diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,140988 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 201340, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.966722956193504e-05, + "grad_norm": 3.921875, + "learning_rate": 0.0008, + "loss": 3.0588, + "step": 1 + }, + { + "epoch": 0.0004966722956193504, + "grad_norm": 1.3125, + "learning_rate": 0.0007999642395947154, + "loss": 2.3153, + "step": 10 + }, + { + "epoch": 0.0009933445912387007, + "grad_norm": 0.53125, + "learning_rate": 0.0007999245058110659, + "loss": 1.8684, + "step": 20 + }, + { + "epoch": 0.001490016886858051, + "grad_norm": 0.53515625, + "learning_rate": 0.0007998847720274163, + "loss": 1.8397, + "step": 30 + }, + { + "epoch": 0.0019866891824774015, + "grad_norm": 0.609375, + "learning_rate": 0.0007998450382437668, + "loss": 1.7934, + "step": 40 + }, + { + "epoch": 0.0024833614780967518, + "grad_norm": 1.09375, + "learning_rate": 0.0007998053044601173, + "loss": 1.7761, + "step": 50 + }, + { + "epoch": 0.002980033773716102, + "grad_norm": 0.419921875, + "learning_rate": 0.0007997655706764677, + "loss": 1.7596, + "step": 60 + }, + { + "epoch": 0.0034767060693354523, + "grad_norm": 0.8671875, + "learning_rate": 0.0007997258368928181, + "loss": 1.7166, + "step": 70 + }, + { + "epoch": 0.003973378364954803, + "grad_norm": 0.482421875, + "learning_rate": 0.0007996861031091686, + "loss": 1.6598, + "step": 80 + }, + { + "epoch": 0.004470050660574153, + "grad_norm": 0.5234375, + "learning_rate": 0.0007996463693255191, + "loss": 1.6145, + "step": 90 + }, + { + "epoch": 0.0049667229561935035, + "grad_norm": 0.53125, + "learning_rate": 0.0007996066355418695, + "loss": 1.5857, + "step": 100 + }, + { + "epoch": 0.005463395251812854, + "grad_norm": 0.40625, + "learning_rate": 0.00079956690175822, + "loss": 1.5669, + "step": 110 + }, + { + "epoch": 0.005960067547432204, + "grad_norm": 0.50390625, + "learning_rate": 0.0007995271679745704, + "loss": 1.5253, + "step": 120 + }, + { + "epoch": 0.006456739843051554, + "grad_norm": 0.396484375, + "learning_rate": 0.0007994874341909208, + "loss": 1.5324, + "step": 130 + }, + { + "epoch": 0.006953412138670905, + "grad_norm": 0.60546875, + "learning_rate": 0.0007994477004072714, + "loss": 1.4808, + "step": 140 + }, + { + "epoch": 0.007450084434290256, + "grad_norm": 0.48046875, + "learning_rate": 0.0007994079666236218, + "loss": 1.4696, + "step": 150 + }, + { + "epoch": 0.007946756729909606, + "grad_norm": 0.3984375, + "learning_rate": 0.0007993682328399723, + "loss": 1.45, + "step": 160 + }, + { + "epoch": 0.008443429025528956, + "grad_norm": 0.408203125, + "learning_rate": 0.0007993284990563226, + "loss": 1.4663, + "step": 170 + }, + { + "epoch": 0.008940101321148307, + "grad_norm": 0.44921875, + "learning_rate": 0.0007992887652726731, + "loss": 1.4412, + "step": 180 + }, + { + "epoch": 0.009436773616767657, + "grad_norm": 0.427734375, + "learning_rate": 0.0007992490314890237, + "loss": 1.3908, + "step": 190 + }, + { + "epoch": 0.009933445912387007, + "grad_norm": 0.51953125, + "learning_rate": 0.000799209297705374, + "loss": 1.3938, + "step": 200 + }, + { + "epoch": 0.010430118208006357, + "grad_norm": 0.43359375, + "learning_rate": 0.0007991695639217245, + "loss": 1.3474, + "step": 210 + }, + { + "epoch": 0.010926790503625708, + "grad_norm": 0.36328125, + "learning_rate": 0.0007991298301380749, + "loss": 1.3319, + "step": 220 + }, + { + "epoch": 0.011423462799245058, + "grad_norm": 0.35546875, + "learning_rate": 0.0007990900963544253, + "loss": 1.3428, + "step": 230 + }, + { + "epoch": 0.011920135094864408, + "grad_norm": 0.431640625, + "learning_rate": 0.0007990503625707759, + "loss": 1.3505, + "step": 240 + }, + { + "epoch": 0.012416807390483758, + "grad_norm": 0.53125, + "learning_rate": 0.0007990106287871263, + "loss": 1.3445, + "step": 250 + }, + { + "epoch": 0.012913479686103109, + "grad_norm": 0.345703125, + "learning_rate": 0.0007989708950034767, + "loss": 1.3059, + "step": 260 + }, + { + "epoch": 0.013410151981722459, + "grad_norm": 0.318359375, + "learning_rate": 0.0007989311612198272, + "loss": 1.3184, + "step": 270 + }, + { + "epoch": 0.01390682427734181, + "grad_norm": 0.359375, + "learning_rate": 0.0007988914274361776, + "loss": 1.2753, + "step": 280 + }, + { + "epoch": 0.01440349657296116, + "grad_norm": 0.4140625, + "learning_rate": 0.0007988516936525282, + "loss": 1.2823, + "step": 290 + }, + { + "epoch": 0.014900168868580511, + "grad_norm": 0.451171875, + "learning_rate": 0.0007988119598688786, + "loss": 1.3007, + "step": 300 + }, + { + "epoch": 0.015396841164199862, + "grad_norm": 0.4296875, + "learning_rate": 0.000798772226085229, + "loss": 1.2959, + "step": 310 + }, + { + "epoch": 0.015893513459819212, + "grad_norm": 0.3828125, + "learning_rate": 0.0007987324923015795, + "loss": 1.2481, + "step": 320 + }, + { + "epoch": 0.01639018575543856, + "grad_norm": 0.53515625, + "learning_rate": 0.0007986927585179299, + "loss": 1.2824, + "step": 330 + }, + { + "epoch": 0.016886858051057912, + "grad_norm": 0.412109375, + "learning_rate": 0.0007986530247342804, + "loss": 1.242, + "step": 340 + }, + { + "epoch": 0.01738353034667726, + "grad_norm": 0.42578125, + "learning_rate": 0.0007986132909506309, + "loss": 1.2588, + "step": 350 + }, + { + "epoch": 0.017880202642296613, + "grad_norm": 0.35546875, + "learning_rate": 0.0007985735571669812, + "loss": 1.2736, + "step": 360 + }, + { + "epoch": 0.01837687493791596, + "grad_norm": 0.4765625, + "learning_rate": 0.0007985338233833317, + "loss": 1.2348, + "step": 370 + }, + { + "epoch": 0.018873547233535314, + "grad_norm": 0.283203125, + "learning_rate": 0.0007984940895996822, + "loss": 1.2293, + "step": 380 + }, + { + "epoch": 0.019370219529154662, + "grad_norm": 0.361328125, + "learning_rate": 0.0007984543558160325, + "loss": 1.1832, + "step": 390 + }, + { + "epoch": 0.019866891824774014, + "grad_norm": 0.326171875, + "learning_rate": 0.0007984146220323831, + "loss": 1.2007, + "step": 400 + }, + { + "epoch": 0.020363564120393366, + "grad_norm": 0.314453125, + "learning_rate": 0.0007983748882487336, + "loss": 1.2085, + "step": 410 + }, + { + "epoch": 0.020860236416012715, + "grad_norm": 0.333984375, + "learning_rate": 0.0007983351544650839, + "loss": 1.2127, + "step": 420 + }, + { + "epoch": 0.021356908711632067, + "grad_norm": 0.388671875, + "learning_rate": 0.0007982954206814344, + "loss": 1.1983, + "step": 430 + }, + { + "epoch": 0.021853581007251415, + "grad_norm": 0.408203125, + "learning_rate": 0.0007982556868977848, + "loss": 1.2332, + "step": 440 + }, + { + "epoch": 0.022350253302870767, + "grad_norm": 0.349609375, + "learning_rate": 0.0007982159531141354, + "loss": 1.1835, + "step": 450 + }, + { + "epoch": 0.022846925598490116, + "grad_norm": 0.345703125, + "learning_rate": 0.0007981762193304858, + "loss": 1.21, + "step": 460 + }, + { + "epoch": 0.023343597894109468, + "grad_norm": 0.3515625, + "learning_rate": 0.0007981364855468362, + "loss": 1.1797, + "step": 470 + }, + { + "epoch": 0.023840270189728816, + "grad_norm": 0.34765625, + "learning_rate": 0.0007980967517631867, + "loss": 1.1599, + "step": 480 + }, + { + "epoch": 0.024336942485348168, + "grad_norm": 0.318359375, + "learning_rate": 0.0007980570179795371, + "loss": 1.1898, + "step": 490 + }, + { + "epoch": 0.024833614780967517, + "grad_norm": 0.291015625, + "learning_rate": 0.0007980172841958876, + "loss": 1.1622, + "step": 500 + }, + { + "epoch": 0.02533028707658687, + "grad_norm": 0.373046875, + "learning_rate": 0.0007979775504122381, + "loss": 1.182, + "step": 510 + }, + { + "epoch": 0.025826959372206217, + "grad_norm": 0.3203125, + "learning_rate": 0.0007979378166285885, + "loss": 1.1502, + "step": 520 + }, + { + "epoch": 0.02632363166782557, + "grad_norm": 0.365234375, + "learning_rate": 0.0007978980828449389, + "loss": 1.1264, + "step": 530 + }, + { + "epoch": 0.026820303963444918, + "grad_norm": 0.37890625, + "learning_rate": 0.0007978583490612895, + "loss": 1.1711, + "step": 540 + }, + { + "epoch": 0.02731697625906427, + "grad_norm": 0.333984375, + "learning_rate": 0.0007978186152776398, + "loss": 1.1233, + "step": 550 + }, + { + "epoch": 0.02781364855468362, + "grad_norm": 0.396484375, + "learning_rate": 0.0007977788814939903, + "loss": 1.1435, + "step": 560 + }, + { + "epoch": 0.02831032085030297, + "grad_norm": 0.330078125, + "learning_rate": 0.0007977391477103408, + "loss": 1.1424, + "step": 570 + }, + { + "epoch": 0.02880699314592232, + "grad_norm": 0.353515625, + "learning_rate": 0.0007976994139266911, + "loss": 1.1642, + "step": 580 + }, + { + "epoch": 0.02930366544154167, + "grad_norm": 0.337890625, + "learning_rate": 0.0007976596801430416, + "loss": 1.11, + "step": 590 + }, + { + "epoch": 0.029800337737161023, + "grad_norm": 0.33984375, + "learning_rate": 0.0007976199463593922, + "loss": 1.152, + "step": 600 + }, + { + "epoch": 0.03029701003278037, + "grad_norm": 0.3203125, + "learning_rate": 0.0007975802125757426, + "loss": 1.1054, + "step": 610 + }, + { + "epoch": 0.030793682328399723, + "grad_norm": 0.369140625, + "learning_rate": 0.000797540478792093, + "loss": 1.1078, + "step": 620 + }, + { + "epoch": 0.03129035462401907, + "grad_norm": 0.333984375, + "learning_rate": 0.0007975007450084434, + "loss": 1.1475, + "step": 630 + }, + { + "epoch": 0.031787026919638424, + "grad_norm": 0.296875, + "learning_rate": 0.000797461011224794, + "loss": 1.1248, + "step": 640 + }, + { + "epoch": 0.032283699215257776, + "grad_norm": 0.349609375, + "learning_rate": 0.0007974212774411444, + "loss": 1.1052, + "step": 650 + }, + { + "epoch": 0.03278037151087712, + "grad_norm": 0.3515625, + "learning_rate": 0.0007973815436574948, + "loss": 1.1349, + "step": 660 + }, + { + "epoch": 0.03327704380649647, + "grad_norm": 0.337890625, + "learning_rate": 0.0007973418098738453, + "loss": 1.1056, + "step": 670 + }, + { + "epoch": 0.033773716102115825, + "grad_norm": 0.396484375, + "learning_rate": 0.0007973020760901957, + "loss": 1.1263, + "step": 680 + }, + { + "epoch": 0.03427038839773518, + "grad_norm": 0.30078125, + "learning_rate": 0.0007972623423065461, + "loss": 1.1334, + "step": 690 + }, + { + "epoch": 0.03476706069335452, + "grad_norm": 0.3203125, + "learning_rate": 0.0007972226085228967, + "loss": 1.1171, + "step": 700 + }, + { + "epoch": 0.035263732988973874, + "grad_norm": 0.33984375, + "learning_rate": 0.0007971828747392471, + "loss": 1.1007, + "step": 710 + }, + { + "epoch": 0.035760405284593226, + "grad_norm": 0.326171875, + "learning_rate": 0.0007971431409555975, + "loss": 1.1092, + "step": 720 + }, + { + "epoch": 0.03625707758021258, + "grad_norm": 0.35546875, + "learning_rate": 0.000797103407171948, + "loss": 1.1299, + "step": 730 + }, + { + "epoch": 0.03675374987583192, + "grad_norm": 0.314453125, + "learning_rate": 0.0007970636733882984, + "loss": 1.1215, + "step": 740 + }, + { + "epoch": 0.037250422171451275, + "grad_norm": 0.294921875, + "learning_rate": 0.0007970239396046489, + "loss": 1.0959, + "step": 750 + }, + { + "epoch": 0.03774709446707063, + "grad_norm": 0.34765625, + "learning_rate": 0.0007969842058209994, + "loss": 1.0947, + "step": 760 + }, + { + "epoch": 0.03824376676268998, + "grad_norm": 0.3984375, + "learning_rate": 0.0007969444720373498, + "loss": 1.0743, + "step": 770 + }, + { + "epoch": 0.038740439058309324, + "grad_norm": 0.3125, + "learning_rate": 0.0007969047382537002, + "loss": 1.0936, + "step": 780 + }, + { + "epoch": 0.039237111353928676, + "grad_norm": 0.306640625, + "learning_rate": 0.0007968650044700507, + "loss": 1.0967, + "step": 790 + }, + { + "epoch": 0.03973378364954803, + "grad_norm": 0.314453125, + "learning_rate": 0.0007968252706864012, + "loss": 1.0531, + "step": 800 + }, + { + "epoch": 0.04023045594516738, + "grad_norm": 0.2734375, + "learning_rate": 0.0007967855369027516, + "loss": 1.0668, + "step": 810 + }, + { + "epoch": 0.04072712824078673, + "grad_norm": 0.294921875, + "learning_rate": 0.000796745803119102, + "loss": 1.0908, + "step": 820 + }, + { + "epoch": 0.04122380053640608, + "grad_norm": 0.265625, + "learning_rate": 0.0007967060693354525, + "loss": 1.0599, + "step": 830 + }, + { + "epoch": 0.04172047283202543, + "grad_norm": 0.29296875, + "learning_rate": 0.0007966663355518029, + "loss": 1.0601, + "step": 840 + }, + { + "epoch": 0.04221714512764478, + "grad_norm": 0.28515625, + "learning_rate": 0.0007966266017681534, + "loss": 1.0303, + "step": 850 + }, + { + "epoch": 0.04271381742326413, + "grad_norm": 0.271484375, + "learning_rate": 0.0007965868679845039, + "loss": 1.0721, + "step": 860 + }, + { + "epoch": 0.04321048971888348, + "grad_norm": 0.322265625, + "learning_rate": 0.0007965471342008543, + "loss": 1.0544, + "step": 870 + }, + { + "epoch": 0.04370716201450283, + "grad_norm": 0.32421875, + "learning_rate": 0.0007965074004172047, + "loss": 1.0499, + "step": 880 + }, + { + "epoch": 0.04420383431012218, + "grad_norm": 0.326171875, + "learning_rate": 0.0007964676666335552, + "loss": 1.0578, + "step": 890 + }, + { + "epoch": 0.044700506605741534, + "grad_norm": 0.3046875, + "learning_rate": 0.0007964279328499057, + "loss": 1.103, + "step": 900 + }, + { + "epoch": 0.04519717890136088, + "grad_norm": 0.2734375, + "learning_rate": 0.0007963881990662561, + "loss": 1.0236, + "step": 910 + }, + { + "epoch": 0.04569385119698023, + "grad_norm": 0.31640625, + "learning_rate": 0.0007963484652826066, + "loss": 1.0282, + "step": 920 + }, + { + "epoch": 0.04619052349259958, + "grad_norm": 0.33984375, + "learning_rate": 0.000796308731498957, + "loss": 1.0686, + "step": 930 + }, + { + "epoch": 0.046687195788218935, + "grad_norm": 0.291015625, + "learning_rate": 0.0007962689977153074, + "loss": 1.0468, + "step": 940 + }, + { + "epoch": 0.04718386808383828, + "grad_norm": 0.31640625, + "learning_rate": 0.000796229263931658, + "loss": 1.0379, + "step": 950 + }, + { + "epoch": 0.04768054037945763, + "grad_norm": 0.349609375, + "learning_rate": 0.0007961895301480084, + "loss": 1.0553, + "step": 960 + }, + { + "epoch": 0.048177212675076984, + "grad_norm": 0.2890625, + "learning_rate": 0.0007961497963643588, + "loss": 1.0768, + "step": 970 + }, + { + "epoch": 0.048673884970696336, + "grad_norm": 0.34765625, + "learning_rate": 0.0007961100625807093, + "loss": 1.0261, + "step": 980 + }, + { + "epoch": 0.04917055726631568, + "grad_norm": 0.255859375, + "learning_rate": 0.0007960703287970597, + "loss": 1.0311, + "step": 990 + }, + { + "epoch": 0.04966722956193503, + "grad_norm": 0.326171875, + "learning_rate": 0.0007960305950134102, + "loss": 1.0556, + "step": 1000 + }, + { + "epoch": 0.050163901857554385, + "grad_norm": 0.28125, + "learning_rate": 0.0007959908612297607, + "loss": 1.034, + "step": 1010 + }, + { + "epoch": 0.05066057415317374, + "grad_norm": 0.330078125, + "learning_rate": 0.0007959511274461111, + "loss": 1.0009, + "step": 1020 + }, + { + "epoch": 0.05115724644879309, + "grad_norm": 0.310546875, + "learning_rate": 0.0007959113936624615, + "loss": 1.0109, + "step": 1030 + }, + { + "epoch": 0.051653918744412435, + "grad_norm": 0.27734375, + "learning_rate": 0.0007958716598788119, + "loss": 1.0085, + "step": 1040 + }, + { + "epoch": 0.052150591040031787, + "grad_norm": 0.27734375, + "learning_rate": 0.0007958319260951625, + "loss": 1.0898, + "step": 1050 + }, + { + "epoch": 0.05264726333565114, + "grad_norm": 0.34765625, + "learning_rate": 0.000795792192311513, + "loss": 1.0379, + "step": 1060 + }, + { + "epoch": 0.05314393563127049, + "grad_norm": 0.291015625, + "learning_rate": 0.0007957524585278633, + "loss": 0.9888, + "step": 1070 + }, + { + "epoch": 0.053640607926889836, + "grad_norm": 0.275390625, + "learning_rate": 0.0007957127247442138, + "loss": 0.9989, + "step": 1080 + }, + { + "epoch": 0.05413728022250919, + "grad_norm": 0.333984375, + "learning_rate": 0.0007956729909605642, + "loss": 1.046, + "step": 1090 + }, + { + "epoch": 0.05463395251812854, + "grad_norm": 0.306640625, + "learning_rate": 0.0007956332571769147, + "loss": 0.9776, + "step": 1100 + }, + { + "epoch": 0.05513062481374789, + "grad_norm": 0.3125, + "learning_rate": 0.0007955935233932652, + "loss": 1.0245, + "step": 1110 + }, + { + "epoch": 0.05562729710936724, + "grad_norm": 0.302734375, + "learning_rate": 0.0007955537896096156, + "loss": 1.0557, + "step": 1120 + }, + { + "epoch": 0.05612396940498659, + "grad_norm": 0.2734375, + "learning_rate": 0.000795514055825966, + "loss": 1.0043, + "step": 1130 + }, + { + "epoch": 0.05662064170060594, + "grad_norm": 0.279296875, + "learning_rate": 0.0007954743220423165, + "loss": 0.9955, + "step": 1140 + }, + { + "epoch": 0.05711731399622529, + "grad_norm": 0.328125, + "learning_rate": 0.000795434588258667, + "loss": 1.0221, + "step": 1150 + }, + { + "epoch": 0.05761398629184464, + "grad_norm": 0.279296875, + "learning_rate": 0.0007953948544750174, + "loss": 0.9994, + "step": 1160 + }, + { + "epoch": 0.05811065858746399, + "grad_norm": 0.275390625, + "learning_rate": 0.0007953551206913679, + "loss": 1.031, + "step": 1170 + }, + { + "epoch": 0.05860733088308334, + "grad_norm": 0.279296875, + "learning_rate": 0.0007953153869077183, + "loss": 0.9968, + "step": 1180 + }, + { + "epoch": 0.059104003178702694, + "grad_norm": 0.2734375, + "learning_rate": 0.0007952756531240688, + "loss": 1.0061, + "step": 1190 + }, + { + "epoch": 0.059600675474322046, + "grad_norm": 0.298828125, + "learning_rate": 0.0007952359193404193, + "loss": 1.0162, + "step": 1200 + }, + { + "epoch": 0.06009734776994139, + "grad_norm": 0.29296875, + "learning_rate": 0.0007951961855567697, + "loss": 0.9925, + "step": 1210 + }, + { + "epoch": 0.06059402006556074, + "grad_norm": 0.271484375, + "learning_rate": 0.0007951564517731202, + "loss": 1.0197, + "step": 1220 + }, + { + "epoch": 0.061090692361180095, + "grad_norm": 0.265625, + "learning_rate": 0.0007951167179894705, + "loss": 1.0029, + "step": 1230 + }, + { + "epoch": 0.06158736465679945, + "grad_norm": 0.27734375, + "learning_rate": 0.000795076984205821, + "loss": 0.9652, + "step": 1240 + }, + { + "epoch": 0.06208403695241879, + "grad_norm": 0.361328125, + "learning_rate": 0.0007950372504221716, + "loss": 1.0034, + "step": 1250 + }, + { + "epoch": 0.06258070924803814, + "grad_norm": 0.2734375, + "learning_rate": 0.0007949975166385219, + "loss": 1.0125, + "step": 1260 + }, + { + "epoch": 0.0630773815436575, + "grad_norm": 0.3203125, + "learning_rate": 0.0007949577828548724, + "loss": 0.999, + "step": 1270 + }, + { + "epoch": 0.06357405383927685, + "grad_norm": 0.30078125, + "learning_rate": 0.0007949180490712229, + "loss": 0.9831, + "step": 1280 + }, + { + "epoch": 0.0640707261348962, + "grad_norm": 0.30859375, + "learning_rate": 0.0007948783152875732, + "loss": 0.9889, + "step": 1290 + }, + { + "epoch": 0.06456739843051555, + "grad_norm": 0.259765625, + "learning_rate": 0.0007948385815039238, + "loss": 0.9954, + "step": 1300 + }, + { + "epoch": 0.06506407072613489, + "grad_norm": 0.40234375, + "learning_rate": 0.0007947988477202742, + "loss": 1.0029, + "step": 1310 + }, + { + "epoch": 0.06556074302175424, + "grad_norm": 0.279296875, + "learning_rate": 0.0007947591139366246, + "loss": 0.9666, + "step": 1320 + }, + { + "epoch": 0.0660574153173736, + "grad_norm": 0.31640625, + "learning_rate": 0.0007947193801529751, + "loss": 0.9921, + "step": 1330 + }, + { + "epoch": 0.06655408761299295, + "grad_norm": 0.302734375, + "learning_rate": 0.0007946796463693255, + "loss": 0.9799, + "step": 1340 + }, + { + "epoch": 0.0670507599086123, + "grad_norm": 0.279296875, + "learning_rate": 0.0007946399125856761, + "loss": 1.0026, + "step": 1350 + }, + { + "epoch": 0.06754743220423165, + "grad_norm": 0.26171875, + "learning_rate": 0.0007946001788020265, + "loss": 0.9782, + "step": 1360 + }, + { + "epoch": 0.068044104499851, + "grad_norm": 0.283203125, + "learning_rate": 0.0007945604450183769, + "loss": 1.0086, + "step": 1370 + }, + { + "epoch": 0.06854077679547035, + "grad_norm": 0.271484375, + "learning_rate": 0.0007945207112347274, + "loss": 0.9763, + "step": 1380 + }, + { + "epoch": 0.0690374490910897, + "grad_norm": 0.30078125, + "learning_rate": 0.0007944809774510778, + "loss": 1.0011, + "step": 1390 + }, + { + "epoch": 0.06953412138670904, + "grad_norm": 0.267578125, + "learning_rate": 0.0007944412436674283, + "loss": 0.9751, + "step": 1400 + }, + { + "epoch": 0.0700307936823284, + "grad_norm": 0.29296875, + "learning_rate": 0.0007944015098837788, + "loss": 1.0073, + "step": 1410 + }, + { + "epoch": 0.07052746597794775, + "grad_norm": 0.287109375, + "learning_rate": 0.0007943617761001291, + "loss": 0.9551, + "step": 1420 + }, + { + "epoch": 0.0710241382735671, + "grad_norm": 0.271484375, + "learning_rate": 0.0007943220423164796, + "loss": 0.9414, + "step": 1430 + }, + { + "epoch": 0.07152081056918645, + "grad_norm": 0.3125, + "learning_rate": 0.0007942823085328301, + "loss": 0.9908, + "step": 1440 + }, + { + "epoch": 0.0720174828648058, + "grad_norm": 0.294921875, + "learning_rate": 0.0007942425747491804, + "loss": 0.9451, + "step": 1450 + }, + { + "epoch": 0.07251415516042516, + "grad_norm": 0.234375, + "learning_rate": 0.000794202840965531, + "loss": 0.9761, + "step": 1460 + }, + { + "epoch": 0.07301082745604451, + "grad_norm": 0.322265625, + "learning_rate": 0.0007941631071818815, + "loss": 0.9389, + "step": 1470 + }, + { + "epoch": 0.07350749975166385, + "grad_norm": 0.267578125, + "learning_rate": 0.0007941233733982319, + "loss": 0.9704, + "step": 1480 + }, + { + "epoch": 0.0740041720472832, + "grad_norm": 0.283203125, + "learning_rate": 0.0007940836396145823, + "loss": 0.9748, + "step": 1490 + }, + { + "epoch": 0.07450084434290255, + "grad_norm": 0.3046875, + "learning_rate": 0.0007940439058309328, + "loss": 0.982, + "step": 1500 + }, + { + "epoch": 0.0749975166385219, + "grad_norm": 0.26953125, + "learning_rate": 0.0007940041720472833, + "loss": 0.9661, + "step": 1510 + }, + { + "epoch": 0.07549418893414125, + "grad_norm": 0.2578125, + "learning_rate": 0.0007939644382636337, + "loss": 0.9424, + "step": 1520 + }, + { + "epoch": 0.0759908612297606, + "grad_norm": 0.29296875, + "learning_rate": 0.0007939247044799841, + "loss": 0.9927, + "step": 1530 + }, + { + "epoch": 0.07648753352537996, + "grad_norm": 0.2451171875, + "learning_rate": 0.0007938849706963346, + "loss": 0.9794, + "step": 1540 + }, + { + "epoch": 0.07698420582099931, + "grad_norm": 0.333984375, + "learning_rate": 0.000793845236912685, + "loss": 0.9505, + "step": 1550 + }, + { + "epoch": 0.07748087811661865, + "grad_norm": 0.296875, + "learning_rate": 0.0007938055031290355, + "loss": 0.9565, + "step": 1560 + }, + { + "epoch": 0.077977550412238, + "grad_norm": 0.3046875, + "learning_rate": 0.000793765769345386, + "loss": 0.9998, + "step": 1570 + }, + { + "epoch": 0.07847422270785735, + "grad_norm": 0.287109375, + "learning_rate": 0.0007937260355617364, + "loss": 0.9816, + "step": 1580 + }, + { + "epoch": 0.0789708950034767, + "grad_norm": 0.30078125, + "learning_rate": 0.0007936863017780868, + "loss": 0.9523, + "step": 1590 + }, + { + "epoch": 0.07946756729909606, + "grad_norm": 0.255859375, + "learning_rate": 0.0007936465679944374, + "loss": 0.9512, + "step": 1600 + }, + { + "epoch": 0.07996423959471541, + "grad_norm": 0.26171875, + "learning_rate": 0.0007936068342107878, + "loss": 0.9186, + "step": 1610 + }, + { + "epoch": 0.08046091189033476, + "grad_norm": 0.296875, + "learning_rate": 0.0007935671004271382, + "loss": 0.9518, + "step": 1620 + }, + { + "epoch": 0.08095758418595411, + "grad_norm": 0.33203125, + "learning_rate": 0.0007935273666434887, + "loss": 0.9269, + "step": 1630 + }, + { + "epoch": 0.08145425648157346, + "grad_norm": 0.25390625, + "learning_rate": 0.0007934876328598391, + "loss": 0.9667, + "step": 1640 + }, + { + "epoch": 0.0819509287771928, + "grad_norm": 0.2578125, + "learning_rate": 0.0007934478990761896, + "loss": 0.947, + "step": 1650 + }, + { + "epoch": 0.08244760107281215, + "grad_norm": 0.30859375, + "learning_rate": 0.0007934081652925401, + "loss": 0.9563, + "step": 1660 + }, + { + "epoch": 0.0829442733684315, + "grad_norm": 0.28515625, + "learning_rate": 0.0007933684315088905, + "loss": 0.9397, + "step": 1670 + }, + { + "epoch": 0.08344094566405086, + "grad_norm": 0.275390625, + "learning_rate": 0.0007933286977252409, + "loss": 0.965, + "step": 1680 + }, + { + "epoch": 0.08393761795967021, + "grad_norm": 0.25, + "learning_rate": 0.0007932889639415913, + "loss": 0.9461, + "step": 1690 + }, + { + "epoch": 0.08443429025528956, + "grad_norm": 0.26171875, + "learning_rate": 0.0007932492301579419, + "loss": 0.9537, + "step": 1700 + }, + { + "epoch": 0.08493096255090891, + "grad_norm": 0.2392578125, + "learning_rate": 0.0007932094963742923, + "loss": 0.9146, + "step": 1710 + }, + { + "epoch": 0.08542763484652827, + "grad_norm": 0.259765625, + "learning_rate": 0.0007931697625906427, + "loss": 0.9558, + "step": 1720 + }, + { + "epoch": 0.0859243071421476, + "grad_norm": 0.263671875, + "learning_rate": 0.0007931300288069932, + "loss": 0.9614, + "step": 1730 + }, + { + "epoch": 0.08642097943776696, + "grad_norm": 0.2412109375, + "learning_rate": 0.0007930902950233436, + "loss": 0.9227, + "step": 1740 + }, + { + "epoch": 0.08691765173338631, + "grad_norm": 0.279296875, + "learning_rate": 0.000793050561239694, + "loss": 0.9533, + "step": 1750 + }, + { + "epoch": 0.08741432402900566, + "grad_norm": 0.2373046875, + "learning_rate": 0.0007930108274560446, + "loss": 0.9488, + "step": 1760 + }, + { + "epoch": 0.08791099632462501, + "grad_norm": 0.25390625, + "learning_rate": 0.000792971093672395, + "loss": 0.9396, + "step": 1770 + }, + { + "epoch": 0.08840766862024436, + "grad_norm": 0.294921875, + "learning_rate": 0.0007929313598887454, + "loss": 0.9199, + "step": 1780 + }, + { + "epoch": 0.08890434091586372, + "grad_norm": 0.267578125, + "learning_rate": 0.0007928916261050959, + "loss": 0.9593, + "step": 1790 + }, + { + "epoch": 0.08940101321148307, + "grad_norm": 0.2490234375, + "learning_rate": 0.0007928518923214463, + "loss": 0.9639, + "step": 1800 + }, + { + "epoch": 0.08989768550710242, + "grad_norm": 0.2578125, + "learning_rate": 0.0007928121585377968, + "loss": 0.9267, + "step": 1810 + }, + { + "epoch": 0.09039435780272176, + "grad_norm": 0.287109375, + "learning_rate": 0.0007927724247541473, + "loss": 0.9629, + "step": 1820 + }, + { + "epoch": 0.09089103009834111, + "grad_norm": 0.248046875, + "learning_rate": 0.0007927326909704977, + "loss": 0.9051, + "step": 1830 + }, + { + "epoch": 0.09138770239396046, + "grad_norm": 0.25390625, + "learning_rate": 0.0007926929571868481, + "loss": 0.9445, + "step": 1840 + }, + { + "epoch": 0.09188437468957981, + "grad_norm": 0.2490234375, + "learning_rate": 0.0007926532234031987, + "loss": 0.9441, + "step": 1850 + }, + { + "epoch": 0.09238104698519917, + "grad_norm": 0.28515625, + "learning_rate": 0.0007926134896195491, + "loss": 0.9459, + "step": 1860 + }, + { + "epoch": 0.09287771928081852, + "grad_norm": 0.263671875, + "learning_rate": 0.0007925737558358995, + "loss": 0.944, + "step": 1870 + }, + { + "epoch": 0.09337439157643787, + "grad_norm": 0.2431640625, + "learning_rate": 0.00079253402205225, + "loss": 0.9664, + "step": 1880 + }, + { + "epoch": 0.09387106387205722, + "grad_norm": 0.25390625, + "learning_rate": 0.0007924942882686004, + "loss": 0.9384, + "step": 1890 + }, + { + "epoch": 0.09436773616767656, + "grad_norm": 0.25390625, + "learning_rate": 0.0007924545544849508, + "loss": 0.936, + "step": 1900 + }, + { + "epoch": 0.09486440846329591, + "grad_norm": 0.259765625, + "learning_rate": 0.0007924148207013013, + "loss": 0.9484, + "step": 1910 + }, + { + "epoch": 0.09536108075891526, + "grad_norm": 0.28125, + "learning_rate": 0.0007923750869176518, + "loss": 0.8862, + "step": 1920 + }, + { + "epoch": 0.09585775305453462, + "grad_norm": 0.2412109375, + "learning_rate": 0.0007923353531340023, + "loss": 0.933, + "step": 1930 + }, + { + "epoch": 0.09635442535015397, + "grad_norm": 0.220703125, + "learning_rate": 0.0007922956193503526, + "loss": 0.9179, + "step": 1940 + }, + { + "epoch": 0.09685109764577332, + "grad_norm": 0.302734375, + "learning_rate": 0.0007922558855667031, + "loss": 0.9129, + "step": 1950 + }, + { + "epoch": 0.09734776994139267, + "grad_norm": 0.259765625, + "learning_rate": 0.0007922161517830536, + "loss": 0.9578, + "step": 1960 + }, + { + "epoch": 0.09784444223701202, + "grad_norm": 0.314453125, + "learning_rate": 0.000792176417999404, + "loss": 0.8939, + "step": 1970 + }, + { + "epoch": 0.09834111453263136, + "grad_norm": 0.28515625, + "learning_rate": 0.0007921366842157545, + "loss": 0.9061, + "step": 1980 + }, + { + "epoch": 0.09883778682825071, + "grad_norm": 0.271484375, + "learning_rate": 0.0007920969504321049, + "loss": 0.9174, + "step": 1990 + }, + { + "epoch": 0.09933445912387007, + "grad_norm": 0.2373046875, + "learning_rate": 0.0007920572166484553, + "loss": 0.9491, + "step": 2000 + }, + { + "epoch": 0.09983113141948942, + "grad_norm": 0.265625, + "learning_rate": 0.0007920174828648059, + "loss": 0.9257, + "step": 2010 + }, + { + "epoch": 0.10032780371510877, + "grad_norm": 0.2578125, + "learning_rate": 0.0007919777490811563, + "loss": 0.8943, + "step": 2020 + }, + { + "epoch": 0.10082447601072812, + "grad_norm": 0.25390625, + "learning_rate": 0.0007919380152975067, + "loss": 0.9324, + "step": 2030 + }, + { + "epoch": 0.10132114830634747, + "grad_norm": 0.2373046875, + "learning_rate": 0.0007918982815138572, + "loss": 0.9209, + "step": 2040 + }, + { + "epoch": 0.10181782060196683, + "grad_norm": 0.302734375, + "learning_rate": 0.0007918585477302076, + "loss": 0.9143, + "step": 2050 + }, + { + "epoch": 0.10231449289758618, + "grad_norm": 0.28125, + "learning_rate": 0.0007918188139465581, + "loss": 0.8881, + "step": 2060 + }, + { + "epoch": 0.10281116519320552, + "grad_norm": 0.255859375, + "learning_rate": 0.0007917790801629086, + "loss": 0.9361, + "step": 2070 + }, + { + "epoch": 0.10330783748882487, + "grad_norm": 0.23828125, + "learning_rate": 0.000791739346379259, + "loss": 0.8759, + "step": 2080 + }, + { + "epoch": 0.10380450978444422, + "grad_norm": 0.2421875, + "learning_rate": 0.0007916996125956095, + "loss": 0.9115, + "step": 2090 + }, + { + "epoch": 0.10430118208006357, + "grad_norm": 0.25, + "learning_rate": 0.0007916598788119598, + "loss": 0.9272, + "step": 2100 + }, + { + "epoch": 0.10479785437568293, + "grad_norm": 0.294921875, + "learning_rate": 0.0007916201450283104, + "loss": 0.9501, + "step": 2110 + }, + { + "epoch": 0.10529452667130228, + "grad_norm": 0.2490234375, + "learning_rate": 0.0007915804112446609, + "loss": 0.8966, + "step": 2120 + }, + { + "epoch": 0.10579119896692163, + "grad_norm": 0.2490234375, + "learning_rate": 0.0007915406774610112, + "loss": 0.9069, + "step": 2130 + }, + { + "epoch": 0.10628787126254098, + "grad_norm": 0.267578125, + "learning_rate": 0.0007915009436773617, + "loss": 0.9063, + "step": 2140 + }, + { + "epoch": 0.10678454355816032, + "grad_norm": 0.26171875, + "learning_rate": 0.0007914612098937123, + "loss": 0.9479, + "step": 2150 + }, + { + "epoch": 0.10728121585377967, + "grad_norm": 0.23828125, + "learning_rate": 0.0007914214761100626, + "loss": 0.8811, + "step": 2160 + }, + { + "epoch": 0.10777788814939902, + "grad_norm": 0.275390625, + "learning_rate": 0.0007913817423264131, + "loss": 0.9181, + "step": 2170 + }, + { + "epoch": 0.10827456044501838, + "grad_norm": 0.275390625, + "learning_rate": 0.0007913420085427635, + "loss": 0.8811, + "step": 2180 + }, + { + "epoch": 0.10877123274063773, + "grad_norm": 0.248046875, + "learning_rate": 0.0007913022747591139, + "loss": 0.8769, + "step": 2190 + }, + { + "epoch": 0.10926790503625708, + "grad_norm": 0.275390625, + "learning_rate": 0.0007912625409754644, + "loss": 0.9298, + "step": 2200 + }, + { + "epoch": 0.10976457733187643, + "grad_norm": 0.25390625, + "learning_rate": 0.0007912228071918149, + "loss": 0.8929, + "step": 2210 + }, + { + "epoch": 0.11026124962749578, + "grad_norm": 0.296875, + "learning_rate": 0.0007911830734081653, + "loss": 0.9101, + "step": 2220 + }, + { + "epoch": 0.11075792192311514, + "grad_norm": 0.234375, + "learning_rate": 0.0007911433396245158, + "loss": 0.9398, + "step": 2230 + }, + { + "epoch": 0.11125459421873447, + "grad_norm": 0.2333984375, + "learning_rate": 0.0007911036058408662, + "loss": 0.9077, + "step": 2240 + }, + { + "epoch": 0.11175126651435383, + "grad_norm": 0.294921875, + "learning_rate": 0.0007910638720572167, + "loss": 0.9285, + "step": 2250 + }, + { + "epoch": 0.11224793880997318, + "grad_norm": 0.26953125, + "learning_rate": 0.0007910241382735672, + "loss": 0.8824, + "step": 2260 + }, + { + "epoch": 0.11274461110559253, + "grad_norm": 0.2734375, + "learning_rate": 0.0007909844044899176, + "loss": 0.8869, + "step": 2270 + }, + { + "epoch": 0.11324128340121188, + "grad_norm": 0.302734375, + "learning_rate": 0.0007909446707062681, + "loss": 0.9341, + "step": 2280 + }, + { + "epoch": 0.11373795569683123, + "grad_norm": 0.240234375, + "learning_rate": 0.0007909049369226184, + "loss": 0.9054, + "step": 2290 + }, + { + "epoch": 0.11423462799245059, + "grad_norm": 0.2578125, + "learning_rate": 0.0007908652031389689, + "loss": 0.91, + "step": 2300 + }, + { + "epoch": 0.11473130028806994, + "grad_norm": 0.2353515625, + "learning_rate": 0.0007908254693553195, + "loss": 0.9317, + "step": 2310 + }, + { + "epoch": 0.11522797258368928, + "grad_norm": 0.267578125, + "learning_rate": 0.0007907857355716698, + "loss": 0.9031, + "step": 2320 + }, + { + "epoch": 0.11572464487930863, + "grad_norm": 0.2734375, + "learning_rate": 0.0007907460017880203, + "loss": 0.9089, + "step": 2330 + }, + { + "epoch": 0.11622131717492798, + "grad_norm": 0.2578125, + "learning_rate": 0.0007907062680043708, + "loss": 0.9035, + "step": 2340 + }, + { + "epoch": 0.11671798947054733, + "grad_norm": 0.29296875, + "learning_rate": 0.0007906665342207211, + "loss": 0.8807, + "step": 2350 + }, + { + "epoch": 0.11721466176616668, + "grad_norm": 0.24609375, + "learning_rate": 0.0007906268004370717, + "loss": 0.8563, + "step": 2360 + }, + { + "epoch": 0.11771133406178604, + "grad_norm": 0.2353515625, + "learning_rate": 0.0007905870666534221, + "loss": 0.8702, + "step": 2370 + }, + { + "epoch": 0.11820800635740539, + "grad_norm": 0.2109375, + "learning_rate": 0.0007905473328697726, + "loss": 0.9268, + "step": 2380 + }, + { + "epoch": 0.11870467865302474, + "grad_norm": 0.240234375, + "learning_rate": 0.000790507599086123, + "loss": 0.902, + "step": 2390 + }, + { + "epoch": 0.11920135094864409, + "grad_norm": 0.26171875, + "learning_rate": 0.0007904678653024734, + "loss": 0.9036, + "step": 2400 + }, + { + "epoch": 0.11969802324426343, + "grad_norm": 0.263671875, + "learning_rate": 0.000790428131518824, + "loss": 0.9041, + "step": 2410 + }, + { + "epoch": 0.12019469553988278, + "grad_norm": 0.25390625, + "learning_rate": 0.0007903883977351744, + "loss": 0.8601, + "step": 2420 + }, + { + "epoch": 0.12069136783550213, + "grad_norm": 0.27734375, + "learning_rate": 0.0007903486639515248, + "loss": 0.8791, + "step": 2430 + }, + { + "epoch": 0.12118804013112149, + "grad_norm": 0.263671875, + "learning_rate": 0.0007903089301678753, + "loss": 0.9051, + "step": 2440 + }, + { + "epoch": 0.12168471242674084, + "grad_norm": 0.255859375, + "learning_rate": 0.0007902691963842257, + "loss": 0.8775, + "step": 2450 + }, + { + "epoch": 0.12218138472236019, + "grad_norm": 0.2255859375, + "learning_rate": 0.0007902294626005762, + "loss": 0.8495, + "step": 2460 + }, + { + "epoch": 0.12267805701797954, + "grad_norm": 0.25, + "learning_rate": 0.0007901897288169267, + "loss": 0.9162, + "step": 2470 + }, + { + "epoch": 0.1231747293135989, + "grad_norm": 0.259765625, + "learning_rate": 0.000790149995033277, + "loss": 0.8772, + "step": 2480 + }, + { + "epoch": 0.12367140160921823, + "grad_norm": 0.2490234375, + "learning_rate": 0.0007901102612496275, + "loss": 0.8825, + "step": 2490 + }, + { + "epoch": 0.12416807390483758, + "grad_norm": 0.251953125, + "learning_rate": 0.000790070527465978, + "loss": 0.9092, + "step": 2500 + }, + { + "epoch": 0.12466474620045694, + "grad_norm": 0.263671875, + "learning_rate": 0.0007900307936823284, + "loss": 0.8917, + "step": 2510 + }, + { + "epoch": 0.1251614184960763, + "grad_norm": 0.3125, + "learning_rate": 0.0007899910598986789, + "loss": 0.8767, + "step": 2520 + }, + { + "epoch": 0.12565809079169563, + "grad_norm": 0.2353515625, + "learning_rate": 0.0007899513261150294, + "loss": 0.8803, + "step": 2530 + }, + { + "epoch": 0.126154763087315, + "grad_norm": 0.2412109375, + "learning_rate": 0.0007899115923313798, + "loss": 0.895, + "step": 2540 + }, + { + "epoch": 0.12665143538293433, + "grad_norm": 0.259765625, + "learning_rate": 0.0007898718585477302, + "loss": 0.9083, + "step": 2550 + }, + { + "epoch": 0.1271481076785537, + "grad_norm": 0.240234375, + "learning_rate": 0.0007898321247640807, + "loss": 0.8967, + "step": 2560 + }, + { + "epoch": 0.12764477997417303, + "grad_norm": 0.2373046875, + "learning_rate": 0.0007897923909804312, + "loss": 0.8881, + "step": 2570 + }, + { + "epoch": 0.1281414522697924, + "grad_norm": 0.220703125, + "learning_rate": 0.0007897526571967816, + "loss": 0.9021, + "step": 2580 + }, + { + "epoch": 0.12863812456541174, + "grad_norm": 0.306640625, + "learning_rate": 0.000789712923413132, + "loss": 0.8832, + "step": 2590 + }, + { + "epoch": 0.1291347968610311, + "grad_norm": 0.255859375, + "learning_rate": 0.0007896731896294825, + "loss": 0.9154, + "step": 2600 + }, + { + "epoch": 0.12963146915665044, + "grad_norm": 0.2255859375, + "learning_rate": 0.000789633455845833, + "loss": 0.8943, + "step": 2610 + }, + { + "epoch": 0.13012814145226978, + "grad_norm": 0.248046875, + "learning_rate": 0.0007895937220621834, + "loss": 0.8884, + "step": 2620 + }, + { + "epoch": 0.13062481374788915, + "grad_norm": 0.248046875, + "learning_rate": 0.0007895539882785339, + "loss": 0.8645, + "step": 2630 + }, + { + "epoch": 0.13112148604350848, + "grad_norm": 0.26171875, + "learning_rate": 0.0007895142544948843, + "loss": 0.8769, + "step": 2640 + }, + { + "epoch": 0.13161815833912785, + "grad_norm": 0.271484375, + "learning_rate": 0.0007894745207112347, + "loss": 0.9108, + "step": 2650 + }, + { + "epoch": 0.1321148306347472, + "grad_norm": 0.267578125, + "learning_rate": 0.0007894347869275853, + "loss": 0.8442, + "step": 2660 + }, + { + "epoch": 0.13261150293036655, + "grad_norm": 0.25, + "learning_rate": 0.0007893950531439357, + "loss": 0.8799, + "step": 2670 + }, + { + "epoch": 0.1331081752259859, + "grad_norm": 0.2265625, + "learning_rate": 0.0007893553193602861, + "loss": 0.9038, + "step": 2680 + }, + { + "epoch": 0.13360484752160526, + "grad_norm": 0.224609375, + "learning_rate": 0.0007893155855766366, + "loss": 0.8603, + "step": 2690 + }, + { + "epoch": 0.1341015198172246, + "grad_norm": 0.22265625, + "learning_rate": 0.000789275851792987, + "loss": 0.8633, + "step": 2700 + }, + { + "epoch": 0.13459819211284393, + "grad_norm": 0.234375, + "learning_rate": 0.0007892361180093375, + "loss": 0.8866, + "step": 2710 + }, + { + "epoch": 0.1350948644084633, + "grad_norm": 0.25, + "learning_rate": 0.000789196384225688, + "loss": 0.873, + "step": 2720 + }, + { + "epoch": 0.13559153670408264, + "grad_norm": 0.248046875, + "learning_rate": 0.0007891566504420384, + "loss": 0.85, + "step": 2730 + }, + { + "epoch": 0.136088208999702, + "grad_norm": 0.23046875, + "learning_rate": 0.0007891169166583888, + "loss": 0.8394, + "step": 2740 + }, + { + "epoch": 0.13658488129532134, + "grad_norm": 0.251953125, + "learning_rate": 0.0007890771828747392, + "loss": 0.8567, + "step": 2750 + }, + { + "epoch": 0.1370815535909407, + "grad_norm": 0.265625, + "learning_rate": 0.0007890374490910898, + "loss": 0.9067, + "step": 2760 + }, + { + "epoch": 0.13757822588656005, + "grad_norm": 0.25, + "learning_rate": 0.0007889977153074402, + "loss": 0.8816, + "step": 2770 + }, + { + "epoch": 0.1380748981821794, + "grad_norm": 0.2265625, + "learning_rate": 0.0007889579815237906, + "loss": 0.8963, + "step": 2780 + }, + { + "epoch": 0.13857157047779875, + "grad_norm": 0.251953125, + "learning_rate": 0.0007889182477401411, + "loss": 0.838, + "step": 2790 + }, + { + "epoch": 0.1390682427734181, + "grad_norm": 0.212890625, + "learning_rate": 0.0007888785139564915, + "loss": 0.8586, + "step": 2800 + }, + { + "epoch": 0.13956491506903745, + "grad_norm": 0.23046875, + "learning_rate": 0.000788838780172842, + "loss": 0.8659, + "step": 2810 + }, + { + "epoch": 0.1400615873646568, + "grad_norm": 0.2421875, + "learning_rate": 0.0007887990463891925, + "loss": 0.9058, + "step": 2820 + }, + { + "epoch": 0.14055825966027616, + "grad_norm": 0.228515625, + "learning_rate": 0.0007887593126055429, + "loss": 0.8714, + "step": 2830 + }, + { + "epoch": 0.1410549319558955, + "grad_norm": 0.2294921875, + "learning_rate": 0.0007887195788218933, + "loss": 0.8725, + "step": 2840 + }, + { + "epoch": 0.14155160425151486, + "grad_norm": 0.2353515625, + "learning_rate": 0.0007886798450382438, + "loss": 0.8797, + "step": 2850 + }, + { + "epoch": 0.1420482765471342, + "grad_norm": 0.2294921875, + "learning_rate": 0.0007886401112545943, + "loss": 0.866, + "step": 2860 + }, + { + "epoch": 0.14254494884275354, + "grad_norm": 0.2470703125, + "learning_rate": 0.0007886003774709447, + "loss": 0.8559, + "step": 2870 + }, + { + "epoch": 0.1430416211383729, + "grad_norm": 0.25, + "learning_rate": 0.0007885606436872952, + "loss": 0.8683, + "step": 2880 + }, + { + "epoch": 0.14353829343399224, + "grad_norm": 0.216796875, + "learning_rate": 0.0007885209099036456, + "loss": 0.8658, + "step": 2890 + }, + { + "epoch": 0.1440349657296116, + "grad_norm": 0.234375, + "learning_rate": 0.000788481176119996, + "loss": 0.8778, + "step": 2900 + }, + { + "epoch": 0.14453163802523095, + "grad_norm": 0.2255859375, + "learning_rate": 0.0007884414423363466, + "loss": 0.8682, + "step": 2910 + }, + { + "epoch": 0.1450283103208503, + "grad_norm": 0.25390625, + "learning_rate": 0.000788401708552697, + "loss": 0.8524, + "step": 2920 + }, + { + "epoch": 0.14552498261646965, + "grad_norm": 0.251953125, + "learning_rate": 0.0007883619747690474, + "loss": 0.8535, + "step": 2930 + }, + { + "epoch": 0.14602165491208902, + "grad_norm": 0.216796875, + "learning_rate": 0.0007883222409853979, + "loss": 0.8414, + "step": 2940 + }, + { + "epoch": 0.14651832720770835, + "grad_norm": 0.271484375, + "learning_rate": 0.0007882825072017483, + "loss": 0.8294, + "step": 2950 + }, + { + "epoch": 0.1470149995033277, + "grad_norm": 0.22265625, + "learning_rate": 0.0007882427734180987, + "loss": 0.8701, + "step": 2960 + }, + { + "epoch": 0.14751167179894706, + "grad_norm": 0.259765625, + "learning_rate": 0.0007882030396344492, + "loss": 0.8428, + "step": 2970 + }, + { + "epoch": 0.1480083440945664, + "grad_norm": 0.2265625, + "learning_rate": 0.0007881633058507997, + "loss": 0.8341, + "step": 2980 + }, + { + "epoch": 0.14850501639018576, + "grad_norm": 0.2578125, + "learning_rate": 0.0007881235720671502, + "loss": 0.839, + "step": 2990 + }, + { + "epoch": 0.1490016886858051, + "grad_norm": 0.2333984375, + "learning_rate": 0.0007880838382835005, + "loss": 0.8262, + "step": 3000 + }, + { + "epoch": 0.14949836098142447, + "grad_norm": 0.2265625, + "learning_rate": 0.000788044104499851, + "loss": 0.8626, + "step": 3010 + }, + { + "epoch": 0.1499950332770438, + "grad_norm": 0.240234375, + "learning_rate": 0.0007880043707162015, + "loss": 0.8461, + "step": 3020 + }, + { + "epoch": 0.15049170557266317, + "grad_norm": 0.208984375, + "learning_rate": 0.0007879646369325519, + "loss": 0.8441, + "step": 3030 + }, + { + "epoch": 0.1509883778682825, + "grad_norm": 0.216796875, + "learning_rate": 0.0007879249031489024, + "loss": 0.8595, + "step": 3040 + }, + { + "epoch": 0.15148505016390185, + "grad_norm": 0.265625, + "learning_rate": 0.0007878851693652528, + "loss": 0.877, + "step": 3050 + }, + { + "epoch": 0.1519817224595212, + "grad_norm": 0.21875, + "learning_rate": 0.0007878454355816032, + "loss": 0.8823, + "step": 3060 + }, + { + "epoch": 0.15247839475514055, + "grad_norm": 0.25390625, + "learning_rate": 0.0007878057017979538, + "loss": 0.8481, + "step": 3070 + }, + { + "epoch": 0.15297506705075992, + "grad_norm": 0.23828125, + "learning_rate": 0.0007877659680143042, + "loss": 0.8838, + "step": 3080 + }, + { + "epoch": 0.15347173934637925, + "grad_norm": 0.2392578125, + "learning_rate": 0.0007877262342306546, + "loss": 0.8341, + "step": 3090 + }, + { + "epoch": 0.15396841164199862, + "grad_norm": 0.2412109375, + "learning_rate": 0.0007876865004470051, + "loss": 0.8893, + "step": 3100 + }, + { + "epoch": 0.15446508393761796, + "grad_norm": 0.2412109375, + "learning_rate": 0.0007876467666633555, + "loss": 0.8442, + "step": 3110 + }, + { + "epoch": 0.1549617562332373, + "grad_norm": 0.26171875, + "learning_rate": 0.000787607032879706, + "loss": 0.837, + "step": 3120 + }, + { + "epoch": 0.15545842852885666, + "grad_norm": 0.2197265625, + "learning_rate": 0.0007875672990960565, + "loss": 0.8601, + "step": 3130 + }, + { + "epoch": 0.155955100824476, + "grad_norm": 0.216796875, + "learning_rate": 0.0007875275653124069, + "loss": 0.8514, + "step": 3140 + }, + { + "epoch": 0.15645177312009537, + "grad_norm": 0.2451171875, + "learning_rate": 0.0007874878315287574, + "loss": 0.886, + "step": 3150 + }, + { + "epoch": 0.1569484454157147, + "grad_norm": 0.212890625, + "learning_rate": 0.0007874480977451077, + "loss": 0.8572, + "step": 3160 + }, + { + "epoch": 0.15744511771133407, + "grad_norm": 0.2294921875, + "learning_rate": 0.0007874083639614583, + "loss": 0.862, + "step": 3170 + }, + { + "epoch": 0.1579417900069534, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007873686301778088, + "loss": 0.8112, + "step": 3180 + }, + { + "epoch": 0.15843846230257277, + "grad_norm": 0.2470703125, + "learning_rate": 0.0007873288963941591, + "loss": 0.8726, + "step": 3190 + }, + { + "epoch": 0.1589351345981921, + "grad_norm": 0.2197265625, + "learning_rate": 0.0007872891626105096, + "loss": 0.8618, + "step": 3200 + }, + { + "epoch": 0.15943180689381145, + "grad_norm": 0.228515625, + "learning_rate": 0.0007872494288268602, + "loss": 0.8528, + "step": 3210 + }, + { + "epoch": 0.15992847918943082, + "grad_norm": 0.24609375, + "learning_rate": 0.0007872096950432105, + "loss": 0.8395, + "step": 3220 + }, + { + "epoch": 0.16042515148505015, + "grad_norm": 0.25, + "learning_rate": 0.000787169961259561, + "loss": 0.827, + "step": 3230 + }, + { + "epoch": 0.16092182378066952, + "grad_norm": 0.232421875, + "learning_rate": 0.0007871302274759114, + "loss": 0.8478, + "step": 3240 + }, + { + "epoch": 0.16141849607628886, + "grad_norm": 0.236328125, + "learning_rate": 0.0007870904936922618, + "loss": 0.8247, + "step": 3250 + }, + { + "epoch": 0.16191516837190822, + "grad_norm": 0.2275390625, + "learning_rate": 0.0007870507599086123, + "loss": 0.8557, + "step": 3260 + }, + { + "epoch": 0.16241184066752756, + "grad_norm": 0.2265625, + "learning_rate": 0.0007870110261249628, + "loss": 0.8532, + "step": 3270 + }, + { + "epoch": 0.16290851296314693, + "grad_norm": 0.23828125, + "learning_rate": 0.0007869712923413133, + "loss": 0.8545, + "step": 3280 + }, + { + "epoch": 0.16340518525876627, + "grad_norm": 0.2578125, + "learning_rate": 0.0007869315585576637, + "loss": 0.8675, + "step": 3290 + }, + { + "epoch": 0.1639018575543856, + "grad_norm": 0.203125, + "learning_rate": 0.0007868918247740141, + "loss": 0.8209, + "step": 3300 + }, + { + "epoch": 0.16439852985000497, + "grad_norm": 0.2265625, + "learning_rate": 0.0007868520909903647, + "loss": 0.8332, + "step": 3310 + }, + { + "epoch": 0.1648952021456243, + "grad_norm": 0.212890625, + "learning_rate": 0.0007868123572067151, + "loss": 0.8373, + "step": 3320 + }, + { + "epoch": 0.16539187444124367, + "grad_norm": 0.2333984375, + "learning_rate": 0.0007867726234230655, + "loss": 0.862, + "step": 3330 + }, + { + "epoch": 0.165888546736863, + "grad_norm": 0.2060546875, + "learning_rate": 0.000786732889639416, + "loss": 0.8397, + "step": 3340 + }, + { + "epoch": 0.16638521903248238, + "grad_norm": 0.248046875, + "learning_rate": 0.0007866931558557663, + "loss": 0.8448, + "step": 3350 + }, + { + "epoch": 0.16688189132810172, + "grad_norm": 0.2294921875, + "learning_rate": 0.0007866534220721168, + "loss": 0.8382, + "step": 3360 + }, + { + "epoch": 0.16737856362372106, + "grad_norm": 0.23046875, + "learning_rate": 0.0007866136882884674, + "loss": 0.837, + "step": 3370 + }, + { + "epoch": 0.16787523591934042, + "grad_norm": 0.2294921875, + "learning_rate": 0.0007865739545048177, + "loss": 0.8321, + "step": 3380 + }, + { + "epoch": 0.16837190821495976, + "grad_norm": 0.2255859375, + "learning_rate": 0.0007865342207211682, + "loss": 0.8347, + "step": 3390 + }, + { + "epoch": 0.16886858051057912, + "grad_norm": 0.26953125, + "learning_rate": 0.0007864944869375187, + "loss": 0.879, + "step": 3400 + }, + { + "epoch": 0.16936525280619846, + "grad_norm": 0.2021484375, + "learning_rate": 0.000786454753153869, + "loss": 0.8377, + "step": 3410 + }, + { + "epoch": 0.16986192510181783, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007864150193702196, + "loss": 0.7883, + "step": 3420 + }, + { + "epoch": 0.17035859739743717, + "grad_norm": 0.228515625, + "learning_rate": 0.00078637528558657, + "loss": 0.8456, + "step": 3430 + }, + { + "epoch": 0.17085526969305653, + "grad_norm": 0.26171875, + "learning_rate": 0.0007863355518029205, + "loss": 0.8309, + "step": 3440 + }, + { + "epoch": 0.17135194198867587, + "grad_norm": 0.2314453125, + "learning_rate": 0.0007862958180192709, + "loss": 0.8715, + "step": 3450 + }, + { + "epoch": 0.1718486142842952, + "grad_norm": 0.2197265625, + "learning_rate": 0.0007862560842356213, + "loss": 0.8108, + "step": 3460 + }, + { + "epoch": 0.17234528657991458, + "grad_norm": 0.2216796875, + "learning_rate": 0.0007862163504519719, + "loss": 0.8348, + "step": 3470 + }, + { + "epoch": 0.1728419588755339, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007861766166683223, + "loss": 0.8579, + "step": 3480 + }, + { + "epoch": 0.17333863117115328, + "grad_norm": 0.201171875, + "learning_rate": 0.0007861368828846727, + "loss": 0.8149, + "step": 3490 + }, + { + "epoch": 0.17383530346677262, + "grad_norm": 0.2412109375, + "learning_rate": 0.0007860971491010232, + "loss": 0.8031, + "step": 3500 + }, + { + "epoch": 0.17433197576239198, + "grad_norm": 0.2421875, + "learning_rate": 0.0007860574153173736, + "loss": 0.8666, + "step": 3510 + }, + { + "epoch": 0.17482864805801132, + "grad_norm": 0.2060546875, + "learning_rate": 0.0007860176815337241, + "loss": 0.8271, + "step": 3520 + }, + { + "epoch": 0.1753253203536307, + "grad_norm": 0.220703125, + "learning_rate": 0.0007859779477500746, + "loss": 0.8059, + "step": 3530 + }, + { + "epoch": 0.17582199264925003, + "grad_norm": 0.2001953125, + "learning_rate": 0.000785938213966425, + "loss": 0.8332, + "step": 3540 + }, + { + "epoch": 0.17631866494486936, + "grad_norm": 0.2412109375, + "learning_rate": 0.0007858984801827754, + "loss": 0.827, + "step": 3550 + }, + { + "epoch": 0.17681533724048873, + "grad_norm": 0.23046875, + "learning_rate": 0.000785858746399126, + "loss": 0.8217, + "step": 3560 + }, + { + "epoch": 0.17731200953610807, + "grad_norm": 0.2265625, + "learning_rate": 0.0007858190126154763, + "loss": 0.8426, + "step": 3570 + }, + { + "epoch": 0.17780868183172743, + "grad_norm": 0.2294921875, + "learning_rate": 0.0007857792788318268, + "loss": 0.8383, + "step": 3580 + }, + { + "epoch": 0.17830535412734677, + "grad_norm": 0.2060546875, + "learning_rate": 0.0007857395450481773, + "loss": 0.8361, + "step": 3590 + }, + { + "epoch": 0.17880202642296614, + "grad_norm": 0.2060546875, + "learning_rate": 0.0007856998112645277, + "loss": 0.815, + "step": 3600 + }, + { + "epoch": 0.17929869871858548, + "grad_norm": 0.2099609375, + "learning_rate": 0.0007856600774808781, + "loss": 0.822, + "step": 3610 + }, + { + "epoch": 0.17979537101420484, + "grad_norm": 0.25390625, + "learning_rate": 0.0007856203436972286, + "loss": 0.8272, + "step": 3620 + }, + { + "epoch": 0.18029204330982418, + "grad_norm": 0.251953125, + "learning_rate": 0.0007855806099135791, + "loss": 0.8239, + "step": 3630 + }, + { + "epoch": 0.18078871560544352, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007855408761299295, + "loss": 0.8254, + "step": 3640 + }, + { + "epoch": 0.18128538790106288, + "grad_norm": 0.236328125, + "learning_rate": 0.0007855011423462799, + "loss": 0.8422, + "step": 3650 + }, + { + "epoch": 0.18178206019668222, + "grad_norm": 0.26171875, + "learning_rate": 0.0007854614085626304, + "loss": 0.8412, + "step": 3660 + }, + { + "epoch": 0.1822787324923016, + "grad_norm": 0.2373046875, + "learning_rate": 0.0007854216747789809, + "loss": 0.8247, + "step": 3670 + }, + { + "epoch": 0.18277540478792093, + "grad_norm": 0.20703125, + "learning_rate": 0.0007853819409953313, + "loss": 0.8094, + "step": 3680 + }, + { + "epoch": 0.1832720770835403, + "grad_norm": 0.267578125, + "learning_rate": 0.0007853422072116818, + "loss": 0.8558, + "step": 3690 + }, + { + "epoch": 0.18376874937915963, + "grad_norm": 0.2333984375, + "learning_rate": 0.0007853024734280322, + "loss": 0.8463, + "step": 3700 + }, + { + "epoch": 0.18426542167477897, + "grad_norm": 0.2412109375, + "learning_rate": 0.0007852627396443826, + "loss": 0.834, + "step": 3710 + }, + { + "epoch": 0.18476209397039833, + "grad_norm": 0.2373046875, + "learning_rate": 0.0007852230058607332, + "loss": 0.8033, + "step": 3720 + }, + { + "epoch": 0.18525876626601767, + "grad_norm": 0.1982421875, + "learning_rate": 0.0007851832720770836, + "loss": 0.8285, + "step": 3730 + }, + { + "epoch": 0.18575543856163704, + "grad_norm": 0.2353515625, + "learning_rate": 0.000785143538293434, + "loss": 0.7968, + "step": 3740 + }, + { + "epoch": 0.18625211085725638, + "grad_norm": 0.2451171875, + "learning_rate": 0.0007851038045097845, + "loss": 0.823, + "step": 3750 + }, + { + "epoch": 0.18674878315287574, + "grad_norm": 0.2470703125, + "learning_rate": 0.0007850640707261349, + "loss": 0.8429, + "step": 3760 + }, + { + "epoch": 0.18724545544849508, + "grad_norm": 0.2373046875, + "learning_rate": 0.0007850243369424854, + "loss": 0.8029, + "step": 3770 + }, + { + "epoch": 0.18774212774411445, + "grad_norm": 0.216796875, + "learning_rate": 0.0007849846031588359, + "loss": 0.7974, + "step": 3780 + }, + { + "epoch": 0.18823880003973378, + "grad_norm": 0.2119140625, + "learning_rate": 0.0007849448693751863, + "loss": 0.8167, + "step": 3790 + }, + { + "epoch": 0.18873547233535312, + "grad_norm": 0.216796875, + "learning_rate": 0.0007849051355915367, + "loss": 0.8322, + "step": 3800 + }, + { + "epoch": 0.1892321446309725, + "grad_norm": 0.23046875, + "learning_rate": 0.0007848654018078872, + "loss": 0.8332, + "step": 3810 + }, + { + "epoch": 0.18972881692659183, + "grad_norm": 0.2041015625, + "learning_rate": 0.0007848256680242377, + "loss": 0.8058, + "step": 3820 + }, + { + "epoch": 0.1902254892222112, + "grad_norm": 0.2197265625, + "learning_rate": 0.0007847859342405881, + "loss": 0.8551, + "step": 3830 + }, + { + "epoch": 0.19072216151783053, + "grad_norm": 0.2099609375, + "learning_rate": 0.0007847462004569385, + "loss": 0.8265, + "step": 3840 + }, + { + "epoch": 0.1912188338134499, + "grad_norm": 0.2421875, + "learning_rate": 0.000784706466673289, + "loss": 0.8347, + "step": 3850 + }, + { + "epoch": 0.19171550610906923, + "grad_norm": 0.216796875, + "learning_rate": 0.0007846667328896394, + "loss": 0.803, + "step": 3860 + }, + { + "epoch": 0.1922121784046886, + "grad_norm": 0.2275390625, + "learning_rate": 0.0007846269991059899, + "loss": 0.8361, + "step": 3870 + }, + { + "epoch": 0.19270885070030794, + "grad_norm": 0.2392578125, + "learning_rate": 0.0007845872653223404, + "loss": 0.8125, + "step": 3880 + }, + { + "epoch": 0.19320552299592728, + "grad_norm": 0.197265625, + "learning_rate": 0.0007845475315386908, + "loss": 0.8293, + "step": 3890 + }, + { + "epoch": 0.19370219529154664, + "grad_norm": 0.205078125, + "learning_rate": 0.0007845077977550412, + "loss": 0.8149, + "step": 3900 + }, + { + "epoch": 0.19419886758716598, + "grad_norm": 0.2265625, + "learning_rate": 0.0007844680639713917, + "loss": 0.8187, + "step": 3910 + }, + { + "epoch": 0.19469553988278535, + "grad_norm": 0.251953125, + "learning_rate": 0.0007844283301877422, + "loss": 0.8111, + "step": 3920 + }, + { + "epoch": 0.19519221217840468, + "grad_norm": 0.21875, + "learning_rate": 0.0007843885964040926, + "loss": 0.8085, + "step": 3930 + }, + { + "epoch": 0.19568888447402405, + "grad_norm": 0.2294921875, + "learning_rate": 0.0007843488626204431, + "loss": 0.8489, + "step": 3940 + }, + { + "epoch": 0.1961855567696434, + "grad_norm": 0.2041015625, + "learning_rate": 0.0007843091288367935, + "loss": 0.8231, + "step": 3950 + }, + { + "epoch": 0.19668222906526273, + "grad_norm": 0.2109375, + "learning_rate": 0.0007842693950531439, + "loss": 0.8353, + "step": 3960 + }, + { + "epoch": 0.1971789013608821, + "grad_norm": 0.2314453125, + "learning_rate": 0.0007842296612694945, + "loss": 0.8116, + "step": 3970 + }, + { + "epoch": 0.19767557365650143, + "grad_norm": 0.2451171875, + "learning_rate": 0.0007841899274858449, + "loss": 0.8392, + "step": 3980 + }, + { + "epoch": 0.1981722459521208, + "grad_norm": 0.21484375, + "learning_rate": 0.0007841501937021953, + "loss": 0.8145, + "step": 3990 + }, + { + "epoch": 0.19866891824774013, + "grad_norm": 0.1953125, + "learning_rate": 0.0007841104599185458, + "loss": 0.8646, + "step": 4000 + }, + { + "epoch": 0.1991655905433595, + "grad_norm": 0.2216796875, + "learning_rate": 0.0007840707261348962, + "loss": 0.792, + "step": 4010 + }, + { + "epoch": 0.19966226283897884, + "grad_norm": 0.19921875, + "learning_rate": 0.0007840309923512468, + "loss": 0.7947, + "step": 4020 + }, + { + "epoch": 0.2001589351345982, + "grad_norm": 0.216796875, + "learning_rate": 0.0007839912585675971, + "loss": 0.7848, + "step": 4030 + }, + { + "epoch": 0.20065560743021754, + "grad_norm": 0.21875, + "learning_rate": 0.0007839515247839476, + "loss": 0.8112, + "step": 4040 + }, + { + "epoch": 0.20115227972583688, + "grad_norm": 0.23046875, + "learning_rate": 0.0007839117910002981, + "loss": 0.8311, + "step": 4050 + }, + { + "epoch": 0.20164895202145625, + "grad_norm": 0.2021484375, + "learning_rate": 0.0007838720572166484, + "loss": 0.7896, + "step": 4060 + }, + { + "epoch": 0.20214562431707558, + "grad_norm": 0.21875, + "learning_rate": 0.000783832323432999, + "loss": 0.7951, + "step": 4070 + }, + { + "epoch": 0.20264229661269495, + "grad_norm": 0.228515625, + "learning_rate": 0.0007837925896493495, + "loss": 0.757, + "step": 4080 + }, + { + "epoch": 0.2031389689083143, + "grad_norm": 0.2158203125, + "learning_rate": 0.0007837528558656998, + "loss": 0.7988, + "step": 4090 + }, + { + "epoch": 0.20363564120393365, + "grad_norm": 0.19140625, + "learning_rate": 0.0007837131220820503, + "loss": 0.8171, + "step": 4100 + }, + { + "epoch": 0.204132313499553, + "grad_norm": 0.2275390625, + "learning_rate": 0.0007836733882984007, + "loss": 0.806, + "step": 4110 + }, + { + "epoch": 0.20462898579517236, + "grad_norm": 0.197265625, + "learning_rate": 0.0007836336545147512, + "loss": 0.8052, + "step": 4120 + }, + { + "epoch": 0.2051256580907917, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007835939207311017, + "loss": 0.814, + "step": 4130 + }, + { + "epoch": 0.20562233038641103, + "grad_norm": 0.2099609375, + "learning_rate": 0.0007835541869474521, + "loss": 0.8323, + "step": 4140 + }, + { + "epoch": 0.2061190026820304, + "grad_norm": 0.1953125, + "learning_rate": 0.0007835144531638025, + "loss": 0.7967, + "step": 4150 + }, + { + "epoch": 0.20661567497764974, + "grad_norm": 0.197265625, + "learning_rate": 0.000783474719380153, + "loss": 0.7966, + "step": 4160 + }, + { + "epoch": 0.2071123472732691, + "grad_norm": 0.205078125, + "learning_rate": 0.0007834349855965035, + "loss": 0.8208, + "step": 4170 + }, + { + "epoch": 0.20760901956888844, + "grad_norm": 0.23046875, + "learning_rate": 0.000783395251812854, + "loss": 0.8258, + "step": 4180 + }, + { + "epoch": 0.2081056918645078, + "grad_norm": 0.251953125, + "learning_rate": 0.0007833555180292044, + "loss": 0.8061, + "step": 4190 + }, + { + "epoch": 0.20860236416012715, + "grad_norm": 0.1953125, + "learning_rate": 0.0007833157842455548, + "loss": 0.824, + "step": 4200 + }, + { + "epoch": 0.2090990364557465, + "grad_norm": 0.2236328125, + "learning_rate": 0.0007832760504619053, + "loss": 0.8106, + "step": 4210 + }, + { + "epoch": 0.20959570875136585, + "grad_norm": 0.23046875, + "learning_rate": 0.0007832363166782556, + "loss": 0.8123, + "step": 4220 + }, + { + "epoch": 0.2100923810469852, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007831965828946062, + "loss": 0.863, + "step": 4230 + }, + { + "epoch": 0.21058905334260455, + "grad_norm": 0.25, + "learning_rate": 0.0007831568491109567, + "loss": 0.8205, + "step": 4240 + }, + { + "epoch": 0.2110857256382239, + "grad_norm": 0.2236328125, + "learning_rate": 0.000783117115327307, + "loss": 0.8325, + "step": 4250 + }, + { + "epoch": 0.21158239793384326, + "grad_norm": 0.2412109375, + "learning_rate": 0.0007830773815436575, + "loss": 0.8257, + "step": 4260 + }, + { + "epoch": 0.2120790702294626, + "grad_norm": 0.2041015625, + "learning_rate": 0.0007830376477600081, + "loss": 0.8101, + "step": 4270 + }, + { + "epoch": 0.21257574252508196, + "grad_norm": 0.2197265625, + "learning_rate": 0.0007829979139763584, + "loss": 0.8094, + "step": 4280 + }, + { + "epoch": 0.2130724148207013, + "grad_norm": 0.19921875, + "learning_rate": 0.0007829581801927089, + "loss": 0.8194, + "step": 4290 + }, + { + "epoch": 0.21356908711632064, + "grad_norm": 0.234375, + "learning_rate": 0.0007829184464090593, + "loss": 0.8438, + "step": 4300 + }, + { + "epoch": 0.21406575941194, + "grad_norm": 0.248046875, + "learning_rate": 0.0007828787126254097, + "loss": 0.8385, + "step": 4310 + }, + { + "epoch": 0.21456243170755934, + "grad_norm": 0.1953125, + "learning_rate": 0.0007828389788417603, + "loss": 0.8205, + "step": 4320 + }, + { + "epoch": 0.2150591040031787, + "grad_norm": 0.2099609375, + "learning_rate": 0.0007827992450581107, + "loss": 0.7999, + "step": 4330 + }, + { + "epoch": 0.21555577629879805, + "grad_norm": 0.2099609375, + "learning_rate": 0.0007827595112744612, + "loss": 0.7847, + "step": 4340 + }, + { + "epoch": 0.2160524485944174, + "grad_norm": 0.2001953125, + "learning_rate": 0.0007827197774908116, + "loss": 0.7887, + "step": 4350 + }, + { + "epoch": 0.21654912089003675, + "grad_norm": 0.232421875, + "learning_rate": 0.000782680043707162, + "loss": 0.8135, + "step": 4360 + }, + { + "epoch": 0.21704579318565612, + "grad_norm": 0.255859375, + "learning_rate": 0.0007826403099235126, + "loss": 0.8124, + "step": 4370 + }, + { + "epoch": 0.21754246548127545, + "grad_norm": 0.208984375, + "learning_rate": 0.000782600576139863, + "loss": 0.7823, + "step": 4380 + }, + { + "epoch": 0.2180391377768948, + "grad_norm": 0.1953125, + "learning_rate": 0.0007825608423562134, + "loss": 0.8115, + "step": 4390 + }, + { + "epoch": 0.21853581007251416, + "grad_norm": 0.2001953125, + "learning_rate": 0.0007825211085725639, + "loss": 0.8017, + "step": 4400 + }, + { + "epoch": 0.2190324823681335, + "grad_norm": 0.2236328125, + "learning_rate": 0.0007824813747889143, + "loss": 0.7821, + "step": 4410 + }, + { + "epoch": 0.21952915466375286, + "grad_norm": 0.2041015625, + "learning_rate": 0.0007824416410052647, + "loss": 0.8001, + "step": 4420 + }, + { + "epoch": 0.2200258269593722, + "grad_norm": 0.2158203125, + "learning_rate": 0.0007824019072216153, + "loss": 0.7993, + "step": 4430 + }, + { + "epoch": 0.22052249925499157, + "grad_norm": 0.2099609375, + "learning_rate": 0.0007823621734379656, + "loss": 0.8269, + "step": 4440 + }, + { + "epoch": 0.2210191715506109, + "grad_norm": 0.2451171875, + "learning_rate": 0.0007823224396543161, + "loss": 0.8173, + "step": 4450 + }, + { + "epoch": 0.22151584384623027, + "grad_norm": 0.2177734375, + "learning_rate": 0.0007822827058706666, + "loss": 0.7898, + "step": 4460 + }, + { + "epoch": 0.2220125161418496, + "grad_norm": 0.236328125, + "learning_rate": 0.000782242972087017, + "loss": 0.8034, + "step": 4470 + }, + { + "epoch": 0.22250918843746895, + "grad_norm": 0.228515625, + "learning_rate": 0.0007822032383033675, + "loss": 0.7929, + "step": 4480 + }, + { + "epoch": 0.2230058607330883, + "grad_norm": 0.2265625, + "learning_rate": 0.0007821635045197179, + "loss": 0.7875, + "step": 4490 + }, + { + "epoch": 0.22350253302870765, + "grad_norm": 0.212890625, + "learning_rate": 0.0007821237707360684, + "loss": 0.7699, + "step": 4500 + }, + { + "epoch": 0.22399920532432702, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007820840369524188, + "loss": 0.815, + "step": 4510 + }, + { + "epoch": 0.22449587761994635, + "grad_norm": 0.2060546875, + "learning_rate": 0.0007820443031687692, + "loss": 0.7915, + "step": 4520 + }, + { + "epoch": 0.22499254991556572, + "grad_norm": 0.197265625, + "learning_rate": 0.0007820045693851198, + "loss": 0.8077, + "step": 4530 + }, + { + "epoch": 0.22548922221118506, + "grad_norm": 0.2216796875, + "learning_rate": 0.0007819648356014702, + "loss": 0.7716, + "step": 4540 + }, + { + "epoch": 0.2259858945068044, + "grad_norm": 0.2001953125, + "learning_rate": 0.0007819251018178206, + "loss": 0.7838, + "step": 4550 + }, + { + "epoch": 0.22648256680242376, + "grad_norm": 0.2197265625, + "learning_rate": 0.0007818853680341711, + "loss": 0.8343, + "step": 4560 + }, + { + "epoch": 0.2269792390980431, + "grad_norm": 0.2373046875, + "learning_rate": 0.0007818456342505215, + "loss": 0.7671, + "step": 4570 + }, + { + "epoch": 0.22747591139366247, + "grad_norm": 0.2060546875, + "learning_rate": 0.000781805900466872, + "loss": 0.7987, + "step": 4580 + }, + { + "epoch": 0.2279725836892818, + "grad_norm": 0.212890625, + "learning_rate": 0.0007817661666832225, + "loss": 0.7885, + "step": 4590 + }, + { + "epoch": 0.22846925598490117, + "grad_norm": 0.208984375, + "learning_rate": 0.0007817264328995729, + "loss": 0.7943, + "step": 4600 + }, + { + "epoch": 0.2289659282805205, + "grad_norm": 0.20703125, + "learning_rate": 0.0007816866991159233, + "loss": 0.8023, + "step": 4610 + }, + { + "epoch": 0.22946260057613987, + "grad_norm": 0.2373046875, + "learning_rate": 0.0007816469653322739, + "loss": 0.7773, + "step": 4620 + }, + { + "epoch": 0.2299592728717592, + "grad_norm": 0.232421875, + "learning_rate": 0.0007816072315486243, + "loss": 0.8065, + "step": 4630 + }, + { + "epoch": 0.23045594516737855, + "grad_norm": 0.19921875, + "learning_rate": 0.0007815674977649747, + "loss": 0.7635, + "step": 4640 + }, + { + "epoch": 0.23095261746299792, + "grad_norm": 0.1845703125, + "learning_rate": 0.0007815277639813252, + "loss": 0.7777, + "step": 4650 + }, + { + "epoch": 0.23144928975861725, + "grad_norm": 0.201171875, + "learning_rate": 0.0007814880301976756, + "loss": 0.7775, + "step": 4660 + }, + { + "epoch": 0.23194596205423662, + "grad_norm": 0.2265625, + "learning_rate": 0.000781448296414026, + "loss": 0.8366, + "step": 4670 + }, + { + "epoch": 0.23244263434985596, + "grad_norm": 0.2060546875, + "learning_rate": 0.0007814085626303766, + "loss": 0.7759, + "step": 4680 + }, + { + "epoch": 0.23293930664547532, + "grad_norm": 0.2060546875, + "learning_rate": 0.000781368828846727, + "loss": 0.7545, + "step": 4690 + }, + { + "epoch": 0.23343597894109466, + "grad_norm": 0.205078125, + "learning_rate": 0.0007813290950630774, + "loss": 0.8055, + "step": 4700 + }, + { + "epoch": 0.23393265123671403, + "grad_norm": 0.2021484375, + "learning_rate": 0.0007812893612794278, + "loss": 0.7886, + "step": 4710 + }, + { + "epoch": 0.23442932353233337, + "grad_norm": 0.189453125, + "learning_rate": 0.0007812496274957783, + "loss": 0.8074, + "step": 4720 + }, + { + "epoch": 0.2349259958279527, + "grad_norm": 0.2119140625, + "learning_rate": 0.0007812098937121288, + "loss": 0.8098, + "step": 4730 + }, + { + "epoch": 0.23542266812357207, + "grad_norm": 0.1845703125, + "learning_rate": 0.0007811701599284792, + "loss": 0.8287, + "step": 4740 + }, + { + "epoch": 0.2359193404191914, + "grad_norm": 0.2158203125, + "learning_rate": 0.0007811304261448297, + "loss": 0.7923, + "step": 4750 + }, + { + "epoch": 0.23641601271481077, + "grad_norm": 0.216796875, + "learning_rate": 0.0007810906923611801, + "loss": 0.8065, + "step": 4760 + }, + { + "epoch": 0.2369126850104301, + "grad_norm": 0.20703125, + "learning_rate": 0.0007810509585775305, + "loss": 0.7706, + "step": 4770 + }, + { + "epoch": 0.23740935730604948, + "grad_norm": 0.208984375, + "learning_rate": 0.0007810112247938811, + "loss": 0.8026, + "step": 4780 + }, + { + "epoch": 0.23790602960166882, + "grad_norm": 0.2294921875, + "learning_rate": 0.0007809714910102315, + "loss": 0.8084, + "step": 4790 + }, + { + "epoch": 0.23840270189728818, + "grad_norm": 0.21875, + "learning_rate": 0.0007809317572265819, + "loss": 0.7945, + "step": 4800 + }, + { + "epoch": 0.23889937419290752, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007808920234429324, + "loss": 0.8042, + "step": 4810 + }, + { + "epoch": 0.23939604648852686, + "grad_norm": 0.1953125, + "learning_rate": 0.0007808522896592828, + "loss": 0.7626, + "step": 4820 + }, + { + "epoch": 0.23989271878414622, + "grad_norm": 0.193359375, + "learning_rate": 0.0007808125558756333, + "loss": 0.7616, + "step": 4830 + }, + { + "epoch": 0.24038939107976556, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007807728220919838, + "loss": 0.7974, + "step": 4840 + }, + { + "epoch": 0.24088606337538493, + "grad_norm": 0.2060546875, + "learning_rate": 0.0007807330883083342, + "loss": 0.8034, + "step": 4850 + }, + { + "epoch": 0.24138273567100427, + "grad_norm": 0.2216796875, + "learning_rate": 0.0007806933545246846, + "loss": 0.8325, + "step": 4860 + }, + { + "epoch": 0.24187940796662363, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007806536207410351, + "loss": 0.8055, + "step": 4870 + }, + { + "epoch": 0.24237608026224297, + "grad_norm": 0.20703125, + "learning_rate": 0.0007806138869573856, + "loss": 0.7872, + "step": 4880 + }, + { + "epoch": 0.2428727525578623, + "grad_norm": 0.2275390625, + "learning_rate": 0.000780574153173736, + "loss": 0.8107, + "step": 4890 + }, + { + "epoch": 0.24336942485348168, + "grad_norm": 0.197265625, + "learning_rate": 0.0007805344193900864, + "loss": 0.7817, + "step": 4900 + }, + { + "epoch": 0.243866097149101, + "grad_norm": 0.2021484375, + "learning_rate": 0.0007804946856064369, + "loss": 0.8126, + "step": 4910 + }, + { + "epoch": 0.24436276944472038, + "grad_norm": 0.1875, + "learning_rate": 0.0007804549518227874, + "loss": 0.7746, + "step": 4920 + }, + { + "epoch": 0.24485944174033972, + "grad_norm": 0.2119140625, + "learning_rate": 0.0007804152180391378, + "loss": 0.7759, + "step": 4930 + }, + { + "epoch": 0.24535611403595908, + "grad_norm": 0.203125, + "learning_rate": 0.0007803754842554883, + "loss": 0.77, + "step": 4940 + }, + { + "epoch": 0.24585278633157842, + "grad_norm": 0.181640625, + "learning_rate": 0.0007803357504718388, + "loss": 0.7883, + "step": 4950 + }, + { + "epoch": 0.2463494586271978, + "grad_norm": 0.19921875, + "learning_rate": 0.0007802960166881891, + "loss": 0.7896, + "step": 4960 + }, + { + "epoch": 0.24684613092281713, + "grad_norm": 0.208984375, + "learning_rate": 0.0007802562829045396, + "loss": 0.796, + "step": 4970 + }, + { + "epoch": 0.24734280321843646, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007802165491208901, + "loss": 0.8069, + "step": 4980 + }, + { + "epoch": 0.24783947551405583, + "grad_norm": 0.220703125, + "learning_rate": 0.0007801768153372405, + "loss": 0.8263, + "step": 4990 + }, + { + "epoch": 0.24833614780967517, + "grad_norm": 0.2275390625, + "learning_rate": 0.000780137081553591, + "loss": 0.7691, + "step": 5000 + }, + { + "epoch": 0.24883282010529453, + "grad_norm": 0.18359375, + "learning_rate": 0.0007800973477699414, + "loss": 0.7552, + "step": 5010 + }, + { + "epoch": 0.24932949240091387, + "grad_norm": 0.205078125, + "learning_rate": 0.0007800576139862918, + "loss": 0.7764, + "step": 5020 + }, + { + "epoch": 0.24982616469653324, + "grad_norm": 0.197265625, + "learning_rate": 0.0007800178802026424, + "loss": 0.7935, + "step": 5030 + }, + { + "epoch": 0.2503228369921526, + "grad_norm": 0.2255859375, + "learning_rate": 0.0007799781464189928, + "loss": 0.7723, + "step": 5040 + }, + { + "epoch": 0.2508195092877719, + "grad_norm": 0.1982421875, + "learning_rate": 0.0007799384126353432, + "loss": 0.7674, + "step": 5050 + }, + { + "epoch": 0.25131618158339125, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007798986788516937, + "loss": 0.7987, + "step": 5060 + }, + { + "epoch": 0.25181285387901065, + "grad_norm": 0.19921875, + "learning_rate": 0.0007798589450680441, + "loss": 0.8071, + "step": 5070 + }, + { + "epoch": 0.25230952617463, + "grad_norm": 0.203125, + "learning_rate": 0.0007798192112843947, + "loss": 0.8022, + "step": 5080 + }, + { + "epoch": 0.2528061984702493, + "grad_norm": 0.2255859375, + "learning_rate": 0.000779779477500745, + "loss": 0.7887, + "step": 5090 + }, + { + "epoch": 0.25330287076586866, + "grad_norm": 0.20703125, + "learning_rate": 0.0007797397437170955, + "loss": 0.7665, + "step": 5100 + }, + { + "epoch": 0.25379954306148805, + "grad_norm": 0.1982421875, + "learning_rate": 0.000779700009933446, + "loss": 0.8016, + "step": 5110 + }, + { + "epoch": 0.2542962153571074, + "grad_norm": 0.21484375, + "learning_rate": 0.0007796602761497963, + "loss": 0.7871, + "step": 5120 + }, + { + "epoch": 0.25479288765272673, + "grad_norm": 0.2021484375, + "learning_rate": 0.0007796205423661469, + "loss": 0.8054, + "step": 5130 + }, + { + "epoch": 0.25528955994834607, + "grad_norm": 0.208984375, + "learning_rate": 0.0007795808085824974, + "loss": 0.7859, + "step": 5140 + }, + { + "epoch": 0.2557862322439654, + "grad_norm": 0.1953125, + "learning_rate": 0.0007795410747988477, + "loss": 0.7812, + "step": 5150 + }, + { + "epoch": 0.2562829045395848, + "grad_norm": 0.181640625, + "learning_rate": 0.0007795013410151982, + "loss": 0.7794, + "step": 5160 + }, + { + "epoch": 0.25677957683520414, + "grad_norm": 0.23046875, + "learning_rate": 0.0007794616072315486, + "loss": 0.7664, + "step": 5170 + }, + { + "epoch": 0.2572762491308235, + "grad_norm": 0.1904296875, + "learning_rate": 0.000779421873447899, + "loss": 0.7864, + "step": 5180 + }, + { + "epoch": 0.2577729214264428, + "grad_norm": 0.255859375, + "learning_rate": 0.0007793821396642496, + "loss": 0.8203, + "step": 5190 + }, + { + "epoch": 0.2582695937220622, + "grad_norm": 0.1796875, + "learning_rate": 0.0007793424058806, + "loss": 0.7712, + "step": 5200 + }, + { + "epoch": 0.25876626601768155, + "grad_norm": 0.259765625, + "learning_rate": 0.0007793026720969504, + "loss": 0.7664, + "step": 5210 + }, + { + "epoch": 0.2592629383133009, + "grad_norm": 0.1953125, + "learning_rate": 0.0007792629383133009, + "loss": 0.7916, + "step": 5220 + }, + { + "epoch": 0.2597596106089202, + "grad_norm": 0.2197265625, + "learning_rate": 0.0007792232045296514, + "loss": 0.7596, + "step": 5230 + }, + { + "epoch": 0.26025628290453956, + "grad_norm": 0.2255859375, + "learning_rate": 0.0007791834707460019, + "loss": 0.7672, + "step": 5240 + }, + { + "epoch": 0.26075295520015895, + "grad_norm": 0.1982421875, + "learning_rate": 0.0007791437369623523, + "loss": 0.7835, + "step": 5250 + }, + { + "epoch": 0.2612496274957783, + "grad_norm": 0.208984375, + "learning_rate": 0.0007791040031787027, + "loss": 0.7874, + "step": 5260 + }, + { + "epoch": 0.26174629979139763, + "grad_norm": 0.21875, + "learning_rate": 0.0007790642693950532, + "loss": 0.7597, + "step": 5270 + }, + { + "epoch": 0.26224297208701697, + "grad_norm": 0.185546875, + "learning_rate": 0.0007790245356114037, + "loss": 0.7825, + "step": 5280 + }, + { + "epoch": 0.26273964438263636, + "grad_norm": 0.2177734375, + "learning_rate": 0.0007789848018277541, + "loss": 0.8065, + "step": 5290 + }, + { + "epoch": 0.2632363166782557, + "grad_norm": 0.21484375, + "learning_rate": 0.0007789450680441046, + "loss": 0.7338, + "step": 5300 + }, + { + "epoch": 0.26373298897387504, + "grad_norm": 0.1923828125, + "learning_rate": 0.0007789053342604549, + "loss": 0.769, + "step": 5310 + }, + { + "epoch": 0.2642296612694944, + "grad_norm": 0.193359375, + "learning_rate": 0.0007788656004768054, + "loss": 0.776, + "step": 5320 + }, + { + "epoch": 0.2647263335651137, + "grad_norm": 0.189453125, + "learning_rate": 0.000778825866693156, + "loss": 0.8244, + "step": 5330 + }, + { + "epoch": 0.2652230058607331, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007787861329095063, + "loss": 0.7883, + "step": 5340 + }, + { + "epoch": 0.26571967815635245, + "grad_norm": 0.212890625, + "learning_rate": 0.0007787463991258568, + "loss": 0.7951, + "step": 5350 + }, + { + "epoch": 0.2662163504519718, + "grad_norm": 0.2060546875, + "learning_rate": 0.0007787066653422072, + "loss": 0.8077, + "step": 5360 + }, + { + "epoch": 0.2667130227475911, + "grad_norm": 0.197265625, + "learning_rate": 0.0007786669315585577, + "loss": 0.7914, + "step": 5370 + }, + { + "epoch": 0.2672096950432105, + "grad_norm": 0.19140625, + "learning_rate": 0.0007786271977749082, + "loss": 0.7563, + "step": 5380 + }, + { + "epoch": 0.26770636733882985, + "grad_norm": 0.224609375, + "learning_rate": 0.0007785874639912586, + "loss": 0.7683, + "step": 5390 + }, + { + "epoch": 0.2682030396344492, + "grad_norm": 0.1845703125, + "learning_rate": 0.0007785477302076091, + "loss": 0.7949, + "step": 5400 + }, + { + "epoch": 0.26869971193006853, + "grad_norm": 0.18359375, + "learning_rate": 0.0007785079964239595, + "loss": 0.8195, + "step": 5410 + }, + { + "epoch": 0.26919638422568787, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007784682626403099, + "loss": 0.7995, + "step": 5420 + }, + { + "epoch": 0.26969305652130726, + "grad_norm": 0.173828125, + "learning_rate": 0.0007784285288566605, + "loss": 0.7776, + "step": 5430 + }, + { + "epoch": 0.2701897288169266, + "grad_norm": 0.19921875, + "learning_rate": 0.0007783887950730109, + "loss": 0.7899, + "step": 5440 + }, + { + "epoch": 0.27068640111254594, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007783490612893613, + "loss": 0.8031, + "step": 5450 + }, + { + "epoch": 0.2711830734081653, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007783093275057118, + "loss": 0.749, + "step": 5460 + }, + { + "epoch": 0.27167974570378467, + "grad_norm": 0.224609375, + "learning_rate": 0.0007782695937220622, + "loss": 0.7685, + "step": 5470 + }, + { + "epoch": 0.272176417999404, + "grad_norm": 0.216796875, + "learning_rate": 0.0007782298599384127, + "loss": 0.765, + "step": 5480 + }, + { + "epoch": 0.27267309029502335, + "grad_norm": 0.2001953125, + "learning_rate": 0.0007781901261547632, + "loss": 0.7843, + "step": 5490 + }, + { + "epoch": 0.2731697625906427, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007781503923711135, + "loss": 0.7801, + "step": 5500 + }, + { + "epoch": 0.273666434886262, + "grad_norm": 0.1865234375, + "learning_rate": 0.000778110658587464, + "loss": 0.7614, + "step": 5510 + }, + { + "epoch": 0.2741631071818814, + "grad_norm": 0.2119140625, + "learning_rate": 0.0007780709248038145, + "loss": 0.8162, + "step": 5520 + }, + { + "epoch": 0.27465977947750075, + "grad_norm": 0.1875, + "learning_rate": 0.000778031191020165, + "loss": 0.7625, + "step": 5530 + }, + { + "epoch": 0.2751564517731201, + "grad_norm": 0.1943359375, + "learning_rate": 0.0007779914572365154, + "loss": 0.7837, + "step": 5540 + }, + { + "epoch": 0.27565312406873943, + "grad_norm": 0.1953125, + "learning_rate": 0.0007779517234528659, + "loss": 0.7613, + "step": 5550 + }, + { + "epoch": 0.2761497963643588, + "grad_norm": 0.244140625, + "learning_rate": 0.0007779119896692163, + "loss": 0.7512, + "step": 5560 + }, + { + "epoch": 0.27664646865997816, + "grad_norm": 0.1865234375, + "learning_rate": 0.0007778722558855667, + "loss": 0.7763, + "step": 5570 + }, + { + "epoch": 0.2771431409555975, + "grad_norm": 0.1982421875, + "learning_rate": 0.0007778325221019171, + "loss": 0.7752, + "step": 5580 + }, + { + "epoch": 0.27763981325121684, + "grad_norm": 0.177734375, + "learning_rate": 0.0007777927883182677, + "loss": 0.8011, + "step": 5590 + }, + { + "epoch": 0.2781364855468362, + "grad_norm": 0.224609375, + "learning_rate": 0.0007777530545346181, + "loss": 0.7834, + "step": 5600 + }, + { + "epoch": 0.27863315784245557, + "grad_norm": 0.2119140625, + "learning_rate": 0.0007777133207509685, + "loss": 0.8148, + "step": 5610 + }, + { + "epoch": 0.2791298301380749, + "grad_norm": 0.1953125, + "learning_rate": 0.000777673586967319, + "loss": 0.7647, + "step": 5620 + }, + { + "epoch": 0.27962650243369425, + "grad_norm": 0.193359375, + "learning_rate": 0.0007776338531836695, + "loss": 0.7446, + "step": 5630 + }, + { + "epoch": 0.2801231747293136, + "grad_norm": 0.2001953125, + "learning_rate": 0.0007775941194000199, + "loss": 0.7964, + "step": 5640 + }, + { + "epoch": 0.2806198470249329, + "grad_norm": 0.197265625, + "learning_rate": 0.0007775543856163704, + "loss": 0.7563, + "step": 5650 + }, + { + "epoch": 0.2811165193205523, + "grad_norm": 0.1845703125, + "learning_rate": 0.0007775146518327208, + "loss": 0.7848, + "step": 5660 + }, + { + "epoch": 0.28161319161617165, + "grad_norm": 0.244140625, + "learning_rate": 0.0007774749180490712, + "loss": 0.7805, + "step": 5670 + }, + { + "epoch": 0.282109863911791, + "grad_norm": 0.205078125, + "learning_rate": 0.0007774351842654218, + "loss": 0.7383, + "step": 5680 + }, + { + "epoch": 0.28260653620741033, + "grad_norm": 0.2021484375, + "learning_rate": 0.0007773954504817722, + "loss": 0.7549, + "step": 5690 + }, + { + "epoch": 0.2831032085030297, + "grad_norm": 0.2177734375, + "learning_rate": 0.0007773557166981226, + "loss": 0.7672, + "step": 5700 + }, + { + "epoch": 0.28359988079864906, + "grad_norm": 0.21484375, + "learning_rate": 0.0007773159829144731, + "loss": 0.7659, + "step": 5710 + }, + { + "epoch": 0.2840965530942684, + "grad_norm": 0.1982421875, + "learning_rate": 0.0007772762491308235, + "loss": 0.7696, + "step": 5720 + }, + { + "epoch": 0.28459322538988774, + "grad_norm": 0.220703125, + "learning_rate": 0.000777236515347174, + "loss": 0.7976, + "step": 5730 + }, + { + "epoch": 0.2850898976855071, + "grad_norm": 0.1826171875, + "learning_rate": 0.0007771967815635245, + "loss": 0.7625, + "step": 5740 + }, + { + "epoch": 0.28558656998112647, + "grad_norm": 0.2021484375, + "learning_rate": 0.0007771570477798749, + "loss": 0.8234, + "step": 5750 + }, + { + "epoch": 0.2860832422767458, + "grad_norm": 0.244140625, + "learning_rate": 0.0007771173139962253, + "loss": 0.7991, + "step": 5760 + }, + { + "epoch": 0.28657991457236515, + "grad_norm": 0.1767578125, + "learning_rate": 0.0007770775802125757, + "loss": 0.7791, + "step": 5770 + }, + { + "epoch": 0.2870765868679845, + "grad_norm": 0.2119140625, + "learning_rate": 0.0007770378464289263, + "loss": 0.8082, + "step": 5780 + }, + { + "epoch": 0.2875732591636039, + "grad_norm": 0.193359375, + "learning_rate": 0.0007769981126452767, + "loss": 0.7729, + "step": 5790 + }, + { + "epoch": 0.2880699314592232, + "grad_norm": 0.2041015625, + "learning_rate": 0.0007769583788616271, + "loss": 0.7724, + "step": 5800 + }, + { + "epoch": 0.28856660375484255, + "grad_norm": 0.19140625, + "learning_rate": 0.0007769186450779776, + "loss": 0.7698, + "step": 5810 + }, + { + "epoch": 0.2890632760504619, + "grad_norm": 0.20703125, + "learning_rate": 0.000776878911294328, + "loss": 0.7587, + "step": 5820 + }, + { + "epoch": 0.28955994834608123, + "grad_norm": 0.1982421875, + "learning_rate": 0.0007768391775106784, + "loss": 0.7634, + "step": 5830 + }, + { + "epoch": 0.2900566206417006, + "grad_norm": 0.2080078125, + "learning_rate": 0.000776799443727029, + "loss": 0.7557, + "step": 5840 + }, + { + "epoch": 0.29055329293731996, + "grad_norm": 0.177734375, + "learning_rate": 0.0007767597099433794, + "loss": 0.7405, + "step": 5850 + }, + { + "epoch": 0.2910499652329393, + "grad_norm": 0.18359375, + "learning_rate": 0.0007767199761597298, + "loss": 0.7826, + "step": 5860 + }, + { + "epoch": 0.29154663752855864, + "grad_norm": 0.1884765625, + "learning_rate": 0.0007766802423760803, + "loss": 0.7608, + "step": 5870 + }, + { + "epoch": 0.29204330982417803, + "grad_norm": 0.2236328125, + "learning_rate": 0.0007766405085924307, + "loss": 0.739, + "step": 5880 + }, + { + "epoch": 0.29253998211979737, + "grad_norm": 0.19921875, + "learning_rate": 0.0007766007748087812, + "loss": 0.7819, + "step": 5890 + }, + { + "epoch": 0.2930366544154167, + "grad_norm": 0.19921875, + "learning_rate": 0.0007765610410251317, + "loss": 0.7584, + "step": 5900 + }, + { + "epoch": 0.29353332671103605, + "grad_norm": 0.24609375, + "learning_rate": 0.0007765213072414821, + "loss": 0.7396, + "step": 5910 + }, + { + "epoch": 0.2940299990066554, + "grad_norm": 0.205078125, + "learning_rate": 0.0007764815734578325, + "loss": 0.7617, + "step": 5920 + }, + { + "epoch": 0.2945266713022748, + "grad_norm": 0.2080078125, + "learning_rate": 0.000776441839674183, + "loss": 0.7464, + "step": 5930 + }, + { + "epoch": 0.2950233435978941, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007764021058905335, + "loss": 0.757, + "step": 5940 + }, + { + "epoch": 0.29552001589351345, + "grad_norm": 0.1767578125, + "learning_rate": 0.0007763623721068839, + "loss": 0.7461, + "step": 5950 + }, + { + "epoch": 0.2960166881891328, + "grad_norm": 0.1923828125, + "learning_rate": 0.0007763226383232343, + "loss": 0.77, + "step": 5960 + }, + { + "epoch": 0.2965133604847522, + "grad_norm": 0.181640625, + "learning_rate": 0.0007762829045395848, + "loss": 0.7571, + "step": 5970 + }, + { + "epoch": 0.2970100327803715, + "grad_norm": 0.2119140625, + "learning_rate": 0.0007762431707559354, + "loss": 0.7764, + "step": 5980 + }, + { + "epoch": 0.29750670507599086, + "grad_norm": 0.208984375, + "learning_rate": 0.0007762034369722857, + "loss": 0.7419, + "step": 5990 + }, + { + "epoch": 0.2980033773716102, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007761637031886362, + "loss": 0.7873, + "step": 6000 + }, + { + "epoch": 0.29850004966722954, + "grad_norm": 0.2001953125, + "learning_rate": 0.0007761239694049867, + "loss": 0.7525, + "step": 6010 + }, + { + "epoch": 0.29899672196284893, + "grad_norm": 0.1953125, + "learning_rate": 0.000776084235621337, + "loss": 0.7782, + "step": 6020 + }, + { + "epoch": 0.29949339425846827, + "grad_norm": 0.2216796875, + "learning_rate": 0.0007760445018376875, + "loss": 0.7807, + "step": 6030 + }, + { + "epoch": 0.2999900665540876, + "grad_norm": 0.1943359375, + "learning_rate": 0.000776004768054038, + "loss": 0.7586, + "step": 6040 + }, + { + "epoch": 0.30048673884970695, + "grad_norm": 0.169921875, + "learning_rate": 0.0007759650342703884, + "loss": 0.7722, + "step": 6050 + }, + { + "epoch": 0.30098341114532634, + "grad_norm": 0.181640625, + "learning_rate": 0.0007759253004867389, + "loss": 0.7659, + "step": 6060 + }, + { + "epoch": 0.3014800834409457, + "grad_norm": 0.1787109375, + "learning_rate": 0.0007758855667030893, + "loss": 0.7636, + "step": 6070 + }, + { + "epoch": 0.301976755736565, + "grad_norm": 0.1953125, + "learning_rate": 0.0007758458329194397, + "loss": 0.7604, + "step": 6080 + }, + { + "epoch": 0.30247342803218435, + "grad_norm": 0.177734375, + "learning_rate": 0.0007758060991357903, + "loss": 0.7528, + "step": 6090 + }, + { + "epoch": 0.3029701003278037, + "grad_norm": 0.189453125, + "learning_rate": 0.0007757663653521407, + "loss": 0.7645, + "step": 6100 + }, + { + "epoch": 0.3034667726234231, + "grad_norm": 0.201171875, + "learning_rate": 0.0007757266315684912, + "loss": 0.7779, + "step": 6110 + }, + { + "epoch": 0.3039634449190424, + "grad_norm": 0.2265625, + "learning_rate": 0.0007756868977848416, + "loss": 0.758, + "step": 6120 + }, + { + "epoch": 0.30446011721466176, + "grad_norm": 0.189453125, + "learning_rate": 0.000775647164001192, + "loss": 0.7451, + "step": 6130 + }, + { + "epoch": 0.3049567895102811, + "grad_norm": 0.1875, + "learning_rate": 0.0007756074302175426, + "loss": 0.7465, + "step": 6140 + }, + { + "epoch": 0.30545346180590044, + "grad_norm": 0.1796875, + "learning_rate": 0.0007755676964338929, + "loss": 0.7755, + "step": 6150 + }, + { + "epoch": 0.30595013410151983, + "grad_norm": 0.234375, + "learning_rate": 0.0007755279626502434, + "loss": 0.7573, + "step": 6160 + }, + { + "epoch": 0.30644680639713917, + "grad_norm": 0.1728515625, + "learning_rate": 0.0007754882288665939, + "loss": 0.7536, + "step": 6170 + }, + { + "epoch": 0.3069434786927585, + "grad_norm": 0.19140625, + "learning_rate": 0.0007754484950829442, + "loss": 0.7855, + "step": 6180 + }, + { + "epoch": 0.30744015098837785, + "grad_norm": 0.1865234375, + "learning_rate": 0.0007754087612992948, + "loss": 0.756, + "step": 6190 + }, + { + "epoch": 0.30793682328399724, + "grad_norm": 0.197265625, + "learning_rate": 0.0007753690275156453, + "loss": 0.7682, + "step": 6200 + }, + { + "epoch": 0.3084334955796166, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007753292937319956, + "loss": 0.7463, + "step": 6210 + }, + { + "epoch": 0.3089301678752359, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007752895599483461, + "loss": 0.7839, + "step": 6220 + }, + { + "epoch": 0.30942684017085526, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007752498261646965, + "loss": 0.7485, + "step": 6230 + }, + { + "epoch": 0.3099235124664746, + "grad_norm": 0.1826171875, + "learning_rate": 0.000775210092381047, + "loss": 0.7452, + "step": 6240 + }, + { + "epoch": 0.310420184762094, + "grad_norm": 0.19140625, + "learning_rate": 0.0007751703585973975, + "loss": 0.7546, + "step": 6250 + }, + { + "epoch": 0.3109168570577133, + "grad_norm": 0.1875, + "learning_rate": 0.0007751306248137479, + "loss": 0.7653, + "step": 6260 + }, + { + "epoch": 0.31141352935333266, + "grad_norm": 0.185546875, + "learning_rate": 0.0007750908910300984, + "loss": 0.7725, + "step": 6270 + }, + { + "epoch": 0.311910201648952, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007750511572464488, + "loss": 0.7692, + "step": 6280 + }, + { + "epoch": 0.3124068739445714, + "grad_norm": 0.29296875, + "learning_rate": 0.0007750114234627993, + "loss": 0.7469, + "step": 6290 + }, + { + "epoch": 0.31290354624019073, + "grad_norm": 0.2001953125, + "learning_rate": 0.0007749716896791498, + "loss": 0.7579, + "step": 6300 + }, + { + "epoch": 0.31340021853581007, + "grad_norm": 0.197265625, + "learning_rate": 0.0007749319558955002, + "loss": 0.7455, + "step": 6310 + }, + { + "epoch": 0.3138968908314294, + "grad_norm": 0.2158203125, + "learning_rate": 0.0007748922221118506, + "loss": 0.7563, + "step": 6320 + }, + { + "epoch": 0.31439356312704875, + "grad_norm": 0.1875, + "learning_rate": 0.0007748524883282011, + "loss": 0.7535, + "step": 6330 + }, + { + "epoch": 0.31489023542266814, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007748127545445516, + "loss": 0.7547, + "step": 6340 + }, + { + "epoch": 0.3153869077182875, + "grad_norm": 0.1826171875, + "learning_rate": 0.000774773020760902, + "loss": 0.7392, + "step": 6350 + }, + { + "epoch": 0.3158835800139068, + "grad_norm": 0.189453125, + "learning_rate": 0.0007747332869772525, + "loss": 0.7651, + "step": 6360 + }, + { + "epoch": 0.31638025230952616, + "grad_norm": 0.1796875, + "learning_rate": 0.0007746935531936028, + "loss": 0.7575, + "step": 6370 + }, + { + "epoch": 0.31687692460514555, + "grad_norm": 0.21484375, + "learning_rate": 0.0007746538194099533, + "loss": 0.7446, + "step": 6380 + }, + { + "epoch": 0.3173735969007649, + "grad_norm": 0.166015625, + "learning_rate": 0.0007746140856263039, + "loss": 0.7425, + "step": 6390 + }, + { + "epoch": 0.3178702691963842, + "grad_norm": 0.1748046875, + "learning_rate": 0.0007745743518426542, + "loss": 0.7696, + "step": 6400 + }, + { + "epoch": 0.31836694149200356, + "grad_norm": 0.216796875, + "learning_rate": 0.0007745346180590047, + "loss": 0.7632, + "step": 6410 + }, + { + "epoch": 0.3188636137876229, + "grad_norm": 0.2138671875, + "learning_rate": 0.0007744948842753551, + "loss": 0.7838, + "step": 6420 + }, + { + "epoch": 0.3193602860832423, + "grad_norm": 0.1943359375, + "learning_rate": 0.0007744551504917056, + "loss": 0.7685, + "step": 6430 + }, + { + "epoch": 0.31985695837886163, + "grad_norm": 0.1689453125, + "learning_rate": 0.0007744154167080561, + "loss": 0.7652, + "step": 6440 + }, + { + "epoch": 0.32035363067448097, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007743756829244065, + "loss": 0.755, + "step": 6450 + }, + { + "epoch": 0.3208503029701003, + "grad_norm": 0.197265625, + "learning_rate": 0.000774335949140757, + "loss": 0.7277, + "step": 6460 + }, + { + "epoch": 0.3213469752657197, + "grad_norm": 0.19140625, + "learning_rate": 0.0007742962153571074, + "loss": 0.7414, + "step": 6470 + }, + { + "epoch": 0.32184364756133904, + "grad_norm": 0.201171875, + "learning_rate": 0.0007742564815734578, + "loss": 0.7466, + "step": 6480 + }, + { + "epoch": 0.3223403198569584, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007742167477898084, + "loss": 0.7707, + "step": 6490 + }, + { + "epoch": 0.3228369921525777, + "grad_norm": 0.166015625, + "learning_rate": 0.0007741770140061588, + "loss": 0.7345, + "step": 6500 + }, + { + "epoch": 0.32333366444819706, + "grad_norm": 0.1923828125, + "learning_rate": 0.0007741372802225092, + "loss": 0.7477, + "step": 6510 + }, + { + "epoch": 0.32383033674381645, + "grad_norm": 0.1923828125, + "learning_rate": 0.0007740975464388597, + "loss": 0.7523, + "step": 6520 + }, + { + "epoch": 0.3243270090394358, + "grad_norm": 0.169921875, + "learning_rate": 0.0007740578126552101, + "loss": 0.7233, + "step": 6530 + }, + { + "epoch": 0.3248236813350551, + "grad_norm": 0.17578125, + "learning_rate": 0.0007740180788715606, + "loss": 0.7454, + "step": 6540 + }, + { + "epoch": 0.32532035363067446, + "grad_norm": 0.20703125, + "learning_rate": 0.0007739783450879111, + "loss": 0.8268, + "step": 6550 + }, + { + "epoch": 0.32581702592629386, + "grad_norm": 0.181640625, + "learning_rate": 0.0007739386113042615, + "loss": 0.7481, + "step": 6560 + }, + { + "epoch": 0.3263136982219132, + "grad_norm": 0.1826171875, + "learning_rate": 0.0007738988775206119, + "loss": 0.742, + "step": 6570 + }, + { + "epoch": 0.32681037051753253, + "grad_norm": 0.2001953125, + "learning_rate": 0.0007738591437369624, + "loss": 0.7556, + "step": 6580 + }, + { + "epoch": 0.32730704281315187, + "grad_norm": 0.19140625, + "learning_rate": 0.0007738194099533129, + "loss": 0.7327, + "step": 6590 + }, + { + "epoch": 0.3278037151087712, + "grad_norm": 0.1748046875, + "learning_rate": 0.0007737796761696633, + "loss": 0.7419, + "step": 6600 + }, + { + "epoch": 0.3283003874043906, + "grad_norm": 0.181640625, + "learning_rate": 0.0007737399423860138, + "loss": 0.7433, + "step": 6610 + }, + { + "epoch": 0.32879705970000994, + "grad_norm": 0.1748046875, + "learning_rate": 0.0007737002086023642, + "loss": 0.7658, + "step": 6620 + }, + { + "epoch": 0.3292937319956293, + "grad_norm": 0.1748046875, + "learning_rate": 0.0007736604748187146, + "loss": 0.7514, + "step": 6630 + }, + { + "epoch": 0.3297904042912486, + "grad_norm": 0.1796875, + "learning_rate": 0.000773620741035065, + "loss": 0.7481, + "step": 6640 + }, + { + "epoch": 0.330287076586868, + "grad_norm": 0.193359375, + "learning_rate": 0.0007735810072514156, + "loss": 0.741, + "step": 6650 + }, + { + "epoch": 0.33078374888248735, + "grad_norm": 0.2177734375, + "learning_rate": 0.000773541273467766, + "loss": 0.7469, + "step": 6660 + }, + { + "epoch": 0.3312804211781067, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007735015396841164, + "loss": 0.7719, + "step": 6670 + }, + { + "epoch": 0.331777093473726, + "grad_norm": 0.1767578125, + "learning_rate": 0.0007734618059004669, + "loss": 0.7762, + "step": 6680 + }, + { + "epoch": 0.33227376576934536, + "grad_norm": 0.169921875, + "learning_rate": 0.0007734220721168174, + "loss": 0.7379, + "step": 6690 + }, + { + "epoch": 0.33277043806496476, + "grad_norm": 0.19140625, + "learning_rate": 0.0007733823383331678, + "loss": 0.7259, + "step": 6700 + }, + { + "epoch": 0.3332671103605841, + "grad_norm": 0.205078125, + "learning_rate": 0.0007733426045495183, + "loss": 0.7645, + "step": 6710 + }, + { + "epoch": 0.33376378265620343, + "grad_norm": 0.19140625, + "learning_rate": 0.0007733028707658687, + "loss": 0.7616, + "step": 6720 + }, + { + "epoch": 0.33426045495182277, + "grad_norm": 0.1669921875, + "learning_rate": 0.0007732631369822191, + "loss": 0.7393, + "step": 6730 + }, + { + "epoch": 0.3347571272474421, + "grad_norm": 0.1904296875, + "learning_rate": 0.0007732234031985697, + "loss": 0.7931, + "step": 6740 + }, + { + "epoch": 0.3352537995430615, + "grad_norm": 0.2158203125, + "learning_rate": 0.0007731836694149201, + "loss": 0.7229, + "step": 6750 + }, + { + "epoch": 0.33575047183868084, + "grad_norm": 0.1787109375, + "learning_rate": 0.0007731439356312705, + "loss": 0.7394, + "step": 6760 + }, + { + "epoch": 0.3362471441343002, + "grad_norm": 0.2119140625, + "learning_rate": 0.000773104201847621, + "loss": 0.7355, + "step": 6770 + }, + { + "epoch": 0.3367438164299195, + "grad_norm": 0.2080078125, + "learning_rate": 0.0007730644680639714, + "loss": 0.7552, + "step": 6780 + }, + { + "epoch": 0.3372404887255389, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007730247342803219, + "loss": 0.7536, + "step": 6790 + }, + { + "epoch": 0.33773716102115825, + "grad_norm": 0.203125, + "learning_rate": 0.0007729850004966724, + "loss": 0.7641, + "step": 6800 + }, + { + "epoch": 0.3382338333167776, + "grad_norm": 0.224609375, + "learning_rate": 0.0007729452667130228, + "loss": 0.7434, + "step": 6810 + }, + { + "epoch": 0.3387305056123969, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007729055329293732, + "loss": 0.7578, + "step": 6820 + }, + { + "epoch": 0.33922717790801626, + "grad_norm": 0.1787109375, + "learning_rate": 0.0007728657991457236, + "loss": 0.7551, + "step": 6830 + }, + { + "epoch": 0.33972385020363566, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007728260653620742, + "loss": 0.7743, + "step": 6840 + }, + { + "epoch": 0.340220522499255, + "grad_norm": 0.1826171875, + "learning_rate": 0.0007727863315784246, + "loss": 0.7424, + "step": 6850 + }, + { + "epoch": 0.34071719479487433, + "grad_norm": 0.1884765625, + "learning_rate": 0.000772746597794775, + "loss": 0.7685, + "step": 6860 + }, + { + "epoch": 0.34121386709049367, + "grad_norm": 0.15625, + "learning_rate": 0.0007727068640111255, + "loss": 0.7318, + "step": 6870 + }, + { + "epoch": 0.34171053938611307, + "grad_norm": 0.169921875, + "learning_rate": 0.000772667130227476, + "loss": 0.7452, + "step": 6880 + }, + { + "epoch": 0.3422072116817324, + "grad_norm": 0.1953125, + "learning_rate": 0.0007726273964438263, + "loss": 0.7754, + "step": 6890 + }, + { + "epoch": 0.34270388397735174, + "grad_norm": 0.1787109375, + "learning_rate": 0.0007725876626601769, + "loss": 0.763, + "step": 6900 + }, + { + "epoch": 0.3432005562729711, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007725479288765273, + "loss": 0.7346, + "step": 6910 + }, + { + "epoch": 0.3436972285685904, + "grad_norm": 0.177734375, + "learning_rate": 0.0007725081950928777, + "loss": 0.7633, + "step": 6920 + }, + { + "epoch": 0.3441939008642098, + "grad_norm": 0.185546875, + "learning_rate": 0.0007724684613092282, + "loss": 0.7258, + "step": 6930 + }, + { + "epoch": 0.34469057315982915, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007724287275255787, + "loss": 0.7323, + "step": 6940 + }, + { + "epoch": 0.3451872454554485, + "grad_norm": 0.166015625, + "learning_rate": 0.0007723889937419291, + "loss": 0.7417, + "step": 6950 + }, + { + "epoch": 0.3456839177510678, + "grad_norm": 0.18359375, + "learning_rate": 0.0007723492599582796, + "loss": 0.7511, + "step": 6960 + }, + { + "epoch": 0.3461805900466872, + "grad_norm": 0.2060546875, + "learning_rate": 0.00077230952617463, + "loss": 0.703, + "step": 6970 + }, + { + "epoch": 0.34667726234230656, + "grad_norm": 0.1943359375, + "learning_rate": 0.0007722697923909804, + "loss": 0.7316, + "step": 6980 + }, + { + "epoch": 0.3471739346379259, + "grad_norm": 0.185546875, + "learning_rate": 0.000772230058607331, + "loss": 0.7663, + "step": 6990 + }, + { + "epoch": 0.34767060693354523, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007721903248236814, + "loss": 0.7679, + "step": 7000 + }, + { + "epoch": 0.3481672792291646, + "grad_norm": 0.1865234375, + "learning_rate": 0.0007721505910400319, + "loss": 0.7432, + "step": 7010 + }, + { + "epoch": 0.34866395152478397, + "grad_norm": 0.19921875, + "learning_rate": 0.0007721108572563822, + "loss": 0.7137, + "step": 7020 + }, + { + "epoch": 0.3491606238204033, + "grad_norm": 0.1513671875, + "learning_rate": 0.0007720711234727327, + "loss": 0.7484, + "step": 7030 + }, + { + "epoch": 0.34965729611602264, + "grad_norm": 0.193359375, + "learning_rate": 0.0007720313896890833, + "loss": 0.8066, + "step": 7040 + }, + { + "epoch": 0.350153968411642, + "grad_norm": 0.1904296875, + "learning_rate": 0.0007719916559054336, + "loss": 0.7591, + "step": 7050 + }, + { + "epoch": 0.3506506407072614, + "grad_norm": 0.17578125, + "learning_rate": 0.0007719519221217841, + "loss": 0.7204, + "step": 7060 + }, + { + "epoch": 0.3511473130028807, + "grad_norm": 0.18359375, + "learning_rate": 0.0007719121883381346, + "loss": 0.7619, + "step": 7070 + }, + { + "epoch": 0.35164398529850005, + "grad_norm": 0.171875, + "learning_rate": 0.0007718724545544849, + "loss": 0.7494, + "step": 7080 + }, + { + "epoch": 0.3521406575941194, + "grad_norm": 0.16796875, + "learning_rate": 0.0007718327207708355, + "loss": 0.7119, + "step": 7090 + }, + { + "epoch": 0.3526373298897387, + "grad_norm": 0.173828125, + "learning_rate": 0.0007717929869871859, + "loss": 0.74, + "step": 7100 + }, + { + "epoch": 0.3531340021853581, + "grad_norm": 0.1728515625, + "learning_rate": 0.0007717532532035363, + "loss": 0.7364, + "step": 7110 + }, + { + "epoch": 0.35363067448097746, + "grad_norm": 0.18359375, + "learning_rate": 0.0007717135194198868, + "loss": 0.723, + "step": 7120 + }, + { + "epoch": 0.3541273467765968, + "grad_norm": 0.1669921875, + "learning_rate": 0.0007716737856362372, + "loss": 0.7895, + "step": 7130 + }, + { + "epoch": 0.35462401907221613, + "grad_norm": 0.1787109375, + "learning_rate": 0.0007716340518525876, + "loss": 0.7586, + "step": 7140 + }, + { + "epoch": 0.35512069136783553, + "grad_norm": 0.18359375, + "learning_rate": 0.0007715943180689382, + "loss": 0.7063, + "step": 7150 + }, + { + "epoch": 0.35561736366345487, + "grad_norm": 0.19921875, + "learning_rate": 0.0007715545842852886, + "loss": 0.7826, + "step": 7160 + }, + { + "epoch": 0.3561140359590742, + "grad_norm": 0.1669921875, + "learning_rate": 0.0007715148505016391, + "loss": 0.7246, + "step": 7170 + }, + { + "epoch": 0.35661070825469354, + "grad_norm": 0.181640625, + "learning_rate": 0.0007714751167179895, + "loss": 0.7159, + "step": 7180 + }, + { + "epoch": 0.3571073805503129, + "grad_norm": 0.1787109375, + "learning_rate": 0.00077143538293434, + "loss": 0.7204, + "step": 7190 + }, + { + "epoch": 0.3576040528459323, + "grad_norm": 0.1904296875, + "learning_rate": 0.0007713956491506905, + "loss": 0.7538, + "step": 7200 + }, + { + "epoch": 0.3581007251415516, + "grad_norm": 0.1767578125, + "learning_rate": 0.0007713559153670409, + "loss": 0.7469, + "step": 7210 + }, + { + "epoch": 0.35859739743717095, + "grad_norm": 0.181640625, + "learning_rate": 0.0007713161815833913, + "loss": 0.7408, + "step": 7220 + }, + { + "epoch": 0.3590940697327903, + "grad_norm": 0.1748046875, + "learning_rate": 0.0007712764477997418, + "loss": 0.7432, + "step": 7230 + }, + { + "epoch": 0.3595907420284097, + "grad_norm": 0.1796875, + "learning_rate": 0.0007712367140160921, + "loss": 0.7366, + "step": 7240 + }, + { + "epoch": 0.360087414324029, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007711969802324427, + "loss": 0.728, + "step": 7250 + }, + { + "epoch": 0.36058408661964836, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007711572464487932, + "loss": 0.7584, + "step": 7260 + }, + { + "epoch": 0.3610807589152677, + "grad_norm": 0.158203125, + "learning_rate": 0.0007711175126651435, + "loss": 0.6997, + "step": 7270 + }, + { + "epoch": 0.36157743121088703, + "grad_norm": 0.1806640625, + "learning_rate": 0.000771077778881494, + "loss": 0.7144, + "step": 7280 + }, + { + "epoch": 0.36207410350650643, + "grad_norm": 0.19921875, + "learning_rate": 0.0007710380450978444, + "loss": 0.728, + "step": 7290 + }, + { + "epoch": 0.36257077580212577, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007709983113141949, + "loss": 0.7011, + "step": 7300 + }, + { + "epoch": 0.3630674480977451, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007709585775305454, + "loss": 0.7655, + "step": 7310 + }, + { + "epoch": 0.36356412039336444, + "grad_norm": 0.1875, + "learning_rate": 0.0007709188437468958, + "loss": 0.7247, + "step": 7320 + }, + { + "epoch": 0.3640607926889838, + "grad_norm": 0.15625, + "learning_rate": 0.0007708791099632463, + "loss": 0.7441, + "step": 7330 + }, + { + "epoch": 0.3645574649846032, + "grad_norm": 0.21484375, + "learning_rate": 0.0007708393761795967, + "loss": 0.7515, + "step": 7340 + }, + { + "epoch": 0.3650541372802225, + "grad_norm": 0.177734375, + "learning_rate": 0.0007707996423959472, + "loss": 0.7171, + "step": 7350 + }, + { + "epoch": 0.36555080957584185, + "grad_norm": 0.1689453125, + "learning_rate": 0.0007707599086122977, + "loss": 0.6862, + "step": 7360 + }, + { + "epoch": 0.3660474818714612, + "grad_norm": 0.1630859375, + "learning_rate": 0.0007707201748286481, + "loss": 0.706, + "step": 7370 + }, + { + "epoch": 0.3665441541670806, + "grad_norm": 0.216796875, + "learning_rate": 0.0007706804410449985, + "loss": 0.7725, + "step": 7380 + }, + { + "epoch": 0.3670408264626999, + "grad_norm": 0.15625, + "learning_rate": 0.000770640707261349, + "loss": 0.7314, + "step": 7390 + }, + { + "epoch": 0.36753749875831926, + "grad_norm": 0.1796875, + "learning_rate": 0.0007706009734776995, + "loss": 0.7398, + "step": 7400 + }, + { + "epoch": 0.3680341710539386, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007705612396940499, + "loss": 0.7266, + "step": 7410 + }, + { + "epoch": 0.36853084334955793, + "grad_norm": 0.173828125, + "learning_rate": 0.0007705215059104004, + "loss": 0.7625, + "step": 7420 + }, + { + "epoch": 0.36902751564517733, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007704817721267507, + "loss": 0.716, + "step": 7430 + }, + { + "epoch": 0.36952418794079667, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007704420383431012, + "loss": 0.7418, + "step": 7440 + }, + { + "epoch": 0.370020860236416, + "grad_norm": 0.17578125, + "learning_rate": 0.0007704023045594518, + "loss": 0.7125, + "step": 7450 + }, + { + "epoch": 0.37051753253203534, + "grad_norm": 0.1875, + "learning_rate": 0.0007703625707758022, + "loss": 0.7312, + "step": 7460 + }, + { + "epoch": 0.37101420482765474, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007703228369921526, + "loss": 0.7424, + "step": 7470 + }, + { + "epoch": 0.3715108771232741, + "grad_norm": 0.1904296875, + "learning_rate": 0.0007702831032085031, + "loss": 0.7156, + "step": 7480 + }, + { + "epoch": 0.3720075494188934, + "grad_norm": 0.193359375, + "learning_rate": 0.0007702433694248535, + "loss": 0.7348, + "step": 7490 + }, + { + "epoch": 0.37250422171451275, + "grad_norm": 0.2021484375, + "learning_rate": 0.000770203635641204, + "loss": 0.6993, + "step": 7500 + }, + { + "epoch": 0.3730008940101321, + "grad_norm": 0.171875, + "learning_rate": 0.0007701639018575544, + "loss": 0.7259, + "step": 7510 + }, + { + "epoch": 0.3734975663057515, + "grad_norm": 0.1572265625, + "learning_rate": 0.0007701241680739049, + "loss": 0.681, + "step": 7520 + }, + { + "epoch": 0.3739942386013708, + "grad_norm": 0.189453125, + "learning_rate": 0.0007700844342902553, + "loss": 0.734, + "step": 7530 + }, + { + "epoch": 0.37449091089699016, + "grad_norm": 0.169921875, + "learning_rate": 0.0007700447005066057, + "loss": 0.7473, + "step": 7540 + }, + { + "epoch": 0.3749875831926095, + "grad_norm": 0.1875, + "learning_rate": 0.0007700049667229563, + "loss": 0.7197, + "step": 7550 + }, + { + "epoch": 0.3754842554882289, + "grad_norm": 0.1845703125, + "learning_rate": 0.0007699652329393067, + "loss": 0.7397, + "step": 7560 + }, + { + "epoch": 0.37598092778384823, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007699254991556571, + "loss": 0.749, + "step": 7570 + }, + { + "epoch": 0.37647760007946757, + "grad_norm": 0.166015625, + "learning_rate": 0.0007698857653720076, + "loss": 0.7399, + "step": 7580 + }, + { + "epoch": 0.3769742723750869, + "grad_norm": 0.181640625, + "learning_rate": 0.000769846031588358, + "loss": 0.7477, + "step": 7590 + }, + { + "epoch": 0.37747094467070624, + "grad_norm": 0.1748046875, + "learning_rate": 0.0007698062978047085, + "loss": 0.7321, + "step": 7600 + }, + { + "epoch": 0.37796761696632564, + "grad_norm": 0.1533203125, + "learning_rate": 0.000769766564021059, + "loss": 0.7328, + "step": 7610 + }, + { + "epoch": 0.378464289261945, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007697268302374094, + "loss": 0.7521, + "step": 7620 + }, + { + "epoch": 0.3789609615575643, + "grad_norm": 0.2041015625, + "learning_rate": 0.0007696870964537598, + "loss": 0.762, + "step": 7630 + }, + { + "epoch": 0.37945763385318365, + "grad_norm": 0.169921875, + "learning_rate": 0.0007696473626701103, + "loss": 0.7221, + "step": 7640 + }, + { + "epoch": 0.37995430614880304, + "grad_norm": 0.1572265625, + "learning_rate": 0.0007696076288864608, + "loss": 0.7237, + "step": 7650 + }, + { + "epoch": 0.3804509784444224, + "grad_norm": 0.1728515625, + "learning_rate": 0.0007695678951028112, + "loss": 0.7303, + "step": 7660 + }, + { + "epoch": 0.3809476507400417, + "grad_norm": 0.220703125, + "learning_rate": 0.0007695281613191617, + "loss": 0.7715, + "step": 7670 + }, + { + "epoch": 0.38144432303566106, + "grad_norm": 0.181640625, + "learning_rate": 0.0007694884275355121, + "loss": 0.7481, + "step": 7680 + }, + { + "epoch": 0.3819409953312804, + "grad_norm": 0.1953125, + "learning_rate": 0.0007694486937518625, + "loss": 0.7587, + "step": 7690 + }, + { + "epoch": 0.3824376676268998, + "grad_norm": 0.166015625, + "learning_rate": 0.000769408959968213, + "loss": 0.7155, + "step": 7700 + }, + { + "epoch": 0.38293433992251913, + "grad_norm": 0.171875, + "learning_rate": 0.0007693692261845635, + "loss": 0.7326, + "step": 7710 + }, + { + "epoch": 0.38343101221813847, + "grad_norm": 0.1767578125, + "learning_rate": 0.0007693294924009139, + "loss": 0.7087, + "step": 7720 + }, + { + "epoch": 0.3839276845137578, + "grad_norm": 0.2060546875, + "learning_rate": 0.0007692897586172643, + "loss": 0.7131, + "step": 7730 + }, + { + "epoch": 0.3844243568093772, + "grad_norm": 0.1689453125, + "learning_rate": 0.0007692500248336148, + "loss": 0.7382, + "step": 7740 + }, + { + "epoch": 0.38492102910499654, + "grad_norm": 0.1611328125, + "learning_rate": 0.0007692102910499653, + "loss": 0.7186, + "step": 7750 + }, + { + "epoch": 0.3854177014006159, + "grad_norm": 0.1796875, + "learning_rate": 0.0007691705572663157, + "loss": 0.7351, + "step": 7760 + }, + { + "epoch": 0.3859143736962352, + "grad_norm": 0.17578125, + "learning_rate": 0.0007691308234826662, + "loss": 0.7372, + "step": 7770 + }, + { + "epoch": 0.38641104599185455, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007690910896990166, + "loss": 0.751, + "step": 7780 + }, + { + "epoch": 0.38690771828747395, + "grad_norm": 0.1669921875, + "learning_rate": 0.000769051355915367, + "loss": 0.7708, + "step": 7790 + }, + { + "epoch": 0.3874043905830933, + "grad_norm": 0.1669921875, + "learning_rate": 0.0007690116221317176, + "loss": 0.7545, + "step": 7800 + }, + { + "epoch": 0.3879010628787126, + "grad_norm": 0.1806640625, + "learning_rate": 0.000768971888348068, + "loss": 0.7645, + "step": 7810 + }, + { + "epoch": 0.38839773517433196, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007689321545644184, + "loss": 0.7348, + "step": 7820 + }, + { + "epoch": 0.38889440746995135, + "grad_norm": 0.162109375, + "learning_rate": 0.0007688924207807689, + "loss": 0.7003, + "step": 7830 + }, + { + "epoch": 0.3893910797655707, + "grad_norm": 0.1552734375, + "learning_rate": 0.0007688526869971193, + "loss": 0.7449, + "step": 7840 + }, + { + "epoch": 0.38988775206119003, + "grad_norm": 0.16015625, + "learning_rate": 0.0007688129532134698, + "loss": 0.7048, + "step": 7850 + }, + { + "epoch": 0.39038442435680937, + "grad_norm": 0.1767578125, + "learning_rate": 0.0007687732194298203, + "loss": 0.6932, + "step": 7860 + }, + { + "epoch": 0.3908810966524287, + "grad_norm": 0.1533203125, + "learning_rate": 0.0007687334856461707, + "loss": 0.6957, + "step": 7870 + }, + { + "epoch": 0.3913777689480481, + "grad_norm": 0.1796875, + "learning_rate": 0.0007686937518625211, + "loss": 0.7196, + "step": 7880 + }, + { + "epoch": 0.39187444124366744, + "grad_norm": 0.162109375, + "learning_rate": 0.0007686540180788715, + "loss": 0.7493, + "step": 7890 + }, + { + "epoch": 0.3923711135392868, + "grad_norm": 0.171875, + "learning_rate": 0.0007686142842952221, + "loss": 0.7667, + "step": 7900 + }, + { + "epoch": 0.3928677858349061, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007685745505115726, + "loss": 0.7573, + "step": 7910 + }, + { + "epoch": 0.39336445813052545, + "grad_norm": 0.2060546875, + "learning_rate": 0.0007685348167279229, + "loss": 0.7438, + "step": 7920 + }, + { + "epoch": 0.39386113042614485, + "grad_norm": 0.177734375, + "learning_rate": 0.0007684950829442734, + "loss": 0.7369, + "step": 7930 + }, + { + "epoch": 0.3943578027217642, + "grad_norm": 0.15625, + "learning_rate": 0.0007684553491606239, + "loss": 0.7336, + "step": 7940 + }, + { + "epoch": 0.3948544750173835, + "grad_norm": 0.162109375, + "learning_rate": 0.0007684156153769743, + "loss": 0.7205, + "step": 7950 + }, + { + "epoch": 0.39535114731300286, + "grad_norm": 0.173828125, + "learning_rate": 0.0007683758815933248, + "loss": 0.699, + "step": 7960 + }, + { + "epoch": 0.39584781960862225, + "grad_norm": 0.189453125, + "learning_rate": 0.0007683361478096752, + "loss": 0.7284, + "step": 7970 + }, + { + "epoch": 0.3963444919042416, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007682964140260256, + "loss": 0.727, + "step": 7980 + }, + { + "epoch": 0.39684116419986093, + "grad_norm": 0.1787109375, + "learning_rate": 0.0007682566802423761, + "loss": 0.7521, + "step": 7990 + }, + { + "epoch": 0.39733783649548027, + "grad_norm": 0.193359375, + "learning_rate": 0.0007682169464587266, + "loss": 0.7427, + "step": 8000 + }, + { + "epoch": 0.3978345087910996, + "grad_norm": 0.16796875, + "learning_rate": 0.000768177212675077, + "loss": 0.6822, + "step": 8010 + }, + { + "epoch": 0.398331181086719, + "grad_norm": 0.1669921875, + "learning_rate": 0.0007681374788914275, + "loss": 0.718, + "step": 8020 + }, + { + "epoch": 0.39882785338233834, + "grad_norm": 0.16015625, + "learning_rate": 0.0007680977451077779, + "loss": 0.7153, + "step": 8030 + }, + { + "epoch": 0.3993245256779577, + "grad_norm": 0.15625, + "learning_rate": 0.0007680580113241283, + "loss": 0.7443, + "step": 8040 + }, + { + "epoch": 0.399821197973577, + "grad_norm": 0.1552734375, + "learning_rate": 0.0007680182775404789, + "loss": 0.7849, + "step": 8050 + }, + { + "epoch": 0.4003178702691964, + "grad_norm": 0.1748046875, + "learning_rate": 0.0007679785437568293, + "loss": 0.7288, + "step": 8060 + }, + { + "epoch": 0.40081454256481575, + "grad_norm": 0.1640625, + "learning_rate": 0.0007679388099731798, + "loss": 0.7566, + "step": 8070 + }, + { + "epoch": 0.4013112148604351, + "grad_norm": 0.158203125, + "learning_rate": 0.0007678990761895302, + "loss": 0.7008, + "step": 8080 + }, + { + "epoch": 0.4018078871560544, + "grad_norm": 0.150390625, + "learning_rate": 0.0007678593424058806, + "loss": 0.754, + "step": 8090 + }, + { + "epoch": 0.40230455945167376, + "grad_norm": 0.15625, + "learning_rate": 0.0007678196086222312, + "loss": 0.7161, + "step": 8100 + }, + { + "epoch": 0.40280123174729315, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007677798748385815, + "loss": 0.7508, + "step": 8110 + }, + { + "epoch": 0.4032979040429125, + "grad_norm": 0.158203125, + "learning_rate": 0.000767740141054932, + "loss": 0.7183, + "step": 8120 + }, + { + "epoch": 0.40379457633853183, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007677004072712825, + "loss": 0.7561, + "step": 8130 + }, + { + "epoch": 0.40429124863415117, + "grad_norm": 0.16015625, + "learning_rate": 0.0007676606734876328, + "loss": 0.7123, + "step": 8140 + }, + { + "epoch": 0.40478792092977056, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007676209397039834, + "loss": 0.7231, + "step": 8150 + }, + { + "epoch": 0.4052845932253899, + "grad_norm": 0.197265625, + "learning_rate": 0.0007675812059203338, + "loss": 0.7267, + "step": 8160 + }, + { + "epoch": 0.40578126552100924, + "grad_norm": 0.171875, + "learning_rate": 0.0007675414721366842, + "loss": 0.7173, + "step": 8170 + }, + { + "epoch": 0.4062779378166286, + "grad_norm": 0.16796875, + "learning_rate": 0.0007675017383530347, + "loss": 0.7031, + "step": 8180 + }, + { + "epoch": 0.4067746101122479, + "grad_norm": 0.15234375, + "learning_rate": 0.0007674620045693851, + "loss": 0.7208, + "step": 8190 + }, + { + "epoch": 0.4072712824078673, + "grad_norm": 0.2021484375, + "learning_rate": 0.0007674222707857355, + "loss": 0.7613, + "step": 8200 + }, + { + "epoch": 0.40776795470348665, + "grad_norm": 0.181640625, + "learning_rate": 0.0007673825370020861, + "loss": 0.7161, + "step": 8210 + }, + { + "epoch": 0.408264626999106, + "grad_norm": 0.1640625, + "learning_rate": 0.0007673428032184365, + "loss": 0.7485, + "step": 8220 + }, + { + "epoch": 0.4087612992947253, + "grad_norm": 0.1767578125, + "learning_rate": 0.000767303069434787, + "loss": 0.7091, + "step": 8230 + }, + { + "epoch": 0.4092579715903447, + "grad_norm": 0.16015625, + "learning_rate": 0.0007672633356511374, + "loss": 0.6908, + "step": 8240 + }, + { + "epoch": 0.40975464388596405, + "grad_norm": 0.166015625, + "learning_rate": 0.0007672236018674879, + "loss": 0.7208, + "step": 8250 + }, + { + "epoch": 0.4102513161815834, + "grad_norm": 0.171875, + "learning_rate": 0.0007671838680838384, + "loss": 0.735, + "step": 8260 + }, + { + "epoch": 0.41074798847720273, + "grad_norm": 0.1552734375, + "learning_rate": 0.0007671441343001888, + "loss": 0.7397, + "step": 8270 + }, + { + "epoch": 0.41124466077282207, + "grad_norm": 0.16796875, + "learning_rate": 0.0007671044005165392, + "loss": 0.7125, + "step": 8280 + }, + { + "epoch": 0.41174133306844146, + "grad_norm": 0.1962890625, + "learning_rate": 0.0007670646667328897, + "loss": 0.7101, + "step": 8290 + }, + { + "epoch": 0.4122380053640608, + "grad_norm": 0.173828125, + "learning_rate": 0.00076702493294924, + "loss": 0.74, + "step": 8300 + }, + { + "epoch": 0.41273467765968014, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007669851991655906, + "loss": 0.7184, + "step": 8310 + }, + { + "epoch": 0.4132313499552995, + "grad_norm": 0.15234375, + "learning_rate": 0.0007669454653819411, + "loss": 0.7223, + "step": 8320 + }, + { + "epoch": 0.41372802225091887, + "grad_norm": 0.1572265625, + "learning_rate": 0.0007669057315982914, + "loss": 0.7655, + "step": 8330 + }, + { + "epoch": 0.4142246945465382, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007668659978146419, + "loss": 0.7088, + "step": 8340 + }, + { + "epoch": 0.41472136684215755, + "grad_norm": 0.1728515625, + "learning_rate": 0.0007668262640309925, + "loss": 0.7325, + "step": 8350 + }, + { + "epoch": 0.4152180391377769, + "grad_norm": 0.169921875, + "learning_rate": 0.0007667865302473429, + "loss": 0.7189, + "step": 8360 + }, + { + "epoch": 0.4157147114333962, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007667467964636933, + "loss": 0.7222, + "step": 8370 + }, + { + "epoch": 0.4162113837290156, + "grad_norm": 0.16796875, + "learning_rate": 0.0007667070626800437, + "loss": 0.7308, + "step": 8380 + }, + { + "epoch": 0.41670805602463495, + "grad_norm": 0.1728515625, + "learning_rate": 0.0007666673288963942, + "loss": 0.6973, + "step": 8390 + }, + { + "epoch": 0.4172047283202543, + "grad_norm": 0.166015625, + "learning_rate": 0.0007666275951127447, + "loss": 0.6914, + "step": 8400 + }, + { + "epoch": 0.41770140061587363, + "grad_norm": 0.162109375, + "learning_rate": 0.0007665878613290951, + "loss": 0.72, + "step": 8410 + }, + { + "epoch": 0.418198072911493, + "grad_norm": 0.15234375, + "learning_rate": 0.0007665481275454456, + "loss": 0.6967, + "step": 8420 + }, + { + "epoch": 0.41869474520711236, + "grad_norm": 0.15234375, + "learning_rate": 0.000766508393761796, + "loss": 0.7139, + "step": 8430 + }, + { + "epoch": 0.4191914175027317, + "grad_norm": 0.158203125, + "learning_rate": 0.0007664686599781464, + "loss": 0.7003, + "step": 8440 + }, + { + "epoch": 0.41968808979835104, + "grad_norm": 0.150390625, + "learning_rate": 0.000766428926194497, + "loss": 0.7177, + "step": 8450 + }, + { + "epoch": 0.4201847620939704, + "grad_norm": 0.16015625, + "learning_rate": 0.0007663891924108474, + "loss": 0.7192, + "step": 8460 + }, + { + "epoch": 0.42068143438958977, + "grad_norm": 0.1640625, + "learning_rate": 0.0007663494586271978, + "loss": 0.7029, + "step": 8470 + }, + { + "epoch": 0.4211781066852091, + "grad_norm": 0.16015625, + "learning_rate": 0.0007663097248435483, + "loss": 0.6977, + "step": 8480 + }, + { + "epoch": 0.42167477898082845, + "grad_norm": 0.15625, + "learning_rate": 0.0007662699910598986, + "loss": 0.7426, + "step": 8490 + }, + { + "epoch": 0.4221714512764478, + "grad_norm": 0.16015625, + "learning_rate": 0.0007662302572762491, + "loss": 0.7655, + "step": 8500 + }, + { + "epoch": 0.4226681235720671, + "grad_norm": 0.1787109375, + "learning_rate": 0.0007661905234925997, + "loss": 0.7211, + "step": 8510 + }, + { + "epoch": 0.4231647958676865, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007661507897089501, + "loss": 0.7069, + "step": 8520 + }, + { + "epoch": 0.42366146816330585, + "grad_norm": 0.158203125, + "learning_rate": 0.0007661110559253005, + "loss": 0.7189, + "step": 8530 + }, + { + "epoch": 0.4241581404589252, + "grad_norm": 0.1875, + "learning_rate": 0.000766071322141651, + "loss": 0.7167, + "step": 8540 + }, + { + "epoch": 0.42465481275454453, + "grad_norm": 0.1845703125, + "learning_rate": 0.0007660315883580014, + "loss": 0.7307, + "step": 8550 + }, + { + "epoch": 0.4251514850501639, + "grad_norm": 0.173828125, + "learning_rate": 0.0007659918545743519, + "loss": 0.7289, + "step": 8560 + }, + { + "epoch": 0.42564815734578326, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007659521207907023, + "loss": 0.7157, + "step": 8570 + }, + { + "epoch": 0.4261448296414026, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007659123870070528, + "loss": 0.7354, + "step": 8580 + }, + { + "epoch": 0.42664150193702194, + "grad_norm": 0.181640625, + "learning_rate": 0.0007658726532234032, + "loss": 0.7431, + "step": 8590 + }, + { + "epoch": 0.4271381742326413, + "grad_norm": 0.1611328125, + "learning_rate": 0.0007658329194397536, + "loss": 0.7369, + "step": 8600 + }, + { + "epoch": 0.42763484652826067, + "grad_norm": 0.1865234375, + "learning_rate": 0.0007657931856561042, + "loss": 0.7348, + "step": 8610 + }, + { + "epoch": 0.42813151882388, + "grad_norm": 0.1826171875, + "learning_rate": 0.0007657534518724546, + "loss": 0.7231, + "step": 8620 + }, + { + "epoch": 0.42862819111949935, + "grad_norm": 0.1640625, + "learning_rate": 0.000765713718088805, + "loss": 0.7217, + "step": 8630 + }, + { + "epoch": 0.4291248634151187, + "grad_norm": 0.1728515625, + "learning_rate": 0.0007656739843051555, + "loss": 0.7138, + "step": 8640 + }, + { + "epoch": 0.4296215357107381, + "grad_norm": 0.1455078125, + "learning_rate": 0.000765634250521506, + "loss": 0.7112, + "step": 8650 + }, + { + "epoch": 0.4301182080063574, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007655945167378564, + "loss": 0.7388, + "step": 8660 + }, + { + "epoch": 0.43061488030197675, + "grad_norm": 0.189453125, + "learning_rate": 0.0007655547829542069, + "loss": 0.7088, + "step": 8670 + }, + { + "epoch": 0.4311115525975961, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007655150491705573, + "loss": 0.7148, + "step": 8680 + }, + { + "epoch": 0.43160822489321543, + "grad_norm": 0.1572265625, + "learning_rate": 0.0007654753153869077, + "loss": 0.7137, + "step": 8690 + }, + { + "epoch": 0.4321048971888348, + "grad_norm": 0.1484375, + "learning_rate": 0.0007654355816032582, + "loss": 0.6875, + "step": 8700 + }, + { + "epoch": 0.43260156948445416, + "grad_norm": 0.158203125, + "learning_rate": 0.0007653958478196087, + "loss": 0.6891, + "step": 8710 + }, + { + "epoch": 0.4330982417800735, + "grad_norm": 0.1484375, + "learning_rate": 0.0007653561140359591, + "loss": 0.7064, + "step": 8720 + }, + { + "epoch": 0.43359491407569284, + "grad_norm": 0.146484375, + "learning_rate": 0.0007653163802523096, + "loss": 0.7231, + "step": 8730 + }, + { + "epoch": 0.43409158637131223, + "grad_norm": 0.1494140625, + "learning_rate": 0.00076527664646866, + "loss": 0.7066, + "step": 8740 + }, + { + "epoch": 0.43458825866693157, + "grad_norm": 0.1669921875, + "learning_rate": 0.0007652369126850104, + "loss": 0.7, + "step": 8750 + }, + { + "epoch": 0.4350849309625509, + "grad_norm": 0.1611328125, + "learning_rate": 0.0007651971789013609, + "loss": 0.7378, + "step": 8760 + }, + { + "epoch": 0.43558160325817025, + "grad_norm": 0.169921875, + "learning_rate": 0.0007651574451177114, + "loss": 0.7336, + "step": 8770 + }, + { + "epoch": 0.4360782755537896, + "grad_norm": 0.158203125, + "learning_rate": 0.0007651177113340618, + "loss": 0.7575, + "step": 8780 + }, + { + "epoch": 0.436574947849409, + "grad_norm": 0.150390625, + "learning_rate": 0.0007650779775504122, + "loss": 0.7354, + "step": 8790 + }, + { + "epoch": 0.4370716201450283, + "grad_norm": 0.15625, + "learning_rate": 0.0007650382437667627, + "loss": 0.7177, + "step": 8800 + }, + { + "epoch": 0.43756829244064765, + "grad_norm": 0.1630859375, + "learning_rate": 0.0007649985099831133, + "loss": 0.7363, + "step": 8810 + }, + { + "epoch": 0.438064964736267, + "grad_norm": 0.177734375, + "learning_rate": 0.0007649587761994636, + "loss": 0.7019, + "step": 8820 + }, + { + "epoch": 0.4385616370318864, + "grad_norm": 0.15234375, + "learning_rate": 0.0007649190424158141, + "loss": 0.6998, + "step": 8830 + }, + { + "epoch": 0.4390583093275057, + "grad_norm": 0.1640625, + "learning_rate": 0.0007648793086321645, + "loss": 0.7236, + "step": 8840 + }, + { + "epoch": 0.43955498162312506, + "grad_norm": 0.189453125, + "learning_rate": 0.0007648395748485149, + "loss": 0.7097, + "step": 8850 + }, + { + "epoch": 0.4400516539187444, + "grad_norm": 0.15625, + "learning_rate": 0.0007647998410648655, + "loss": 0.6748, + "step": 8860 + }, + { + "epoch": 0.44054832621436374, + "grad_norm": 0.2041015625, + "learning_rate": 0.0007647601072812159, + "loss": 0.6921, + "step": 8870 + }, + { + "epoch": 0.44104499850998313, + "grad_norm": 0.15234375, + "learning_rate": 0.0007647203734975663, + "loss": 0.7079, + "step": 8880 + }, + { + "epoch": 0.44154167080560247, + "grad_norm": 0.1455078125, + "learning_rate": 0.0007646806397139168, + "loss": 0.7137, + "step": 8890 + }, + { + "epoch": 0.4420383431012218, + "grad_norm": 0.1494140625, + "learning_rate": 0.0007646409059302672, + "loss": 0.7328, + "step": 8900 + }, + { + "epoch": 0.44253501539684115, + "grad_norm": 0.1748046875, + "learning_rate": 0.0007646011721466177, + "loss": 0.7409, + "step": 8910 + }, + { + "epoch": 0.44303168769246054, + "grad_norm": 0.1767578125, + "learning_rate": 0.0007645614383629682, + "loss": 0.7505, + "step": 8920 + }, + { + "epoch": 0.4435283599880799, + "grad_norm": 0.16796875, + "learning_rate": 0.0007645217045793186, + "loss": 0.7348, + "step": 8930 + }, + { + "epoch": 0.4440250322836992, + "grad_norm": 0.1591796875, + "learning_rate": 0.000764481970795669, + "loss": 0.7477, + "step": 8940 + }, + { + "epoch": 0.44452170457931856, + "grad_norm": 0.1640625, + "learning_rate": 0.0007644422370120194, + "loss": 0.71, + "step": 8950 + }, + { + "epoch": 0.4450183768749379, + "grad_norm": 0.1630859375, + "learning_rate": 0.00076440250322837, + "loss": 0.7137, + "step": 8960 + }, + { + "epoch": 0.4455150491705573, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007643627694447205, + "loss": 0.7641, + "step": 8970 + }, + { + "epoch": 0.4460117214661766, + "grad_norm": 0.203125, + "learning_rate": 0.0007643230356610708, + "loss": 0.699, + "step": 8980 + }, + { + "epoch": 0.44650839376179596, + "grad_norm": 0.1806640625, + "learning_rate": 0.0007642833018774213, + "loss": 0.6906, + "step": 8990 + }, + { + "epoch": 0.4470050660574153, + "grad_norm": 0.18359375, + "learning_rate": 0.0007642435680937718, + "loss": 0.7058, + "step": 9000 + }, + { + "epoch": 0.4475017383530347, + "grad_norm": 0.166015625, + "learning_rate": 0.0007642038343101222, + "loss": 0.7377, + "step": 9010 + }, + { + "epoch": 0.44799841064865403, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007641641005264727, + "loss": 0.6808, + "step": 9020 + }, + { + "epoch": 0.44849508294427337, + "grad_norm": 0.185546875, + "learning_rate": 0.0007641243667428231, + "loss": 0.7154, + "step": 9030 + }, + { + "epoch": 0.4489917552398927, + "grad_norm": 0.15625, + "learning_rate": 0.0007640846329591735, + "loss": 0.7077, + "step": 9040 + }, + { + "epoch": 0.44948842753551205, + "grad_norm": 0.162109375, + "learning_rate": 0.000764044899175524, + "loss": 0.6977, + "step": 9050 + }, + { + "epoch": 0.44998509983113144, + "grad_norm": 0.16796875, + "learning_rate": 0.0007640051653918745, + "loss": 0.7314, + "step": 9060 + }, + { + "epoch": 0.4504817721267508, + "grad_norm": 0.177734375, + "learning_rate": 0.0007639654316082249, + "loss": 0.7332, + "step": 9070 + }, + { + "epoch": 0.4509784444223701, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007639256978245754, + "loss": 0.7256, + "step": 9080 + }, + { + "epoch": 0.45147511671798946, + "grad_norm": 0.18359375, + "learning_rate": 0.0007638859640409258, + "loss": 0.6945, + "step": 9090 + }, + { + "epoch": 0.4519717890136088, + "grad_norm": 0.1513671875, + "learning_rate": 0.0007638462302572763, + "loss": 0.7337, + "step": 9100 + }, + { + "epoch": 0.4524684613092282, + "grad_norm": 0.15234375, + "learning_rate": 0.0007638064964736268, + "loss": 0.7143, + "step": 9110 + }, + { + "epoch": 0.4529651336048475, + "grad_norm": 0.162109375, + "learning_rate": 0.0007637667626899772, + "loss": 0.7248, + "step": 9120 + }, + { + "epoch": 0.45346180590046686, + "grad_norm": 0.15625, + "learning_rate": 0.0007637270289063277, + "loss": 0.7323, + "step": 9130 + }, + { + "epoch": 0.4539584781960862, + "grad_norm": 0.181640625, + "learning_rate": 0.0007636872951226781, + "loss": 0.7023, + "step": 9140 + }, + { + "epoch": 0.4544551504917056, + "grad_norm": 0.1533203125, + "learning_rate": 0.0007636475613390285, + "loss": 0.7105, + "step": 9150 + }, + { + "epoch": 0.45495182278732493, + "grad_norm": 0.158203125, + "learning_rate": 0.0007636078275553791, + "loss": 0.7187, + "step": 9160 + }, + { + "epoch": 0.45544849508294427, + "grad_norm": 0.189453125, + "learning_rate": 0.0007635680937717294, + "loss": 0.7122, + "step": 9170 + }, + { + "epoch": 0.4559451673785636, + "grad_norm": 0.15625, + "learning_rate": 0.0007635283599880799, + "loss": 0.7229, + "step": 9180 + }, + { + "epoch": 0.45644183967418295, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007634886262044304, + "loss": 0.7008, + "step": 9190 + }, + { + "epoch": 0.45693851196980234, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007634488924207807, + "loss": 0.6972, + "step": 9200 + }, + { + "epoch": 0.4574351842654217, + "grad_norm": 0.140625, + "learning_rate": 0.0007634091586371313, + "loss": 0.6998, + "step": 9210 + }, + { + "epoch": 0.457931856561041, + "grad_norm": 0.177734375, + "learning_rate": 0.0007633694248534817, + "loss": 0.7409, + "step": 9220 + }, + { + "epoch": 0.45842852885666036, + "grad_norm": 0.1796875, + "learning_rate": 0.0007633296910698321, + "loss": 0.7144, + "step": 9230 + }, + { + "epoch": 0.45892520115227975, + "grad_norm": 0.158203125, + "learning_rate": 0.0007632899572861826, + "loss": 0.7273, + "step": 9240 + }, + { + "epoch": 0.4594218734478991, + "grad_norm": 0.171875, + "learning_rate": 0.000763250223502533, + "loss": 0.7022, + "step": 9250 + }, + { + "epoch": 0.4599185457435184, + "grad_norm": 0.162109375, + "learning_rate": 0.0007632104897188836, + "loss": 0.7172, + "step": 9260 + }, + { + "epoch": 0.46041521803913776, + "grad_norm": 0.1591796875, + "learning_rate": 0.000763170755935234, + "loss": 0.7561, + "step": 9270 + }, + { + "epoch": 0.4609118903347571, + "grad_norm": 0.1796875, + "learning_rate": 0.0007631310221515844, + "loss": 0.7032, + "step": 9280 + }, + { + "epoch": 0.4614085626303765, + "grad_norm": 0.197265625, + "learning_rate": 0.0007630912883679349, + "loss": 0.7463, + "step": 9290 + }, + { + "epoch": 0.46190523492599583, + "grad_norm": 0.15234375, + "learning_rate": 0.0007630515545842853, + "loss": 0.7138, + "step": 9300 + }, + { + "epoch": 0.46240190722161517, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007630118208006358, + "loss": 0.7258, + "step": 9310 + }, + { + "epoch": 0.4628985795172345, + "grad_norm": 0.18359375, + "learning_rate": 0.0007629720870169863, + "loss": 0.7383, + "step": 9320 + }, + { + "epoch": 0.4633952518128539, + "grad_norm": 0.146484375, + "learning_rate": 0.0007629323532333367, + "loss": 0.7449, + "step": 9330 + }, + { + "epoch": 0.46389192410847324, + "grad_norm": 0.16015625, + "learning_rate": 0.0007628926194496871, + "loss": 0.7348, + "step": 9340 + }, + { + "epoch": 0.4643885964040926, + "grad_norm": 0.16015625, + "learning_rate": 0.0007628528856660376, + "loss": 0.7234, + "step": 9350 + }, + { + "epoch": 0.4648852686997119, + "grad_norm": 0.146484375, + "learning_rate": 0.000762813151882388, + "loss": 0.6861, + "step": 9360 + }, + { + "epoch": 0.46538194099533126, + "grad_norm": 0.1611328125, + "learning_rate": 0.0007627734180987385, + "loss": 0.6995, + "step": 9370 + }, + { + "epoch": 0.46587861329095065, + "grad_norm": 0.169921875, + "learning_rate": 0.000762733684315089, + "loss": 0.6967, + "step": 9380 + }, + { + "epoch": 0.46637528558657, + "grad_norm": 0.16015625, + "learning_rate": 0.0007626939505314393, + "loss": 0.6964, + "step": 9390 + }, + { + "epoch": 0.4668719578821893, + "grad_norm": 0.162109375, + "learning_rate": 0.0007626542167477898, + "loss": 0.7155, + "step": 9400 + }, + { + "epoch": 0.46736863017780866, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007626144829641404, + "loss": 0.7281, + "step": 9410 + }, + { + "epoch": 0.46786530247342806, + "grad_norm": 0.1640625, + "learning_rate": 0.0007625747491804908, + "loss": 0.7683, + "step": 9420 + }, + { + "epoch": 0.4683619747690474, + "grad_norm": 0.146484375, + "learning_rate": 0.0007625350153968412, + "loss": 0.6822, + "step": 9430 + }, + { + "epoch": 0.46885864706466673, + "grad_norm": 0.1494140625, + "learning_rate": 0.0007624952816131916, + "loss": 0.7145, + "step": 9440 + }, + { + "epoch": 0.46935531936028607, + "grad_norm": 0.14453125, + "learning_rate": 0.0007624555478295421, + "loss": 0.7266, + "step": 9450 + }, + { + "epoch": 0.4698519916559054, + "grad_norm": 0.1767578125, + "learning_rate": 0.0007624158140458926, + "loss": 0.7053, + "step": 9460 + }, + { + "epoch": 0.4703486639515248, + "grad_norm": 0.154296875, + "learning_rate": 0.000762376080262243, + "loss": 0.6892, + "step": 9470 + }, + { + "epoch": 0.47084533624714414, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007623363464785935, + "loss": 0.7253, + "step": 9480 + }, + { + "epoch": 0.4713420085427635, + "grad_norm": 0.150390625, + "learning_rate": 0.0007622966126949439, + "loss": 0.7233, + "step": 9490 + }, + { + "epoch": 0.4718386808383828, + "grad_norm": 0.173828125, + "learning_rate": 0.0007622568789112943, + "loss": 0.7241, + "step": 9500 + }, + { + "epoch": 0.4723353531340022, + "grad_norm": 0.1552734375, + "learning_rate": 0.0007622171451276449, + "loss": 0.7205, + "step": 9510 + }, + { + "epoch": 0.47283202542962155, + "grad_norm": 0.1767578125, + "learning_rate": 0.0007621774113439953, + "loss": 0.7085, + "step": 9520 + }, + { + "epoch": 0.4733286977252409, + "grad_norm": 0.150390625, + "learning_rate": 0.0007621376775603457, + "loss": 0.6921, + "step": 9530 + }, + { + "epoch": 0.4738253700208602, + "grad_norm": 0.13671875, + "learning_rate": 0.0007620979437766962, + "loss": 0.7269, + "step": 9540 + }, + { + "epoch": 0.47432204231647956, + "grad_norm": 0.1572265625, + "learning_rate": 0.0007620582099930466, + "loss": 0.7659, + "step": 9550 + }, + { + "epoch": 0.47481871461209896, + "grad_norm": 0.1611328125, + "learning_rate": 0.000762018476209397, + "loss": 0.7237, + "step": 9560 + }, + { + "epoch": 0.4753153869077183, + "grad_norm": 0.14453125, + "learning_rate": 0.0007619787424257476, + "loss": 0.7036, + "step": 9570 + }, + { + "epoch": 0.47581205920333763, + "grad_norm": 0.189453125, + "learning_rate": 0.000761939008642098, + "loss": 0.7232, + "step": 9580 + }, + { + "epoch": 0.47630873149895697, + "grad_norm": 0.154296875, + "learning_rate": 0.0007618992748584484, + "loss": 0.6916, + "step": 9590 + }, + { + "epoch": 0.47680540379457637, + "grad_norm": 0.1669921875, + "learning_rate": 0.0007618595410747989, + "loss": 0.6853, + "step": 9600 + }, + { + "epoch": 0.4773020760901957, + "grad_norm": 0.1689453125, + "learning_rate": 0.0007618198072911494, + "loss": 0.6997, + "step": 9610 + }, + { + "epoch": 0.47779874838581504, + "grad_norm": 0.1552734375, + "learning_rate": 0.0007617800735074998, + "loss": 0.7772, + "step": 9620 + }, + { + "epoch": 0.4782954206814344, + "grad_norm": 0.15625, + "learning_rate": 0.0007617403397238502, + "loss": 0.7037, + "step": 9630 + }, + { + "epoch": 0.4787920929770537, + "grad_norm": 0.16015625, + "learning_rate": 0.0007617006059402007, + "loss": 0.7249, + "step": 9640 + }, + { + "epoch": 0.4792887652726731, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007616608721565511, + "loss": 0.7063, + "step": 9650 + }, + { + "epoch": 0.47978543756829245, + "grad_norm": 0.1533203125, + "learning_rate": 0.0007616211383729015, + "loss": 0.6947, + "step": 9660 + }, + { + "epoch": 0.4802821098639118, + "grad_norm": 0.1572265625, + "learning_rate": 0.0007615814045892521, + "loss": 0.6985, + "step": 9670 + }, + { + "epoch": 0.4807787821595311, + "grad_norm": 0.16796875, + "learning_rate": 0.0007615416708056025, + "loss": 0.7313, + "step": 9680 + }, + { + "epoch": 0.48127545445515046, + "grad_norm": 0.140625, + "learning_rate": 0.0007615019370219529, + "loss": 0.708, + "step": 9690 + }, + { + "epoch": 0.48177212675076986, + "grad_norm": 0.142578125, + "learning_rate": 0.0007614622032383034, + "loss": 0.7357, + "step": 9700 + }, + { + "epoch": 0.4822687990463892, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007614224694546539, + "loss": 0.7357, + "step": 9710 + }, + { + "epoch": 0.48276547134200853, + "grad_norm": 0.1513671875, + "learning_rate": 0.0007613827356710043, + "loss": 0.6737, + "step": 9720 + }, + { + "epoch": 0.4832621436376279, + "grad_norm": 0.166015625, + "learning_rate": 0.0007613430018873548, + "loss": 0.6925, + "step": 9730 + }, + { + "epoch": 0.48375881593324727, + "grad_norm": 0.1484375, + "learning_rate": 0.0007613032681037052, + "loss": 0.6754, + "step": 9740 + }, + { + "epoch": 0.4842554882288666, + "grad_norm": 0.1484375, + "learning_rate": 0.0007612635343200556, + "loss": 0.7491, + "step": 9750 + }, + { + "epoch": 0.48475216052448594, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007612238005364062, + "loss": 0.7246, + "step": 9760 + }, + { + "epoch": 0.4852488328201053, + "grad_norm": 0.158203125, + "learning_rate": 0.0007611840667527566, + "loss": 0.6921, + "step": 9770 + }, + { + "epoch": 0.4857455051157246, + "grad_norm": 0.1572265625, + "learning_rate": 0.000761144332969107, + "loss": 0.6853, + "step": 9780 + }, + { + "epoch": 0.486242177411344, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007611045991854575, + "loss": 0.6736, + "step": 9790 + }, + { + "epoch": 0.48673884970696335, + "grad_norm": 0.138671875, + "learning_rate": 0.0007610648654018079, + "loss": 0.7246, + "step": 9800 + }, + { + "epoch": 0.4872355220025827, + "grad_norm": 0.1455078125, + "learning_rate": 0.0007610251316181583, + "loss": 0.7048, + "step": 9810 + }, + { + "epoch": 0.487732194298202, + "grad_norm": 0.1630859375, + "learning_rate": 0.0007609853978345088, + "loss": 0.688, + "step": 9820 + }, + { + "epoch": 0.4882288665938214, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007609456640508593, + "loss": 0.7032, + "step": 9830 + }, + { + "epoch": 0.48872553888944076, + "grad_norm": 0.150390625, + "learning_rate": 0.0007609059302672097, + "loss": 0.6997, + "step": 9840 + }, + { + "epoch": 0.4892222111850601, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007608661964835601, + "loss": 0.6674, + "step": 9850 + }, + { + "epoch": 0.48971888348067943, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007608264626999106, + "loss": 0.7512, + "step": 9860 + }, + { + "epoch": 0.4902155557762988, + "grad_norm": 0.150390625, + "learning_rate": 0.0007607867289162612, + "loss": 0.7171, + "step": 9870 + }, + { + "epoch": 0.49071222807191817, + "grad_norm": 0.158203125, + "learning_rate": 0.0007607469951326115, + "loss": 0.7149, + "step": 9880 + }, + { + "epoch": 0.4912089003675375, + "grad_norm": 0.140625, + "learning_rate": 0.000760707261348962, + "loss": 0.6987, + "step": 9890 + }, + { + "epoch": 0.49170557266315684, + "grad_norm": 0.154296875, + "learning_rate": 0.0007606675275653124, + "loss": 0.728, + "step": 9900 + }, + { + "epoch": 0.4922022449587762, + "grad_norm": 0.13671875, + "learning_rate": 0.0007606277937816628, + "loss": 0.669, + "step": 9910 + }, + { + "epoch": 0.4926989172543956, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007605880599980134, + "loss": 0.6901, + "step": 9920 + }, + { + "epoch": 0.4931955895500149, + "grad_norm": 0.1533203125, + "learning_rate": 0.0007605483262143638, + "loss": 0.7053, + "step": 9930 + }, + { + "epoch": 0.49369226184563425, + "grad_norm": 0.171875, + "learning_rate": 0.0007605085924307142, + "loss": 0.7132, + "step": 9940 + }, + { + "epoch": 0.4941889341412536, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007604688586470647, + "loss": 0.6702, + "step": 9950 + }, + { + "epoch": 0.4946856064368729, + "grad_norm": 0.126953125, + "learning_rate": 0.0007604291248634151, + "loss": 0.7449, + "step": 9960 + }, + { + "epoch": 0.4951822787324923, + "grad_norm": 0.1826171875, + "learning_rate": 0.0007603893910797656, + "loss": 0.6888, + "step": 9970 + }, + { + "epoch": 0.49567895102811166, + "grad_norm": 0.142578125, + "learning_rate": 0.0007603496572961161, + "loss": 0.7041, + "step": 9980 + }, + { + "epoch": 0.496175623323731, + "grad_norm": 0.1533203125, + "learning_rate": 0.0007603099235124665, + "loss": 0.7158, + "step": 9990 + }, + { + "epoch": 0.49667229561935033, + "grad_norm": 0.15234375, + "learning_rate": 0.000760270189728817, + "loss": 0.7205, + "step": 10000 + }, + { + "epoch": 0.49716896791496973, + "grad_norm": 0.130859375, + "learning_rate": 0.0007602304559451674, + "loss": 0.7384, + "step": 10010 + }, + { + "epoch": 0.49766564021058907, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007601907221615179, + "loss": 0.7441, + "step": 10020 + }, + { + "epoch": 0.4981623125062084, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007601509883778684, + "loss": 0.6916, + "step": 10030 + }, + { + "epoch": 0.49865898480182774, + "grad_norm": 0.15234375, + "learning_rate": 0.0007601112545942187, + "loss": 0.7187, + "step": 10040 + }, + { + "epoch": 0.4991556570974471, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007600715208105692, + "loss": 0.7011, + "step": 10050 + }, + { + "epoch": 0.4996523293930665, + "grad_norm": 0.13671875, + "learning_rate": 0.0007600317870269198, + "loss": 0.6988, + "step": 10060 + }, + { + "epoch": 0.5001490016886858, + "grad_norm": 0.1328125, + "learning_rate": 0.0007599920532432701, + "loss": 0.6829, + "step": 10070 + }, + { + "epoch": 0.5006456739843052, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007599523194596206, + "loss": 0.7256, + "step": 10080 + }, + { + "epoch": 0.5011423462799245, + "grad_norm": 0.134765625, + "learning_rate": 0.000759912585675971, + "loss": 0.741, + "step": 10090 + }, + { + "epoch": 0.5016390185755438, + "grad_norm": 0.126953125, + "learning_rate": 0.0007598728518923214, + "loss": 0.6929, + "step": 10100 + }, + { + "epoch": 0.5021356908711632, + "grad_norm": 0.13671875, + "learning_rate": 0.0007598331181086719, + "loss": 0.7239, + "step": 10110 + }, + { + "epoch": 0.5026323631667825, + "grad_norm": 0.1572265625, + "learning_rate": 0.0007597933843250224, + "loss": 0.7027, + "step": 10120 + }, + { + "epoch": 0.503129035462402, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007597536505413728, + "loss": 0.6563, + "step": 10130 + }, + { + "epoch": 0.5036257077580213, + "grad_norm": 0.140625, + "learning_rate": 0.0007597139167577233, + "loss": 0.727, + "step": 10140 + }, + { + "epoch": 0.5041223800536406, + "grad_norm": 0.234375, + "learning_rate": 0.0007596741829740737, + "loss": 0.7071, + "step": 10150 + }, + { + "epoch": 0.50461905234926, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007596344491904242, + "loss": 0.7103, + "step": 10160 + }, + { + "epoch": 0.5051157246448793, + "grad_norm": 0.154296875, + "learning_rate": 0.0007595947154067747, + "loss": 0.6813, + "step": 10170 + }, + { + "epoch": 0.5056123969404986, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007595549816231251, + "loss": 0.7, + "step": 10180 + }, + { + "epoch": 0.506109069236118, + "grad_norm": 0.146484375, + "learning_rate": 0.0007595152478394756, + "loss": 0.7448, + "step": 10190 + }, + { + "epoch": 0.5066057415317373, + "grad_norm": 0.181640625, + "learning_rate": 0.000759475514055826, + "loss": 0.6848, + "step": 10200 + }, + { + "epoch": 0.5071024138273567, + "grad_norm": 0.150390625, + "learning_rate": 0.0007594357802721764, + "loss": 0.7022, + "step": 10210 + }, + { + "epoch": 0.5075990861229761, + "grad_norm": 0.1455078125, + "learning_rate": 0.000759396046488527, + "loss": 0.6825, + "step": 10220 + }, + { + "epoch": 0.5080957584185954, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007593563127048773, + "loss": 0.6914, + "step": 10230 + }, + { + "epoch": 0.5085924307142148, + "grad_norm": 0.134765625, + "learning_rate": 0.0007593165789212278, + "loss": 0.737, + "step": 10240 + }, + { + "epoch": 0.5090891030098341, + "grad_norm": 0.140625, + "learning_rate": 0.0007592768451375783, + "loss": 0.6901, + "step": 10250 + }, + { + "epoch": 0.5095857753054535, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007592371113539286, + "loss": 0.7328, + "step": 10260 + }, + { + "epoch": 0.5100824476010728, + "grad_norm": 0.123046875, + "learning_rate": 0.0007591973775702792, + "loss": 0.7049, + "step": 10270 + }, + { + "epoch": 0.5105791198966921, + "grad_norm": 0.1484375, + "learning_rate": 0.0007591576437866297, + "loss": 0.6783, + "step": 10280 + }, + { + "epoch": 0.5110757921923115, + "grad_norm": 0.14453125, + "learning_rate": 0.00075911791000298, + "loss": 0.7185, + "step": 10290 + }, + { + "epoch": 0.5115724644879308, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007590781762193305, + "loss": 0.7185, + "step": 10300 + }, + { + "epoch": 0.5120691367835503, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007590384424356809, + "loss": 0.7252, + "step": 10310 + }, + { + "epoch": 0.5125658090791696, + "grad_norm": 0.140625, + "learning_rate": 0.0007589987086520315, + "loss": 0.7113, + "step": 10320 + }, + { + "epoch": 0.5130624813747889, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007589589748683819, + "loss": 0.6794, + "step": 10330 + }, + { + "epoch": 0.5135591536704083, + "grad_norm": 0.146484375, + "learning_rate": 0.0007589192410847323, + "loss": 0.7233, + "step": 10340 + }, + { + "epoch": 0.5140558259660276, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007588795073010828, + "loss": 0.6812, + "step": 10350 + }, + { + "epoch": 0.514552498261647, + "grad_norm": 0.130859375, + "learning_rate": 0.0007588397735174332, + "loss": 0.7091, + "step": 10360 + }, + { + "epoch": 0.5150491705572663, + "grad_norm": 0.13671875, + "learning_rate": 0.0007588000397337837, + "loss": 0.734, + "step": 10370 + }, + { + "epoch": 0.5155458428528856, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007587603059501342, + "loss": 0.7, + "step": 10380 + }, + { + "epoch": 0.516042515148505, + "grad_norm": 0.140625, + "learning_rate": 0.0007587205721664846, + "loss": 0.6981, + "step": 10390 + }, + { + "epoch": 0.5165391874441244, + "grad_norm": 0.1611328125, + "learning_rate": 0.000758680838382835, + "loss": 0.7074, + "step": 10400 + }, + { + "epoch": 0.5170358597397438, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007586411045991855, + "loss": 0.722, + "step": 10410 + }, + { + "epoch": 0.5175325320353631, + "grad_norm": 0.126953125, + "learning_rate": 0.0007586013708155359, + "loss": 0.7269, + "step": 10420 + }, + { + "epoch": 0.5180292043309824, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007585616370318864, + "loss": 0.7256, + "step": 10430 + }, + { + "epoch": 0.5185258766266018, + "grad_norm": 0.1572265625, + "learning_rate": 0.0007585219032482369, + "loss": 0.711, + "step": 10440 + }, + { + "epoch": 0.5190225489222211, + "grad_norm": 0.1328125, + "learning_rate": 0.0007584821694645873, + "loss": 0.6722, + "step": 10450 + }, + { + "epoch": 0.5195192212178404, + "grad_norm": 0.130859375, + "learning_rate": 0.0007584424356809377, + "loss": 0.6736, + "step": 10460 + }, + { + "epoch": 0.5200158935134598, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007584027018972883, + "loss": 0.6942, + "step": 10470 + }, + { + "epoch": 0.5205125658090791, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007583629681136387, + "loss": 0.6936, + "step": 10480 + }, + { + "epoch": 0.5210092381046986, + "grad_norm": 0.125, + "learning_rate": 0.0007583232343299891, + "loss": 0.7083, + "step": 10490 + }, + { + "epoch": 0.5215059104003179, + "grad_norm": 0.134765625, + "learning_rate": 0.0007582835005463395, + "loss": 0.7019, + "step": 10500 + }, + { + "epoch": 0.5220025826959372, + "grad_norm": 0.126953125, + "learning_rate": 0.00075824376676269, + "loss": 0.7379, + "step": 10510 + }, + { + "epoch": 0.5224992549915566, + "grad_norm": 0.142578125, + "learning_rate": 0.0007582040329790405, + "loss": 0.6855, + "step": 10520 + }, + { + "epoch": 0.5229959272871759, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007581642991953909, + "loss": 0.7362, + "step": 10530 + }, + { + "epoch": 0.5234925995827953, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007581245654117414, + "loss": 0.6975, + "step": 10540 + }, + { + "epoch": 0.5239892718784146, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007580848316280918, + "loss": 0.668, + "step": 10550 + }, + { + "epoch": 0.5244859441740339, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007580450978444422, + "loss": 0.7077, + "step": 10560 + }, + { + "epoch": 0.5249826164696533, + "grad_norm": 0.1533203125, + "learning_rate": 0.0007580053640607928, + "loss": 0.7183, + "step": 10570 + }, + { + "epoch": 0.5254792887652727, + "grad_norm": 0.1494140625, + "learning_rate": 0.0007579656302771432, + "loss": 0.6935, + "step": 10580 + }, + { + "epoch": 0.5259759610608921, + "grad_norm": 0.134765625, + "learning_rate": 0.0007579258964934936, + "loss": 0.7131, + "step": 10590 + }, + { + "epoch": 0.5264726333565114, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007578861627098441, + "loss": 0.6672, + "step": 10600 + }, + { + "epoch": 0.5269693056521307, + "grad_norm": 0.12890625, + "learning_rate": 0.0007578464289261945, + "loss": 0.6699, + "step": 10610 + }, + { + "epoch": 0.5274659779477501, + "grad_norm": 0.1279296875, + "learning_rate": 0.000757806695142545, + "loss": 0.6993, + "step": 10620 + }, + { + "epoch": 0.5279626502433694, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007577669613588955, + "loss": 0.6946, + "step": 10630 + }, + { + "epoch": 0.5284593225389888, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007577272275752459, + "loss": 0.6683, + "step": 10640 + }, + { + "epoch": 0.5289559948346081, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007576874937915963, + "loss": 0.7298, + "step": 10650 + }, + { + "epoch": 0.5294526671302274, + "grad_norm": 0.1552734375, + "learning_rate": 0.0007576477600079468, + "loss": 0.6767, + "step": 10660 + }, + { + "epoch": 0.5299493394258469, + "grad_norm": 0.166015625, + "learning_rate": 0.0007576080262242973, + "loss": 0.7152, + "step": 10670 + }, + { + "epoch": 0.5304460117214662, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007575682924406477, + "loss": 0.7154, + "step": 10680 + }, + { + "epoch": 0.5309426840170856, + "grad_norm": 0.126953125, + "learning_rate": 0.0007575285586569981, + "loss": 0.6851, + "step": 10690 + }, + { + "epoch": 0.5314393563127049, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007574888248733486, + "loss": 0.7228, + "step": 10700 + }, + { + "epoch": 0.5319360286083242, + "grad_norm": 0.13671875, + "learning_rate": 0.000757449091089699, + "loss": 0.6955, + "step": 10710 + }, + { + "epoch": 0.5324327009039436, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007574093573060495, + "loss": 0.6805, + "step": 10720 + }, + { + "epoch": 0.5329293731995629, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007573696235224, + "loss": 0.6945, + "step": 10730 + }, + { + "epoch": 0.5334260454951822, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007573298897387504, + "loss": 0.6913, + "step": 10740 + }, + { + "epoch": 0.5339227177908016, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007572901559551008, + "loss": 0.6735, + "step": 10750 + }, + { + "epoch": 0.534419390086421, + "grad_norm": 0.13671875, + "learning_rate": 0.0007572504221714513, + "loss": 0.7134, + "step": 10760 + }, + { + "epoch": 0.5349160623820404, + "grad_norm": 0.140625, + "learning_rate": 0.0007572106883878018, + "loss": 0.6831, + "step": 10770 + }, + { + "epoch": 0.5354127346776597, + "grad_norm": 0.1533203125, + "learning_rate": 0.0007571709546041522, + "loss": 0.7085, + "step": 10780 + }, + { + "epoch": 0.535909406973279, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007571312208205027, + "loss": 0.6748, + "step": 10790 + }, + { + "epoch": 0.5364060792688984, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007570914870368531, + "loss": 0.7158, + "step": 10800 + }, + { + "epoch": 0.5369027515645177, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007570517532532035, + "loss": 0.7052, + "step": 10810 + }, + { + "epoch": 0.5373994238601371, + "grad_norm": 0.130859375, + "learning_rate": 0.0007570120194695541, + "loss": 0.7261, + "step": 10820 + }, + { + "epoch": 0.5378960961557564, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007569722856859045, + "loss": 0.7455, + "step": 10830 + }, + { + "epoch": 0.5383927684513757, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007569325519022549, + "loss": 0.6688, + "step": 10840 + }, + { + "epoch": 0.5388894407469952, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007568928181186054, + "loss": 0.6693, + "step": 10850 + }, + { + "epoch": 0.5393861130426145, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007568530843349558, + "loss": 0.7121, + "step": 10860 + }, + { + "epoch": 0.5398827853382339, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007568133505513063, + "loss": 0.6963, + "step": 10870 + }, + { + "epoch": 0.5403794576338532, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007567736167676568, + "loss": 0.7068, + "step": 10880 + }, + { + "epoch": 0.5408761299294725, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007567338829840072, + "loss": 0.6676, + "step": 10890 + }, + { + "epoch": 0.5413728022250919, + "grad_norm": 0.125, + "learning_rate": 0.0007566941492003577, + "loss": 0.6815, + "step": 10900 + }, + { + "epoch": 0.5418694745207112, + "grad_norm": 0.1357421875, + "learning_rate": 0.000756654415416708, + "loss": 0.7308, + "step": 10910 + }, + { + "epoch": 0.5423661468163306, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007566146816330586, + "loss": 0.6691, + "step": 10920 + }, + { + "epoch": 0.5428628191119499, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007565749478494091, + "loss": 0.7001, + "step": 10930 + }, + { + "epoch": 0.5433594914075693, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007565352140657594, + "loss": 0.6703, + "step": 10940 + }, + { + "epoch": 0.5438561637031887, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007564954802821099, + "loss": 0.6909, + "step": 10950 + }, + { + "epoch": 0.544352835998808, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007564557464984603, + "loss": 0.6848, + "step": 10960 + }, + { + "epoch": 0.5448495082944274, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007564160127148107, + "loss": 0.6682, + "step": 10970 + }, + { + "epoch": 0.5453461805900467, + "grad_norm": 0.15625, + "learning_rate": 0.0007563762789311613, + "loss": 0.7165, + "step": 10980 + }, + { + "epoch": 0.545842852885666, + "grad_norm": 0.12890625, + "learning_rate": 0.0007563365451475117, + "loss": 0.6916, + "step": 10990 + }, + { + "epoch": 0.5463395251812854, + "grad_norm": 0.13671875, + "learning_rate": 0.0007562968113638621, + "loss": 0.7004, + "step": 11000 + }, + { + "epoch": 0.5468361974769047, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007562570775802126, + "loss": 0.7482, + "step": 11010 + }, + { + "epoch": 0.547332869772524, + "grad_norm": 0.115234375, + "learning_rate": 0.000756217343796563, + "loss": 0.6794, + "step": 11020 + }, + { + "epoch": 0.5478295420681435, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007561776100129135, + "loss": 0.7104, + "step": 11030 + }, + { + "epoch": 0.5483262143637628, + "grad_norm": 0.11083984375, + "learning_rate": 0.000756137876229264, + "loss": 0.681, + "step": 11040 + }, + { + "epoch": 0.5488228866593822, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007560981424456144, + "loss": 0.6743, + "step": 11050 + }, + { + "epoch": 0.5493195589550015, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007560584086619649, + "loss": 0.7129, + "step": 11060 + }, + { + "epoch": 0.5498162312506208, + "grad_norm": 0.1513671875, + "learning_rate": 0.0007560186748783154, + "loss": 0.7309, + "step": 11070 + }, + { + "epoch": 0.5503129035462402, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007559789410946658, + "loss": 0.6973, + "step": 11080 + }, + { + "epoch": 0.5508095758418595, + "grad_norm": 0.134765625, + "learning_rate": 0.0007559392073110163, + "loss": 0.6736, + "step": 11090 + }, + { + "epoch": 0.5513062481374789, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007558994735273666, + "loss": 0.6728, + "step": 11100 + }, + { + "epoch": 0.5518029204330982, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007558597397437171, + "loss": 0.6845, + "step": 11110 + }, + { + "epoch": 0.5522995927287176, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007558200059600677, + "loss": 0.6922, + "step": 11120 + }, + { + "epoch": 0.552796265024337, + "grad_norm": 0.1142578125, + "learning_rate": 0.000755780272176418, + "loss": 0.6936, + "step": 11130 + }, + { + "epoch": 0.5532929373199563, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007557405383927685, + "loss": 0.6968, + "step": 11140 + }, + { + "epoch": 0.5537896096155757, + "grad_norm": 0.12109375, + "learning_rate": 0.000755700804609119, + "loss": 0.6866, + "step": 11150 + }, + { + "epoch": 0.554286281911195, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007556610708254693, + "loss": 0.6958, + "step": 11160 + }, + { + "epoch": 0.5547829542068143, + "grad_norm": 0.130859375, + "learning_rate": 0.0007556213370418198, + "loss": 0.6782, + "step": 11170 + }, + { + "epoch": 0.5552796265024337, + "grad_norm": 0.130859375, + "learning_rate": 0.0007555816032581703, + "loss": 0.6872, + "step": 11180 + }, + { + "epoch": 0.555776298798053, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007555418694745208, + "loss": 0.6499, + "step": 11190 + }, + { + "epoch": 0.5562729710936724, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007555021356908712, + "loss": 0.6933, + "step": 11200 + }, + { + "epoch": 0.5567696433892917, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007554624019072216, + "loss": 0.7007, + "step": 11210 + }, + { + "epoch": 0.5572663156849111, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007554226681235722, + "loss": 0.7212, + "step": 11220 + }, + { + "epoch": 0.5577629879805305, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007553829343399226, + "loss": 0.6869, + "step": 11230 + }, + { + "epoch": 0.5582596602761498, + "grad_norm": 0.12353515625, + "learning_rate": 0.000755343200556273, + "loss": 0.7108, + "step": 11240 + }, + { + "epoch": 0.5587563325717692, + "grad_norm": 0.126953125, + "learning_rate": 0.0007553034667726235, + "loss": 0.7133, + "step": 11250 + }, + { + "epoch": 0.5592530048673885, + "grad_norm": 0.125, + "learning_rate": 0.0007552637329889739, + "loss": 0.6634, + "step": 11260 + }, + { + "epoch": 0.5597496771630078, + "grad_norm": 0.1201171875, + "learning_rate": 0.0007552239992053243, + "loss": 0.7046, + "step": 11270 + }, + { + "epoch": 0.5602463494586272, + "grad_norm": 0.126953125, + "learning_rate": 0.0007551842654216749, + "loss": 0.6988, + "step": 11280 + }, + { + "epoch": 0.5607430217542465, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007551445316380252, + "loss": 0.681, + "step": 11290 + }, + { + "epoch": 0.5612396940498658, + "grad_norm": 0.12890625, + "learning_rate": 0.0007551047978543757, + "loss": 0.7067, + "step": 11300 + }, + { + "epoch": 0.5617363663454853, + "grad_norm": 0.11328125, + "learning_rate": 0.0007550650640707262, + "loss": 0.6846, + "step": 11310 + }, + { + "epoch": 0.5622330386411046, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007550253302870765, + "loss": 0.6974, + "step": 11320 + }, + { + "epoch": 0.562729710936724, + "grad_norm": 0.1337890625, + "learning_rate": 0.0007549855965034271, + "loss": 0.6877, + "step": 11330 + }, + { + "epoch": 0.5632263832323433, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007549458627197776, + "loss": 0.6674, + "step": 11340 + }, + { + "epoch": 0.5637230555279626, + "grad_norm": 0.12255859375, + "learning_rate": 0.000754906128936128, + "loss": 0.6544, + "step": 11350 + }, + { + "epoch": 0.564219727823582, + "grad_norm": 0.123046875, + "learning_rate": 0.0007548663951524784, + "loss": 0.6775, + "step": 11360 + }, + { + "epoch": 0.5647164001192013, + "grad_norm": 0.1201171875, + "learning_rate": 0.0007548266613688288, + "loss": 0.6838, + "step": 11370 + }, + { + "epoch": 0.5652130724148207, + "grad_norm": 0.14453125, + "learning_rate": 0.0007547869275851794, + "loss": 0.6775, + "step": 11380 + }, + { + "epoch": 0.56570974471044, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007547471938015298, + "loss": 0.6907, + "step": 11390 + }, + { + "epoch": 0.5662064170060594, + "grad_norm": 0.119140625, + "learning_rate": 0.0007547074600178802, + "loss": 0.669, + "step": 11400 + }, + { + "epoch": 0.5667030893016788, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007546677262342307, + "loss": 0.681, + "step": 11410 + }, + { + "epoch": 0.5671997615972981, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007546279924505811, + "loss": 0.685, + "step": 11420 + }, + { + "epoch": 0.5676964338929175, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007545882586669316, + "loss": 0.7101, + "step": 11430 + }, + { + "epoch": 0.5681931061885368, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007545485248832821, + "loss": 0.7297, + "step": 11440 + }, + { + "epoch": 0.5686897784841561, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007545087910996325, + "loss": 0.7116, + "step": 11450 + }, + { + "epoch": 0.5691864507797755, + "grad_norm": 0.123046875, + "learning_rate": 0.0007544690573159829, + "loss": 0.6823, + "step": 11460 + }, + { + "epoch": 0.5696831230753948, + "grad_norm": 0.1201171875, + "learning_rate": 0.0007544293235323334, + "loss": 0.7039, + "step": 11470 + }, + { + "epoch": 0.5701797953710142, + "grad_norm": 0.111328125, + "learning_rate": 0.0007543895897486839, + "loss": 0.6877, + "step": 11480 + }, + { + "epoch": 0.5706764676666336, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007543498559650343, + "loss": 0.7188, + "step": 11490 + }, + { + "epoch": 0.5711731399622529, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007543101221813848, + "loss": 0.6998, + "step": 11500 + }, + { + "epoch": 0.5716698122578723, + "grad_norm": 0.1689453125, + "learning_rate": 0.0007542703883977352, + "loss": 0.6943, + "step": 11510 + }, + { + "epoch": 0.5721664845534916, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007542306546140856, + "loss": 0.6909, + "step": 11520 + }, + { + "epoch": 0.572663156849111, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007541909208304362, + "loss": 0.6551, + "step": 11530 + }, + { + "epoch": 0.5731598291447303, + "grad_norm": 0.15234375, + "learning_rate": 0.0007541511870467866, + "loss": 0.6907, + "step": 11540 + }, + { + "epoch": 0.5736565014403496, + "grad_norm": 0.12353515625, + "learning_rate": 0.000754111453263137, + "loss": 0.6967, + "step": 11550 + }, + { + "epoch": 0.574153173735969, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007540717194794874, + "loss": 0.6866, + "step": 11560 + }, + { + "epoch": 0.5746498460315883, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007540319856958379, + "loss": 0.6793, + "step": 11570 + }, + { + "epoch": 0.5751465183272078, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007539922519121884, + "loss": 0.71, + "step": 11580 + }, + { + "epoch": 0.5756431906228271, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007539525181285388, + "loss": 0.6951, + "step": 11590 + }, + { + "epoch": 0.5761398629184464, + "grad_norm": 0.1201171875, + "learning_rate": 0.0007539127843448893, + "loss": 0.6879, + "step": 11600 + }, + { + "epoch": 0.5766365352140658, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007538730505612397, + "loss": 0.6827, + "step": 11610 + }, + { + "epoch": 0.5771332075096851, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007538333167775901, + "loss": 0.666, + "step": 11620 + }, + { + "epoch": 0.5776298798053044, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007537935829939407, + "loss": 0.6948, + "step": 11630 + }, + { + "epoch": 0.5781265521009238, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007537538492102911, + "loss": 0.6813, + "step": 11640 + }, + { + "epoch": 0.5786232243965431, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007537141154266415, + "loss": 0.6757, + "step": 11650 + }, + { + "epoch": 0.5791198966921625, + "grad_norm": 0.11865234375, + "learning_rate": 0.000753674381642992, + "loss": 0.7123, + "step": 11660 + }, + { + "epoch": 0.5796165689877819, + "grad_norm": 0.1328125, + "learning_rate": 0.0007536346478593424, + "loss": 0.7048, + "step": 11670 + }, + { + "epoch": 0.5801132412834012, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007535949140756929, + "loss": 0.6927, + "step": 11680 + }, + { + "epoch": 0.5806099135790206, + "grad_norm": 0.119140625, + "learning_rate": 0.0007535551802920434, + "loss": 0.6679, + "step": 11690 + }, + { + "epoch": 0.5811065858746399, + "grad_norm": 0.1201171875, + "learning_rate": 0.0007535154465083938, + "loss": 0.729, + "step": 11700 + }, + { + "epoch": 0.5816032581702593, + "grad_norm": 0.111328125, + "learning_rate": 0.0007534757127247442, + "loss": 0.7099, + "step": 11710 + }, + { + "epoch": 0.5820999304658786, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007534359789410947, + "loss": 0.6524, + "step": 11720 + }, + { + "epoch": 0.5825966027614979, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007533962451574452, + "loss": 0.7253, + "step": 11730 + }, + { + "epoch": 0.5830932750571173, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007533565113737956, + "loss": 0.6756, + "step": 11740 + }, + { + "epoch": 0.5835899473527366, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007533167775901461, + "loss": 0.6726, + "step": 11750 + }, + { + "epoch": 0.5840866196483561, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007532770438064965, + "loss": 0.6854, + "step": 11760 + }, + { + "epoch": 0.5845832919439754, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007532373100228469, + "loss": 0.6881, + "step": 11770 + }, + { + "epoch": 0.5850799642395947, + "grad_norm": 0.1201171875, + "learning_rate": 0.0007531975762391974, + "loss": 0.7094, + "step": 11780 + }, + { + "epoch": 0.5855766365352141, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007531578424555479, + "loss": 0.6962, + "step": 11790 + }, + { + "epoch": 0.5860733088308334, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007531181086718984, + "loss": 0.7148, + "step": 11800 + }, + { + "epoch": 0.5865699811264528, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007530783748882487, + "loss": 0.6397, + "step": 11810 + }, + { + "epoch": 0.5870666534220721, + "grad_norm": 0.154296875, + "learning_rate": 0.0007530386411045992, + "loss": 0.6775, + "step": 11820 + }, + { + "epoch": 0.5875633257176914, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007529989073209497, + "loss": 0.7079, + "step": 11830 + }, + { + "epoch": 0.5880599980133108, + "grad_norm": 0.134765625, + "learning_rate": 0.0007529591735373001, + "loss": 0.7033, + "step": 11840 + }, + { + "epoch": 0.5885566703089302, + "grad_norm": 0.14453125, + "learning_rate": 0.0007529194397536506, + "loss": 0.6891, + "step": 11850 + }, + { + "epoch": 0.5890533426045496, + "grad_norm": 0.1328125, + "learning_rate": 0.000752879705970001, + "loss": 0.6991, + "step": 11860 + }, + { + "epoch": 0.5895500149001689, + "grad_norm": 0.126953125, + "learning_rate": 0.0007528399721863514, + "loss": 0.7138, + "step": 11870 + }, + { + "epoch": 0.5900466871957882, + "grad_norm": 0.12353515625, + "learning_rate": 0.000752800238402702, + "loss": 0.694, + "step": 11880 + }, + { + "epoch": 0.5905433594914076, + "grad_norm": 0.115234375, + "learning_rate": 0.0007527605046190524, + "loss": 0.6693, + "step": 11890 + }, + { + "epoch": 0.5910400317870269, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007527207708354028, + "loss": 0.6626, + "step": 11900 + }, + { + "epoch": 0.5915367040826462, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007526810370517533, + "loss": 0.6696, + "step": 11910 + }, + { + "epoch": 0.5920333763782656, + "grad_norm": 0.138671875, + "learning_rate": 0.0007526413032681037, + "loss": 0.7056, + "step": 11920 + }, + { + "epoch": 0.5925300486738849, + "grad_norm": 0.15234375, + "learning_rate": 0.0007526015694844542, + "loss": 0.7229, + "step": 11930 + }, + { + "epoch": 0.5930267209695044, + "grad_norm": 0.130859375, + "learning_rate": 0.0007525618357008047, + "loss": 0.6533, + "step": 11940 + }, + { + "epoch": 0.5935233932651237, + "grad_norm": 0.119140625, + "learning_rate": 0.0007525221019171551, + "loss": 0.6976, + "step": 11950 + }, + { + "epoch": 0.594020065560743, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007524823681335056, + "loss": 0.6907, + "step": 11960 + }, + { + "epoch": 0.5945167378563624, + "grad_norm": 0.12109375, + "learning_rate": 0.0007524426343498559, + "loss": 0.6898, + "step": 11970 + }, + { + "epoch": 0.5950134101519817, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007524029005662065, + "loss": 0.6755, + "step": 11980 + }, + { + "epoch": 0.5955100824476011, + "grad_norm": 0.1162109375, + "learning_rate": 0.000752363166782557, + "loss": 0.7117, + "step": 11990 + }, + { + "epoch": 0.5960067547432204, + "grad_norm": 0.12109375, + "learning_rate": 0.0007523234329989073, + "loss": 0.6984, + "step": 12000 + }, + { + "epoch": 0.5965034270388397, + "grad_norm": 0.15234375, + "learning_rate": 0.0007522836992152578, + "loss": 0.693, + "step": 12010 + }, + { + "epoch": 0.5970000993344591, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007522439654316083, + "loss": 0.6968, + "step": 12020 + }, + { + "epoch": 0.5974967716300785, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007522042316479587, + "loss": 0.726, + "step": 12030 + }, + { + "epoch": 0.5979934439256979, + "grad_norm": 0.109375, + "learning_rate": 0.0007521644978643092, + "loss": 0.6597, + "step": 12040 + }, + { + "epoch": 0.5984901162213172, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007521247640806596, + "loss": 0.6991, + "step": 12050 + }, + { + "epoch": 0.5989867885169365, + "grad_norm": 0.11767578125, + "learning_rate": 0.00075208503029701, + "loss": 0.69, + "step": 12060 + }, + { + "epoch": 0.5994834608125559, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007520452965133605, + "loss": 0.6338, + "step": 12070 + }, + { + "epoch": 0.5999801331081752, + "grad_norm": 0.123046875, + "learning_rate": 0.000752005562729711, + "loss": 0.6749, + "step": 12080 + }, + { + "epoch": 0.6004768054037946, + "grad_norm": 0.109375, + "learning_rate": 0.0007519658289460615, + "loss": 0.6509, + "step": 12090 + }, + { + "epoch": 0.6009734776994139, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007519260951624119, + "loss": 0.6835, + "step": 12100 + }, + { + "epoch": 0.6014701499950332, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007518863613787623, + "loss": 0.6922, + "step": 12110 + }, + { + "epoch": 0.6019668222906527, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007518466275951128, + "loss": 0.6965, + "step": 12120 + }, + { + "epoch": 0.602463494586272, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007518068938114633, + "loss": 0.6965, + "step": 12130 + }, + { + "epoch": 0.6029601668818914, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007517671600278137, + "loss": 0.6884, + "step": 12140 + }, + { + "epoch": 0.6034568391775107, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007517274262441642, + "loss": 0.6822, + "step": 12150 + }, + { + "epoch": 0.60395351147313, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007516876924605145, + "loss": 0.6637, + "step": 12160 + }, + { + "epoch": 0.6044501837687494, + "grad_norm": 0.11962890625, + "learning_rate": 0.000751647958676865, + "loss": 0.6784, + "step": 12170 + }, + { + "epoch": 0.6049468560643687, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007516082248932156, + "loss": 0.6741, + "step": 12180 + }, + { + "epoch": 0.605443528359988, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007515684911095659, + "loss": 0.6529, + "step": 12190 + }, + { + "epoch": 0.6059402006556074, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007515287573259164, + "loss": 0.6806, + "step": 12200 + }, + { + "epoch": 0.6064368729512268, + "grad_norm": 0.111328125, + "learning_rate": 0.0007514890235422669, + "loss": 0.6758, + "step": 12210 + }, + { + "epoch": 0.6069335452468462, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007514492897586172, + "loss": 0.7012, + "step": 12220 + }, + { + "epoch": 0.6074302175424655, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007514095559749678, + "loss": 0.7264, + "step": 12230 + }, + { + "epoch": 0.6079268898380848, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007513698221913182, + "loss": 0.6513, + "step": 12240 + }, + { + "epoch": 0.6084235621337042, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007513300884076687, + "loss": 0.6646, + "step": 12250 + }, + { + "epoch": 0.6089202344293235, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007512903546240191, + "loss": 0.6786, + "step": 12260 + }, + { + "epoch": 0.6094169067249429, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007512506208403695, + "loss": 0.6541, + "step": 12270 + }, + { + "epoch": 0.6099135790205622, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007512108870567201, + "loss": 0.6807, + "step": 12280 + }, + { + "epoch": 0.6104102513161815, + "grad_norm": 0.12890625, + "learning_rate": 0.0007511711532730705, + "loss": 0.6737, + "step": 12290 + }, + { + "epoch": 0.6109069236118009, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007511314194894209, + "loss": 0.6765, + "step": 12300 + }, + { + "epoch": 0.6114035959074203, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007510916857057714, + "loss": 0.7203, + "step": 12310 + }, + { + "epoch": 0.6119002682030397, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007510519519221218, + "loss": 0.665, + "step": 12320 + }, + { + "epoch": 0.612396940498659, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007510122181384722, + "loss": 0.6752, + "step": 12330 + }, + { + "epoch": 0.6128936127942783, + "grad_norm": 0.140625, + "learning_rate": 0.0007509724843548228, + "loss": 0.6806, + "step": 12340 + }, + { + "epoch": 0.6133902850898977, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007509327505711731, + "loss": 0.6697, + "step": 12350 + }, + { + "epoch": 0.613886957385517, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007508930167875236, + "loss": 0.6807, + "step": 12360 + }, + { + "epoch": 0.6143836296811364, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007508532830038741, + "loss": 0.6599, + "step": 12370 + }, + { + "epoch": 0.6148803019767557, + "grad_norm": 0.125, + "learning_rate": 0.0007508135492202244, + "loss": 0.6782, + "step": 12380 + }, + { + "epoch": 0.615376974272375, + "grad_norm": 0.126953125, + "learning_rate": 0.000750773815436575, + "loss": 0.6866, + "step": 12390 + }, + { + "epoch": 0.6158736465679945, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007507340816529255, + "loss": 0.6832, + "step": 12400 + }, + { + "epoch": 0.6163703188636138, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007506943478692759, + "loss": 0.6741, + "step": 12410 + }, + { + "epoch": 0.6168669911592332, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007506546140856263, + "loss": 0.6638, + "step": 12420 + }, + { + "epoch": 0.6173636634548525, + "grad_norm": 0.134765625, + "learning_rate": 0.0007506148803019767, + "loss": 0.6734, + "step": 12430 + }, + { + "epoch": 0.6178603357504718, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007505751465183273, + "loss": 0.6718, + "step": 12440 + }, + { + "epoch": 0.6183570080460912, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007505354127346777, + "loss": 0.7069, + "step": 12450 + }, + { + "epoch": 0.6188536803417105, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007504956789510281, + "loss": 0.7089, + "step": 12460 + }, + { + "epoch": 0.6193503526373298, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007504559451673786, + "loss": 0.6918, + "step": 12470 + }, + { + "epoch": 0.6198470249329492, + "grad_norm": 0.1591796875, + "learning_rate": 0.000750416211383729, + "loss": 0.695, + "step": 12480 + }, + { + "epoch": 0.6203436972285686, + "grad_norm": 0.134765625, + "learning_rate": 0.0007503764776000795, + "loss": 0.6542, + "step": 12490 + }, + { + "epoch": 0.620840369524188, + "grad_norm": 0.14453125, + "learning_rate": 0.00075033674381643, + "loss": 0.7126, + "step": 12500 + }, + { + "epoch": 0.6213370418198073, + "grad_norm": 0.1171875, + "learning_rate": 0.0007502970100327804, + "loss": 0.6595, + "step": 12510 + }, + { + "epoch": 0.6218337141154267, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007502572762491308, + "loss": 0.6853, + "step": 12520 + }, + { + "epoch": 0.622330386411046, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007502175424654814, + "loss": 0.6536, + "step": 12530 + }, + { + "epoch": 0.6228270587066653, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007501778086818318, + "loss": 0.6682, + "step": 12540 + }, + { + "epoch": 0.6233237310022847, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007501380748981822, + "loss": 0.6557, + "step": 12550 + }, + { + "epoch": 0.623820403297904, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007500983411145327, + "loss": 0.6871, + "step": 12560 + }, + { + "epoch": 0.6243170755935233, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007500586073308831, + "loss": 0.6744, + "step": 12570 + }, + { + "epoch": 0.6248137478891428, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007500188735472335, + "loss": 0.6408, + "step": 12580 + }, + { + "epoch": 0.6253104201847621, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007499791397635841, + "loss": 0.6815, + "step": 12590 + }, + { + "epoch": 0.6258070924803815, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007499394059799345, + "loss": 0.6647, + "step": 12600 + }, + { + "epoch": 0.6263037647760008, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007498996721962849, + "loss": 0.666, + "step": 12610 + }, + { + "epoch": 0.6268004370716201, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007498599384126353, + "loss": 0.6886, + "step": 12620 + }, + { + "epoch": 0.6272971093672395, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007498202046289858, + "loss": 0.7055, + "step": 12630 + }, + { + "epoch": 0.6277937816628588, + "grad_norm": 0.12109375, + "learning_rate": 0.0007497804708453363, + "loss": 0.6681, + "step": 12640 + }, + { + "epoch": 0.6282904539584782, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007497407370616867, + "loss": 0.6827, + "step": 12650 + }, + { + "epoch": 0.6287871262540975, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007497010032780372, + "loss": 0.6637, + "step": 12660 + }, + { + "epoch": 0.6292837985497169, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007496612694943876, + "loss": 0.6761, + "step": 12670 + }, + { + "epoch": 0.6297804708453363, + "grad_norm": 0.1162109375, + "learning_rate": 0.000749621535710738, + "loss": 0.6713, + "step": 12680 + }, + { + "epoch": 0.6302771431409556, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007495818019270886, + "loss": 0.6797, + "step": 12690 + }, + { + "epoch": 0.630773815436575, + "grad_norm": 0.11474609375, + "learning_rate": 0.000749542068143439, + "loss": 0.6841, + "step": 12700 + }, + { + "epoch": 0.6312704877321943, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007495023343597894, + "loss": 0.6869, + "step": 12710 + }, + { + "epoch": 0.6317671600278136, + "grad_norm": 0.140625, + "learning_rate": 0.0007494626005761399, + "loss": 0.6889, + "step": 12720 + }, + { + "epoch": 0.632263832323433, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007494228667924903, + "loss": 0.7024, + "step": 12730 + }, + { + "epoch": 0.6327605046190523, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007493831330088408, + "loss": 0.6904, + "step": 12740 + }, + { + "epoch": 0.6332571769146716, + "grad_norm": 0.1337890625, + "learning_rate": 0.0007493433992251913, + "loss": 0.6811, + "step": 12750 + }, + { + "epoch": 0.6337538492102911, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007493036654415417, + "loss": 0.6713, + "step": 12760 + }, + { + "epoch": 0.6342505215059104, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007492639316578921, + "loss": 0.6732, + "step": 12770 + }, + { + "epoch": 0.6347471938015298, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007492241978742426, + "loss": 0.6961, + "step": 12780 + }, + { + "epoch": 0.6352438660971491, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007491844640905931, + "loss": 0.672, + "step": 12790 + }, + { + "epoch": 0.6357405383927685, + "grad_norm": 0.1171875, + "learning_rate": 0.0007491447303069435, + "loss": 0.7121, + "step": 12800 + }, + { + "epoch": 0.6362372106883878, + "grad_norm": 0.10400390625, + "learning_rate": 0.000749104996523294, + "loss": 0.6957, + "step": 12810 + }, + { + "epoch": 0.6367338829840071, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007490652627396444, + "loss": 0.6893, + "step": 12820 + }, + { + "epoch": 0.6372305552796265, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007490255289559948, + "loss": 0.6501, + "step": 12830 + }, + { + "epoch": 0.6377272275752458, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007489857951723453, + "loss": 0.6912, + "step": 12840 + }, + { + "epoch": 0.6382238998708653, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007489460613886958, + "loss": 0.6945, + "step": 12850 + }, + { + "epoch": 0.6387205721664846, + "grad_norm": 0.130859375, + "learning_rate": 0.0007489063276050463, + "loss": 0.66, + "step": 12860 + }, + { + "epoch": 0.6392172444621039, + "grad_norm": 0.1171875, + "learning_rate": 0.0007488665938213966, + "loss": 0.7147, + "step": 12870 + }, + { + "epoch": 0.6397139167577233, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007488268600377471, + "loss": 0.668, + "step": 12880 + }, + { + "epoch": 0.6402105890533426, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007487871262540976, + "loss": 0.6798, + "step": 12890 + }, + { + "epoch": 0.6407072613489619, + "grad_norm": 0.11767578125, + "learning_rate": 0.000748747392470448, + "loss": 0.6702, + "step": 12900 + }, + { + "epoch": 0.6412039336445813, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007487076586867985, + "loss": 0.661, + "step": 12910 + }, + { + "epoch": 0.6417006059402006, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007486679249031489, + "loss": 0.6679, + "step": 12920 + }, + { + "epoch": 0.64219727823582, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007486281911194993, + "loss": 0.659, + "step": 12930 + }, + { + "epoch": 0.6426939505314394, + "grad_norm": 0.10205078125, + "learning_rate": 0.0007485884573358499, + "loss": 0.6966, + "step": 12940 + }, + { + "epoch": 0.6431906228270587, + "grad_norm": 0.1328125, + "learning_rate": 0.0007485487235522003, + "loss": 0.7063, + "step": 12950 + }, + { + "epoch": 0.6436872951226781, + "grad_norm": 0.111328125, + "learning_rate": 0.0007485089897685507, + "loss": 0.6778, + "step": 12960 + }, + { + "epoch": 0.6441839674182974, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007484692559849012, + "loss": 0.7112, + "step": 12970 + }, + { + "epoch": 0.6446806397139168, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007484295222012516, + "loss": 0.7005, + "step": 12980 + }, + { + "epoch": 0.6451773120095361, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007483897884176022, + "loss": 0.657, + "step": 12990 + }, + { + "epoch": 0.6456739843051554, + "grad_norm": 0.140625, + "learning_rate": 0.0007483500546339526, + "loss": 0.6676, + "step": 13000 + }, + { + "epoch": 0.6461706566007748, + "grad_norm": 0.1123046875, + "learning_rate": 0.000748310320850303, + "loss": 0.6943, + "step": 13010 + }, + { + "epoch": 0.6466673288963941, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007482705870666535, + "loss": 0.6727, + "step": 13020 + }, + { + "epoch": 0.6471640011920136, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007482308532830038, + "loss": 0.6818, + "step": 13030 + }, + { + "epoch": 0.6476606734876329, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007481911194993544, + "loss": 0.6715, + "step": 13040 + }, + { + "epoch": 0.6481573457832522, + "grad_norm": 0.107421875, + "learning_rate": 0.0007481513857157049, + "loss": 0.6993, + "step": 13050 + }, + { + "epoch": 0.6486540180788716, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007481116519320552, + "loss": 0.6996, + "step": 13060 + }, + { + "epoch": 0.6491506903744909, + "grad_norm": 0.154296875, + "learning_rate": 0.0007480719181484057, + "loss": 0.6714, + "step": 13070 + }, + { + "epoch": 0.6496473626701103, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007480321843647562, + "loss": 0.702, + "step": 13080 + }, + { + "epoch": 0.6501440349657296, + "grad_norm": 0.11328125, + "learning_rate": 0.0007479924505811066, + "loss": 0.6975, + "step": 13090 + }, + { + "epoch": 0.6506407072613489, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007479527167974571, + "loss": 0.7115, + "step": 13100 + }, + { + "epoch": 0.6511373795569683, + "grad_norm": 0.107421875, + "learning_rate": 0.0007479129830138075, + "loss": 0.7013, + "step": 13110 + }, + { + "epoch": 0.6516340518525877, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007478732492301579, + "loss": 0.6469, + "step": 13120 + }, + { + "epoch": 0.652130724148207, + "grad_norm": 0.1484375, + "learning_rate": 0.0007478335154465084, + "loss": 0.7094, + "step": 13130 + }, + { + "epoch": 0.6526273964438264, + "grad_norm": 0.134765625, + "learning_rate": 0.0007477937816628589, + "loss": 0.6609, + "step": 13140 + }, + { + "epoch": 0.6531240687394457, + "grad_norm": 0.11328125, + "learning_rate": 0.0007477540478792094, + "loss": 0.684, + "step": 13150 + }, + { + "epoch": 0.6536207410350651, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007477143140955598, + "loss": 0.6833, + "step": 13160 + }, + { + "epoch": 0.6541174133306844, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007476745803119102, + "loss": 0.6783, + "step": 13170 + }, + { + "epoch": 0.6546140856263037, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007476348465282607, + "loss": 0.6597, + "step": 13180 + }, + { + "epoch": 0.6551107579219231, + "grad_norm": 0.123046875, + "learning_rate": 0.0007475951127446112, + "loss": 0.6951, + "step": 13190 + }, + { + "epoch": 0.6556074302175424, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007475553789609616, + "loss": 0.6596, + "step": 13200 + }, + { + "epoch": 0.6561041025131619, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007475156451773121, + "loss": 0.6579, + "step": 13210 + }, + { + "epoch": 0.6566007748087812, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007474759113936624, + "loss": 0.669, + "step": 13220 + }, + { + "epoch": 0.6570974471044005, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007474361776100129, + "loss": 0.6524, + "step": 13230 + }, + { + "epoch": 0.6575941194000199, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007473964438263635, + "loss": 0.6781, + "step": 13240 + }, + { + "epoch": 0.6580907916956392, + "grad_norm": 0.111328125, + "learning_rate": 0.0007473567100427138, + "loss": 0.6868, + "step": 13250 + }, + { + "epoch": 0.6585874639912586, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007473169762590643, + "loss": 0.6969, + "step": 13260 + }, + { + "epoch": 0.6590841362868779, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007472772424754148, + "loss": 0.7009, + "step": 13270 + }, + { + "epoch": 0.6595808085824972, + "grad_norm": 0.109375, + "learning_rate": 0.0007472375086917652, + "loss": 0.681, + "step": 13280 + }, + { + "epoch": 0.6600774808781166, + "grad_norm": 0.1015625, + "learning_rate": 0.0007471977749081157, + "loss": 0.6342, + "step": 13290 + }, + { + "epoch": 0.660574153173736, + "grad_norm": 0.111328125, + "learning_rate": 0.0007471580411244661, + "loss": 0.6394, + "step": 13300 + }, + { + "epoch": 0.6610708254693554, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007471183073408166, + "loss": 0.7037, + "step": 13310 + }, + { + "epoch": 0.6615674977649747, + "grad_norm": 0.12109375, + "learning_rate": 0.000747078573557167, + "loss": 0.6478, + "step": 13320 + }, + { + "epoch": 0.662064170060594, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007470388397735174, + "loss": 0.6727, + "step": 13330 + }, + { + "epoch": 0.6625608423562134, + "grad_norm": 0.123046875, + "learning_rate": 0.000746999105989868, + "loss": 0.6829, + "step": 13340 + }, + { + "epoch": 0.6630575146518327, + "grad_norm": 0.125, + "learning_rate": 0.0007469593722062184, + "loss": 0.6971, + "step": 13350 + }, + { + "epoch": 0.663554186947452, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007469196384225688, + "loss": 0.6622, + "step": 13360 + }, + { + "epoch": 0.6640508592430714, + "grad_norm": 0.10546875, + "learning_rate": 0.0007468799046389193, + "loss": 0.6789, + "step": 13370 + }, + { + "epoch": 0.6645475315386907, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007468401708552697, + "loss": 0.6986, + "step": 13380 + }, + { + "epoch": 0.6650442038343102, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007468004370716202, + "loss": 0.7204, + "step": 13390 + }, + { + "epoch": 0.6655408761299295, + "grad_norm": 0.111328125, + "learning_rate": 0.0007467607032879707, + "loss": 0.6522, + "step": 13400 + }, + { + "epoch": 0.6660375484255489, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007467209695043211, + "loss": 0.6546, + "step": 13410 + }, + { + "epoch": 0.6665342207211682, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007466812357206715, + "loss": 0.6535, + "step": 13420 + }, + { + "epoch": 0.6670308930167875, + "grad_norm": 0.1123046875, + "learning_rate": 0.000746641501937022, + "loss": 0.6729, + "step": 13430 + }, + { + "epoch": 0.6675275653124069, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007466017681533725, + "loss": 0.6886, + "step": 13440 + }, + { + "epoch": 0.6680242376080262, + "grad_norm": 0.134765625, + "learning_rate": 0.0007465620343697229, + "loss": 0.6564, + "step": 13450 + }, + { + "epoch": 0.6685209099036455, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007465223005860734, + "loss": 0.6938, + "step": 13460 + }, + { + "epoch": 0.6690175821992649, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007464825668024238, + "loss": 0.6715, + "step": 13470 + }, + { + "epoch": 0.6695142544948842, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007464428330187742, + "loss": 0.6789, + "step": 13480 + }, + { + "epoch": 0.6700109267905037, + "grad_norm": 0.1337890625, + "learning_rate": 0.0007464030992351247, + "loss": 0.6537, + "step": 13490 + }, + { + "epoch": 0.670507599086123, + "grad_norm": 0.109375, + "learning_rate": 0.0007463633654514752, + "loss": 0.6739, + "step": 13500 + }, + { + "epoch": 0.6710042713817423, + "grad_norm": 0.103515625, + "learning_rate": 0.0007463236316678256, + "loss": 0.6662, + "step": 13510 + }, + { + "epoch": 0.6715009436773617, + "grad_norm": 0.10888671875, + "learning_rate": 0.000746283897884176, + "loss": 0.6451, + "step": 13520 + }, + { + "epoch": 0.671997615972981, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007462441641005265, + "loss": 0.6754, + "step": 13530 + }, + { + "epoch": 0.6724942882686004, + "grad_norm": 0.12255859375, + "learning_rate": 0.000746204430316877, + "loss": 0.6632, + "step": 13540 + }, + { + "epoch": 0.6729909605642197, + "grad_norm": 0.119140625, + "learning_rate": 0.0007461646965332274, + "loss": 0.7027, + "step": 13550 + }, + { + "epoch": 0.673487632859839, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007461249627495779, + "loss": 0.6755, + "step": 13560 + }, + { + "epoch": 0.6739843051554584, + "grad_norm": 0.109375, + "learning_rate": 0.0007460852289659283, + "loss": 0.688, + "step": 13570 + }, + { + "epoch": 0.6744809774510778, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007460454951822787, + "loss": 0.69, + "step": 13580 + }, + { + "epoch": 0.6749776497466972, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007460057613986293, + "loss": 0.7026, + "step": 13590 + }, + { + "epoch": 0.6754743220423165, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007459660276149797, + "loss": 0.6592, + "step": 13600 + }, + { + "epoch": 0.6759709943379358, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007459262938313301, + "loss": 0.6609, + "step": 13610 + }, + { + "epoch": 0.6764676666335552, + "grad_norm": 0.125, + "learning_rate": 0.0007458865600476806, + "loss": 0.6837, + "step": 13620 + }, + { + "epoch": 0.6769643389291745, + "grad_norm": 0.11181640625, + "learning_rate": 0.000745846826264031, + "loss": 0.6865, + "step": 13630 + }, + { + "epoch": 0.6774610112247939, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007458070924803814, + "loss": 0.6354, + "step": 13640 + }, + { + "epoch": 0.6779576835204132, + "grad_norm": 0.12060546875, + "learning_rate": 0.000745767358696732, + "loss": 0.7152, + "step": 13650 + }, + { + "epoch": 0.6784543558160325, + "grad_norm": 0.12890625, + "learning_rate": 0.0007457276249130824, + "loss": 0.6739, + "step": 13660 + }, + { + "epoch": 0.678951028111652, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007456878911294328, + "loss": 0.6947, + "step": 13670 + }, + { + "epoch": 0.6794477004072713, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007456481573457833, + "loss": 0.6693, + "step": 13680 + }, + { + "epoch": 0.6799443727028907, + "grad_norm": 0.13671875, + "learning_rate": 0.0007456084235621338, + "loss": 0.6785, + "step": 13690 + }, + { + "epoch": 0.68044104499851, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007455686897784842, + "loss": 0.6698, + "step": 13700 + }, + { + "epoch": 0.6809377172941293, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007455289559948346, + "loss": 0.6377, + "step": 13710 + }, + { + "epoch": 0.6814343895897487, + "grad_norm": 0.130859375, + "learning_rate": 0.0007454892222111851, + "loss": 0.6667, + "step": 13720 + }, + { + "epoch": 0.681931061885368, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007454494884275356, + "loss": 0.7018, + "step": 13730 + }, + { + "epoch": 0.6824277341809873, + "grad_norm": 0.10888671875, + "learning_rate": 0.000745409754643886, + "loss": 0.6803, + "step": 13740 + }, + { + "epoch": 0.6829244064766067, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007453700208602365, + "loss": 0.6521, + "step": 13750 + }, + { + "epoch": 0.6834210787722261, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007453302870765869, + "loss": 0.696, + "step": 13760 + }, + { + "epoch": 0.6839177510678455, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007452905532929373, + "loss": 0.6783, + "step": 13770 + }, + { + "epoch": 0.6844144233634648, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007452508195092878, + "loss": 0.6695, + "step": 13780 + }, + { + "epoch": 0.6849110956590841, + "grad_norm": 0.111328125, + "learning_rate": 0.0007452110857256382, + "loss": 0.694, + "step": 13790 + }, + { + "epoch": 0.6854077679547035, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007451713519419887, + "loss": 0.6664, + "step": 13800 + }, + { + "epoch": 0.6859044402503228, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007451316181583392, + "loss": 0.6734, + "step": 13810 + }, + { + "epoch": 0.6864011125459422, + "grad_norm": 0.111328125, + "learning_rate": 0.0007450918843746896, + "loss": 0.6455, + "step": 13820 + }, + { + "epoch": 0.6868977848415615, + "grad_norm": 0.10546875, + "learning_rate": 0.00074505215059104, + "loss": 0.6509, + "step": 13830 + }, + { + "epoch": 0.6873944571371808, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007450124168073906, + "loss": 0.6705, + "step": 13840 + }, + { + "epoch": 0.6878911294328003, + "grad_norm": 0.1171875, + "learning_rate": 0.000744972683023741, + "loss": 0.6952, + "step": 13850 + }, + { + "epoch": 0.6883878017284196, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007449329492400914, + "loss": 0.6787, + "step": 13860 + }, + { + "epoch": 0.688884474024039, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007448932154564419, + "loss": 0.6715, + "step": 13870 + }, + { + "epoch": 0.6893811463196583, + "grad_norm": 0.123046875, + "learning_rate": 0.0007448534816727923, + "loss": 0.6722, + "step": 13880 + }, + { + "epoch": 0.6898778186152776, + "grad_norm": 0.1669921875, + "learning_rate": 0.0007448137478891429, + "loss": 0.6656, + "step": 13890 + }, + { + "epoch": 0.690374490910897, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007447740141054932, + "loss": 0.6341, + "step": 13900 + }, + { + "epoch": 0.6908711632065163, + "grad_norm": 0.123046875, + "learning_rate": 0.0007447342803218437, + "loss": 0.687, + "step": 13910 + }, + { + "epoch": 0.6913678355021357, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007446945465381942, + "loss": 0.7099, + "step": 13920 + }, + { + "epoch": 0.691864507797755, + "grad_norm": 0.1337890625, + "learning_rate": 0.0007446548127545445, + "loss": 0.6738, + "step": 13930 + }, + { + "epoch": 0.6923611800933744, + "grad_norm": 0.111328125, + "learning_rate": 0.000744615078970895, + "loss": 0.6398, + "step": 13940 + }, + { + "epoch": 0.6928578523889938, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007445753451872456, + "loss": 0.66, + "step": 13950 + }, + { + "epoch": 0.6933545246846131, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007445356114035959, + "loss": 0.6886, + "step": 13960 + }, + { + "epoch": 0.6938511969802325, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007444958776199464, + "loss": 0.6649, + "step": 13970 + }, + { + "epoch": 0.6943478692758518, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007444561438362968, + "loss": 0.6863, + "step": 13980 + }, + { + "epoch": 0.6948445415714711, + "grad_norm": 0.126953125, + "learning_rate": 0.0007444164100526472, + "loss": 0.6929, + "step": 13990 + }, + { + "epoch": 0.6953412138670905, + "grad_norm": 0.107421875, + "learning_rate": 0.0007443766762689978, + "loss": 0.6631, + "step": 14000 + }, + { + "epoch": 0.6958378861627098, + "grad_norm": 0.134765625, + "learning_rate": 0.0007443369424853482, + "loss": 0.6998, + "step": 14010 + }, + { + "epoch": 0.6963345584583291, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007442972087016986, + "loss": 0.6604, + "step": 14020 + }, + { + "epoch": 0.6968312307539486, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007442574749180491, + "loss": 0.6781, + "step": 14030 + }, + { + "epoch": 0.6973279030495679, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007442177411343995, + "loss": 0.653, + "step": 14040 + }, + { + "epoch": 0.6978245753451873, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007441780073507501, + "loss": 0.6524, + "step": 14050 + }, + { + "epoch": 0.6983212476408066, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007441382735671005, + "loss": 0.6796, + "step": 14060 + }, + { + "epoch": 0.698817919936426, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007440985397834509, + "loss": 0.6614, + "step": 14070 + }, + { + "epoch": 0.6993145922320453, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007440588059998014, + "loss": 0.6559, + "step": 14080 + }, + { + "epoch": 0.6998112645276646, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007440190722161517, + "loss": 0.6349, + "step": 14090 + }, + { + "epoch": 0.700307936823284, + "grad_norm": 0.1171875, + "learning_rate": 0.0007439793384325023, + "loss": 0.6741, + "step": 14100 + }, + { + "epoch": 0.7008046091189033, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007439396046488528, + "loss": 0.6455, + "step": 14110 + }, + { + "epoch": 0.7013012814145227, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007438998708652031, + "loss": 0.6997, + "step": 14120 + }, + { + "epoch": 0.7017979537101421, + "grad_norm": 0.109375, + "learning_rate": 0.0007438601370815536, + "loss": 0.661, + "step": 14130 + }, + { + "epoch": 0.7022946260057614, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007438204032979041, + "loss": 0.6776, + "step": 14140 + }, + { + "epoch": 0.7027912983013808, + "grad_norm": 0.123046875, + "learning_rate": 0.0007437806695142545, + "loss": 0.6656, + "step": 14150 + }, + { + "epoch": 0.7032879705970001, + "grad_norm": 0.1064453125, + "learning_rate": 0.000743740935730605, + "loss": 0.6608, + "step": 14160 + }, + { + "epoch": 0.7037846428926194, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007437012019469554, + "loss": 0.6501, + "step": 14170 + }, + { + "epoch": 0.7042813151882388, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007436614681633059, + "loss": 0.6619, + "step": 14180 + }, + { + "epoch": 0.7047779874838581, + "grad_norm": 0.125, + "learning_rate": 0.0007436217343796563, + "loss": 0.6832, + "step": 14190 + }, + { + "epoch": 0.7052746597794775, + "grad_norm": 0.111328125, + "learning_rate": 0.0007435820005960068, + "loss": 0.66, + "step": 14200 + }, + { + "epoch": 0.7057713320750969, + "grad_norm": 0.12109375, + "learning_rate": 0.0007435422668123573, + "loss": 0.6392, + "step": 14210 + }, + { + "epoch": 0.7062680043707162, + "grad_norm": 0.1171875, + "learning_rate": 0.0007435025330287077, + "loss": 0.6357, + "step": 14220 + }, + { + "epoch": 0.7067646766663356, + "grad_norm": 0.1171875, + "learning_rate": 0.0007434627992450581, + "loss": 0.6774, + "step": 14230 + }, + { + "epoch": 0.7072613489619549, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007434230654614086, + "loss": 0.63, + "step": 14240 + }, + { + "epoch": 0.7077580212575743, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007433833316777591, + "loss": 0.6643, + "step": 14250 + }, + { + "epoch": 0.7082546935531936, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007433435978941095, + "loss": 0.6845, + "step": 14260 + }, + { + "epoch": 0.7087513658488129, + "grad_norm": 0.1044921875, + "learning_rate": 0.00074330386411046, + "loss": 0.6639, + "step": 14270 + }, + { + "epoch": 0.7092480381444323, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007432641303268104, + "loss": 0.7032, + "step": 14280 + }, + { + "epoch": 0.7097447104400516, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007432243965431608, + "loss": 0.6655, + "step": 14290 + }, + { + "epoch": 0.7102413827356711, + "grad_norm": 0.158203125, + "learning_rate": 0.0007431846627595114, + "loss": 0.6785, + "step": 14300 + }, + { + "epoch": 0.7107380550312904, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007431449289758617, + "loss": 0.673, + "step": 14310 + }, + { + "epoch": 0.7112347273269097, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007431051951922122, + "loss": 0.6534, + "step": 14320 + }, + { + "epoch": 0.7117313996225291, + "grad_norm": 0.109375, + "learning_rate": 0.0007430654614085627, + "loss": 0.6963, + "step": 14330 + }, + { + "epoch": 0.7122280719181484, + "grad_norm": 0.111328125, + "learning_rate": 0.0007430257276249131, + "loss": 0.6584, + "step": 14340 + }, + { + "epoch": 0.7127247442137677, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007429859938412636, + "loss": 0.6742, + "step": 14350 + }, + { + "epoch": 0.7132214165093871, + "grad_norm": 0.1025390625, + "learning_rate": 0.000742946260057614, + "loss": 0.6555, + "step": 14360 + }, + { + "epoch": 0.7137180888050064, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007429065262739645, + "loss": 0.6705, + "step": 14370 + }, + { + "epoch": 0.7142147611006258, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007428667924903149, + "loss": 0.661, + "step": 14380 + }, + { + "epoch": 0.7147114333962452, + "grad_norm": 0.11328125, + "learning_rate": 0.0007428270587066653, + "loss": 0.6886, + "step": 14390 + }, + { + "epoch": 0.7152081056918645, + "grad_norm": 0.10205078125, + "learning_rate": 0.0007427873249230159, + "loss": 0.6865, + "step": 14400 + }, + { + "epoch": 0.7157047779874839, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007427475911393663, + "loss": 0.6566, + "step": 14410 + }, + { + "epoch": 0.7162014502831032, + "grad_norm": 0.1171875, + "learning_rate": 0.0007427078573557167, + "loss": 0.6595, + "step": 14420 + }, + { + "epoch": 0.7166981225787226, + "grad_norm": 0.1201171875, + "learning_rate": 0.0007426681235720672, + "loss": 0.6846, + "step": 14430 + }, + { + "epoch": 0.7171947948743419, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007426283897884176, + "loss": 0.6815, + "step": 14440 + }, + { + "epoch": 0.7176914671699612, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007425886560047681, + "loss": 0.6479, + "step": 14450 + }, + { + "epoch": 0.7181881394655806, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007425489222211186, + "loss": 0.6771, + "step": 14460 + }, + { + "epoch": 0.7186848117611999, + "grad_norm": 0.1064453125, + "learning_rate": 0.000742509188437469, + "loss": 0.6768, + "step": 14470 + }, + { + "epoch": 0.7191814840568194, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007424694546538194, + "loss": 0.6603, + "step": 14480 + }, + { + "epoch": 0.7196781563524387, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007424297208701699, + "loss": 0.6645, + "step": 14490 + }, + { + "epoch": 0.720174828648058, + "grad_norm": 0.10546875, + "learning_rate": 0.0007423899870865204, + "loss": 0.6885, + "step": 14500 + }, + { + "epoch": 0.7206715009436774, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007423502533028708, + "loss": 0.6573, + "step": 14510 + }, + { + "epoch": 0.7211681732392967, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007423105195192213, + "loss": 0.6914, + "step": 14520 + }, + { + "epoch": 0.721664845534916, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007422707857355717, + "loss": 0.6392, + "step": 14530 + }, + { + "epoch": 0.7221615178305354, + "grad_norm": 0.111328125, + "learning_rate": 0.0007422310519519221, + "loss": 0.6606, + "step": 14540 + }, + { + "epoch": 0.7226581901261547, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007421913181682727, + "loss": 0.7015, + "step": 14550 + }, + { + "epoch": 0.7231548624217741, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007421515843846231, + "loss": 0.6693, + "step": 14560 + }, + { + "epoch": 0.7236515347173935, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007421118506009735, + "loss": 0.6702, + "step": 14570 + }, + { + "epoch": 0.7241482070130129, + "grad_norm": 0.109375, + "learning_rate": 0.0007420721168173239, + "loss": 0.7158, + "step": 14580 + }, + { + "epoch": 0.7246448793086322, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007420323830336744, + "loss": 0.6439, + "step": 14590 + }, + { + "epoch": 0.7251415516042515, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007419926492500249, + "loss": 0.6677, + "step": 14600 + }, + { + "epoch": 0.7256382238998709, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007419529154663753, + "loss": 0.6892, + "step": 14610 + }, + { + "epoch": 0.7261348961954902, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007419131816827258, + "loss": 0.6827, + "step": 14620 + }, + { + "epoch": 0.7266315684911095, + "grad_norm": 0.099609375, + "learning_rate": 0.0007418734478990762, + "loss": 0.6628, + "step": 14630 + }, + { + "epoch": 0.7271282407867289, + "grad_norm": 0.11328125, + "learning_rate": 0.0007418337141154266, + "loss": 0.6387, + "step": 14640 + }, + { + "epoch": 0.7276249130823482, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007417939803317772, + "loss": 0.6581, + "step": 14650 + }, + { + "epoch": 0.7281215853779676, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007417542465481276, + "loss": 0.6713, + "step": 14660 + }, + { + "epoch": 0.728618257673587, + "grad_norm": 0.10107421875, + "learning_rate": 0.000741714512764478, + "loss": 0.6754, + "step": 14670 + }, + { + "epoch": 0.7291149299692063, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007416747789808285, + "loss": 0.6784, + "step": 14680 + }, + { + "epoch": 0.7296116022648257, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007416350451971789, + "loss": 0.6548, + "step": 14690 + }, + { + "epoch": 0.730108274560445, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007415953114135294, + "loss": 0.6828, + "step": 14700 + }, + { + "epoch": 0.7306049468560644, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007415555776298799, + "loss": 0.6802, + "step": 14710 + }, + { + "epoch": 0.7311016191516837, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007415158438462303, + "loss": 0.657, + "step": 14720 + }, + { + "epoch": 0.731598291447303, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007414761100625807, + "loss": 0.6715, + "step": 14730 + }, + { + "epoch": 0.7320949637429224, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007414363762789312, + "loss": 0.633, + "step": 14740 + }, + { + "epoch": 0.7325916360385417, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007413966424952817, + "loss": 0.643, + "step": 14750 + }, + { + "epoch": 0.7330883083341612, + "grad_norm": 0.09228515625, + "learning_rate": 0.0007413569087116321, + "loss": 0.6803, + "step": 14760 + }, + { + "epoch": 0.7335849806297805, + "grad_norm": 0.13671875, + "learning_rate": 0.0007413171749279825, + "loss": 0.6491, + "step": 14770 + }, + { + "epoch": 0.7340816529253998, + "grad_norm": 0.10498046875, + "learning_rate": 0.000741277441144333, + "loss": 0.6822, + "step": 14780 + }, + { + "epoch": 0.7345783252210192, + "grad_norm": 0.123046875, + "learning_rate": 0.0007412377073606835, + "loss": 0.6758, + "step": 14790 + }, + { + "epoch": 0.7350749975166385, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007411979735770338, + "loss": 0.6977, + "step": 14800 + }, + { + "epoch": 0.7355716698122579, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007411582397933844, + "loss": 0.6655, + "step": 14810 + }, + { + "epoch": 0.7360683421078772, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007411185060097349, + "loss": 0.6554, + "step": 14820 + }, + { + "epoch": 0.7365650144034965, + "grad_norm": 0.1650390625, + "learning_rate": 0.0007410787722260852, + "loss": 0.623, + "step": 14830 + }, + { + "epoch": 0.7370616866991159, + "grad_norm": 0.10546875, + "learning_rate": 0.0007410390384424357, + "loss": 0.6476, + "step": 14840 + }, + { + "epoch": 0.7375583589947353, + "grad_norm": 0.126953125, + "learning_rate": 0.0007409993046587862, + "loss": 0.6634, + "step": 14850 + }, + { + "epoch": 0.7380550312903547, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007409595708751366, + "loss": 0.6542, + "step": 14860 + }, + { + "epoch": 0.738551703585974, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007409198370914871, + "loss": 0.6161, + "step": 14870 + }, + { + "epoch": 0.7390483758815933, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007408801033078375, + "loss": 0.6548, + "step": 14880 + }, + { + "epoch": 0.7395450481772127, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007408403695241879, + "loss": 0.6417, + "step": 14890 + }, + { + "epoch": 0.740041720472832, + "grad_norm": 0.12109375, + "learning_rate": 0.0007408006357405385, + "loss": 0.6701, + "step": 14900 + }, + { + "epoch": 0.7405383927684513, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007407609019568889, + "loss": 0.6672, + "step": 14910 + }, + { + "epoch": 0.7410350650640707, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007407211681732393, + "loss": 0.6359, + "step": 14920 + }, + { + "epoch": 0.74153173735969, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007406814343895898, + "loss": 0.668, + "step": 14930 + }, + { + "epoch": 0.7420284096553095, + "grad_norm": 0.10546875, + "learning_rate": 0.0007406417006059402, + "loss": 0.6952, + "step": 14940 + }, + { + "epoch": 0.7425250819509288, + "grad_norm": 0.12890625, + "learning_rate": 0.0007406019668222908, + "loss": 0.6725, + "step": 14950 + }, + { + "epoch": 0.7430217542465481, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007405622330386411, + "loss": 0.6643, + "step": 14960 + }, + { + "epoch": 0.7435184265421675, + "grad_norm": 0.115234375, + "learning_rate": 0.0007405224992549916, + "loss": 0.6796, + "step": 14970 + }, + { + "epoch": 0.7440150988377868, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007404827654713421, + "loss": 0.6462, + "step": 14980 + }, + { + "epoch": 0.7445117711334062, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007404430316876924, + "loss": 0.6689, + "step": 14990 + }, + { + "epoch": 0.7450084434290255, + "grad_norm": 0.1162109375, + "learning_rate": 0.000740403297904043, + "loss": 0.7093, + "step": 15000 + }, + { + "epoch": 0.7455051157246448, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007403635641203935, + "loss": 0.6595, + "step": 15010 + }, + { + "epoch": 0.7460017880202642, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007403238303367438, + "loss": 0.7084, + "step": 15020 + }, + { + "epoch": 0.7464984603158836, + "grad_norm": 0.1171875, + "learning_rate": 0.0007402840965530943, + "loss": 0.6587, + "step": 15030 + }, + { + "epoch": 0.746995132611503, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007402443627694447, + "loss": 0.6821, + "step": 15040 + }, + { + "epoch": 0.7474918049071223, + "grad_norm": 0.109375, + "learning_rate": 0.0007402046289857951, + "loss": 0.6574, + "step": 15050 + }, + { + "epoch": 0.7479884772027416, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007401648952021457, + "loss": 0.6512, + "step": 15060 + }, + { + "epoch": 0.748485149498361, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007401251614184961, + "loss": 0.6612, + "step": 15070 + }, + { + "epoch": 0.7489818217939803, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007400854276348466, + "loss": 0.6923, + "step": 15080 + }, + { + "epoch": 0.7494784940895997, + "grad_norm": 0.1162109375, + "learning_rate": 0.000740045693851197, + "loss": 0.6737, + "step": 15090 + }, + { + "epoch": 0.749975166385219, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007400059600675474, + "loss": 0.6633, + "step": 15100 + }, + { + "epoch": 0.7504718386808383, + "grad_norm": 0.10791015625, + "learning_rate": 0.000739966226283898, + "loss": 0.6599, + "step": 15110 + }, + { + "epoch": 0.7509685109764578, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007399264925002484, + "loss": 0.6532, + "step": 15120 + }, + { + "epoch": 0.7514651832720771, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007398867587165988, + "loss": 0.6475, + "step": 15130 + }, + { + "epoch": 0.7519618555676965, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007398470249329493, + "loss": 0.627, + "step": 15140 + }, + { + "epoch": 0.7524585278633158, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007398072911492998, + "loss": 0.6591, + "step": 15150 + }, + { + "epoch": 0.7529552001589351, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007397675573656502, + "loss": 0.678, + "step": 15160 + }, + { + "epoch": 0.7534518724545545, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007397278235820007, + "loss": 0.6714, + "step": 15170 + }, + { + "epoch": 0.7539485447501738, + "grad_norm": 0.10205078125, + "learning_rate": 0.000739688089798351, + "loss": 0.6759, + "step": 15180 + }, + { + "epoch": 0.7544452170457931, + "grad_norm": 0.15234375, + "learning_rate": 0.0007396483560147015, + "loss": 0.7192, + "step": 15190 + }, + { + "epoch": 0.7549418893414125, + "grad_norm": 0.138671875, + "learning_rate": 0.000739608622231052, + "loss": 0.6535, + "step": 15200 + }, + { + "epoch": 0.7554385616370319, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007395688884474024, + "loss": 0.6535, + "step": 15210 + }, + { + "epoch": 0.7559352339326513, + "grad_norm": 0.119140625, + "learning_rate": 0.0007395291546637529, + "loss": 0.6493, + "step": 15220 + }, + { + "epoch": 0.7564319062282706, + "grad_norm": 0.123046875, + "learning_rate": 0.0007394894208801033, + "loss": 0.6432, + "step": 15230 + }, + { + "epoch": 0.75692857852389, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007394496870964538, + "loss": 0.6387, + "step": 15240 + }, + { + "epoch": 0.7574252508195093, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007394099533128042, + "loss": 0.6892, + "step": 15250 + }, + { + "epoch": 0.7579219231151286, + "grad_norm": 0.099609375, + "learning_rate": 0.0007393702195291547, + "loss": 0.6835, + "step": 15260 + }, + { + "epoch": 0.758418595410748, + "grad_norm": 0.109375, + "learning_rate": 0.0007393304857455052, + "loss": 0.6625, + "step": 15270 + }, + { + "epoch": 0.7589152677063673, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007392907519618556, + "loss": 0.687, + "step": 15280 + }, + { + "epoch": 0.7594119400019866, + "grad_norm": 0.111328125, + "learning_rate": 0.000739251018178206, + "loss": 0.63, + "step": 15290 + }, + { + "epoch": 0.7599086122976061, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007392112843945566, + "loss": 0.6379, + "step": 15300 + }, + { + "epoch": 0.7604052845932254, + "grad_norm": 0.1015625, + "learning_rate": 0.000739171550610907, + "loss": 0.6866, + "step": 15310 + }, + { + "epoch": 0.7609019568888448, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007391318168272574, + "loss": 0.651, + "step": 15320 + }, + { + "epoch": 0.7613986291844641, + "grad_norm": 0.10546875, + "learning_rate": 0.0007390920830436079, + "loss": 0.6841, + "step": 15330 + }, + { + "epoch": 0.7618953014800834, + "grad_norm": 0.126953125, + "learning_rate": 0.0007390523492599583, + "loss": 0.6586, + "step": 15340 + }, + { + "epoch": 0.7623919737757028, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007390126154763087, + "loss": 0.6854, + "step": 15350 + }, + { + "epoch": 0.7628886460713221, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007389728816926593, + "loss": 0.668, + "step": 15360 + }, + { + "epoch": 0.7633853183669415, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007389331479090096, + "loss": 0.6417, + "step": 15370 + }, + { + "epoch": 0.7638819906625608, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007388934141253601, + "loss": 0.6731, + "step": 15380 + }, + { + "epoch": 0.7643786629581802, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007388536803417106, + "loss": 0.6381, + "step": 15390 + }, + { + "epoch": 0.7648753352537996, + "grad_norm": 0.10791015625, + "learning_rate": 0.000738813946558061, + "loss": 0.6457, + "step": 15400 + }, + { + "epoch": 0.7653720075494189, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007387742127744115, + "loss": 0.6486, + "step": 15410 + }, + { + "epoch": 0.7658686798450383, + "grad_norm": 0.1259765625, + "learning_rate": 0.000738734478990762, + "loss": 0.6659, + "step": 15420 + }, + { + "epoch": 0.7663653521406576, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007386947452071124, + "loss": 0.6527, + "step": 15430 + }, + { + "epoch": 0.7668620244362769, + "grad_norm": 0.09765625, + "learning_rate": 0.0007386550114234628, + "loss": 0.6608, + "step": 15440 + }, + { + "epoch": 0.7673586967318963, + "grad_norm": 0.09765625, + "learning_rate": 0.0007386152776398132, + "loss": 0.6618, + "step": 15450 + }, + { + "epoch": 0.7678553690275156, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007385755438561638, + "loss": 0.6524, + "step": 15460 + }, + { + "epoch": 0.768352041323135, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007385358100725142, + "loss": 0.6715, + "step": 15470 + }, + { + "epoch": 0.7688487136187544, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007384960762888646, + "loss": 0.6431, + "step": 15480 + }, + { + "epoch": 0.7693453859143737, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007384563425052151, + "loss": 0.6363, + "step": 15490 + }, + { + "epoch": 0.7698420582099931, + "grad_norm": 0.142578125, + "learning_rate": 0.0007384166087215655, + "loss": 0.6678, + "step": 15500 + }, + { + "epoch": 0.7703387305056124, + "grad_norm": 0.1123046875, + "learning_rate": 0.000738376874937916, + "loss": 0.6862, + "step": 15510 + }, + { + "epoch": 0.7708354028012318, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007383371411542665, + "loss": 0.6488, + "step": 15520 + }, + { + "epoch": 0.7713320750968511, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007382974073706169, + "loss": 0.6608, + "step": 15530 + }, + { + "epoch": 0.7718287473924704, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007382576735869673, + "loss": 0.6887, + "step": 15540 + }, + { + "epoch": 0.7723254196880898, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007382179398033178, + "loss": 0.65, + "step": 15550 + }, + { + "epoch": 0.7728220919837091, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007381782060196683, + "loss": 0.6542, + "step": 15560 + }, + { + "epoch": 0.7733187642793286, + "grad_norm": 0.09765625, + "learning_rate": 0.0007381384722360187, + "loss": 0.6324, + "step": 15570 + }, + { + "epoch": 0.7738154365749479, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007380987384523692, + "loss": 0.6682, + "step": 15580 + }, + { + "epoch": 0.7743121088705672, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007380590046687196, + "loss": 0.6871, + "step": 15590 + }, + { + "epoch": 0.7748087811661866, + "grad_norm": 0.125, + "learning_rate": 0.00073801927088507, + "loss": 0.6528, + "step": 15600 + }, + { + "epoch": 0.7753054534618059, + "grad_norm": 0.1337890625, + "learning_rate": 0.0007379795371014206, + "loss": 0.6642, + "step": 15610 + }, + { + "epoch": 0.7758021257574252, + "grad_norm": 0.1259765625, + "learning_rate": 0.000737939803317771, + "loss": 0.657, + "step": 15620 + }, + { + "epoch": 0.7762987980530446, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007379000695341214, + "loss": 0.672, + "step": 15630 + }, + { + "epoch": 0.7767954703486639, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007378603357504718, + "loss": 0.6719, + "step": 15640 + }, + { + "epoch": 0.7772921426442833, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007378206019668223, + "loss": 0.6735, + "step": 15650 + }, + { + "epoch": 0.7777888149399027, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007377808681831728, + "loss": 0.6648, + "step": 15660 + }, + { + "epoch": 0.778285487235522, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007377411343995232, + "loss": 0.6676, + "step": 15670 + }, + { + "epoch": 0.7787821595311414, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007377014006158737, + "loss": 0.685, + "step": 15680 + }, + { + "epoch": 0.7792788318267607, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007376616668322241, + "loss": 0.6626, + "step": 15690 + }, + { + "epoch": 0.7797755041223801, + "grad_norm": 0.107421875, + "learning_rate": 0.0007376219330485745, + "loss": 0.6375, + "step": 15700 + }, + { + "epoch": 0.7802721764179994, + "grad_norm": 0.140625, + "learning_rate": 0.0007375821992649251, + "loss": 0.6498, + "step": 15710 + }, + { + "epoch": 0.7807688487136187, + "grad_norm": 0.13671875, + "learning_rate": 0.0007375424654812755, + "loss": 0.6627, + "step": 15720 + }, + { + "epoch": 0.7812655210092381, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007375027316976259, + "loss": 0.6476, + "step": 15730 + }, + { + "epoch": 0.7817621933048574, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007374629979139764, + "loss": 0.6621, + "step": 15740 + }, + { + "epoch": 0.7822588656004769, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007374232641303268, + "loss": 0.6777, + "step": 15750 + }, + { + "epoch": 0.7827555378960962, + "grad_norm": 0.10546875, + "learning_rate": 0.0007373835303466773, + "loss": 0.6634, + "step": 15760 + }, + { + "epoch": 0.7832522101917155, + "grad_norm": 0.099609375, + "learning_rate": 0.0007373437965630278, + "loss": 0.6365, + "step": 15770 + }, + { + "epoch": 0.7837488824873349, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007373040627793782, + "loss": 0.6644, + "step": 15780 + }, + { + "epoch": 0.7842455547829542, + "grad_norm": 0.12890625, + "learning_rate": 0.0007372643289957286, + "loss": 0.6768, + "step": 15790 + }, + { + "epoch": 0.7847422270785736, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007372245952120791, + "loss": 0.6666, + "step": 15800 + }, + { + "epoch": 0.7852388993741929, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007371848614284296, + "loss": 0.6616, + "step": 15810 + }, + { + "epoch": 0.7857355716698122, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007371451276447801, + "loss": 0.6727, + "step": 15820 + }, + { + "epoch": 0.7862322439654316, + "grad_norm": 0.10546875, + "learning_rate": 0.0007371053938611304, + "loss": 0.6297, + "step": 15830 + }, + { + "epoch": 0.7867289162610509, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007370656600774809, + "loss": 0.661, + "step": 15840 + }, + { + "epoch": 0.7872255885566704, + "grad_norm": 0.0927734375, + "learning_rate": 0.0007370259262938314, + "loss": 0.6471, + "step": 15850 + }, + { + "epoch": 0.7877222608522897, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007369861925101818, + "loss": 0.6412, + "step": 15860 + }, + { + "epoch": 0.788218933147909, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007369464587265323, + "loss": 0.647, + "step": 15870 + }, + { + "epoch": 0.7887156054435284, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007369067249428828, + "loss": 0.6365, + "step": 15880 + }, + { + "epoch": 0.7892122777391477, + "grad_norm": 0.130859375, + "learning_rate": 0.0007368669911592331, + "loss": 0.6588, + "step": 15890 + }, + { + "epoch": 0.789708950034767, + "grad_norm": 0.130859375, + "learning_rate": 0.0007368272573755836, + "loss": 0.6499, + "step": 15900 + }, + { + "epoch": 0.7902056223303864, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007367875235919341, + "loss": 0.6606, + "step": 15910 + }, + { + "epoch": 0.7907022946260057, + "grad_norm": 0.109375, + "learning_rate": 0.0007367477898082845, + "loss": 0.6713, + "step": 15920 + }, + { + "epoch": 0.7911989669216251, + "grad_norm": 0.11474609375, + "learning_rate": 0.000736708056024635, + "loss": 0.633, + "step": 15930 + }, + { + "epoch": 0.7916956392172445, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007366683222409854, + "loss": 0.6714, + "step": 15940 + }, + { + "epoch": 0.7921923115128638, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007366285884573358, + "loss": 0.6649, + "step": 15950 + }, + { + "epoch": 0.7926889838084832, + "grad_norm": 0.111328125, + "learning_rate": 0.0007365888546736864, + "loss": 0.68, + "step": 15960 + }, + { + "epoch": 0.7931856561041025, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007365491208900368, + "loss": 0.6256, + "step": 15970 + }, + { + "epoch": 0.7936823283997219, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007365093871063873, + "loss": 0.6611, + "step": 15980 + }, + { + "epoch": 0.7941790006953412, + "grad_norm": 0.109375, + "learning_rate": 0.0007364696533227377, + "loss": 0.6605, + "step": 15990 + }, + { + "epoch": 0.7946756729909605, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007364299195390881, + "loss": 0.6494, + "step": 16000 + }, + { + "epoch": 0.7951723452865799, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007363901857554387, + "loss": 0.6674, + "step": 16010 + }, + { + "epoch": 0.7956690175821992, + "grad_norm": 0.12353515625, + "learning_rate": 0.000736350451971789, + "loss": 0.6483, + "step": 16020 + }, + { + "epoch": 0.7961656898778187, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007363107181881395, + "loss": 0.6441, + "step": 16030 + }, + { + "epoch": 0.796662362173438, + "grad_norm": 0.1533203125, + "learning_rate": 0.00073627098440449, + "loss": 0.6642, + "step": 16040 + }, + { + "epoch": 0.7971590344690573, + "grad_norm": 0.10205078125, + "learning_rate": 0.0007362312506208403, + "loss": 0.6259, + "step": 16050 + }, + { + "epoch": 0.7976557067646767, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007361915168371909, + "loss": 0.6641, + "step": 16060 + }, + { + "epoch": 0.798152379060296, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007361517830535414, + "loss": 0.6431, + "step": 16070 + }, + { + "epoch": 0.7986490513559154, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007361120492698917, + "loss": 0.6418, + "step": 16080 + }, + { + "epoch": 0.7991457236515347, + "grad_norm": 0.109375, + "learning_rate": 0.0007360723154862422, + "loss": 0.6362, + "step": 16090 + }, + { + "epoch": 0.799642395947154, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007360325817025926, + "loss": 0.6457, + "step": 16100 + }, + { + "epoch": 0.8001390682427734, + "grad_norm": 0.10888671875, + "learning_rate": 0.000735992847918943, + "loss": 0.6353, + "step": 16110 + }, + { + "epoch": 0.8006357405383928, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007359531141352936, + "loss": 0.6309, + "step": 16120 + }, + { + "epoch": 0.8011324128340122, + "grad_norm": 0.11572265625, + "learning_rate": 0.000735913380351644, + "loss": 0.6537, + "step": 16130 + }, + { + "epoch": 0.8016290851296315, + "grad_norm": 0.099609375, + "learning_rate": 0.0007358736465679945, + "loss": 0.6506, + "step": 16140 + }, + { + "epoch": 0.8021257574252508, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007358339127843449, + "loss": 0.6716, + "step": 16150 + }, + { + "epoch": 0.8026224297208702, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007357941790006954, + "loss": 0.6404, + "step": 16160 + }, + { + "epoch": 0.8031191020164895, + "grad_norm": 0.103515625, + "learning_rate": 0.0007357544452170459, + "loss": 0.6493, + "step": 16170 + }, + { + "epoch": 0.8036157743121088, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007357147114333963, + "loss": 0.6635, + "step": 16180 + }, + { + "epoch": 0.8041124466077282, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007356749776497467, + "loss": 0.619, + "step": 16190 + }, + { + "epoch": 0.8046091189033475, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007356352438660972, + "loss": 0.6638, + "step": 16200 + }, + { + "epoch": 0.805105791198967, + "grad_norm": 0.10546875, + "learning_rate": 0.0007355955100824477, + "loss": 0.665, + "step": 16210 + }, + { + "epoch": 0.8056024634945863, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007355557762987981, + "loss": 0.6213, + "step": 16220 + }, + { + "epoch": 0.8060991357902056, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007355160425151486, + "loss": 0.6451, + "step": 16230 + }, + { + "epoch": 0.806595808085825, + "grad_norm": 0.115234375, + "learning_rate": 0.0007354763087314989, + "loss": 0.6398, + "step": 16240 + }, + { + "epoch": 0.8070924803814443, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007354365749478494, + "loss": 0.6553, + "step": 16250 + }, + { + "epoch": 0.8075891526770637, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007353968411642, + "loss": 0.691, + "step": 16260 + }, + { + "epoch": 0.808085824972683, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007353571073805504, + "loss": 0.6825, + "step": 16270 + }, + { + "epoch": 0.8085824972683023, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007353173735969008, + "loss": 0.6685, + "step": 16280 + }, + { + "epoch": 0.8090791695639217, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007352776398132512, + "loss": 0.6356, + "step": 16290 + }, + { + "epoch": 0.8095758418595411, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007352379060296017, + "loss": 0.6552, + "step": 16300 + }, + { + "epoch": 0.8100725141551605, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007351981722459522, + "loss": 0.6721, + "step": 16310 + }, + { + "epoch": 0.8105691864507798, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007351584384623026, + "loss": 0.6464, + "step": 16320 + }, + { + "epoch": 0.8110658587463991, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007351187046786531, + "loss": 0.6584, + "step": 16330 + }, + { + "epoch": 0.8115625310420185, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007350789708950035, + "loss": 0.6583, + "step": 16340 + }, + { + "epoch": 0.8120592033376378, + "grad_norm": 0.1171875, + "learning_rate": 0.0007350392371113539, + "loss": 0.6785, + "step": 16350 + }, + { + "epoch": 0.8125558756332572, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007349995033277045, + "loss": 0.6511, + "step": 16360 + }, + { + "epoch": 0.8130525479288765, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007349597695440549, + "loss": 0.6634, + "step": 16370 + }, + { + "epoch": 0.8135492202244958, + "grad_norm": 0.111328125, + "learning_rate": 0.0007349200357604053, + "loss": 0.7017, + "step": 16380 + }, + { + "epoch": 0.8140458925201153, + "grad_norm": 0.115234375, + "learning_rate": 0.0007348803019767558, + "loss": 0.6716, + "step": 16390 + }, + { + "epoch": 0.8145425648157346, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007348405681931062, + "loss": 0.6769, + "step": 16400 + }, + { + "epoch": 0.815039237111354, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007348008344094566, + "loss": 0.6457, + "step": 16410 + }, + { + "epoch": 0.8155359094069733, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007347611006258072, + "loss": 0.6589, + "step": 16420 + }, + { + "epoch": 0.8160325817025926, + "grad_norm": 0.119140625, + "learning_rate": 0.0007347213668421576, + "loss": 0.6282, + "step": 16430 + }, + { + "epoch": 0.816529253998212, + "grad_norm": 0.1328125, + "learning_rate": 0.000734681633058508, + "loss": 0.6914, + "step": 16440 + }, + { + "epoch": 0.8170259262938313, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007346418992748585, + "loss": 0.6693, + "step": 16450 + }, + { + "epoch": 0.8175225985894506, + "grad_norm": 0.11376953125, + "learning_rate": 0.000734602165491209, + "loss": 0.6435, + "step": 16460 + }, + { + "epoch": 0.81801927088507, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007345624317075594, + "loss": 0.6676, + "step": 16470 + }, + { + "epoch": 0.8185159431806894, + "grad_norm": 0.109375, + "learning_rate": 0.0007345226979239099, + "loss": 0.6642, + "step": 16480 + }, + { + "epoch": 0.8190126154763088, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007344829641402603, + "loss": 0.677, + "step": 16490 + }, + { + "epoch": 0.8195092877719281, + "grad_norm": 0.123046875, + "learning_rate": 0.0007344432303566107, + "loss": 0.6594, + "step": 16500 + }, + { + "epoch": 0.8200059600675474, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007344034965729611, + "loss": 0.6555, + "step": 16510 + }, + { + "epoch": 0.8205026323631668, + "grad_norm": 0.115234375, + "learning_rate": 0.0007343637627893117, + "loss": 0.6422, + "step": 16520 + }, + { + "epoch": 0.8209993046587861, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007343240290056621, + "loss": 0.6481, + "step": 16530 + }, + { + "epoch": 0.8214959769544055, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007342842952220125, + "loss": 0.678, + "step": 16540 + }, + { + "epoch": 0.8219926492500248, + "grad_norm": 0.1044921875, + "learning_rate": 0.000734244561438363, + "loss": 0.638, + "step": 16550 + }, + { + "epoch": 0.8224893215456441, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007342048276547134, + "loss": 0.6303, + "step": 16560 + }, + { + "epoch": 0.8229859938412636, + "grad_norm": 0.119140625, + "learning_rate": 0.0007341650938710639, + "loss": 0.6615, + "step": 16570 + }, + { + "epoch": 0.8234826661368829, + "grad_norm": 0.1015625, + "learning_rate": 0.0007341253600874144, + "loss": 0.6814, + "step": 16580 + }, + { + "epoch": 0.8239793384325023, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007340856263037648, + "loss": 0.6632, + "step": 16590 + }, + { + "epoch": 0.8244760107281216, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007340458925201152, + "loss": 0.6387, + "step": 16600 + }, + { + "epoch": 0.8249726830237409, + "grad_norm": 0.115234375, + "learning_rate": 0.0007340061587364658, + "loss": 0.6394, + "step": 16610 + }, + { + "epoch": 0.8254693553193603, + "grad_norm": 0.11328125, + "learning_rate": 0.0007339664249528162, + "loss": 0.6571, + "step": 16620 + }, + { + "epoch": 0.8259660276149796, + "grad_norm": 0.11328125, + "learning_rate": 0.0007339266911691666, + "loss": 0.662, + "step": 16630 + }, + { + "epoch": 0.826462699910599, + "grad_norm": 0.109375, + "learning_rate": 0.0007338869573855171, + "loss": 0.6662, + "step": 16640 + }, + { + "epoch": 0.8269593722062183, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007338472236018675, + "loss": 0.6464, + "step": 16650 + }, + { + "epoch": 0.8274560445018377, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007338074898182179, + "loss": 0.6808, + "step": 16660 + }, + { + "epoch": 0.8279527167974571, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007337677560345685, + "loss": 0.6563, + "step": 16670 + }, + { + "epoch": 0.8284493890930764, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007337280222509189, + "loss": 0.6661, + "step": 16680 + }, + { + "epoch": 0.8289460613886958, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007336882884672693, + "loss": 0.6458, + "step": 16690 + }, + { + "epoch": 0.8294427336843151, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007336485546836197, + "loss": 0.6569, + "step": 16700 + }, + { + "epoch": 0.8299394059799344, + "grad_norm": 0.109375, + "learning_rate": 0.0007336088208999702, + "loss": 0.6661, + "step": 16710 + }, + { + "epoch": 0.8304360782755538, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007335690871163208, + "loss": 0.6804, + "step": 16720 + }, + { + "epoch": 0.8309327505711731, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007335293533326711, + "loss": 0.6887, + "step": 16730 + }, + { + "epoch": 0.8314294228667924, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007334896195490216, + "loss": 0.6213, + "step": 16740 + }, + { + "epoch": 0.8319260951624119, + "grad_norm": 0.107421875, + "learning_rate": 0.0007334498857653721, + "loss": 0.649, + "step": 16750 + }, + { + "epoch": 0.8324227674580312, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007334101519817224, + "loss": 0.6543, + "step": 16760 + }, + { + "epoch": 0.8329194397536506, + "grad_norm": 0.1103515625, + "learning_rate": 0.000733370418198073, + "loss": 0.6634, + "step": 16770 + }, + { + "epoch": 0.8334161120492699, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007333306844144234, + "loss": 0.6675, + "step": 16780 + }, + { + "epoch": 0.8339127843448892, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007332909506307738, + "loss": 0.6521, + "step": 16790 + }, + { + "epoch": 0.8344094566405086, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007332512168471243, + "loss": 0.6378, + "step": 16800 + }, + { + "epoch": 0.8349061289361279, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007332114830634747, + "loss": 0.6496, + "step": 16810 + }, + { + "epoch": 0.8354028012317473, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007331717492798252, + "loss": 0.6638, + "step": 16820 + }, + { + "epoch": 0.8358994735273666, + "grad_norm": 0.10546875, + "learning_rate": 0.0007331320154961757, + "loss": 0.6344, + "step": 16830 + }, + { + "epoch": 0.836396145822986, + "grad_norm": 0.185546875, + "learning_rate": 0.0007330922817125261, + "loss": 0.6616, + "step": 16840 + }, + { + "epoch": 0.8368928181186054, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007330525479288765, + "loss": 0.6469, + "step": 16850 + }, + { + "epoch": 0.8373894904142247, + "grad_norm": 0.107421875, + "learning_rate": 0.000733012814145227, + "loss": 0.6803, + "step": 16860 + }, + { + "epoch": 0.8378861627098441, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007329730803615775, + "loss": 0.6452, + "step": 16870 + }, + { + "epoch": 0.8383828350054634, + "grad_norm": 0.09423828125, + "learning_rate": 0.000732933346577928, + "loss": 0.6699, + "step": 16880 + }, + { + "epoch": 0.8388795073010827, + "grad_norm": 0.1015625, + "learning_rate": 0.0007328936127942783, + "loss": 0.6362, + "step": 16890 + }, + { + "epoch": 0.8393761795967021, + "grad_norm": 0.107421875, + "learning_rate": 0.0007328538790106288, + "loss": 0.6622, + "step": 16900 + }, + { + "epoch": 0.8398728518923214, + "grad_norm": 0.099609375, + "learning_rate": 0.0007328141452269793, + "loss": 0.6484, + "step": 16910 + }, + { + "epoch": 0.8403695241879408, + "grad_norm": 0.15234375, + "learning_rate": 0.0007327744114433297, + "loss": 0.6497, + "step": 16920 + }, + { + "epoch": 0.8408661964835602, + "grad_norm": 0.12890625, + "learning_rate": 0.0007327346776596802, + "loss": 0.6783, + "step": 16930 + }, + { + "epoch": 0.8413628687791795, + "grad_norm": 0.1552734375, + "learning_rate": 0.0007326949438760307, + "loss": 0.6665, + "step": 16940 + }, + { + "epoch": 0.8418595410747989, + "grad_norm": 0.1142578125, + "learning_rate": 0.000732655210092381, + "loss": 0.6902, + "step": 16950 + }, + { + "epoch": 0.8423562133704182, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007326154763087315, + "loss": 0.6522, + "step": 16960 + }, + { + "epoch": 0.8428528856660376, + "grad_norm": 0.10986328125, + "learning_rate": 0.000732575742525082, + "loss": 0.6571, + "step": 16970 + }, + { + "epoch": 0.8433495579616569, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007325360087414324, + "loss": 0.657, + "step": 16980 + }, + { + "epoch": 0.8438462302572762, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007324962749577829, + "loss": 0.648, + "step": 16990 + }, + { + "epoch": 0.8443429025528956, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007324565411741333, + "loss": 0.6394, + "step": 17000 + }, + { + "epoch": 0.8448395748485149, + "grad_norm": 0.119140625, + "learning_rate": 0.0007324168073904837, + "loss": 0.6504, + "step": 17010 + }, + { + "epoch": 0.8453362471441342, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007323770736068343, + "loss": 0.6483, + "step": 17020 + }, + { + "epoch": 0.8458329194397537, + "grad_norm": 0.115234375, + "learning_rate": 0.0007323373398231847, + "loss": 0.6492, + "step": 17030 + }, + { + "epoch": 0.846329591735373, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007322976060395352, + "loss": 0.6662, + "step": 17040 + }, + { + "epoch": 0.8468262640309924, + "grad_norm": 0.14453125, + "learning_rate": 0.0007322578722558856, + "loss": 0.6364, + "step": 17050 + }, + { + "epoch": 0.8473229363266117, + "grad_norm": 0.10302734375, + "learning_rate": 0.000732218138472236, + "loss": 0.6674, + "step": 17060 + }, + { + "epoch": 0.847819608622231, + "grad_norm": 0.109375, + "learning_rate": 0.0007321784046885866, + "loss": 0.6588, + "step": 17070 + }, + { + "epoch": 0.8483162809178504, + "grad_norm": 0.11181640625, + "learning_rate": 0.000732138670904937, + "loss": 0.6432, + "step": 17080 + }, + { + "epoch": 0.8488129532134697, + "grad_norm": 0.162109375, + "learning_rate": 0.0007320989371212874, + "loss": 0.6654, + "step": 17090 + }, + { + "epoch": 0.8493096255090891, + "grad_norm": 0.177734375, + "learning_rate": 0.0007320592033376379, + "loss": 0.6632, + "step": 17100 + }, + { + "epoch": 0.8498062978047084, + "grad_norm": 0.125, + "learning_rate": 0.0007320194695539882, + "loss": 0.6595, + "step": 17110 + }, + { + "epoch": 0.8503029701003278, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007319797357703388, + "loss": 0.6614, + "step": 17120 + }, + { + "epoch": 0.8507996423959472, + "grad_norm": 0.1201171875, + "learning_rate": 0.0007319400019866893, + "loss": 0.6409, + "step": 17130 + }, + { + "epoch": 0.8512963146915665, + "grad_norm": 0.103515625, + "learning_rate": 0.0007319002682030396, + "loss": 0.6245, + "step": 17140 + }, + { + "epoch": 0.8517929869871859, + "grad_norm": 0.126953125, + "learning_rate": 0.0007318605344193901, + "loss": 0.6708, + "step": 17150 + }, + { + "epoch": 0.8522896592828052, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007318208006357405, + "loss": 0.6608, + "step": 17160 + }, + { + "epoch": 0.8527863315784245, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007317810668520911, + "loss": 0.6411, + "step": 17170 + }, + { + "epoch": 0.8532830038740439, + "grad_norm": 0.140625, + "learning_rate": 0.0007317413330684415, + "loss": 0.6536, + "step": 17180 + }, + { + "epoch": 0.8537796761696632, + "grad_norm": 0.11328125, + "learning_rate": 0.0007317015992847919, + "loss": 0.6427, + "step": 17190 + }, + { + "epoch": 0.8542763484652826, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007316618655011424, + "loss": 0.6913, + "step": 17200 + }, + { + "epoch": 0.854773020760902, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007316221317174928, + "loss": 0.6436, + "step": 17210 + }, + { + "epoch": 0.8552696930565213, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007315823979338433, + "loss": 0.6381, + "step": 17220 + }, + { + "epoch": 0.8557663653521407, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007315426641501938, + "loss": 0.66, + "step": 17230 + }, + { + "epoch": 0.85626303764776, + "grad_norm": 0.14453125, + "learning_rate": 0.0007315029303665442, + "loss": 0.6569, + "step": 17240 + }, + { + "epoch": 0.8567597099433794, + "grad_norm": 0.115234375, + "learning_rate": 0.0007314631965828946, + "loss": 0.6972, + "step": 17250 + }, + { + "epoch": 0.8572563822389987, + "grad_norm": 0.126953125, + "learning_rate": 0.0007314234627992451, + "loss": 0.6668, + "step": 17260 + }, + { + "epoch": 0.857753054534618, + "grad_norm": 0.1328125, + "learning_rate": 0.0007313837290155956, + "loss": 0.6675, + "step": 17270 + }, + { + "epoch": 0.8582497268302374, + "grad_norm": 0.12255859375, + "learning_rate": 0.000731343995231946, + "loss": 0.6405, + "step": 17280 + }, + { + "epoch": 0.8587463991258567, + "grad_norm": 0.09765625, + "learning_rate": 0.0007313042614482965, + "loss": 0.657, + "step": 17290 + }, + { + "epoch": 0.8592430714214762, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007312645276646468, + "loss": 0.6507, + "step": 17300 + }, + { + "epoch": 0.8597397437170955, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007312247938809973, + "loss": 0.6625, + "step": 17310 + }, + { + "epoch": 0.8602364160127148, + "grad_norm": 0.109375, + "learning_rate": 0.0007311850600973479, + "loss": 0.6755, + "step": 17320 + }, + { + "epoch": 0.8607330883083342, + "grad_norm": 0.103515625, + "learning_rate": 0.0007311453263136983, + "loss": 0.6671, + "step": 17330 + }, + { + "epoch": 0.8612297606039535, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007311055925300487, + "loss": 0.6828, + "step": 17340 + }, + { + "epoch": 0.8617264328995728, + "grad_norm": 0.09130859375, + "learning_rate": 0.0007310658587463992, + "loss": 0.6169, + "step": 17350 + }, + { + "epoch": 0.8622231051951922, + "grad_norm": 0.109375, + "learning_rate": 0.0007310261249627496, + "loss": 0.6387, + "step": 17360 + }, + { + "epoch": 0.8627197774908115, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007309863911791001, + "loss": 0.7025, + "step": 17370 + }, + { + "epoch": 0.8632164497864309, + "grad_norm": 0.095703125, + "learning_rate": 0.0007309466573954505, + "loss": 0.6599, + "step": 17380 + }, + { + "epoch": 0.8637131220820503, + "grad_norm": 0.1005859375, + "learning_rate": 0.000730906923611801, + "loss": 0.6288, + "step": 17390 + }, + { + "epoch": 0.8642097943776696, + "grad_norm": 0.130859375, + "learning_rate": 0.0007308671898281514, + "loss": 0.6718, + "step": 17400 + }, + { + "epoch": 0.864706466673289, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007308274560445018, + "loss": 0.6607, + "step": 17410 + }, + { + "epoch": 0.8652031389689083, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007307877222608524, + "loss": 0.6593, + "step": 17420 + }, + { + "epoch": 0.8656998112645277, + "grad_norm": 0.109375, + "learning_rate": 0.0007307479884772028, + "loss": 0.6401, + "step": 17430 + }, + { + "epoch": 0.866196483560147, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007307082546935532, + "loss": 0.6316, + "step": 17440 + }, + { + "epoch": 0.8666931558557663, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007306685209099037, + "loss": 0.6319, + "step": 17450 + }, + { + "epoch": 0.8671898281513857, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007306287871262541, + "loss": 0.6675, + "step": 17460 + }, + { + "epoch": 0.867686500447005, + "grad_norm": 0.12109375, + "learning_rate": 0.0007305890533426046, + "loss": 0.6245, + "step": 17470 + }, + { + "epoch": 0.8681831727426245, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007305493195589551, + "loss": 0.6518, + "step": 17480 + }, + { + "epoch": 0.8686798450382438, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007305095857753055, + "loss": 0.6574, + "step": 17490 + }, + { + "epoch": 0.8691765173338631, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007304698519916559, + "loss": 0.6386, + "step": 17500 + }, + { + "epoch": 0.8696731896294825, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007304301182080064, + "loss": 0.6485, + "step": 17510 + }, + { + "epoch": 0.8701698619251018, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007303903844243569, + "loss": 0.653, + "step": 17520 + }, + { + "epoch": 0.8706665342207212, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007303506506407073, + "loss": 0.6777, + "step": 17530 + }, + { + "epoch": 0.8711632065163405, + "grad_norm": 0.150390625, + "learning_rate": 0.0007303109168570578, + "loss": 0.6541, + "step": 17540 + }, + { + "epoch": 0.8716598788119598, + "grad_norm": 0.12890625, + "learning_rate": 0.0007302711830734082, + "loss": 0.6746, + "step": 17550 + }, + { + "epoch": 0.8721565511075792, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007302314492897586, + "loss": 0.6732, + "step": 17560 + }, + { + "epoch": 0.8726532234031986, + "grad_norm": 0.1171875, + "learning_rate": 0.000730191715506109, + "loss": 0.6948, + "step": 17570 + }, + { + "epoch": 0.873149895698818, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007301519817224596, + "loss": 0.6498, + "step": 17580 + }, + { + "epoch": 0.8736465679944373, + "grad_norm": 0.115234375, + "learning_rate": 0.00073011224793881, + "loss": 0.6505, + "step": 17590 + }, + { + "epoch": 0.8741432402900566, + "grad_norm": 0.099609375, + "learning_rate": 0.0007300725141551604, + "loss": 0.6503, + "step": 17600 + }, + { + "epoch": 0.874639912585676, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007300327803715109, + "loss": 0.6586, + "step": 17610 + }, + { + "epoch": 0.8751365848812953, + "grad_norm": 0.138671875, + "learning_rate": 0.0007299930465878615, + "loss": 0.6732, + "step": 17620 + }, + { + "epoch": 0.8756332571769146, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007299533128042118, + "loss": 0.6564, + "step": 17630 + }, + { + "epoch": 0.876129929472534, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007299135790205623, + "loss": 0.6678, + "step": 17640 + }, + { + "epoch": 0.8766266017681533, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007298738452369127, + "loss": 0.6443, + "step": 17650 + }, + { + "epoch": 0.8771232740637728, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007298341114532631, + "loss": 0.643, + "step": 17660 + }, + { + "epoch": 0.8776199463593921, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007297943776696137, + "loss": 0.6385, + "step": 17670 + }, + { + "epoch": 0.8781166186550114, + "grad_norm": 0.09423828125, + "learning_rate": 0.0007297546438859641, + "loss": 0.6425, + "step": 17680 + }, + { + "epoch": 0.8786132909506308, + "grad_norm": 0.09716796875, + "learning_rate": 0.0007297149101023145, + "loss": 0.6334, + "step": 17690 + }, + { + "epoch": 0.8791099632462501, + "grad_norm": 0.134765625, + "learning_rate": 0.000729675176318665, + "loss": 0.6628, + "step": 17700 + }, + { + "epoch": 0.8796066355418695, + "grad_norm": 0.134765625, + "learning_rate": 0.0007296354425350154, + "loss": 0.674, + "step": 17710 + }, + { + "epoch": 0.8801033078374888, + "grad_norm": 0.107421875, + "learning_rate": 0.0007295957087513658, + "loss": 0.6799, + "step": 17720 + }, + { + "epoch": 0.8805999801331081, + "grad_norm": 0.11328125, + "learning_rate": 0.0007295559749677164, + "loss": 0.6566, + "step": 17730 + }, + { + "epoch": 0.8810966524287275, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007295162411840668, + "loss": 0.6363, + "step": 17740 + }, + { + "epoch": 0.8815933247243469, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007294765074004172, + "loss": 0.681, + "step": 17750 + }, + { + "epoch": 0.8820899970199663, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007294367736167676, + "loss": 0.6795, + "step": 17760 + }, + { + "epoch": 0.8825866693155856, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007293970398331182, + "loss": 0.668, + "step": 17770 + }, + { + "epoch": 0.8830833416112049, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007293573060494687, + "loss": 0.6346, + "step": 17780 + }, + { + "epoch": 0.8835800139068243, + "grad_norm": 0.11328125, + "learning_rate": 0.000729317572265819, + "loss": 0.6488, + "step": 17790 + }, + { + "epoch": 0.8840766862024436, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007292778384821695, + "loss": 0.6297, + "step": 17800 + }, + { + "epoch": 0.884573358498063, + "grad_norm": 0.099609375, + "learning_rate": 0.00072923810469852, + "loss": 0.6498, + "step": 17810 + }, + { + "epoch": 0.8850700307936823, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007291983709148703, + "loss": 0.6379, + "step": 17820 + }, + { + "epoch": 0.8855667030893016, + "grad_norm": 0.12890625, + "learning_rate": 0.0007291586371312209, + "loss": 0.6485, + "step": 17830 + }, + { + "epoch": 0.8860633753849211, + "grad_norm": 0.119140625, + "learning_rate": 0.0007291189033475713, + "loss": 0.6255, + "step": 17840 + }, + { + "epoch": 0.8865600476805404, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007290791695639217, + "loss": 0.6269, + "step": 17850 + }, + { + "epoch": 0.8870567199761598, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007290394357802722, + "loss": 0.658, + "step": 17860 + }, + { + "epoch": 0.8875533922717791, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007289997019966226, + "loss": 0.6499, + "step": 17870 + }, + { + "epoch": 0.8880500645673984, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007289599682129731, + "loss": 0.6215, + "step": 17880 + }, + { + "epoch": 0.8885467368630178, + "grad_norm": 0.103515625, + "learning_rate": 0.0007289202344293236, + "loss": 0.638, + "step": 17890 + }, + { + "epoch": 0.8890434091586371, + "grad_norm": 0.12109375, + "learning_rate": 0.000728880500645674, + "loss": 0.6508, + "step": 17900 + }, + { + "epoch": 0.8895400814542564, + "grad_norm": 0.197265625, + "learning_rate": 0.0007288407668620245, + "loss": 0.6515, + "step": 17910 + }, + { + "epoch": 0.8900367537498758, + "grad_norm": 0.12158203125, + "learning_rate": 0.000728801033078375, + "loss": 0.6798, + "step": 17920 + }, + { + "epoch": 0.8905334260454952, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007287612992947254, + "loss": 0.6558, + "step": 17930 + }, + { + "epoch": 0.8910300983411146, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007287215655110759, + "loss": 0.6627, + "step": 17940 + }, + { + "epoch": 0.8915267706367339, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007286818317274263, + "loss": 0.6434, + "step": 17950 + }, + { + "epoch": 0.8920234429323532, + "grad_norm": 0.111328125, + "learning_rate": 0.0007286420979437767, + "loss": 0.636, + "step": 17960 + }, + { + "epoch": 0.8925201152279726, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007286023641601273, + "loss": 0.6488, + "step": 17970 + }, + { + "epoch": 0.8930167875235919, + "grad_norm": 0.119140625, + "learning_rate": 0.0007285626303764776, + "loss": 0.6529, + "step": 17980 + }, + { + "epoch": 0.8935134598192113, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007285228965928281, + "loss": 0.6544, + "step": 17990 + }, + { + "epoch": 0.8940101321148306, + "grad_norm": 0.111328125, + "learning_rate": 0.0007284831628091786, + "loss": 0.6757, + "step": 18000 + }, + { + "epoch": 0.8945068044104499, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007284434290255289, + "loss": 0.6446, + "step": 18010 + }, + { + "epoch": 0.8950034767060694, + "grad_norm": 0.10546875, + "learning_rate": 0.0007284036952418794, + "loss": 0.6686, + "step": 18020 + }, + { + "epoch": 0.8955001490016887, + "grad_norm": 0.146484375, + "learning_rate": 0.0007283639614582299, + "loss": 0.6257, + "step": 18030 + }, + { + "epoch": 0.8959968212973081, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007283242276745803, + "loss": 0.684, + "step": 18040 + }, + { + "epoch": 0.8964934935929274, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007282844938909308, + "loss": 0.6261, + "step": 18050 + }, + { + "epoch": 0.8969901658885467, + "grad_norm": 0.123046875, + "learning_rate": 0.0007282447601072812, + "loss": 0.6511, + "step": 18060 + }, + { + "epoch": 0.8974868381841661, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007282050263236317, + "loss": 0.669, + "step": 18070 + }, + { + "epoch": 0.8979835104797854, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007281652925399822, + "loss": 0.6212, + "step": 18080 + }, + { + "epoch": 0.8984801827754048, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007281255587563326, + "loss": 0.6908, + "step": 18090 + }, + { + "epoch": 0.8989768550710241, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007280858249726831, + "loss": 0.6437, + "step": 18100 + }, + { + "epoch": 0.8994735273666435, + "grad_norm": 0.203125, + "learning_rate": 0.0007280460911890335, + "loss": 0.6976, + "step": 18110 + }, + { + "epoch": 0.8999701996622629, + "grad_norm": 0.119140625, + "learning_rate": 0.0007280063574053839, + "loss": 0.6431, + "step": 18120 + }, + { + "epoch": 0.9004668719578822, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007279666236217345, + "loss": 0.6197, + "step": 18130 + }, + { + "epoch": 0.9009635442535016, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007279268898380849, + "loss": 0.6198, + "step": 18140 + }, + { + "epoch": 0.9014602165491209, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007278871560544353, + "loss": 0.6532, + "step": 18150 + }, + { + "epoch": 0.9019568888447402, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007278474222707858, + "loss": 0.6735, + "step": 18160 + }, + { + "epoch": 0.9024535611403596, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007278076884871361, + "loss": 0.6289, + "step": 18170 + }, + { + "epoch": 0.9029502334359789, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007277679547034867, + "loss": 0.6466, + "step": 18180 + }, + { + "epoch": 0.9034469057315982, + "grad_norm": 0.109375, + "learning_rate": 0.0007277282209198372, + "loss": 0.6393, + "step": 18190 + }, + { + "epoch": 0.9039435780272176, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007276884871361875, + "loss": 0.6556, + "step": 18200 + }, + { + "epoch": 0.904440250322837, + "grad_norm": 0.1005859375, + "learning_rate": 0.000727648753352538, + "loss": 0.6643, + "step": 18210 + }, + { + "epoch": 0.9049369226184564, + "grad_norm": 0.1513671875, + "learning_rate": 0.0007276090195688885, + "loss": 0.6401, + "step": 18220 + }, + { + "epoch": 0.9054335949140757, + "grad_norm": 0.1455078125, + "learning_rate": 0.000727569285785239, + "loss": 0.6655, + "step": 18230 + }, + { + "epoch": 0.905930267209695, + "grad_norm": 0.11328125, + "learning_rate": 0.0007275295520015894, + "loss": 0.6161, + "step": 18240 + }, + { + "epoch": 0.9064269395053144, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007274898182179398, + "loss": 0.6177, + "step": 18250 + }, + { + "epoch": 0.9069236118009337, + "grad_norm": 0.1484375, + "learning_rate": 0.0007274500844342903, + "loss": 0.6307, + "step": 18260 + }, + { + "epoch": 0.9074202840965531, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007274103506506407, + "loss": 0.6363, + "step": 18270 + }, + { + "epoch": 0.9079169563921724, + "grad_norm": 0.1533203125, + "learning_rate": 0.0007273706168669912, + "loss": 0.6561, + "step": 18280 + }, + { + "epoch": 0.9084136286877917, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007273308830833417, + "loss": 0.6347, + "step": 18290 + }, + { + "epoch": 0.9089103009834112, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007272911492996921, + "loss": 0.6362, + "step": 18300 + }, + { + "epoch": 0.9094069732790305, + "grad_norm": 0.10546875, + "learning_rate": 0.0007272514155160425, + "loss": 0.6456, + "step": 18310 + }, + { + "epoch": 0.9099036455746499, + "grad_norm": 0.1025390625, + "learning_rate": 0.000727211681732393, + "loss": 0.6526, + "step": 18320 + }, + { + "epoch": 0.9104003178702692, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007271719479487435, + "loss": 0.6516, + "step": 18330 + }, + { + "epoch": 0.9108969901658885, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007271322141650939, + "loss": 0.6862, + "step": 18340 + }, + { + "epoch": 0.9113936624615079, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007270924803814444, + "loss": 0.6343, + "step": 18350 + }, + { + "epoch": 0.9118903347571272, + "grad_norm": 0.1015625, + "learning_rate": 0.0007270527465977948, + "loss": 0.6607, + "step": 18360 + }, + { + "epoch": 0.9123870070527466, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007270130128141452, + "loss": 0.6355, + "step": 18370 + }, + { + "epoch": 0.9128836793483659, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007269732790304958, + "loss": 0.6391, + "step": 18380 + }, + { + "epoch": 0.9133803516439853, + "grad_norm": 0.10205078125, + "learning_rate": 0.0007269335452468462, + "loss": 0.638, + "step": 18390 + }, + { + "epoch": 0.9138770239396047, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007268938114631966, + "loss": 0.6353, + "step": 18400 + }, + { + "epoch": 0.914373696235224, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007268540776795471, + "loss": 0.6596, + "step": 18410 + }, + { + "epoch": 0.9148703685308434, + "grad_norm": 0.115234375, + "learning_rate": 0.0007268143438958975, + "loss": 0.6434, + "step": 18420 + }, + { + "epoch": 0.9153670408264627, + "grad_norm": 0.126953125, + "learning_rate": 0.000726774610112248, + "loss": 0.6595, + "step": 18430 + }, + { + "epoch": 0.915863713122082, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007267348763285984, + "loss": 0.6257, + "step": 18440 + }, + { + "epoch": 0.9163603854177014, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007266951425449489, + "loss": 0.6293, + "step": 18450 + }, + { + "epoch": 0.9168570577133207, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007266554087612993, + "loss": 0.6326, + "step": 18460 + }, + { + "epoch": 0.91735373000894, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007266156749776497, + "loss": 0.6738, + "step": 18470 + }, + { + "epoch": 0.9178504023045595, + "grad_norm": 0.1015625, + "learning_rate": 0.0007265759411940003, + "loss": 0.64, + "step": 18480 + }, + { + "epoch": 0.9183470746001788, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007265362074103507, + "loss": 0.6349, + "step": 18490 + }, + { + "epoch": 0.9188437468957982, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007264964736267011, + "loss": 0.6403, + "step": 18500 + }, + { + "epoch": 0.9193404191914175, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007264567398430516, + "loss": 0.666, + "step": 18510 + }, + { + "epoch": 0.9198370914870369, + "grad_norm": 0.11767578125, + "learning_rate": 0.000726417006059402, + "loss": 0.642, + "step": 18520 + }, + { + "epoch": 0.9203337637826562, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007263772722757525, + "loss": 0.6183, + "step": 18530 + }, + { + "epoch": 0.9208304360782755, + "grad_norm": 0.0986328125, + "learning_rate": 0.000726337538492103, + "loss": 0.6488, + "step": 18540 + }, + { + "epoch": 0.9213271083738949, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007262978047084534, + "loss": 0.6425, + "step": 18550 + }, + { + "epoch": 0.9218237806695142, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007262580709248038, + "loss": 0.6517, + "step": 18560 + }, + { + "epoch": 0.9223204529651337, + "grad_norm": 0.146484375, + "learning_rate": 0.0007262183371411543, + "loss": 0.6648, + "step": 18570 + }, + { + "epoch": 0.922817125260753, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007261786033575048, + "loss": 0.6293, + "step": 18580 + }, + { + "epoch": 0.9233137975563723, + "grad_norm": 0.138671875, + "learning_rate": 0.0007261388695738552, + "loss": 0.6352, + "step": 18590 + }, + { + "epoch": 0.9238104698519917, + "grad_norm": 0.103515625, + "learning_rate": 0.0007260991357902057, + "loss": 0.6556, + "step": 18600 + }, + { + "epoch": 0.924307142147611, + "grad_norm": 0.09423828125, + "learning_rate": 0.0007260594020065561, + "loss": 0.6334, + "step": 18610 + }, + { + "epoch": 0.9248038144432303, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007260196682229065, + "loss": 0.6108, + "step": 18620 + }, + { + "epoch": 0.9253004867388497, + "grad_norm": 0.10205078125, + "learning_rate": 0.000725979934439257, + "loss": 0.6333, + "step": 18630 + }, + { + "epoch": 0.925797159034469, + "grad_norm": 0.11328125, + "learning_rate": 0.0007259402006556075, + "loss": 0.6463, + "step": 18640 + }, + { + "epoch": 0.9262938313300884, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007259004668719579, + "loss": 0.6333, + "step": 18650 + }, + { + "epoch": 0.9267905036257078, + "grad_norm": 0.08837890625, + "learning_rate": 0.0007258607330883083, + "loss": 0.676, + "step": 18660 + }, + { + "epoch": 0.9272871759213271, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007258209993046588, + "loss": 0.6571, + "step": 18670 + }, + { + "epoch": 0.9277838482169465, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007257812655210094, + "loss": 0.6544, + "step": 18680 + }, + { + "epoch": 0.9282805205125658, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007257415317373597, + "loss": 0.6293, + "step": 18690 + }, + { + "epoch": 0.9287771928081852, + "grad_norm": 0.1015625, + "learning_rate": 0.0007257017979537102, + "loss": 0.6482, + "step": 18700 + }, + { + "epoch": 0.9292738651038045, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007256620641700606, + "loss": 0.6875, + "step": 18710 + }, + { + "epoch": 0.9297705373994238, + "grad_norm": 0.11181640625, + "learning_rate": 0.000725622330386411, + "loss": 0.6797, + "step": 18720 + }, + { + "epoch": 0.9302672096950432, + "grad_norm": 0.09130859375, + "learning_rate": 0.0007255825966027616, + "loss": 0.6506, + "step": 18730 + }, + { + "epoch": 0.9307638819906625, + "grad_norm": 0.11279296875, + "learning_rate": 0.000725542862819112, + "loss": 0.6089, + "step": 18740 + }, + { + "epoch": 0.931260554286282, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007255031290354624, + "loss": 0.6626, + "step": 18750 + }, + { + "epoch": 0.9317572265819013, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007254633952518129, + "loss": 0.6508, + "step": 18760 + }, + { + "epoch": 0.9322538988775206, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007254236614681633, + "loss": 0.6255, + "step": 18770 + }, + { + "epoch": 0.93275057117314, + "grad_norm": 0.10205078125, + "learning_rate": 0.0007253839276845138, + "loss": 0.6643, + "step": 18780 + }, + { + "epoch": 0.9332472434687593, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007253441939008643, + "loss": 0.6427, + "step": 18790 + }, + { + "epoch": 0.9337439157643787, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007253044601172147, + "loss": 0.641, + "step": 18800 + }, + { + "epoch": 0.934240588059998, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007252647263335652, + "loss": 0.6462, + "step": 18810 + }, + { + "epoch": 0.9347372603556173, + "grad_norm": 0.09765625, + "learning_rate": 0.0007252249925499155, + "loss": 0.6797, + "step": 18820 + }, + { + "epoch": 0.9352339326512367, + "grad_norm": 0.12451171875, + "learning_rate": 0.000725185258766266, + "loss": 0.6372, + "step": 18830 + }, + { + "epoch": 0.9357306049468561, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007251455249826166, + "loss": 0.6683, + "step": 18840 + }, + { + "epoch": 0.9362272772424755, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007251057911989669, + "loss": 0.6337, + "step": 18850 + }, + { + "epoch": 0.9367239495380948, + "grad_norm": 0.103515625, + "learning_rate": 0.0007250660574153174, + "loss": 0.6449, + "step": 18860 + }, + { + "epoch": 0.9372206218337141, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007250263236316679, + "loss": 0.6515, + "step": 18870 + }, + { + "epoch": 0.9377172941293335, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007249865898480182, + "loss": 0.6232, + "step": 18880 + }, + { + "epoch": 0.9382139664249528, + "grad_norm": 0.11328125, + "learning_rate": 0.0007249468560643688, + "loss": 0.6598, + "step": 18890 + }, + { + "epoch": 0.9387106387205721, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007249071222807192, + "loss": 0.6842, + "step": 18900 + }, + { + "epoch": 0.9392073110161915, + "grad_norm": 0.111328125, + "learning_rate": 0.0007248673884970696, + "loss": 0.6436, + "step": 18910 + }, + { + "epoch": 0.9397039833118108, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007248276547134201, + "loss": 0.6466, + "step": 18920 + }, + { + "epoch": 0.9402006556074303, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007247879209297706, + "loss": 0.6312, + "step": 18930 + }, + { + "epoch": 0.9406973279030496, + "grad_norm": 0.10302734375, + "learning_rate": 0.000724748187146121, + "loss": 0.676, + "step": 18940 + }, + { + "epoch": 0.941194000198669, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007247084533624715, + "loss": 0.6399, + "step": 18950 + }, + { + "epoch": 0.9416906724942883, + "grad_norm": 0.10546875, + "learning_rate": 0.0007246687195788219, + "loss": 0.6462, + "step": 18960 + }, + { + "epoch": 0.9421873447899076, + "grad_norm": 0.1328125, + "learning_rate": 0.0007246289857951724, + "loss": 0.6322, + "step": 18970 + }, + { + "epoch": 0.942684017085527, + "grad_norm": 0.095703125, + "learning_rate": 0.0007245892520115229, + "loss": 0.6487, + "step": 18980 + }, + { + "epoch": 0.9431806893811463, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007245495182278733, + "loss": 0.6321, + "step": 18990 + }, + { + "epoch": 0.9436773616767656, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007245097844442238, + "loss": 0.6716, + "step": 19000 + }, + { + "epoch": 0.944174033972385, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007244700506605742, + "loss": 0.6537, + "step": 19010 + }, + { + "epoch": 0.9446707062680044, + "grad_norm": 0.1171875, + "learning_rate": 0.0007244303168769246, + "loss": 0.6177, + "step": 19020 + }, + { + "epoch": 0.9451673785636238, + "grad_norm": 0.146484375, + "learning_rate": 0.0007243905830932752, + "loss": 0.6269, + "step": 19030 + }, + { + "epoch": 0.9456640508592431, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007243508493096255, + "loss": 0.6405, + "step": 19040 + }, + { + "epoch": 0.9461607231548624, + "grad_norm": 0.10791015625, + "learning_rate": 0.000724311115525976, + "loss": 0.6169, + "step": 19050 + }, + { + "epoch": 0.9466573954504818, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007242713817423265, + "loss": 0.6676, + "step": 19060 + }, + { + "epoch": 0.9471540677461011, + "grad_norm": 0.09619140625, + "learning_rate": 0.0007242316479586768, + "loss": 0.6352, + "step": 19070 + }, + { + "epoch": 0.9476507400417205, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007241919141750274, + "loss": 0.6393, + "step": 19080 + }, + { + "epoch": 0.9481474123373398, + "grad_norm": 0.09423828125, + "learning_rate": 0.0007241521803913778, + "loss": 0.6591, + "step": 19090 + }, + { + "epoch": 0.9486440846329591, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007241124466077282, + "loss": 0.6649, + "step": 19100 + }, + { + "epoch": 0.9491407569285786, + "grad_norm": 0.123046875, + "learning_rate": 0.0007240727128240787, + "loss": 0.6774, + "step": 19110 + }, + { + "epoch": 0.9496374292241979, + "grad_norm": 0.09619140625, + "learning_rate": 0.0007240329790404291, + "loss": 0.6421, + "step": 19120 + }, + { + "epoch": 0.9501341015198173, + "grad_norm": 0.1630859375, + "learning_rate": 0.0007239932452567797, + "loss": 0.6432, + "step": 19130 + }, + { + "epoch": 0.9506307738154366, + "grad_norm": 0.150390625, + "learning_rate": 0.0007239535114731301, + "loss": 0.6313, + "step": 19140 + }, + { + "epoch": 0.9511274461110559, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007239137776894805, + "loss": 0.6938, + "step": 19150 + }, + { + "epoch": 0.9516241184066753, + "grad_norm": 0.1572265625, + "learning_rate": 0.000723874043905831, + "loss": 0.6474, + "step": 19160 + }, + { + "epoch": 0.9521207907022946, + "grad_norm": 0.125, + "learning_rate": 0.0007238343101221814, + "loss": 0.66, + "step": 19170 + }, + { + "epoch": 0.9526174629979139, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007237945763385318, + "loss": 0.6707, + "step": 19180 + }, + { + "epoch": 0.9531141352935333, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007237548425548824, + "loss": 0.6322, + "step": 19190 + }, + { + "epoch": 0.9536108075891527, + "grad_norm": 0.09130859375, + "learning_rate": 0.0007237151087712328, + "loss": 0.6423, + "step": 19200 + }, + { + "epoch": 0.9541074798847721, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007236753749875832, + "loss": 0.6438, + "step": 19210 + }, + { + "epoch": 0.9546041521803914, + "grad_norm": 0.154296875, + "learning_rate": 0.0007236356412039337, + "loss": 0.65, + "step": 19220 + }, + { + "epoch": 0.9551008244760107, + "grad_norm": 0.099609375, + "learning_rate": 0.000723595907420284, + "loss": 0.6024, + "step": 19230 + }, + { + "epoch": 0.9555974967716301, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007235561736366346, + "loss": 0.6478, + "step": 19240 + }, + { + "epoch": 0.9560941690672494, + "grad_norm": 0.107421875, + "learning_rate": 0.0007235164398529851, + "loss": 0.6551, + "step": 19250 + }, + { + "epoch": 0.9565908413628688, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007234767060693355, + "loss": 0.6244, + "step": 19260 + }, + { + "epoch": 0.9570875136584881, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007234369722856859, + "loss": 0.6126, + "step": 19270 + }, + { + "epoch": 0.9575841859541074, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007233972385020365, + "loss": 0.6673, + "step": 19280 + }, + { + "epoch": 0.9580808582497269, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007233575047183869, + "loss": 0.6338, + "step": 19290 + }, + { + "epoch": 0.9585775305453462, + "grad_norm": 0.140625, + "learning_rate": 0.0007233177709347373, + "loss": 0.6413, + "step": 19300 + }, + { + "epoch": 0.9590742028409656, + "grad_norm": 0.1015625, + "learning_rate": 0.0007232780371510877, + "loss": 0.6325, + "step": 19310 + }, + { + "epoch": 0.9595708751365849, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007232383033674382, + "loss": 0.6368, + "step": 19320 + }, + { + "epoch": 0.9600675474322042, + "grad_norm": 0.1015625, + "learning_rate": 0.0007231985695837886, + "loss": 0.6868, + "step": 19330 + }, + { + "epoch": 0.9605642197278236, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007231588358001391, + "loss": 0.6554, + "step": 19340 + }, + { + "epoch": 0.9610608920234429, + "grad_norm": 0.109375, + "learning_rate": 0.0007231191020164896, + "loss": 0.6511, + "step": 19350 + }, + { + "epoch": 0.9615575643190623, + "grad_norm": 0.1015625, + "learning_rate": 0.00072307936823284, + "loss": 0.601, + "step": 19360 + }, + { + "epoch": 0.9620542366146816, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007230396344491904, + "loss": 0.6424, + "step": 19370 + }, + { + "epoch": 0.9625509089103009, + "grad_norm": 0.126953125, + "learning_rate": 0.000722999900665541, + "loss": 0.6213, + "step": 19380 + }, + { + "epoch": 0.9630475812059204, + "grad_norm": 0.10205078125, + "learning_rate": 0.0007229601668818914, + "loss": 0.6322, + "step": 19390 + }, + { + "epoch": 0.9635442535015397, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007229204330982418, + "loss": 0.639, + "step": 19400 + }, + { + "epoch": 0.964040925797159, + "grad_norm": 0.09814453125, + "learning_rate": 0.0007228806993145923, + "loss": 0.6287, + "step": 19410 + }, + { + "epoch": 0.9645375980927784, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007228409655309427, + "loss": 0.6506, + "step": 19420 + }, + { + "epoch": 0.9650342703883977, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007228012317472931, + "loss": 0.6512, + "step": 19430 + }, + { + "epoch": 0.9655309426840171, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007227614979636437, + "loss": 0.6371, + "step": 19440 + }, + { + "epoch": 0.9660276149796364, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007227217641799941, + "loss": 0.6314, + "step": 19450 + }, + { + "epoch": 0.9665242872752557, + "grad_norm": 0.125, + "learning_rate": 0.0007226820303963445, + "loss": 0.633, + "step": 19460 + }, + { + "epoch": 0.9670209595708751, + "grad_norm": 0.09716796875, + "learning_rate": 0.000722642296612695, + "loss": 0.6049, + "step": 19470 + }, + { + "epoch": 0.9675176318664945, + "grad_norm": 0.123046875, + "learning_rate": 0.0007226025628290454, + "loss": 0.6461, + "step": 19480 + }, + { + "epoch": 0.9680143041621139, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007225628290453959, + "loss": 0.6392, + "step": 19490 + }, + { + "epoch": 0.9685109764577332, + "grad_norm": 0.109375, + "learning_rate": 0.0007225230952617463, + "loss": 0.6055, + "step": 19500 + }, + { + "epoch": 0.9690076487533525, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007224833614780968, + "loss": 0.6556, + "step": 19510 + }, + { + "epoch": 0.9695043210489719, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007224436276944472, + "loss": 0.6339, + "step": 19520 + }, + { + "epoch": 0.9700009933445912, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007224038939107976, + "loss": 0.655, + "step": 19530 + }, + { + "epoch": 0.9704976656402106, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007223641601271482, + "loss": 0.6511, + "step": 19540 + }, + { + "epoch": 0.9709943379358299, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007223244263434986, + "loss": 0.6243, + "step": 19550 + }, + { + "epoch": 0.9714910102314492, + "grad_norm": 0.10302734375, + "learning_rate": 0.000722284692559849, + "loss": 0.6578, + "step": 19560 + }, + { + "epoch": 0.9719876825270687, + "grad_norm": 0.103515625, + "learning_rate": 0.0007222449587761995, + "loss": 0.6603, + "step": 19570 + }, + { + "epoch": 0.972484354822688, + "grad_norm": 0.1171875, + "learning_rate": 0.0007222052249925499, + "loss": 0.6494, + "step": 19580 + }, + { + "epoch": 0.9729810271183074, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007221654912089004, + "loss": 0.6439, + "step": 19590 + }, + { + "epoch": 0.9734776994139267, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007221257574252509, + "loss": 0.6378, + "step": 19600 + }, + { + "epoch": 0.973974371709546, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007220860236416013, + "loss": 0.6525, + "step": 19610 + }, + { + "epoch": 0.9744710440051654, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007220462898579517, + "loss": 0.6577, + "step": 19620 + }, + { + "epoch": 0.9749677163007847, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007220065560743022, + "loss": 0.6399, + "step": 19630 + }, + { + "epoch": 0.975464388596404, + "grad_norm": 0.12890625, + "learning_rate": 0.0007219668222906527, + "loss": 0.6335, + "step": 19640 + }, + { + "epoch": 0.9759610608920234, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007219270885070031, + "loss": 0.6269, + "step": 19650 + }, + { + "epoch": 0.9764577331876428, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007218873547233536, + "loss": 0.653, + "step": 19660 + }, + { + "epoch": 0.9769544054832622, + "grad_norm": 0.107421875, + "learning_rate": 0.000721847620939704, + "loss": 0.6416, + "step": 19670 + }, + { + "epoch": 0.9774510777788815, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007218078871560544, + "loss": 0.6566, + "step": 19680 + }, + { + "epoch": 0.9779477500745009, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007217681533724049, + "loss": 0.6215, + "step": 19690 + }, + { + "epoch": 0.9784444223701202, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007217284195887554, + "loss": 0.6143, + "step": 19700 + }, + { + "epoch": 0.9789410946657395, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007216886858051059, + "loss": 0.6546, + "step": 19710 + }, + { + "epoch": 0.9794377669613589, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007216489520214562, + "loss": 0.6309, + "step": 19720 + }, + { + "epoch": 0.9799344392569782, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007216092182378067, + "loss": 0.6613, + "step": 19730 + }, + { + "epoch": 0.9804311115525975, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007215694844541573, + "loss": 0.634, + "step": 19740 + }, + { + "epoch": 0.980927783848217, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007215297506705076, + "loss": 0.6508, + "step": 19750 + }, + { + "epoch": 0.9814244561438363, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007214900168868581, + "loss": 0.6455, + "step": 19760 + }, + { + "epoch": 0.9819211284394557, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007214502831032085, + "loss": 0.6486, + "step": 19770 + }, + { + "epoch": 0.982417800735075, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007214105493195589, + "loss": 0.6511, + "step": 19780 + }, + { + "epoch": 0.9829144730306943, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007213708155359095, + "loss": 0.6744, + "step": 19790 + }, + { + "epoch": 0.9834111453263137, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007213310817522599, + "loss": 0.6283, + "step": 19800 + }, + { + "epoch": 0.983907817621933, + "grad_norm": 0.15625, + "learning_rate": 0.0007212913479686103, + "loss": 0.666, + "step": 19810 + }, + { + "epoch": 0.9844044899175524, + "grad_norm": 0.12890625, + "learning_rate": 0.0007212516141849608, + "loss": 0.6695, + "step": 19820 + }, + { + "epoch": 0.9849011622131717, + "grad_norm": 0.109375, + "learning_rate": 0.0007212118804013112, + "loss": 0.6422, + "step": 19830 + }, + { + "epoch": 0.9853978345087911, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007211721466176617, + "loss": 0.6549, + "step": 19840 + }, + { + "epoch": 0.9858945068044105, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007211324128340122, + "loss": 0.6556, + "step": 19850 + }, + { + "epoch": 0.9863911791000298, + "grad_norm": 0.1171875, + "learning_rate": 0.0007210926790503626, + "loss": 0.645, + "step": 19860 + }, + { + "epoch": 0.9868878513956492, + "grad_norm": 0.130859375, + "learning_rate": 0.0007210529452667131, + "loss": 0.621, + "step": 19870 + }, + { + "epoch": 0.9873845236912685, + "grad_norm": 0.09716796875, + "learning_rate": 0.0007210132114830635, + "loss": 0.6638, + "step": 19880 + }, + { + "epoch": 0.9878811959868878, + "grad_norm": 0.130859375, + "learning_rate": 0.000720973477699414, + "loss": 0.6561, + "step": 19890 + }, + { + "epoch": 0.9883778682825072, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007209337439157645, + "loss": 0.6335, + "step": 19900 + }, + { + "epoch": 0.9888745405781265, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007208940101321148, + "loss": 0.6598, + "step": 19910 + }, + { + "epoch": 0.9893712128737459, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007208542763484653, + "loss": 0.6381, + "step": 19920 + }, + { + "epoch": 0.9898678851693653, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007208145425648158, + "loss": 0.6276, + "step": 19930 + }, + { + "epoch": 0.9903645574649846, + "grad_norm": 0.1171875, + "learning_rate": 0.0007207748087811662, + "loss": 0.6665, + "step": 19940 + }, + { + "epoch": 0.990861229760604, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007207350749975167, + "loss": 0.6354, + "step": 19950 + }, + { + "epoch": 0.9913579020562233, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007206953412138671, + "loss": 0.6357, + "step": 19960 + }, + { + "epoch": 0.9918545743518427, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007206556074302175, + "loss": 0.6177, + "step": 19970 + }, + { + "epoch": 0.992351246647462, + "grad_norm": 0.107421875, + "learning_rate": 0.000720615873646568, + "loss": 0.6346, + "step": 19980 + }, + { + "epoch": 0.9928479189430813, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007205761398629185, + "loss": 0.6313, + "step": 19990 + }, + { + "epoch": 0.9933445912387007, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007205364060792689, + "loss": 0.6217, + "step": 20000 + }, + { + "epoch": 0.99384126353432, + "grad_norm": 0.134765625, + "learning_rate": 0.0007204966722956194, + "loss": 0.6423, + "step": 20010 + }, + { + "epoch": 0.9943379358299395, + "grad_norm": 0.10205078125, + "learning_rate": 0.0007204569385119698, + "loss": 0.6366, + "step": 20020 + }, + { + "epoch": 0.9948346081255588, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007204172047283203, + "loss": 0.6465, + "step": 20030 + }, + { + "epoch": 0.9953312804211781, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007203774709446708, + "loss": 0.6485, + "step": 20040 + }, + { + "epoch": 0.9958279527167975, + "grad_norm": 0.09619140625, + "learning_rate": 0.0007203377371610212, + "loss": 0.6753, + "step": 20050 + }, + { + "epoch": 0.9963246250124168, + "grad_norm": 0.09326171875, + "learning_rate": 0.0007202980033773717, + "loss": 0.6464, + "step": 20060 + }, + { + "epoch": 0.9968212973080361, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007202582695937221, + "loss": 0.6352, + "step": 20070 + }, + { + "epoch": 0.9973179696036555, + "grad_norm": 0.125, + "learning_rate": 0.0007202185358100725, + "loss": 0.6392, + "step": 20080 + }, + { + "epoch": 0.9978146418992748, + "grad_norm": 0.095703125, + "learning_rate": 0.0007201788020264231, + "loss": 0.6418, + "step": 20090 + }, + { + "epoch": 0.9983113141948942, + "grad_norm": 0.095703125, + "learning_rate": 0.0007201390682427734, + "loss": 0.6289, + "step": 20100 + }, + { + "epoch": 0.9988079864905136, + "grad_norm": 0.09814453125, + "learning_rate": 0.0007200993344591239, + "loss": 0.6458, + "step": 20110 + }, + { + "epoch": 0.999304658786133, + "grad_norm": 0.109375, + "learning_rate": 0.0007200596006754744, + "loss": 0.6511, + "step": 20120 + }, + { + "epoch": 0.9998013310817523, + "grad_norm": 0.11328125, + "learning_rate": 0.0007200198668918247, + "loss": 0.6132, + "step": 20130 + }, + { + "epoch": 1.0002980033773716, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007199801331081753, + "loss": 0.6431, + "step": 20140 + }, + { + "epoch": 1.0007946756729909, + "grad_norm": 0.11328125, + "learning_rate": 0.0007199403993245258, + "loss": 0.6414, + "step": 20150 + }, + { + "epoch": 1.0012913479686103, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007199006655408762, + "loss": 0.6431, + "step": 20160 + }, + { + "epoch": 1.0017880202642298, + "grad_norm": 0.09228515625, + "learning_rate": 0.0007198609317572266, + "loss": 0.6413, + "step": 20170 + }, + { + "epoch": 1.002284692559849, + "grad_norm": 0.0966796875, + "learning_rate": 0.000719821197973577, + "loss": 0.6061, + "step": 20180 + }, + { + "epoch": 1.0027813648554684, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007197814641899276, + "loss": 0.6099, + "step": 20190 + }, + { + "epoch": 1.0032780371510877, + "grad_norm": 0.11474609375, + "learning_rate": 0.000719741730406278, + "loss": 0.6247, + "step": 20200 + }, + { + "epoch": 1.003774709446707, + "grad_norm": 0.171875, + "learning_rate": 0.0007197019966226284, + "loss": 0.6223, + "step": 20210 + }, + { + "epoch": 1.0042713817423263, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007196622628389789, + "loss": 0.6178, + "step": 20220 + }, + { + "epoch": 1.0047680540379458, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007196225290553293, + "loss": 0.6223, + "step": 20230 + }, + { + "epoch": 1.005264726333565, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007195827952716798, + "loss": 0.6333, + "step": 20240 + }, + { + "epoch": 1.0057613986291845, + "grad_norm": 0.103515625, + "learning_rate": 0.0007195430614880303, + "loss": 0.6397, + "step": 20250 + }, + { + "epoch": 1.006258070924804, + "grad_norm": 0.109375, + "learning_rate": 0.0007195033277043807, + "loss": 0.6309, + "step": 20260 + }, + { + "epoch": 1.0067547432204231, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007194635939207311, + "loss": 0.6561, + "step": 20270 + }, + { + "epoch": 1.0072514155160426, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007194238601370816, + "loss": 0.6394, + "step": 20280 + }, + { + "epoch": 1.0077480878116618, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007193841263534319, + "loss": 0.6141, + "step": 20290 + }, + { + "epoch": 1.0082447601072813, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007193443925697825, + "loss": 0.6236, + "step": 20300 + }, + { + "epoch": 1.0087414324029005, + "grad_norm": 0.09716796875, + "learning_rate": 0.000719304658786133, + "loss": 0.6215, + "step": 20310 + }, + { + "epoch": 1.00923810469852, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007192649250024834, + "loss": 0.5914, + "step": 20320 + }, + { + "epoch": 1.0097347769941392, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007192251912188338, + "loss": 0.6805, + "step": 20330 + }, + { + "epoch": 1.0102314492897586, + "grad_norm": 0.09228515625, + "learning_rate": 0.0007191854574351844, + "loss": 0.6498, + "step": 20340 + }, + { + "epoch": 1.010728121585378, + "grad_norm": 0.1171875, + "learning_rate": 0.0007191457236515348, + "loss": 0.6419, + "step": 20350 + }, + { + "epoch": 1.0112247938809973, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007191059898678852, + "loss": 0.614, + "step": 20360 + }, + { + "epoch": 1.0117214661766167, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007190662560842356, + "loss": 0.6385, + "step": 20370 + }, + { + "epoch": 1.012218138472236, + "grad_norm": 0.142578125, + "learning_rate": 0.0007190265223005861, + "loss": 0.6472, + "step": 20380 + }, + { + "epoch": 1.0127148107678554, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007189867885169365, + "loss": 0.6339, + "step": 20390 + }, + { + "epoch": 1.0132114830634746, + "grad_norm": 0.1572265625, + "learning_rate": 0.000718947054733287, + "loss": 0.629, + "step": 20400 + }, + { + "epoch": 1.013708155359094, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007189073209496375, + "loss": 0.6393, + "step": 20410 + }, + { + "epoch": 1.0142048276547133, + "grad_norm": 0.09521484375, + "learning_rate": 0.0007188675871659879, + "loss": 0.615, + "step": 20420 + }, + { + "epoch": 1.0147014999503328, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007188278533823383, + "loss": 0.6203, + "step": 20430 + }, + { + "epoch": 1.0151981722459522, + "grad_norm": 0.142578125, + "learning_rate": 0.0007187881195986889, + "loss": 0.6366, + "step": 20440 + }, + { + "epoch": 1.0156948445415714, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007187483858150393, + "loss": 0.6522, + "step": 20450 + }, + { + "epoch": 1.016191516837191, + "grad_norm": 0.12353515625, + "learning_rate": 0.0007187086520313897, + "loss": 0.6664, + "step": 20460 + }, + { + "epoch": 1.0166881891328101, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007186689182477402, + "loss": 0.6364, + "step": 20470 + }, + { + "epoch": 1.0171848614284296, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007186291844640906, + "loss": 0.6269, + "step": 20480 + }, + { + "epoch": 1.0176815337240488, + "grad_norm": 0.1103515625, + "learning_rate": 0.000718589450680441, + "loss": 0.6337, + "step": 20490 + }, + { + "epoch": 1.0181782060196682, + "grad_norm": 0.146484375, + "learning_rate": 0.0007185497168967916, + "loss": 0.6397, + "step": 20500 + }, + { + "epoch": 1.0186748783152875, + "grad_norm": 0.09716796875, + "learning_rate": 0.000718509983113142, + "loss": 0.6124, + "step": 20510 + }, + { + "epoch": 1.019171550610907, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007184702493294924, + "loss": 0.6223, + "step": 20520 + }, + { + "epoch": 1.0196682229065264, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007184305155458429, + "loss": 0.6654, + "step": 20530 + }, + { + "epoch": 1.0201648952021456, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007183907817621933, + "loss": 0.6354, + "step": 20540 + }, + { + "epoch": 1.020661567497765, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007183510479785438, + "loss": 0.6546, + "step": 20550 + }, + { + "epoch": 1.0211582397933843, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007183113141948942, + "loss": 0.6305, + "step": 20560 + }, + { + "epoch": 1.0216549120890037, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007182715804112447, + "loss": 0.6463, + "step": 20570 + }, + { + "epoch": 1.022151584384623, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007182318466275951, + "loss": 0.6449, + "step": 20580 + }, + { + "epoch": 1.0226482566802424, + "grad_norm": 0.09521484375, + "learning_rate": 0.0007181921128439455, + "loss": 0.6536, + "step": 20590 + }, + { + "epoch": 1.0231449289758616, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007181523790602961, + "loss": 0.6502, + "step": 20600 + }, + { + "epoch": 1.023641601271481, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007181126452766466, + "loss": 0.6262, + "step": 20610 + }, + { + "epoch": 1.0241382735671005, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007180729114929969, + "loss": 0.6113, + "step": 20620 + }, + { + "epoch": 1.0246349458627197, + "grad_norm": 0.1708984375, + "learning_rate": 0.0007180331777093474, + "loss": 0.6546, + "step": 20630 + }, + { + "epoch": 1.0251316181583392, + "grad_norm": 0.09228515625, + "learning_rate": 0.0007179934439256978, + "loss": 0.6563, + "step": 20640 + }, + { + "epoch": 1.0256282904539584, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007179537101420483, + "loss": 0.6274, + "step": 20650 + }, + { + "epoch": 1.0261249627495779, + "grad_norm": 0.1015625, + "learning_rate": 0.0007179139763583988, + "loss": 0.636, + "step": 20660 + }, + { + "epoch": 1.026621635045197, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007178742425747492, + "loss": 0.6469, + "step": 20670 + }, + { + "epoch": 1.0271183073408165, + "grad_norm": 0.109375, + "learning_rate": 0.0007178345087910996, + "loss": 0.6293, + "step": 20680 + }, + { + "epoch": 1.0276149796364358, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007177947750074501, + "loss": 0.6118, + "step": 20690 + }, + { + "epoch": 1.0281116519320552, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007177550412238006, + "loss": 0.6164, + "step": 20700 + }, + { + "epoch": 1.0286083242276747, + "grad_norm": 0.134765625, + "learning_rate": 0.000717715307440151, + "loss": 0.6503, + "step": 20710 + }, + { + "epoch": 1.029104996523294, + "grad_norm": 0.10546875, + "learning_rate": 0.0007176755736565015, + "loss": 0.6428, + "step": 20720 + }, + { + "epoch": 1.0296016688189134, + "grad_norm": 0.099609375, + "learning_rate": 0.0007176358398728519, + "loss": 0.6194, + "step": 20730 + }, + { + "epoch": 1.0300983411145326, + "grad_norm": 0.12109375, + "learning_rate": 0.0007175961060892023, + "loss": 0.6299, + "step": 20740 + }, + { + "epoch": 1.030595013410152, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007175563723055529, + "loss": 0.6277, + "step": 20750 + }, + { + "epoch": 1.0310916857057713, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007175166385219033, + "loss": 0.6155, + "step": 20760 + }, + { + "epoch": 1.0315883580013907, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007174769047382538, + "loss": 0.6441, + "step": 20770 + }, + { + "epoch": 1.03208503029701, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007174371709546041, + "loss": 0.6151, + "step": 20780 + }, + { + "epoch": 1.0325817025926294, + "grad_norm": 0.099609375, + "learning_rate": 0.0007173974371709546, + "loss": 0.6207, + "step": 20790 + }, + { + "epoch": 1.0330783748882488, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007173577033873052, + "loss": 0.6286, + "step": 20800 + }, + { + "epoch": 1.033575047183868, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007173179696036555, + "loss": 0.661, + "step": 20810 + }, + { + "epoch": 1.0340717194794875, + "grad_norm": 0.1416015625, + "learning_rate": 0.000717278235820006, + "loss": 0.6232, + "step": 20820 + }, + { + "epoch": 1.0345683917751067, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007172385020363564, + "loss": 0.6403, + "step": 20830 + }, + { + "epoch": 1.0350650640707262, + "grad_norm": 0.126953125, + "learning_rate": 0.0007171987682527068, + "loss": 0.6231, + "step": 20840 + }, + { + "epoch": 1.0355617363663454, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007171590344690574, + "loss": 0.6646, + "step": 20850 + }, + { + "epoch": 1.0360584086619649, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007171193006854078, + "loss": 0.6139, + "step": 20860 + }, + { + "epoch": 1.036555080957584, + "grad_norm": 0.1328125, + "learning_rate": 0.0007170795669017582, + "loss": 0.6497, + "step": 20870 + }, + { + "epoch": 1.0370517532532035, + "grad_norm": 0.1015625, + "learning_rate": 0.0007170398331181087, + "loss": 0.644, + "step": 20880 + }, + { + "epoch": 1.037548425548823, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007170000993344591, + "loss": 0.6199, + "step": 20890 + }, + { + "epoch": 1.0380450978444422, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007169603655508097, + "loss": 0.6642, + "step": 20900 + }, + { + "epoch": 1.0385417701400617, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007169206317671601, + "loss": 0.6234, + "step": 20910 + }, + { + "epoch": 1.0390384424356809, + "grad_norm": 0.09716796875, + "learning_rate": 0.0007168808979835105, + "loss": 0.6432, + "step": 20920 + }, + { + "epoch": 1.0395351147313003, + "grad_norm": 0.0927734375, + "learning_rate": 0.000716841164199861, + "loss": 0.6174, + "step": 20930 + }, + { + "epoch": 1.0400317870269196, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007168014304162114, + "loss": 0.6398, + "step": 20940 + }, + { + "epoch": 1.040528459322539, + "grad_norm": 0.109375, + "learning_rate": 0.0007167616966325619, + "loss": 0.6372, + "step": 20950 + }, + { + "epoch": 1.0410251316181582, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007167219628489124, + "loss": 0.6382, + "step": 20960 + }, + { + "epoch": 1.0415218039137777, + "grad_norm": 0.126953125, + "learning_rate": 0.0007166822290652627, + "loss": 0.6185, + "step": 20970 + }, + { + "epoch": 1.0420184762093971, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007166424952816132, + "loss": 0.6185, + "step": 20980 + }, + { + "epoch": 1.0425151485050164, + "grad_norm": 0.146484375, + "learning_rate": 0.0007166027614979637, + "loss": 0.6616, + "step": 20990 + }, + { + "epoch": 1.0430118208006358, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007165630277143141, + "loss": 0.6378, + "step": 21000 + }, + { + "epoch": 1.043508493096255, + "grad_norm": 0.11328125, + "learning_rate": 0.0007165232939306646, + "loss": 0.6186, + "step": 21010 + }, + { + "epoch": 1.0440051653918745, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007164835601470151, + "loss": 0.6414, + "step": 21020 + }, + { + "epoch": 1.0445018376874937, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007164438263633654, + "loss": 0.6409, + "step": 21030 + }, + { + "epoch": 1.0449985099831132, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007164040925797159, + "loss": 0.6121, + "step": 21040 + }, + { + "epoch": 1.0454951822787324, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007163643587960664, + "loss": 0.614, + "step": 21050 + }, + { + "epoch": 1.0459918545743518, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007163246250124169, + "loss": 0.6208, + "step": 21060 + }, + { + "epoch": 1.0464885268699713, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007162848912287673, + "loss": 0.6057, + "step": 21070 + }, + { + "epoch": 1.0469851991655905, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007162451574451177, + "loss": 0.6317, + "step": 21080 + }, + { + "epoch": 1.04748187146121, + "grad_norm": 0.1630859375, + "learning_rate": 0.0007162054236614682, + "loss": 0.6259, + "step": 21090 + }, + { + "epoch": 1.0479785437568292, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007161656898778187, + "loss": 0.6358, + "step": 21100 + }, + { + "epoch": 1.0484752160524486, + "grad_norm": 0.126953125, + "learning_rate": 0.0007161259560941691, + "loss": 0.6219, + "step": 21110 + }, + { + "epoch": 1.0489718883480679, + "grad_norm": 0.11328125, + "learning_rate": 0.0007160862223105196, + "loss": 0.6467, + "step": 21120 + }, + { + "epoch": 1.0494685606436873, + "grad_norm": 0.10693359375, + "learning_rate": 0.00071604648852687, + "loss": 0.5926, + "step": 21130 + }, + { + "epoch": 1.0499652329393065, + "grad_norm": 0.15234375, + "learning_rate": 0.0007160067547432204, + "loss": 0.6026, + "step": 21140 + }, + { + "epoch": 1.050461905234926, + "grad_norm": 0.09814453125, + "learning_rate": 0.000715967020959571, + "loss": 0.6293, + "step": 21150 + }, + { + "epoch": 1.0509585775305454, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007159272871759213, + "loss": 0.6204, + "step": 21160 + }, + { + "epoch": 1.0514552498261647, + "grad_norm": 0.1513671875, + "learning_rate": 0.0007158875533922718, + "loss": 0.6413, + "step": 21170 + }, + { + "epoch": 1.0519519221217841, + "grad_norm": 0.095703125, + "learning_rate": 0.0007158478196086223, + "loss": 0.6339, + "step": 21180 + }, + { + "epoch": 1.0524485944174033, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007158080858249726, + "loss": 0.6545, + "step": 21190 + }, + { + "epoch": 1.0529452667130228, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007157683520413232, + "loss": 0.6232, + "step": 21200 + }, + { + "epoch": 1.053441939008642, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007157286182576737, + "loss": 0.6383, + "step": 21210 + }, + { + "epoch": 1.0539386113042615, + "grad_norm": 0.09814453125, + "learning_rate": 0.0007156888844740241, + "loss": 0.647, + "step": 21220 + }, + { + "epoch": 1.0544352835998807, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007156491506903745, + "loss": 0.6368, + "step": 21230 + }, + { + "epoch": 1.0549319558955002, + "grad_norm": 0.134765625, + "learning_rate": 0.0007156094169067249, + "loss": 0.637, + "step": 21240 + }, + { + "epoch": 1.0554286281911196, + "grad_norm": 0.111328125, + "learning_rate": 0.0007155696831230755, + "loss": 0.6766, + "step": 21250 + }, + { + "epoch": 1.0559253004867388, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007155299493394259, + "loss": 0.6468, + "step": 21260 + }, + { + "epoch": 1.0564219727823583, + "grad_norm": 0.107421875, + "learning_rate": 0.0007154902155557763, + "loss": 0.6206, + "step": 21270 + }, + { + "epoch": 1.0569186450779775, + "grad_norm": 0.0966796875, + "learning_rate": 0.0007154504817721268, + "loss": 0.6368, + "step": 21280 + }, + { + "epoch": 1.057415317373597, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007154107479884772, + "loss": 0.6367, + "step": 21290 + }, + { + "epoch": 1.0579119896692162, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007153710142048277, + "loss": 0.593, + "step": 21300 + }, + { + "epoch": 1.0584086619648356, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007153312804211782, + "loss": 0.6003, + "step": 21310 + }, + { + "epoch": 1.0589053342604549, + "grad_norm": 0.125, + "learning_rate": 0.0007152915466375286, + "loss": 0.6045, + "step": 21320 + }, + { + "epoch": 1.0594020065560743, + "grad_norm": 0.1044921875, + "learning_rate": 0.000715251812853879, + "loss": 0.6367, + "step": 21330 + }, + { + "epoch": 1.0598986788516938, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007152120790702295, + "loss": 0.616, + "step": 21340 + }, + { + "epoch": 1.060395351147313, + "grad_norm": 0.10693359375, + "learning_rate": 0.00071517234528658, + "loss": 0.5995, + "step": 21350 + }, + { + "epoch": 1.0608920234429324, + "grad_norm": 0.12890625, + "learning_rate": 0.0007151326115029304, + "loss": 0.6266, + "step": 21360 + }, + { + "epoch": 1.0613886957385517, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007150928777192809, + "loss": 0.6305, + "step": 21370 + }, + { + "epoch": 1.061885368034171, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007150531439356313, + "loss": 0.6097, + "step": 21380 + }, + { + "epoch": 1.0623820403297903, + "grad_norm": 0.1357421875, + "learning_rate": 0.0007150134101519817, + "loss": 0.6639, + "step": 21390 + }, + { + "epoch": 1.0628787126254098, + "grad_norm": 0.1201171875, + "learning_rate": 0.0007149736763683323, + "loss": 0.5871, + "step": 21400 + }, + { + "epoch": 1.063375384921029, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007149339425846827, + "loss": 0.6388, + "step": 21410 + }, + { + "epoch": 1.0638720572166485, + "grad_norm": 0.09814453125, + "learning_rate": 0.0007148942088010331, + "loss": 0.6103, + "step": 21420 + }, + { + "epoch": 1.064368729512268, + "grad_norm": 0.1591796875, + "learning_rate": 0.0007148544750173835, + "loss": 0.6276, + "step": 21430 + }, + { + "epoch": 1.0648654018078871, + "grad_norm": 0.109375, + "learning_rate": 0.000714814741233734, + "loss": 0.6362, + "step": 21440 + }, + { + "epoch": 1.0653620741035066, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007147750074500845, + "loss": 0.6413, + "step": 21450 + }, + { + "epoch": 1.0658587463991258, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007147352736664349, + "loss": 0.5809, + "step": 21460 + }, + { + "epoch": 1.0663554186947453, + "grad_norm": 0.1015625, + "learning_rate": 0.0007146955398827854, + "loss": 0.6338, + "step": 21470 + }, + { + "epoch": 1.0668520909903645, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007146558060991358, + "loss": 0.6367, + "step": 21480 + }, + { + "epoch": 1.067348763285984, + "grad_norm": 0.109375, + "learning_rate": 0.0007146160723154862, + "loss": 0.6383, + "step": 21490 + }, + { + "epoch": 1.0678454355816032, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007145763385318368, + "loss": 0.6074, + "step": 21500 + }, + { + "epoch": 1.0683421078772226, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007145366047481872, + "loss": 0.606, + "step": 21510 + }, + { + "epoch": 1.068838780172842, + "grad_norm": 0.15234375, + "learning_rate": 0.0007144968709645376, + "loss": 0.641, + "step": 21520 + }, + { + "epoch": 1.0693354524684613, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007144571371808881, + "loss": 0.6228, + "step": 21530 + }, + { + "epoch": 1.0698321247640807, + "grad_norm": 0.158203125, + "learning_rate": 0.0007144174033972385, + "loss": 0.6205, + "step": 21540 + }, + { + "epoch": 1.0703287970597, + "grad_norm": 0.10498046875, + "learning_rate": 0.000714377669613589, + "loss": 0.6452, + "step": 21550 + }, + { + "epoch": 1.0708254693553194, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007143379358299395, + "loss": 0.638, + "step": 21560 + }, + { + "epoch": 1.0713221416509386, + "grad_norm": 0.09423828125, + "learning_rate": 0.0007142982020462899, + "loss": 0.6696, + "step": 21570 + }, + { + "epoch": 1.071818813946558, + "grad_norm": 0.09814453125, + "learning_rate": 0.0007142584682626403, + "loss": 0.6129, + "step": 21580 + }, + { + "epoch": 1.0723154862421773, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007142187344789908, + "loss": 0.6459, + "step": 21590 + }, + { + "epoch": 1.0728121585377968, + "grad_norm": 0.123046875, + "learning_rate": 0.0007141790006953413, + "loss": 0.6551, + "step": 21600 + }, + { + "epoch": 1.0733088308334162, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007141392669116917, + "loss": 0.6482, + "step": 21610 + }, + { + "epoch": 1.0738055031290354, + "grad_norm": 0.103515625, + "learning_rate": 0.0007140995331280422, + "loss": 0.5979, + "step": 21620 + }, + { + "epoch": 1.074302175424655, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007140597993443926, + "loss": 0.6278, + "step": 21630 + }, + { + "epoch": 1.0747988477202741, + "grad_norm": 0.10595703125, + "learning_rate": 0.000714020065560743, + "loss": 0.627, + "step": 21640 + }, + { + "epoch": 1.0752955200158936, + "grad_norm": 0.09375, + "learning_rate": 0.0007139803317770934, + "loss": 0.6193, + "step": 21650 + }, + { + "epoch": 1.0757921923115128, + "grad_norm": 0.1279296875, + "learning_rate": 0.000713940597993444, + "loss": 0.6542, + "step": 21660 + }, + { + "epoch": 1.0762888646071322, + "grad_norm": 0.107421875, + "learning_rate": 0.0007139008642097945, + "loss": 0.6136, + "step": 21670 + }, + { + "epoch": 1.0767855369027515, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007138611304261448, + "loss": 0.6423, + "step": 21680 + }, + { + "epoch": 1.077282209198371, + "grad_norm": 0.115234375, + "learning_rate": 0.0007138213966424953, + "loss": 0.6457, + "step": 21690 + }, + { + "epoch": 1.0777788814939901, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007137816628588457, + "loss": 0.6406, + "step": 21700 + }, + { + "epoch": 1.0782755537896096, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007137419290751962, + "loss": 0.6145, + "step": 21710 + }, + { + "epoch": 1.078772226085229, + "grad_norm": 0.1552734375, + "learning_rate": 0.0007137021952915467, + "loss": 0.6083, + "step": 21720 + }, + { + "epoch": 1.0792688983808483, + "grad_norm": 0.12255859375, + "learning_rate": 0.0007136624615078971, + "loss": 0.6084, + "step": 21730 + }, + { + "epoch": 1.0797655706764677, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007136227277242475, + "loss": 0.6413, + "step": 21740 + }, + { + "epoch": 1.080262242972087, + "grad_norm": 0.09814453125, + "learning_rate": 0.000713582993940598, + "loss": 0.6146, + "step": 21750 + }, + { + "epoch": 1.0807589152677064, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007135432601569485, + "loss": 0.6419, + "step": 21760 + }, + { + "epoch": 1.0812555875633256, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007135035263732989, + "loss": 0.6509, + "step": 21770 + }, + { + "epoch": 1.081752259858945, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007134637925896494, + "loss": 0.6484, + "step": 21780 + }, + { + "epoch": 1.0822489321545645, + "grad_norm": 0.107421875, + "learning_rate": 0.0007134240588059998, + "loss": 0.6472, + "step": 21790 + }, + { + "epoch": 1.0827456044501838, + "grad_norm": 0.134765625, + "learning_rate": 0.0007133843250223504, + "loss": 0.6434, + "step": 21800 + }, + { + "epoch": 1.0832422767458032, + "grad_norm": 0.14453125, + "learning_rate": 0.0007133445912387008, + "loss": 0.6067, + "step": 21810 + }, + { + "epoch": 1.0837389490414224, + "grad_norm": 0.1328125, + "learning_rate": 0.0007133048574550512, + "loss": 0.6255, + "step": 21820 + }, + { + "epoch": 1.0842356213370419, + "grad_norm": 0.09521484375, + "learning_rate": 0.0007132651236714017, + "loss": 0.6374, + "step": 21830 + }, + { + "epoch": 1.084732293632661, + "grad_norm": 0.087890625, + "learning_rate": 0.000713225389887752, + "loss": 0.6203, + "step": 21840 + }, + { + "epoch": 1.0852289659282806, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007131856561041025, + "loss": 0.6495, + "step": 21850 + }, + { + "epoch": 1.0857256382238998, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007131459223204531, + "loss": 0.6453, + "step": 21860 + }, + { + "epoch": 1.0862223105195192, + "grad_norm": 0.09765625, + "learning_rate": 0.0007131061885368034, + "loss": 0.5869, + "step": 21870 + }, + { + "epoch": 1.0867189828151385, + "grad_norm": 0.09326171875, + "learning_rate": 0.0007130664547531539, + "loss": 0.6145, + "step": 21880 + }, + { + "epoch": 1.087215655110758, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007130267209695044, + "loss": 0.6281, + "step": 21890 + }, + { + "epoch": 1.0877123274063774, + "grad_norm": 0.123046875, + "learning_rate": 0.0007129869871858547, + "loss": 0.6336, + "step": 21900 + }, + { + "epoch": 1.0882089997019966, + "grad_norm": 0.087890625, + "learning_rate": 0.0007129472534022053, + "loss": 0.6367, + "step": 21910 + }, + { + "epoch": 1.088705671997616, + "grad_norm": 0.103515625, + "learning_rate": 0.0007129075196185557, + "loss": 0.6467, + "step": 21920 + }, + { + "epoch": 1.0892023442932353, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007128677858349061, + "loss": 0.633, + "step": 21930 + }, + { + "epoch": 1.0896990165888547, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007128280520512566, + "loss": 0.6505, + "step": 21940 + }, + { + "epoch": 1.090195688884474, + "grad_norm": 0.115234375, + "learning_rate": 0.000712788318267607, + "loss": 0.6666, + "step": 21950 + }, + { + "epoch": 1.0906923611800934, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007127485844839576, + "loss": 0.6298, + "step": 21960 + }, + { + "epoch": 1.0911890334757128, + "grad_norm": 0.11669921875, + "learning_rate": 0.000712708850700308, + "loss": 0.6172, + "step": 21970 + }, + { + "epoch": 1.091685705771332, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007126691169166584, + "loss": 0.6471, + "step": 21980 + }, + { + "epoch": 1.0921823780669515, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007126293831330089, + "loss": 0.649, + "step": 21990 + }, + { + "epoch": 1.0926790503625707, + "grad_norm": 0.11328125, + "learning_rate": 0.0007125896493493593, + "loss": 0.5827, + "step": 22000 + }, + { + "epoch": 1.0931757226581902, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007125499155657098, + "loss": 0.6342, + "step": 22010 + }, + { + "epoch": 1.0936723949538094, + "grad_norm": 0.107421875, + "learning_rate": 0.0007125101817820603, + "loss": 0.6023, + "step": 22020 + }, + { + "epoch": 1.0941690672494289, + "grad_norm": 0.158203125, + "learning_rate": 0.0007124704479984106, + "loss": 0.6409, + "step": 22030 + }, + { + "epoch": 1.094665739545048, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007124307142147611, + "loss": 0.6335, + "step": 22040 + }, + { + "epoch": 1.0951624118406675, + "grad_norm": 0.1328125, + "learning_rate": 0.0007123909804311117, + "loss": 0.6375, + "step": 22050 + }, + { + "epoch": 1.0956590841362868, + "grad_norm": 0.1240234375, + "learning_rate": 0.000712351246647462, + "loss": 0.6516, + "step": 22060 + }, + { + "epoch": 1.0961557564319062, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007123115128638125, + "loss": 0.6722, + "step": 22070 + }, + { + "epoch": 1.0966524287275257, + "grad_norm": 0.11669921875, + "learning_rate": 0.000712271779080163, + "loss": 0.6483, + "step": 22080 + }, + { + "epoch": 1.097149101023145, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007122320452965133, + "loss": 0.6566, + "step": 22090 + }, + { + "epoch": 1.0976457733187643, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007121923115128638, + "loss": 0.6415, + "step": 22100 + }, + { + "epoch": 1.0981424456143836, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007121525777292143, + "loss": 0.6417, + "step": 22110 + }, + { + "epoch": 1.098639117910003, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007121128439455648, + "loss": 0.6333, + "step": 22120 + }, + { + "epoch": 1.0991357902056222, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007120731101619152, + "loss": 0.6129, + "step": 22130 + }, + { + "epoch": 1.0996324625012417, + "grad_norm": 0.09326171875, + "learning_rate": 0.0007120333763782656, + "loss": 0.5927, + "step": 22140 + }, + { + "epoch": 1.1001291347968611, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007119936425946161, + "loss": 0.627, + "step": 22150 + }, + { + "epoch": 1.1006258070924804, + "grad_norm": 0.171875, + "learning_rate": 0.0007119539088109666, + "loss": 0.5961, + "step": 22160 + }, + { + "epoch": 1.1011224793880998, + "grad_norm": 0.1396484375, + "learning_rate": 0.000711914175027317, + "loss": 0.6226, + "step": 22170 + }, + { + "epoch": 1.101619151683719, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007118744412436675, + "loss": 0.6391, + "step": 22180 + }, + { + "epoch": 1.1021158239793385, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007118347074600179, + "loss": 0.6212, + "step": 22190 + }, + { + "epoch": 1.1026124962749577, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007117949736763683, + "loss": 0.6268, + "step": 22200 + }, + { + "epoch": 1.1031091685705772, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007117552398927189, + "loss": 0.6589, + "step": 22210 + }, + { + "epoch": 1.1036058408661964, + "grad_norm": 0.09326171875, + "learning_rate": 0.0007117155061090692, + "loss": 0.6267, + "step": 22220 + }, + { + "epoch": 1.1041025131618158, + "grad_norm": 0.103515625, + "learning_rate": 0.0007116757723254197, + "loss": 0.613, + "step": 22230 + }, + { + "epoch": 1.104599185457435, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007116360385417702, + "loss": 0.6083, + "step": 22240 + }, + { + "epoch": 1.1050958577530545, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007115963047581206, + "loss": 0.613, + "step": 22250 + }, + { + "epoch": 1.105592530048674, + "grad_norm": 0.1337890625, + "learning_rate": 0.0007115565709744711, + "loss": 0.5983, + "step": 22260 + }, + { + "epoch": 1.1060892023442932, + "grad_norm": 0.1494140625, + "learning_rate": 0.0007115168371908216, + "loss": 0.6171, + "step": 22270 + }, + { + "epoch": 1.1065858746399126, + "grad_norm": 0.11865234375, + "learning_rate": 0.000711477103407172, + "loss": 0.6404, + "step": 22280 + }, + { + "epoch": 1.1070825469355319, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007114373696235224, + "loss": 0.6187, + "step": 22290 + }, + { + "epoch": 1.1075792192311513, + "grad_norm": 0.09716796875, + "learning_rate": 0.0007113976358398728, + "loss": 0.6012, + "step": 22300 + }, + { + "epoch": 1.1080758915267706, + "grad_norm": 0.09423828125, + "learning_rate": 0.0007113579020562234, + "loss": 0.6123, + "step": 22310 + }, + { + "epoch": 1.10857256382239, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007113181682725738, + "loss": 0.6281, + "step": 22320 + }, + { + "epoch": 1.1090692361180094, + "grad_norm": 0.0947265625, + "learning_rate": 0.0007112784344889242, + "loss": 0.6013, + "step": 22330 + }, + { + "epoch": 1.1095659084136287, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007112387007052747, + "loss": 0.6682, + "step": 22340 + }, + { + "epoch": 1.1100625807092481, + "grad_norm": 0.09521484375, + "learning_rate": 0.0007111989669216251, + "loss": 0.6218, + "step": 22350 + }, + { + "epoch": 1.1105592530048674, + "grad_norm": 0.1328125, + "learning_rate": 0.0007111592331379756, + "loss": 0.6293, + "step": 22360 + }, + { + "epoch": 1.1110559253004868, + "grad_norm": 0.12158203125, + "learning_rate": 0.0007111194993543261, + "loss": 0.6096, + "step": 22370 + }, + { + "epoch": 1.111552597596106, + "grad_norm": 0.107421875, + "learning_rate": 0.0007110797655706765, + "loss": 0.628, + "step": 22380 + }, + { + "epoch": 1.1120492698917255, + "grad_norm": 0.103515625, + "learning_rate": 0.0007110400317870269, + "loss": 0.6113, + "step": 22390 + }, + { + "epoch": 1.1125459421873447, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007110002980033774, + "loss": 0.5945, + "step": 22400 + }, + { + "epoch": 1.1130426144829642, + "grad_norm": 0.1533203125, + "learning_rate": 0.0007109605642197279, + "loss": 0.6667, + "step": 22410 + }, + { + "epoch": 1.1135392867785834, + "grad_norm": 0.13671875, + "learning_rate": 0.0007109208304360783, + "loss": 0.6661, + "step": 22420 + }, + { + "epoch": 1.1140359590742028, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007108810966524288, + "loss": 0.6505, + "step": 22430 + }, + { + "epoch": 1.1145326313698223, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007108413628687792, + "loss": 0.6354, + "step": 22440 + }, + { + "epoch": 1.1150293036654415, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007108016290851296, + "loss": 0.6554, + "step": 22450 + }, + { + "epoch": 1.115525975961061, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007107618953014802, + "loss": 0.6206, + "step": 22460 + }, + { + "epoch": 1.1160226482566802, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007107221615178306, + "loss": 0.6328, + "step": 22470 + }, + { + "epoch": 1.1165193205522996, + "grad_norm": 0.1005859375, + "learning_rate": 0.000710682427734181, + "loss": 0.5999, + "step": 22480 + }, + { + "epoch": 1.1170159928479189, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007106426939505314, + "loss": 0.6318, + "step": 22490 + }, + { + "epoch": 1.1175126651435383, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007106029601668819, + "loss": 0.6513, + "step": 22500 + }, + { + "epoch": 1.1180093374391578, + "grad_norm": 0.111328125, + "learning_rate": 0.0007105632263832324, + "loss": 0.6258, + "step": 22510 + }, + { + "epoch": 1.118506009734777, + "grad_norm": 0.103515625, + "learning_rate": 0.0007105234925995828, + "loss": 0.6303, + "step": 22520 + }, + { + "epoch": 1.1190026820303964, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007104837588159333, + "loss": 0.709, + "step": 22530 + }, + { + "epoch": 1.1194993543260157, + "grad_norm": 0.11328125, + "learning_rate": 0.0007104440250322838, + "loss": 0.6219, + "step": 22540 + }, + { + "epoch": 1.119996026621635, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007104042912486341, + "loss": 0.6181, + "step": 22550 + }, + { + "epoch": 1.1204926989172543, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007103645574649847, + "loss": 0.621, + "step": 22560 + }, + { + "epoch": 1.1209893712128738, + "grad_norm": 0.1015625, + "learning_rate": 0.0007103248236813351, + "loss": 0.619, + "step": 22570 + }, + { + "epoch": 1.121486043508493, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007102850898976855, + "loss": 0.619, + "step": 22580 + }, + { + "epoch": 1.1219827158041125, + "grad_norm": 0.11181640625, + "learning_rate": 0.000710245356114036, + "loss": 0.6185, + "step": 22590 + }, + { + "epoch": 1.1224793880997317, + "grad_norm": 0.1015625, + "learning_rate": 0.0007102056223303864, + "loss": 0.6314, + "step": 22600 + }, + { + "epoch": 1.1229760603953511, + "grad_norm": 0.1396484375, + "learning_rate": 0.0007101658885467369, + "loss": 0.6068, + "step": 22610 + }, + { + "epoch": 1.1234727326909706, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007101261547630874, + "loss": 0.6185, + "step": 22620 + }, + { + "epoch": 1.1239694049865898, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007100864209794378, + "loss": 0.6095, + "step": 22630 + }, + { + "epoch": 1.1244660772822093, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007100466871957882, + "loss": 0.6314, + "step": 22640 + }, + { + "epoch": 1.1249627495778285, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007100069534121387, + "loss": 0.6239, + "step": 22650 + }, + { + "epoch": 1.125459421873448, + "grad_norm": 0.1015625, + "learning_rate": 0.0007099672196284892, + "loss": 0.6411, + "step": 22660 + }, + { + "epoch": 1.1259560941690672, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007099274858448396, + "loss": 0.6537, + "step": 22670 + }, + { + "epoch": 1.1264527664646866, + "grad_norm": 0.09765625, + "learning_rate": 0.0007098877520611901, + "loss": 0.6395, + "step": 22680 + }, + { + "epoch": 1.126949438760306, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007098480182775405, + "loss": 0.6465, + "step": 22690 + }, + { + "epoch": 1.1274461110559253, + "grad_norm": 0.123046875, + "learning_rate": 0.000709808284493891, + "loss": 0.5975, + "step": 22700 + }, + { + "epoch": 1.1279427833515447, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007097685507102414, + "loss": 0.6149, + "step": 22710 + }, + { + "epoch": 1.128439455647164, + "grad_norm": 0.11279296875, + "learning_rate": 0.0007097288169265919, + "loss": 0.625, + "step": 22720 + }, + { + "epoch": 1.1289361279427834, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007096890831429424, + "loss": 0.6185, + "step": 22730 + }, + { + "epoch": 1.1294328002384026, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007096493493592927, + "loss": 0.6329, + "step": 22740 + }, + { + "epoch": 1.129929472534022, + "grad_norm": 0.1689453125, + "learning_rate": 0.0007096096155756432, + "loss": 0.6179, + "step": 22750 + }, + { + "epoch": 1.1304261448296413, + "grad_norm": 0.1484375, + "learning_rate": 0.0007095698817919937, + "loss": 0.6177, + "step": 22760 + }, + { + "epoch": 1.1309228171252608, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007095301480083441, + "loss": 0.6089, + "step": 22770 + }, + { + "epoch": 1.13141948942088, + "grad_norm": 0.103515625, + "learning_rate": 0.0007094904142246946, + "loss": 0.6127, + "step": 22780 + }, + { + "epoch": 1.1319161617164994, + "grad_norm": 0.1171875, + "learning_rate": 0.000709450680441045, + "loss": 0.6463, + "step": 22790 + }, + { + "epoch": 1.132412834012119, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007094109466573954, + "loss": 0.6197, + "step": 22800 + }, + { + "epoch": 1.1329095063077381, + "grad_norm": 0.09375, + "learning_rate": 0.000709371212873746, + "loss": 0.6144, + "step": 22810 + }, + { + "epoch": 1.1334061786033576, + "grad_norm": 0.134765625, + "learning_rate": 0.0007093314790900964, + "loss": 0.6395, + "step": 22820 + }, + { + "epoch": 1.1339028508989768, + "grad_norm": 0.09716796875, + "learning_rate": 0.0007092917453064468, + "loss": 0.6231, + "step": 22830 + }, + { + "epoch": 1.1343995231945962, + "grad_norm": 0.1328125, + "learning_rate": 0.0007092520115227973, + "loss": 0.6377, + "step": 22840 + }, + { + "epoch": 1.1348961954902155, + "grad_norm": 0.1513671875, + "learning_rate": 0.0007092122777391477, + "loss": 0.6589, + "step": 22850 + }, + { + "epoch": 1.135392867785835, + "grad_norm": 0.10546875, + "learning_rate": 0.0007091725439554983, + "loss": 0.6392, + "step": 22860 + }, + { + "epoch": 1.1358895400814544, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007091328101718487, + "loss": 0.6901, + "step": 22870 + }, + { + "epoch": 1.1363862123770736, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007090930763881991, + "loss": 0.6257, + "step": 22880 + }, + { + "epoch": 1.136882884672693, + "grad_norm": 0.130859375, + "learning_rate": 0.0007090533426045496, + "loss": 0.609, + "step": 22890 + }, + { + "epoch": 1.1373795569683123, + "grad_norm": 0.1484375, + "learning_rate": 0.0007090136088208999, + "loss": 0.6255, + "step": 22900 + }, + { + "epoch": 1.1378762292639317, + "grad_norm": 0.123046875, + "learning_rate": 0.0007089738750372505, + "loss": 0.6209, + "step": 22910 + }, + { + "epoch": 1.138372901559551, + "grad_norm": 0.1416015625, + "learning_rate": 0.000708934141253601, + "loss": 0.6024, + "step": 22920 + }, + { + "epoch": 1.1388695738551704, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007088944074699513, + "loss": 0.6234, + "step": 22930 + }, + { + "epoch": 1.1393662461507896, + "grad_norm": 0.09765625, + "learning_rate": 0.0007088546736863018, + "loss": 0.635, + "step": 22940 + }, + { + "epoch": 1.139862918446409, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007088149399026523, + "loss": 0.6387, + "step": 22950 + }, + { + "epoch": 1.1403595907420283, + "grad_norm": 0.1015625, + "learning_rate": 0.0007087752061190026, + "loss": 0.6197, + "step": 22960 + }, + { + "epoch": 1.1408562630376478, + "grad_norm": 0.16015625, + "learning_rate": 0.0007087354723353532, + "loss": 0.6047, + "step": 22970 + }, + { + "epoch": 1.1413529353332672, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007086957385517036, + "loss": 0.6384, + "step": 22980 + }, + { + "epoch": 1.1418496076288864, + "grad_norm": 0.169921875, + "learning_rate": 0.0007086560047680541, + "loss": 0.6246, + "step": 22990 + }, + { + "epoch": 1.1423462799245059, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007086162709844045, + "loss": 0.6167, + "step": 23000 + }, + { + "epoch": 1.142842952220125, + "grad_norm": 0.1572265625, + "learning_rate": 0.000708576537200755, + "loss": 0.6267, + "step": 23010 + }, + { + "epoch": 1.1433396245157446, + "grad_norm": 0.1435546875, + "learning_rate": 0.0007085368034171055, + "loss": 0.628, + "step": 23020 + }, + { + "epoch": 1.1438362968113638, + "grad_norm": 0.130859375, + "learning_rate": 0.0007084970696334559, + "loss": 0.6131, + "step": 23030 + }, + { + "epoch": 1.1443329691069832, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007084573358498063, + "loss": 0.6425, + "step": 23040 + }, + { + "epoch": 1.1448296414026027, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007084176020661568, + "loss": 0.6234, + "step": 23050 + }, + { + "epoch": 1.145326313698222, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007083778682825073, + "loss": 0.6404, + "step": 23060 + }, + { + "epoch": 1.1458229859938414, + "grad_norm": 0.11865234375, + "learning_rate": 0.0007083381344988577, + "loss": 0.617, + "step": 23070 + }, + { + "epoch": 1.1463196582894606, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007082984007152082, + "loss": 0.5922, + "step": 23080 + }, + { + "epoch": 1.14681633058508, + "grad_norm": 0.138671875, + "learning_rate": 0.0007082586669315585, + "loss": 0.618, + "step": 23090 + }, + { + "epoch": 1.1473130028806993, + "grad_norm": 0.12060546875, + "learning_rate": 0.000708218933147909, + "loss": 0.613, + "step": 23100 + }, + { + "epoch": 1.1478096751763187, + "grad_norm": 0.10693359375, + "learning_rate": 0.0007081791993642596, + "loss": 0.6369, + "step": 23110 + }, + { + "epoch": 1.148306347471938, + "grad_norm": 0.0927734375, + "learning_rate": 0.0007081394655806099, + "loss": 0.6305, + "step": 23120 + }, + { + "epoch": 1.1488030197675574, + "grad_norm": 0.12890625, + "learning_rate": 0.0007080997317969604, + "loss": 0.6211, + "step": 23130 + }, + { + "epoch": 1.1492996920631766, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007080599980133109, + "loss": 0.6008, + "step": 23140 + }, + { + "epoch": 1.149796364358796, + "grad_norm": 0.126953125, + "learning_rate": 0.0007080202642296613, + "loss": 0.6009, + "step": 23150 + }, + { + "epoch": 1.1502930366544155, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007079805304460117, + "loss": 0.6652, + "step": 23160 + }, + { + "epoch": 1.1507897089500347, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007079407966623622, + "loss": 0.6498, + "step": 23170 + }, + { + "epoch": 1.1512863812456542, + "grad_norm": 0.09716796875, + "learning_rate": 0.0007079010628787127, + "loss": 0.6265, + "step": 23180 + }, + { + "epoch": 1.1517830535412734, + "grad_norm": 0.10546875, + "learning_rate": 0.0007078613290950631, + "loss": 0.6222, + "step": 23190 + }, + { + "epoch": 1.1522797258368929, + "grad_norm": 0.095703125, + "learning_rate": 0.0007078215953114135, + "loss": 0.6105, + "step": 23200 + }, + { + "epoch": 1.152776398132512, + "grad_norm": 0.09619140625, + "learning_rate": 0.000707781861527764, + "loss": 0.6407, + "step": 23210 + }, + { + "epoch": 1.1532730704281315, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007077421277441145, + "loss": 0.6244, + "step": 23220 + }, + { + "epoch": 1.153769742723751, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007077023939604649, + "loss": 0.6351, + "step": 23230 + }, + { + "epoch": 1.1542664150193702, + "grad_norm": 0.107421875, + "learning_rate": 0.0007076626601768154, + "loss": 0.6386, + "step": 23240 + }, + { + "epoch": 1.1547630873149897, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007076229263931658, + "loss": 0.639, + "step": 23250 + }, + { + "epoch": 1.155259759610609, + "grad_norm": 0.091796875, + "learning_rate": 0.0007075831926095162, + "loss": 0.6513, + "step": 23260 + }, + { + "epoch": 1.1557564319062283, + "grad_norm": 0.119140625, + "learning_rate": 0.0007075434588258668, + "loss": 0.6354, + "step": 23270 + }, + { + "epoch": 1.1562531042018476, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007075037250422172, + "loss": 0.6214, + "step": 23280 + }, + { + "epoch": 1.156749776497467, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007074639912585676, + "loss": 0.6353, + "step": 23290 + }, + { + "epoch": 1.1572464487930862, + "grad_norm": 0.0966796875, + "learning_rate": 0.0007074242574749181, + "loss": 0.6349, + "step": 23300 + }, + { + "epoch": 1.1577431210887057, + "grad_norm": 0.103515625, + "learning_rate": 0.0007073845236912685, + "loss": 0.6332, + "step": 23310 + }, + { + "epoch": 1.158239793384325, + "grad_norm": 0.1005859375, + "learning_rate": 0.000707344789907619, + "loss": 0.6123, + "step": 23320 + }, + { + "epoch": 1.1587364656799444, + "grad_norm": 0.0966796875, + "learning_rate": 0.0007073050561239695, + "loss": 0.6024, + "step": 23330 + }, + { + "epoch": 1.1592331379755638, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007072653223403199, + "loss": 0.6049, + "step": 23340 + }, + { + "epoch": 1.159729810271183, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007072255885566703, + "loss": 0.6323, + "step": 23350 + }, + { + "epoch": 1.1602264825668025, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007071858547730207, + "loss": 0.643, + "step": 23360 + }, + { + "epoch": 1.1607231548624217, + "grad_norm": 0.09326171875, + "learning_rate": 0.0007071461209893713, + "loss": 0.6096, + "step": 23370 + }, + { + "epoch": 1.1612198271580412, + "grad_norm": 0.111328125, + "learning_rate": 0.0007071063872057217, + "loss": 0.6407, + "step": 23380 + }, + { + "epoch": 1.1617164994536604, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007070666534220721, + "loss": 0.5905, + "step": 23390 + }, + { + "epoch": 1.1622131717492798, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007070269196384226, + "loss": 0.5927, + "step": 23400 + }, + { + "epoch": 1.1627098440448993, + "grad_norm": 0.1142578125, + "learning_rate": 0.000706987185854773, + "loss": 0.6414, + "step": 23410 + }, + { + "epoch": 1.1632065163405185, + "grad_norm": 0.1259765625, + "learning_rate": 0.0007069474520711235, + "loss": 0.6149, + "step": 23420 + }, + { + "epoch": 1.1637031886361378, + "grad_norm": 0.10546875, + "learning_rate": 0.000706907718287474, + "loss": 0.6216, + "step": 23430 + }, + { + "epoch": 1.1641998609317572, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007068679845038244, + "loss": 0.6228, + "step": 23440 + }, + { + "epoch": 1.1646965332273767, + "grad_norm": 0.1123046875, + "learning_rate": 0.0007068282507201748, + "loss": 0.6317, + "step": 23450 + }, + { + "epoch": 1.1651932055229959, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007067885169365253, + "loss": 0.6172, + "step": 23460 + }, + { + "epoch": 1.1656898778186153, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007067487831528758, + "loss": 0.6362, + "step": 23470 + }, + { + "epoch": 1.1661865501142346, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007067090493692262, + "loss": 0.6011, + "step": 23480 + }, + { + "epoch": 1.166683222409854, + "grad_norm": 0.09326171875, + "learning_rate": 0.0007066693155855767, + "loss": 0.6261, + "step": 23490 + }, + { + "epoch": 1.1671798947054732, + "grad_norm": 0.10205078125, + "learning_rate": 0.0007066295818019271, + "loss": 0.6458, + "step": 23500 + }, + { + "epoch": 1.1676765670010927, + "grad_norm": 0.09765625, + "learning_rate": 0.0007065898480182775, + "loss": 0.6441, + "step": 23510 + }, + { + "epoch": 1.1681732392967121, + "grad_norm": 0.09130859375, + "learning_rate": 0.0007065501142346281, + "loss": 0.602, + "step": 23520 + }, + { + "epoch": 1.1686699115923314, + "grad_norm": 0.107421875, + "learning_rate": 0.0007065103804509785, + "loss": 0.6454, + "step": 23530 + }, + { + "epoch": 1.1691665838879508, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007064706466673289, + "loss": 0.6184, + "step": 23540 + }, + { + "epoch": 1.16966325618357, + "grad_norm": 0.0966796875, + "learning_rate": 0.0007064309128836794, + "loss": 0.6447, + "step": 23550 + }, + { + "epoch": 1.1701599284791895, + "grad_norm": 0.09619140625, + "learning_rate": 0.0007063911791000298, + "loss": 0.6074, + "step": 23560 + }, + { + "epoch": 1.1706566007748087, + "grad_norm": 0.08984375, + "learning_rate": 0.0007063514453163803, + "loss": 0.6108, + "step": 23570 + }, + { + "epoch": 1.1711532730704282, + "grad_norm": 0.1640625, + "learning_rate": 0.0007063117115327307, + "loss": 0.6396, + "step": 23580 + }, + { + "epoch": 1.1716499453660474, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007062719777490812, + "loss": 0.6014, + "step": 23590 + }, + { + "epoch": 1.1721466176616668, + "grad_norm": 0.12109375, + "learning_rate": 0.0007062322439654317, + "loss": 0.6143, + "step": 23600 + }, + { + "epoch": 1.172643289957286, + "grad_norm": 0.10205078125, + "learning_rate": 0.000706192510181782, + "loss": 0.5961, + "step": 23610 + }, + { + "epoch": 1.1731399622529055, + "grad_norm": 0.0966796875, + "learning_rate": 0.0007061527763981326, + "loss": 0.6356, + "step": 23620 + }, + { + "epoch": 1.173636634548525, + "grad_norm": 0.130859375, + "learning_rate": 0.000706113042614483, + "loss": 0.6159, + "step": 23630 + }, + { + "epoch": 1.1741333068441442, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007060733088308334, + "loss": 0.6085, + "step": 23640 + }, + { + "epoch": 1.1746299791397636, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007060335750471839, + "loss": 0.5906, + "step": 23650 + }, + { + "epoch": 1.1751266514353829, + "grad_norm": 0.099609375, + "learning_rate": 0.0007059938412635343, + "loss": 0.6355, + "step": 23660 + }, + { + "epoch": 1.1756233237310023, + "grad_norm": 0.09814453125, + "learning_rate": 0.0007059541074798848, + "loss": 0.6162, + "step": 23670 + }, + { + "epoch": 1.1761199960266215, + "grad_norm": 0.1376953125, + "learning_rate": 0.0007059143736962353, + "loss": 0.616, + "step": 23680 + }, + { + "epoch": 1.176616668322241, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007058746399125857, + "loss": 0.5947, + "step": 23690 + }, + { + "epoch": 1.1771133406178604, + "grad_norm": 0.0947265625, + "learning_rate": 0.0007058349061289361, + "loss": 0.6463, + "step": 23700 + }, + { + "epoch": 1.1776100129134797, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007057951723452866, + "loss": 0.6078, + "step": 23710 + }, + { + "epoch": 1.1781066852090991, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007057554385616371, + "loss": 0.6306, + "step": 23720 + }, + { + "epoch": 1.1786033575047183, + "grad_norm": 0.13671875, + "learning_rate": 0.0007057157047779875, + "loss": 0.6252, + "step": 23730 + }, + { + "epoch": 1.1791000298003378, + "grad_norm": 0.11328125, + "learning_rate": 0.000705675970994338, + "loss": 0.6268, + "step": 23740 + }, + { + "epoch": 1.179596702095957, + "grad_norm": 0.11328125, + "learning_rate": 0.0007056362372106884, + "loss": 0.6547, + "step": 23750 + }, + { + "epoch": 1.1800933743915765, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007055965034270389, + "loss": 0.5791, + "step": 23760 + }, + { + "epoch": 1.1805900466871957, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007055567696433893, + "loss": 0.6184, + "step": 23770 + }, + { + "epoch": 1.1810867189828151, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007055170358597398, + "loss": 0.6027, + "step": 23780 + }, + { + "epoch": 1.1815833912784344, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007054773020760903, + "loss": 0.6148, + "step": 23790 + }, + { + "epoch": 1.1820800635740538, + "grad_norm": 0.1337890625, + "learning_rate": 0.0007054375682924406, + "loss": 0.6185, + "step": 23800 + }, + { + "epoch": 1.1825767358696733, + "grad_norm": 0.10546875, + "learning_rate": 0.0007053978345087911, + "loss": 0.6371, + "step": 23810 + }, + { + "epoch": 1.1830734081652925, + "grad_norm": 0.138671875, + "learning_rate": 0.0007053581007251417, + "loss": 0.6398, + "step": 23820 + }, + { + "epoch": 1.183570080460912, + "grad_norm": 0.1171875, + "learning_rate": 0.000705318366941492, + "loss": 0.6341, + "step": 23830 + }, + { + "epoch": 1.1840667527565312, + "grad_norm": 0.11962890625, + "learning_rate": 0.0007052786331578425, + "loss": 0.6218, + "step": 23840 + }, + { + "epoch": 1.1845634250521506, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007052388993741929, + "loss": 0.6058, + "step": 23850 + }, + { + "epoch": 1.1850600973477698, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007051991655905433, + "loss": 0.6475, + "step": 23860 + }, + { + "epoch": 1.1855567696433893, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007051594318068939, + "loss": 0.6322, + "step": 23870 + }, + { + "epoch": 1.1860534419390087, + "grad_norm": 0.099609375, + "learning_rate": 0.0007051196980232443, + "loss": 0.5993, + "step": 23880 + }, + { + "epoch": 1.186550114234628, + "grad_norm": 0.10546875, + "learning_rate": 0.0007050799642395948, + "loss": 0.6268, + "step": 23890 + }, + { + "epoch": 1.1870467865302474, + "grad_norm": 0.11767578125, + "learning_rate": 0.0007050402304559452, + "loss": 0.5902, + "step": 23900 + }, + { + "epoch": 1.1875434588258666, + "grad_norm": 0.0947265625, + "learning_rate": 0.0007050004966722956, + "loss": 0.6241, + "step": 23910 + }, + { + "epoch": 1.188040131121486, + "grad_norm": 0.11376953125, + "learning_rate": 0.0007049607628886462, + "loss": 0.6016, + "step": 23920 + }, + { + "epoch": 1.1885368034171053, + "grad_norm": 0.1142578125, + "learning_rate": 0.0007049210291049966, + "loss": 0.6004, + "step": 23930 + }, + { + "epoch": 1.1890334757127248, + "grad_norm": 0.1025390625, + "learning_rate": 0.000704881295321347, + "loss": 0.6316, + "step": 23940 + }, + { + "epoch": 1.189530148008344, + "grad_norm": 0.103515625, + "learning_rate": 0.0007048415615376975, + "loss": 0.6467, + "step": 23950 + }, + { + "epoch": 1.1900268203039635, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007048018277540478, + "loss": 0.6248, + "step": 23960 + }, + { + "epoch": 1.1905234925995827, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007047620939703984, + "loss": 0.6385, + "step": 23970 + }, + { + "epoch": 1.1910201648952021, + "grad_norm": 0.2041015625, + "learning_rate": 0.0007047223601867489, + "loss": 0.6219, + "step": 23980 + }, + { + "epoch": 1.1915168371908216, + "grad_norm": 0.1484375, + "learning_rate": 0.0007046826264030992, + "loss": 0.6193, + "step": 23990 + }, + { + "epoch": 1.1920135094864408, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007046428926194497, + "loss": 0.6297, + "step": 24000 + }, + { + "epoch": 1.1925101817820603, + "grad_norm": 0.09521484375, + "learning_rate": 0.0007046031588358002, + "loss": 0.6158, + "step": 24010 + }, + { + "epoch": 1.1930068540776795, + "grad_norm": 0.12890625, + "learning_rate": 0.0007045634250521506, + "loss": 0.6023, + "step": 24020 + }, + { + "epoch": 1.193503526373299, + "grad_norm": 0.115234375, + "learning_rate": 0.0007045236912685011, + "loss": 0.6306, + "step": 24030 + }, + { + "epoch": 1.1940001986689182, + "grad_norm": 0.1162109375, + "learning_rate": 0.0007044839574848515, + "loss": 0.6125, + "step": 24040 + }, + { + "epoch": 1.1944968709645376, + "grad_norm": 0.09716796875, + "learning_rate": 0.000704444223701202, + "loss": 0.5975, + "step": 24050 + }, + { + "epoch": 1.194993543260157, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007044044899175524, + "loss": 0.6436, + "step": 24060 + }, + { + "epoch": 1.1954902155557763, + "grad_norm": 0.11328125, + "learning_rate": 0.0007043647561339029, + "loss": 0.6371, + "step": 24070 + }, + { + "epoch": 1.1959868878513957, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007043250223502534, + "loss": 0.6063, + "step": 24080 + }, + { + "epoch": 1.196483560147015, + "grad_norm": 0.107421875, + "learning_rate": 0.0007042852885666038, + "loss": 0.6436, + "step": 24090 + }, + { + "epoch": 1.1969802324426344, + "grad_norm": 0.11181640625, + "learning_rate": 0.0007042455547829542, + "loss": 0.6478, + "step": 24100 + }, + { + "epoch": 1.1974769047382536, + "grad_norm": 0.099609375, + "learning_rate": 0.0007042058209993047, + "loss": 0.6237, + "step": 24110 + }, + { + "epoch": 1.197973577033873, + "grad_norm": 0.11328125, + "learning_rate": 0.0007041660872156552, + "loss": 0.5966, + "step": 24120 + }, + { + "epoch": 1.1984702493294923, + "grad_norm": 0.1845703125, + "learning_rate": 0.0007041263534320056, + "loss": 0.6514, + "step": 24130 + }, + { + "epoch": 1.1989669216251118, + "grad_norm": 0.1240234375, + "learning_rate": 0.0007040866196483561, + "loss": 0.6267, + "step": 24140 + }, + { + "epoch": 1.199463593920731, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007040468858647065, + "loss": 0.6382, + "step": 24150 + }, + { + "epoch": 1.1999602662163504, + "grad_norm": 0.10546875, + "learning_rate": 0.0007040071520810569, + "loss": 0.6262, + "step": 24160 + }, + { + "epoch": 1.2004569385119699, + "grad_norm": 0.10546875, + "learning_rate": 0.0007039674182974075, + "loss": 0.6151, + "step": 24170 + }, + { + "epoch": 1.2009536108075891, + "grad_norm": 0.109375, + "learning_rate": 0.0007039276845137578, + "loss": 0.6319, + "step": 24180 + }, + { + "epoch": 1.2014502831032086, + "grad_norm": 0.12060546875, + "learning_rate": 0.0007038879507301083, + "loss": 0.6521, + "step": 24190 + }, + { + "epoch": 1.2019469553988278, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007038482169464588, + "loss": 0.5974, + "step": 24200 + }, + { + "epoch": 1.2024436276944472, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007038084831628092, + "loss": 0.655, + "step": 24210 + }, + { + "epoch": 1.2029402999900665, + "grad_norm": 0.130859375, + "learning_rate": 0.0007037687493791597, + "loss": 0.6258, + "step": 24220 + }, + { + "epoch": 1.203436972285686, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007037290155955101, + "loss": 0.6048, + "step": 24230 + }, + { + "epoch": 1.2039336445813054, + "grad_norm": 0.13671875, + "learning_rate": 0.0007036892818118606, + "loss": 0.6375, + "step": 24240 + }, + { + "epoch": 1.2044303168769246, + "grad_norm": 0.10498046875, + "learning_rate": 0.000703649548028211, + "loss": 0.6155, + "step": 24250 + }, + { + "epoch": 1.204926989172544, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007036098142445614, + "loss": 0.595, + "step": 24260 + }, + { + "epoch": 1.2054236614681633, + "grad_norm": 0.099609375, + "learning_rate": 0.000703570080460912, + "loss": 0.605, + "step": 24270 + }, + { + "epoch": 1.2059203337637827, + "grad_norm": 0.126953125, + "learning_rate": 0.0007035303466772624, + "loss": 0.6043, + "step": 24280 + }, + { + "epoch": 1.206417006059402, + "grad_norm": 0.11474609375, + "learning_rate": 0.0007034906128936128, + "loss": 0.6582, + "step": 24290 + }, + { + "epoch": 1.2069136783550214, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007034508791099633, + "loss": 0.6373, + "step": 24300 + }, + { + "epoch": 1.2074103506506406, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007034111453263137, + "loss": 0.6126, + "step": 24310 + }, + { + "epoch": 1.20790702294626, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007033714115426641, + "loss": 0.6255, + "step": 24320 + }, + { + "epoch": 1.2084036952418793, + "grad_norm": 0.134765625, + "learning_rate": 0.0007033316777590147, + "loss": 0.6219, + "step": 24330 + }, + { + "epoch": 1.2089003675374987, + "grad_norm": 0.1220703125, + "learning_rate": 0.0007032919439753651, + "loss": 0.6258, + "step": 24340 + }, + { + "epoch": 1.2093970398331182, + "grad_norm": 0.087890625, + "learning_rate": 0.0007032522101917155, + "loss": 0.6137, + "step": 24350 + }, + { + "epoch": 1.2098937121287374, + "grad_norm": 0.1328125, + "learning_rate": 0.000703212476408066, + "loss": 0.5958, + "step": 24360 + }, + { + "epoch": 1.2103903844243569, + "grad_norm": 0.111328125, + "learning_rate": 0.0007031727426244165, + "loss": 0.6078, + "step": 24370 + }, + { + "epoch": 1.210887056719976, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007031330088407669, + "loss": 0.6212, + "step": 24380 + }, + { + "epoch": 1.2113837290155955, + "grad_norm": 0.09521484375, + "learning_rate": 0.0007030932750571174, + "loss": 0.5888, + "step": 24390 + }, + { + "epoch": 1.2118804013112148, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007030535412734678, + "loss": 0.611, + "step": 24400 + }, + { + "epoch": 1.2123770736068342, + "grad_norm": 0.109375, + "learning_rate": 0.0007030138074898182, + "loss": 0.5952, + "step": 24410 + }, + { + "epoch": 1.2128737459024537, + "grad_norm": 0.111328125, + "learning_rate": 0.0007029740737061688, + "loss": 0.6439, + "step": 24420 + }, + { + "epoch": 1.213370418198073, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007029343399225192, + "loss": 0.6053, + "step": 24430 + }, + { + "epoch": 1.2138670904936923, + "grad_norm": 0.1181640625, + "learning_rate": 0.0007028946061388696, + "loss": 0.6181, + "step": 24440 + }, + { + "epoch": 1.2143637627893116, + "grad_norm": 0.11865234375, + "learning_rate": 0.00070285487235522, + "loss": 0.6271, + "step": 24450 + }, + { + "epoch": 1.214860435084931, + "grad_norm": 0.1484375, + "learning_rate": 0.0007028151385715705, + "loss": 0.6332, + "step": 24460 + }, + { + "epoch": 1.2153571073805503, + "grad_norm": 0.14453125, + "learning_rate": 0.000702775404787921, + "loss": 0.6302, + "step": 24470 + }, + { + "epoch": 1.2158537796761697, + "grad_norm": 0.103515625, + "learning_rate": 0.0007027356710042714, + "loss": 0.5947, + "step": 24480 + }, + { + "epoch": 1.216350451971789, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007026959372206219, + "loss": 0.645, + "step": 24490 + }, + { + "epoch": 1.2168471242674084, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007026562034369723, + "loss": 0.6371, + "step": 24500 + }, + { + "epoch": 1.2173437965630276, + "grad_norm": 0.09814453125, + "learning_rate": 0.0007026164696533227, + "loss": 0.6097, + "step": 24510 + }, + { + "epoch": 1.217840468858647, + "grad_norm": 0.1455078125, + "learning_rate": 0.0007025767358696733, + "loss": 0.6174, + "step": 24520 + }, + { + "epoch": 1.2183371411542665, + "grad_norm": 0.1572265625, + "learning_rate": 0.0007025370020860237, + "loss": 0.6005, + "step": 24530 + }, + { + "epoch": 1.2188338134498857, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007024972683023741, + "loss": 0.59, + "step": 24540 + }, + { + "epoch": 1.2193304857455052, + "grad_norm": 0.11669921875, + "learning_rate": 0.0007024575345187246, + "loss": 0.6203, + "step": 24550 + }, + { + "epoch": 1.2198271580411244, + "grad_norm": 0.10302734375, + "learning_rate": 0.000702417800735075, + "loss": 0.6041, + "step": 24560 + }, + { + "epoch": 1.2203238303367439, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007023780669514254, + "loss": 0.6207, + "step": 24570 + }, + { + "epoch": 1.220820502632363, + "grad_norm": 0.09765625, + "learning_rate": 0.000702338333167776, + "loss": 0.6145, + "step": 24580 + }, + { + "epoch": 1.2213171749279825, + "grad_norm": 0.103515625, + "learning_rate": 0.0007022985993841264, + "loss": 0.6016, + "step": 24590 + }, + { + "epoch": 1.221813847223602, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007022588656004768, + "loss": 0.615, + "step": 24600 + }, + { + "epoch": 1.2223105195192212, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007022191318168273, + "loss": 0.6351, + "step": 24610 + }, + { + "epoch": 1.2228071918148407, + "grad_norm": 0.09521484375, + "learning_rate": 0.0007021793980331777, + "loss": 0.6276, + "step": 24620 + }, + { + "epoch": 1.2233038641104599, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007021396642495282, + "loss": 0.6095, + "step": 24630 + }, + { + "epoch": 1.2238005364060793, + "grad_norm": 0.1025390625, + "learning_rate": 0.0007020999304658786, + "loss": 0.6215, + "step": 24640 + }, + { + "epoch": 1.2242972087016986, + "grad_norm": 0.12890625, + "learning_rate": 0.0007020601966822291, + "loss": 0.5976, + "step": 24650 + }, + { + "epoch": 1.224793880997318, + "grad_norm": 0.1298828125, + "learning_rate": 0.0007020204628985796, + "loss": 0.5812, + "step": 24660 + }, + { + "epoch": 1.2252905532929372, + "grad_norm": 0.1337890625, + "learning_rate": 0.0007019807291149299, + "loss": 0.6122, + "step": 24670 + }, + { + "epoch": 1.2257872255885567, + "grad_norm": 0.134765625, + "learning_rate": 0.0007019409953312805, + "loss": 0.6131, + "step": 24680 + }, + { + "epoch": 1.226283897884176, + "grad_norm": 0.1005859375, + "learning_rate": 0.000701901261547631, + "loss": 0.6229, + "step": 24690 + }, + { + "epoch": 1.2267805701797954, + "grad_norm": 0.0927734375, + "learning_rate": 0.0007018615277639813, + "loss": 0.594, + "step": 24700 + }, + { + "epoch": 1.2272772424754148, + "grad_norm": 0.123046875, + "learning_rate": 0.0007018217939803318, + "loss": 0.5905, + "step": 24710 + }, + { + "epoch": 1.227773914771034, + "grad_norm": 0.1474609375, + "learning_rate": 0.0007017820601966822, + "loss": 0.6067, + "step": 24720 + }, + { + "epoch": 1.2282705870666535, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007017423264130327, + "loss": 0.6426, + "step": 24730 + }, + { + "epoch": 1.2287672593622727, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007017025926293832, + "loss": 0.6224, + "step": 24740 + }, + { + "epoch": 1.2292639316578922, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007016628588457336, + "loss": 0.6083, + "step": 24750 + }, + { + "epoch": 1.2297606039535114, + "grad_norm": 0.0927734375, + "learning_rate": 0.000701623125062084, + "loss": 0.6234, + "step": 24760 + }, + { + "epoch": 1.2302572762491308, + "grad_norm": 0.12451171875, + "learning_rate": 0.0007015833912784345, + "loss": 0.6315, + "step": 24770 + }, + { + "epoch": 1.2307539485447503, + "grad_norm": 0.1142578125, + "learning_rate": 0.000701543657494785, + "loss": 0.6274, + "step": 24780 + }, + { + "epoch": 1.2312506208403695, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007015039237111355, + "loss": 0.6201, + "step": 24790 + }, + { + "epoch": 1.231747293135989, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007014641899274859, + "loss": 0.6321, + "step": 24800 + }, + { + "epoch": 1.2322439654316082, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007014244561438363, + "loss": 0.6348, + "step": 24810 + }, + { + "epoch": 1.2327406377272276, + "grad_norm": 0.099609375, + "learning_rate": 0.0007013847223601868, + "loss": 0.5963, + "step": 24820 + }, + { + "epoch": 1.2332373100228469, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007013449885765372, + "loss": 0.6164, + "step": 24830 + }, + { + "epoch": 1.2337339823184663, + "grad_norm": 0.111328125, + "learning_rate": 0.0007013052547928877, + "loss": 0.6486, + "step": 24840 + }, + { + "epoch": 1.2342306546140855, + "grad_norm": 0.154296875, + "learning_rate": 0.0007012655210092382, + "loss": 0.6338, + "step": 24850 + }, + { + "epoch": 1.234727326909705, + "grad_norm": 0.09326171875, + "learning_rate": 0.0007012257872255885, + "loss": 0.6248, + "step": 24860 + }, + { + "epoch": 1.2352239992053242, + "grad_norm": 0.1044921875, + "learning_rate": 0.000701186053441939, + "loss": 0.622, + "step": 24870 + }, + { + "epoch": 1.2357206715009437, + "grad_norm": 0.0966796875, + "learning_rate": 0.0007011463196582896, + "loss": 0.5822, + "step": 24880 + }, + { + "epoch": 1.2362173437965631, + "grad_norm": 0.09765625, + "learning_rate": 0.0007011065858746399, + "loss": 0.61, + "step": 24890 + }, + { + "epoch": 1.2367140160921823, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007010668520909904, + "loss": 0.6277, + "step": 24900 + }, + { + "epoch": 1.2372106883878018, + "grad_norm": 0.134765625, + "learning_rate": 0.0007010271183073408, + "loss": 0.6312, + "step": 24910 + }, + { + "epoch": 1.237707360683421, + "grad_norm": 0.111328125, + "learning_rate": 0.0007009873845236912, + "loss": 0.5847, + "step": 24920 + }, + { + "epoch": 1.2382040329790405, + "grad_norm": 0.1201171875, + "learning_rate": 0.0007009476507400418, + "loss": 0.6328, + "step": 24930 + }, + { + "epoch": 1.2387007052746597, + "grad_norm": 0.09423828125, + "learning_rate": 0.0007009079169563922, + "loss": 0.6421, + "step": 24940 + }, + { + "epoch": 1.2391973775702791, + "grad_norm": 0.10498046875, + "learning_rate": 0.0007008681831727427, + "loss": 0.6063, + "step": 24950 + }, + { + "epoch": 1.2396940498658986, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007008284493890931, + "loss": 0.6355, + "step": 24960 + }, + { + "epoch": 1.2401907221615178, + "grad_norm": 0.1064453125, + "learning_rate": 0.0007007887156054435, + "loss": 0.6116, + "step": 24970 + }, + { + "epoch": 1.2406873944571373, + "grad_norm": 0.1484375, + "learning_rate": 0.0007007489818217941, + "loss": 0.6389, + "step": 24980 + }, + { + "epoch": 1.2411840667527565, + "grad_norm": 0.09765625, + "learning_rate": 0.0007007092480381445, + "loss": 0.641, + "step": 24990 + }, + { + "epoch": 1.241680739048376, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007006695142544949, + "loss": 0.6491, + "step": 25000 + }, + { + "epoch": 1.2421774113439952, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007006297804708454, + "loss": 0.6464, + "step": 25010 + }, + { + "epoch": 1.2426740836396146, + "grad_norm": 0.1103515625, + "learning_rate": 0.0007005900466871958, + "loss": 0.6064, + "step": 25020 + }, + { + "epoch": 1.2431707559352339, + "grad_norm": 0.1533203125, + "learning_rate": 0.0007005503129035463, + "loss": 0.6449, + "step": 25030 + }, + { + "epoch": 1.2436674282308533, + "grad_norm": 0.1279296875, + "learning_rate": 0.0007005105791198968, + "loss": 0.6289, + "step": 25040 + }, + { + "epoch": 1.2441641005264725, + "grad_norm": 0.10888671875, + "learning_rate": 0.0007004708453362471, + "loss": 0.6168, + "step": 25050 + }, + { + "epoch": 1.244660772822092, + "grad_norm": 0.09765625, + "learning_rate": 0.0007004311115525976, + "loss": 0.6012, + "step": 25060 + }, + { + "epoch": 1.2451574451177114, + "grad_norm": 0.11572265625, + "learning_rate": 0.0007003913777689481, + "loss": 0.6305, + "step": 25070 + }, + { + "epoch": 1.2456541174133307, + "grad_norm": 0.09033203125, + "learning_rate": 0.0007003516439852986, + "loss": 0.6238, + "step": 25080 + }, + { + "epoch": 1.24615078970895, + "grad_norm": 0.08740234375, + "learning_rate": 0.000700311910201649, + "loss": 0.6106, + "step": 25090 + }, + { + "epoch": 1.2466474620045693, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007002721764179994, + "loss": 0.6166, + "step": 25100 + }, + { + "epoch": 1.2471441343001888, + "grad_norm": 0.103515625, + "learning_rate": 0.0007002324426343499, + "loss": 0.6587, + "step": 25110 + }, + { + "epoch": 1.247640806595808, + "grad_norm": 0.10791015625, + "learning_rate": 0.0007001927088507003, + "loss": 0.632, + "step": 25120 + }, + { + "epoch": 1.2481374788914275, + "grad_norm": 0.1337890625, + "learning_rate": 0.0007001529750670508, + "loss": 0.6218, + "step": 25130 + }, + { + "epoch": 1.248634151187047, + "grad_norm": 0.08837890625, + "learning_rate": 0.0007001132412834013, + "loss": 0.605, + "step": 25140 + }, + { + "epoch": 1.2491308234826661, + "grad_norm": 0.10205078125, + "learning_rate": 0.0007000735074997517, + "loss": 0.6058, + "step": 25150 + }, + { + "epoch": 1.2496274957782856, + "grad_norm": 0.1611328125, + "learning_rate": 0.0007000337737161021, + "loss": 0.6327, + "step": 25160 + }, + { + "epoch": 1.2501241680739048, + "grad_norm": 0.091796875, + "learning_rate": 0.0006999940399324526, + "loss": 0.6189, + "step": 25170 + }, + { + "epoch": 1.2506208403695243, + "grad_norm": 0.18359375, + "learning_rate": 0.0006999543061488031, + "loss": 0.6172, + "step": 25180 + }, + { + "epoch": 1.2511175126651435, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006999145723651535, + "loss": 0.6432, + "step": 25190 + }, + { + "epoch": 1.251614184960763, + "grad_norm": 0.109375, + "learning_rate": 0.000699874838581504, + "loss": 0.6251, + "step": 25200 + }, + { + "epoch": 1.2521108572563822, + "grad_norm": 0.134765625, + "learning_rate": 0.0006998351047978544, + "loss": 0.6038, + "step": 25210 + }, + { + "epoch": 1.2526075295520016, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006997953710142048, + "loss": 0.6242, + "step": 25220 + }, + { + "epoch": 1.2531042018476208, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006997556372305554, + "loss": 0.612, + "step": 25230 + }, + { + "epoch": 1.2536008741432403, + "grad_norm": 0.12890625, + "learning_rate": 0.0006997159034469058, + "loss": 0.6356, + "step": 25240 + }, + { + "epoch": 1.2540975464388597, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006996761696632562, + "loss": 0.5935, + "step": 25250 + }, + { + "epoch": 1.254594218734479, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006996364358796067, + "loss": 0.5989, + "step": 25260 + }, + { + "epoch": 1.2550908910300984, + "grad_norm": 0.125, + "learning_rate": 0.0006995967020959571, + "loss": 0.615, + "step": 25270 + }, + { + "epoch": 1.2555875633257176, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006995569683123076, + "loss": 0.634, + "step": 25280 + }, + { + "epoch": 1.256084235621337, + "grad_norm": 0.109375, + "learning_rate": 0.0006995172345286581, + "loss": 0.6228, + "step": 25290 + }, + { + "epoch": 1.2565809079169563, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006994775007450085, + "loss": 0.6097, + "step": 25300 + }, + { + "epoch": 1.2570775802125758, + "grad_norm": 0.111328125, + "learning_rate": 0.0006994377669613589, + "loss": 0.6059, + "step": 25310 + }, + { + "epoch": 1.2575742525081952, + "grad_norm": 0.126953125, + "learning_rate": 0.0006993980331777093, + "loss": 0.6045, + "step": 25320 + }, + { + "epoch": 1.2580709248038144, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006993582993940599, + "loss": 0.6311, + "step": 25330 + }, + { + "epoch": 1.2585675970994337, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006993185656104103, + "loss": 0.5906, + "step": 25340 + }, + { + "epoch": 1.2590642693950531, + "grad_norm": 0.1640625, + "learning_rate": 0.0006992788318267607, + "loss": 0.6082, + "step": 25350 + }, + { + "epoch": 1.2595609416906726, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006992390980431112, + "loss": 0.5942, + "step": 25360 + }, + { + "epoch": 1.2600576139862918, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006991993642594616, + "loss": 0.606, + "step": 25370 + }, + { + "epoch": 1.2605542862819112, + "grad_norm": 0.11328125, + "learning_rate": 0.000699159630475812, + "loss": 0.6422, + "step": 25380 + }, + { + "epoch": 1.2610509585775305, + "grad_norm": 0.1171875, + "learning_rate": 0.0006991198966921626, + "loss": 0.6, + "step": 25390 + }, + { + "epoch": 1.26154763087315, + "grad_norm": 0.0908203125, + "learning_rate": 0.000699080162908513, + "loss": 0.6161, + "step": 25400 + }, + { + "epoch": 1.2620443031687691, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006990404291248634, + "loss": 0.5868, + "step": 25410 + }, + { + "epoch": 1.2625409754643886, + "grad_norm": 0.13671875, + "learning_rate": 0.0006990006953412139, + "loss": 0.5998, + "step": 25420 + }, + { + "epoch": 1.263037647760008, + "grad_norm": 0.140625, + "learning_rate": 0.0006989609615575644, + "loss": 0.5932, + "step": 25430 + }, + { + "epoch": 1.2635343200556273, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006989212277739148, + "loss": 0.6253, + "step": 25440 + }, + { + "epoch": 1.2640309923512467, + "grad_norm": 0.10546875, + "learning_rate": 0.0006988814939902653, + "loss": 0.6573, + "step": 25450 + }, + { + "epoch": 1.264527664646866, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006988417602066157, + "loss": 0.6146, + "step": 25460 + }, + { + "epoch": 1.2650243369424854, + "grad_norm": 0.146484375, + "learning_rate": 0.0006988020264229661, + "loss": 0.6223, + "step": 25470 + }, + { + "epoch": 1.2655210092381046, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006987622926393167, + "loss": 0.6207, + "step": 25480 + }, + { + "epoch": 1.266017681533724, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006987225588556671, + "loss": 0.6174, + "step": 25490 + }, + { + "epoch": 1.2665143538293435, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006986828250720175, + "loss": 0.6209, + "step": 25500 + }, + { + "epoch": 1.2670110261249627, + "grad_norm": 0.115234375, + "learning_rate": 0.0006986430912883679, + "loss": 0.6172, + "step": 25510 + }, + { + "epoch": 1.267507698420582, + "grad_norm": 0.1015625, + "learning_rate": 0.0006986033575047184, + "loss": 0.61, + "step": 25520 + }, + { + "epoch": 1.2680043707162014, + "grad_norm": 0.0966796875, + "learning_rate": 0.000698563623721069, + "loss": 0.6227, + "step": 25530 + }, + { + "epoch": 1.2685010430118209, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006985238899374193, + "loss": 0.6064, + "step": 25540 + }, + { + "epoch": 1.26899771530744, + "grad_norm": 0.099609375, + "learning_rate": 0.0006984841561537698, + "loss": 0.6189, + "step": 25550 + }, + { + "epoch": 1.2694943876030595, + "grad_norm": 0.130859375, + "learning_rate": 0.0006984444223701202, + "loss": 0.6138, + "step": 25560 + }, + { + "epoch": 1.2699910598986788, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006984046885864706, + "loss": 0.6371, + "step": 25570 + }, + { + "epoch": 1.2704877321942982, + "grad_norm": 0.099609375, + "learning_rate": 0.0006983649548028212, + "loss": 0.6274, + "step": 25580 + }, + { + "epoch": 1.2709844044899175, + "grad_norm": 0.12109375, + "learning_rate": 0.0006983252210191716, + "loss": 0.613, + "step": 25590 + }, + { + "epoch": 1.271481076785537, + "grad_norm": 0.11181640625, + "learning_rate": 0.000698285487235522, + "loss": 0.5777, + "step": 25600 + }, + { + "epoch": 1.2719777490811564, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006982457534518725, + "loss": 0.6199, + "step": 25610 + }, + { + "epoch": 1.2724744213767756, + "grad_norm": 0.123046875, + "learning_rate": 0.0006982060196682229, + "loss": 0.6128, + "step": 25620 + }, + { + "epoch": 1.272971093672395, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006981662858845733, + "loss": 0.6039, + "step": 25630 + }, + { + "epoch": 1.2734677659680143, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006981265521009239, + "loss": 0.5967, + "step": 25640 + }, + { + "epoch": 1.2739644382636337, + "grad_norm": 0.109375, + "learning_rate": 0.0006980868183172743, + "loss": 0.6332, + "step": 25650 + }, + { + "epoch": 1.274461110559253, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006980470845336247, + "loss": 0.6355, + "step": 25660 + }, + { + "epoch": 1.2749577828548724, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006980073507499752, + "loss": 0.6018, + "step": 25670 + }, + { + "epoch": 1.2754544551504918, + "grad_norm": 0.12109375, + "learning_rate": 0.0006979676169663257, + "loss": 0.618, + "step": 25680 + }, + { + "epoch": 1.275951127446111, + "grad_norm": 0.2265625, + "learning_rate": 0.0006979278831826762, + "loss": 0.609, + "step": 25690 + }, + { + "epoch": 1.2764477997417303, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006978881493990265, + "loss": 0.631, + "step": 25700 + }, + { + "epoch": 1.2769444720373497, + "grad_norm": 0.099609375, + "learning_rate": 0.000697848415615377, + "loss": 0.6446, + "step": 25710 + }, + { + "epoch": 1.2774411443329692, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006978086818317275, + "loss": 0.6415, + "step": 25720 + }, + { + "epoch": 1.2779378166285884, + "grad_norm": 0.162109375, + "learning_rate": 0.0006977689480480778, + "loss": 0.6365, + "step": 25730 + }, + { + "epoch": 1.2784344889242079, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006977292142644284, + "loss": 0.669, + "step": 25740 + }, + { + "epoch": 1.278931161219827, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006976894804807789, + "loss": 0.6098, + "step": 25750 + }, + { + "epoch": 1.2794278335154465, + "grad_norm": 0.09375, + "learning_rate": 0.0006976497466971292, + "loss": 0.6255, + "step": 25760 + }, + { + "epoch": 1.2799245058110658, + "grad_norm": 0.1171875, + "learning_rate": 0.0006976100129134797, + "loss": 0.6222, + "step": 25770 + }, + { + "epoch": 1.2804211781066852, + "grad_norm": 0.095703125, + "learning_rate": 0.0006975702791298301, + "loss": 0.6459, + "step": 25780 + }, + { + "epoch": 1.2809178504023047, + "grad_norm": 0.138671875, + "learning_rate": 0.0006975305453461806, + "loss": 0.6226, + "step": 25790 + }, + { + "epoch": 1.2814145226979239, + "grad_norm": 0.146484375, + "learning_rate": 0.0006974908115625311, + "loss": 0.6342, + "step": 25800 + }, + { + "epoch": 1.2819111949935433, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006974510777788815, + "loss": 0.6535, + "step": 25810 + }, + { + "epoch": 1.2824078672891626, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006974113439952319, + "loss": 0.6288, + "step": 25820 + }, + { + "epoch": 1.282904539584782, + "grad_norm": 0.12109375, + "learning_rate": 0.0006973716102115825, + "loss": 0.6131, + "step": 25830 + }, + { + "epoch": 1.2834012118804012, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006973318764279329, + "loss": 0.6023, + "step": 25840 + }, + { + "epoch": 1.2838978841760207, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006972921426442834, + "loss": 0.6053, + "step": 25850 + }, + { + "epoch": 1.2843945564716401, + "grad_norm": 0.1650390625, + "learning_rate": 0.0006972524088606338, + "loss": 0.6479, + "step": 25860 + }, + { + "epoch": 1.2848912287672594, + "grad_norm": 0.109375, + "learning_rate": 0.0006972126750769842, + "loss": 0.6086, + "step": 25870 + }, + { + "epoch": 1.2853879010628786, + "grad_norm": 0.12255859375, + "learning_rate": 0.0006971729412933348, + "loss": 0.6075, + "step": 25880 + }, + { + "epoch": 1.285884573358498, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006971332075096851, + "loss": 0.616, + "step": 25890 + }, + { + "epoch": 1.2863812456541175, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006970934737260356, + "loss": 0.6025, + "step": 25900 + }, + { + "epoch": 1.2868779179497367, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006970537399423861, + "loss": 0.5992, + "step": 25910 + }, + { + "epoch": 1.2873745902453562, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006970140061587364, + "loss": 0.6205, + "step": 25920 + }, + { + "epoch": 1.2878712625409754, + "grad_norm": 0.1064453125, + "learning_rate": 0.000696974272375087, + "loss": 0.6286, + "step": 25930 + }, + { + "epoch": 1.2883679348365948, + "grad_norm": 0.095703125, + "learning_rate": 0.0006969345385914375, + "loss": 0.648, + "step": 25940 + }, + { + "epoch": 1.288864607132214, + "grad_norm": 0.138671875, + "learning_rate": 0.0006968948048077878, + "loss": 0.6122, + "step": 25950 + }, + { + "epoch": 1.2893612794278335, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006968550710241383, + "loss": 0.6291, + "step": 25960 + }, + { + "epoch": 1.289857951723453, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006968153372404887, + "loss": 0.6159, + "step": 25970 + }, + { + "epoch": 1.2903546240190722, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006967756034568392, + "loss": 0.6452, + "step": 25980 + }, + { + "epoch": 1.2908512963146916, + "grad_norm": 0.115234375, + "learning_rate": 0.0006967358696731897, + "loss": 0.6365, + "step": 25990 + }, + { + "epoch": 1.2913479686103109, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006966961358895401, + "loss": 0.6077, + "step": 26000 + }, + { + "epoch": 1.2918446409059303, + "grad_norm": 0.130859375, + "learning_rate": 0.0006966564021058906, + "loss": 0.6097, + "step": 26010 + }, + { + "epoch": 1.2923413132015495, + "grad_norm": 0.09765625, + "learning_rate": 0.000696616668322241, + "loss": 0.622, + "step": 26020 + }, + { + "epoch": 1.292837985497169, + "grad_norm": 0.09375, + "learning_rate": 0.0006965769345385914, + "loss": 0.5867, + "step": 26030 + }, + { + "epoch": 1.2933346577927884, + "grad_norm": 0.142578125, + "learning_rate": 0.000696537200754942, + "loss": 0.6268, + "step": 26040 + }, + { + "epoch": 1.2938313300884077, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006964974669712924, + "loss": 0.6172, + "step": 26050 + }, + { + "epoch": 1.294328002384027, + "grad_norm": 0.126953125, + "learning_rate": 0.0006964577331876428, + "loss": 0.61, + "step": 26060 + }, + { + "epoch": 1.2948246746796463, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006964179994039933, + "loss": 0.6454, + "step": 26070 + }, + { + "epoch": 1.2953213469752658, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006963782656203437, + "loss": 0.6162, + "step": 26080 + }, + { + "epoch": 1.295818019270885, + "grad_norm": 0.111328125, + "learning_rate": 0.0006963385318366942, + "loss": 0.6403, + "step": 26090 + }, + { + "epoch": 1.2963146915665045, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006962987980530447, + "loss": 0.6284, + "step": 26100 + }, + { + "epoch": 1.2968113638621237, + "grad_norm": 0.08935546875, + "learning_rate": 0.000696259064269395, + "loss": 0.5865, + "step": 26110 + }, + { + "epoch": 1.2973080361577431, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006962193304857455, + "loss": 0.5768, + "step": 26120 + }, + { + "epoch": 1.2978047084533624, + "grad_norm": 0.142578125, + "learning_rate": 0.000696179596702096, + "loss": 0.6388, + "step": 26130 + }, + { + "epoch": 1.2983013807489818, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006961398629184465, + "loss": 0.632, + "step": 26140 + }, + { + "epoch": 1.2987980530446013, + "grad_norm": 0.119140625, + "learning_rate": 0.0006961001291347969, + "loss": 0.6346, + "step": 26150 + }, + { + "epoch": 1.2992947253402205, + "grad_norm": 0.12353515625, + "learning_rate": 0.0006960603953511473, + "loss": 0.6329, + "step": 26160 + }, + { + "epoch": 1.29979139763584, + "grad_norm": 0.1796875, + "learning_rate": 0.0006960206615674978, + "loss": 0.6127, + "step": 26170 + }, + { + "epoch": 1.3002880699314592, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006959809277838482, + "loss": 0.6357, + "step": 26180 + }, + { + "epoch": 1.3007847422270786, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006959411940001987, + "loss": 0.6332, + "step": 26190 + }, + { + "epoch": 1.3012814145226979, + "grad_norm": 0.119140625, + "learning_rate": 0.0006959014602165492, + "loss": 0.612, + "step": 26200 + }, + { + "epoch": 1.3017780868183173, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006958617264328996, + "loss": 0.6465, + "step": 26210 + }, + { + "epoch": 1.3022747591139368, + "grad_norm": 0.099609375, + "learning_rate": 0.00069582199264925, + "loss": 0.629, + "step": 26220 + }, + { + "epoch": 1.302771431409556, + "grad_norm": 0.140625, + "learning_rate": 0.0006957822588656005, + "loss": 0.5934, + "step": 26230 + }, + { + "epoch": 1.3032681037051752, + "grad_norm": 0.099609375, + "learning_rate": 0.000695742525081951, + "loss": 0.6104, + "step": 26240 + }, + { + "epoch": 1.3037647760007947, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006957027912983014, + "loss": 0.6124, + "step": 26250 + }, + { + "epoch": 1.304261448296414, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006956630575146519, + "loss": 0.6126, + "step": 26260 + }, + { + "epoch": 1.3047581205920333, + "grad_norm": 0.13671875, + "learning_rate": 0.0006956233237310023, + "loss": 0.5908, + "step": 26270 + }, + { + "epoch": 1.3052547928876528, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006955835899473527, + "loss": 0.5986, + "step": 26280 + }, + { + "epoch": 1.305751465183272, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006955438561637033, + "loss": 0.6358, + "step": 26290 + }, + { + "epoch": 1.3062481374788915, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006955041223800537, + "loss": 0.6091, + "step": 26300 + }, + { + "epoch": 1.3067448097745107, + "grad_norm": 0.1484375, + "learning_rate": 0.0006954643885964041, + "loss": 0.6536, + "step": 26310 + }, + { + "epoch": 1.3072414820701301, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006954246548127546, + "loss": 0.6112, + "step": 26320 + }, + { + "epoch": 1.3077381543657496, + "grad_norm": 0.09423828125, + "learning_rate": 0.000695384921029105, + "loss": 0.6137, + "step": 26330 + }, + { + "epoch": 1.3082348266613688, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006953451872454555, + "loss": 0.6373, + "step": 26340 + }, + { + "epoch": 1.3087314989569883, + "grad_norm": 0.1103515625, + "learning_rate": 0.000695305453461806, + "loss": 0.6115, + "step": 26350 + }, + { + "epoch": 1.3092281712526075, + "grad_norm": 0.099609375, + "learning_rate": 0.0006952657196781564, + "loss": 0.6095, + "step": 26360 + }, + { + "epoch": 1.309724843548227, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006952259858945068, + "loss": 0.594, + "step": 26370 + }, + { + "epoch": 1.3102215158438462, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006951862521108572, + "loss": 0.6177, + "step": 26380 + }, + { + "epoch": 1.3107181881394656, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006951465183272078, + "loss": 0.5909, + "step": 26390 + }, + { + "epoch": 1.311214860435085, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006951067845435582, + "loss": 0.5882, + "step": 26400 + }, + { + "epoch": 1.3117115327307043, + "grad_norm": 0.12353515625, + "learning_rate": 0.0006950670507599086, + "loss": 0.6251, + "step": 26410 + }, + { + "epoch": 1.3122082050263235, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006950273169762591, + "loss": 0.6158, + "step": 26420 + }, + { + "epoch": 1.312704877321943, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006949875831926095, + "loss": 0.636, + "step": 26430 + }, + { + "epoch": 1.3132015496175624, + "grad_norm": 0.1015625, + "learning_rate": 0.00069494784940896, + "loss": 0.6297, + "step": 26440 + }, + { + "epoch": 1.3136982219131816, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006949081156253105, + "loss": 0.5981, + "step": 26450 + }, + { + "epoch": 1.314194894208801, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006948683818416609, + "loss": 0.6133, + "step": 26460 + }, + { + "epoch": 1.3146915665044203, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006948286480580113, + "loss": 0.6271, + "step": 26470 + }, + { + "epoch": 1.3151882388000398, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006947889142743618, + "loss": 0.6233, + "step": 26480 + }, + { + "epoch": 1.315684911095659, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006947491804907123, + "loss": 0.6188, + "step": 26490 + }, + { + "epoch": 1.3161815833912784, + "grad_norm": 0.099609375, + "learning_rate": 0.0006947094467070627, + "loss": 0.588, + "step": 26500 + }, + { + "epoch": 1.316678255686898, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006946697129234132, + "loss": 0.6201, + "step": 26510 + }, + { + "epoch": 1.3171749279825171, + "grad_norm": 0.107421875, + "learning_rate": 0.0006946299791397636, + "loss": 0.6294, + "step": 26520 + }, + { + "epoch": 1.3176716002781366, + "grad_norm": 0.1171875, + "learning_rate": 0.000694590245356114, + "loss": 0.6203, + "step": 26530 + }, + { + "epoch": 1.3181682725737558, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006945505115724646, + "loss": 0.6211, + "step": 26540 + }, + { + "epoch": 1.3186649448693752, + "grad_norm": 0.125, + "learning_rate": 0.000694510777788815, + "loss": 0.6181, + "step": 26550 + }, + { + "epoch": 1.3191616171649945, + "grad_norm": 0.220703125, + "learning_rate": 0.0006944710440051654, + "loss": 0.6553, + "step": 26560 + }, + { + "epoch": 1.319658289460614, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006944313102215158, + "loss": 0.5941, + "step": 26570 + }, + { + "epoch": 1.3201549617562334, + "grad_norm": 0.12255859375, + "learning_rate": 0.0006943915764378663, + "loss": 0.6245, + "step": 26580 + }, + { + "epoch": 1.3206516340518526, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006943518426542169, + "loss": 0.6374, + "step": 26590 + }, + { + "epoch": 1.3211483063474718, + "grad_norm": 0.1630859375, + "learning_rate": 0.0006943121088705672, + "loss": 0.6443, + "step": 26600 + }, + { + "epoch": 1.3216449786430913, + "grad_norm": 0.1962890625, + "learning_rate": 0.0006942723750869177, + "loss": 0.6219, + "step": 26610 + }, + { + "epoch": 1.3221416509387107, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006942326413032682, + "loss": 0.6492, + "step": 26620 + }, + { + "epoch": 1.32263832323433, + "grad_norm": 0.087890625, + "learning_rate": 0.0006941929075196185, + "loss": 0.6174, + "step": 26630 + }, + { + "epoch": 1.3231349955299494, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006941531737359691, + "loss": 0.5641, + "step": 26640 + }, + { + "epoch": 1.3236316678255686, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006941134399523195, + "loss": 0.6142, + "step": 26650 + }, + { + "epoch": 1.324128340121188, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006940737061686699, + "loss": 0.6077, + "step": 26660 + }, + { + "epoch": 1.3246250124168073, + "grad_norm": 0.099609375, + "learning_rate": 0.0006940339723850204, + "loss": 0.6305, + "step": 26670 + }, + { + "epoch": 1.3251216847124268, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006939942386013708, + "loss": 0.6343, + "step": 26680 + }, + { + "epoch": 1.3256183570080462, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006939545048177213, + "loss": 0.5751, + "step": 26690 + }, + { + "epoch": 1.3261150293036654, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006939147710340718, + "loss": 0.6058, + "step": 26700 + }, + { + "epoch": 1.3266117015992847, + "grad_norm": 0.1171875, + "learning_rate": 0.0006938750372504222, + "loss": 0.6087, + "step": 26710 + }, + { + "epoch": 1.327108373894904, + "grad_norm": 0.1357421875, + "learning_rate": 0.0006938353034667726, + "loss": 0.6055, + "step": 26720 + }, + { + "epoch": 1.3276050461905236, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006937955696831231, + "loss": 0.6121, + "step": 26730 + }, + { + "epoch": 1.3281017184861428, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006937558358994736, + "loss": 0.5855, + "step": 26740 + }, + { + "epoch": 1.3285983907817622, + "grad_norm": 0.1630859375, + "learning_rate": 0.0006937161021158241, + "loss": 0.5932, + "step": 26750 + }, + { + "epoch": 1.3290950630773817, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006936763683321744, + "loss": 0.6314, + "step": 26760 + }, + { + "epoch": 1.329591735373001, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006936366345485249, + "loss": 0.6129, + "step": 26770 + }, + { + "epoch": 1.3300884076686201, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006935969007648754, + "loss": 0.5807, + "step": 26780 + }, + { + "epoch": 1.3305850799642396, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006935571669812257, + "loss": 0.5593, + "step": 26790 + }, + { + "epoch": 1.331081752259859, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006935174331975763, + "loss": 0.6105, + "step": 26800 + }, + { + "epoch": 1.3315784245554783, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006934776994139268, + "loss": 0.589, + "step": 26810 + }, + { + "epoch": 1.3320750968510977, + "grad_norm": 0.123046875, + "learning_rate": 0.0006934379656302771, + "loss": 0.6002, + "step": 26820 + }, + { + "epoch": 1.332571769146717, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006933982318466276, + "loss": 0.5998, + "step": 26830 + }, + { + "epoch": 1.3330684414423364, + "grad_norm": 0.10888671875, + "learning_rate": 0.000693358498062978, + "loss": 0.6258, + "step": 26840 + }, + { + "epoch": 1.3335651137379556, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006933187642793285, + "loss": 0.628, + "step": 26850 + }, + { + "epoch": 1.334061786033575, + "grad_norm": 0.15625, + "learning_rate": 0.000693279030495679, + "loss": 0.6142, + "step": 26860 + }, + { + "epoch": 1.3345584583291945, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006932392967120294, + "loss": 0.6212, + "step": 26870 + }, + { + "epoch": 1.3350551306248137, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006931995629283799, + "loss": 0.6142, + "step": 26880 + }, + { + "epoch": 1.335551802920433, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006931598291447304, + "loss": 0.6429, + "step": 26890 + }, + { + "epoch": 1.3360484752160524, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006931200953610808, + "loss": 0.6278, + "step": 26900 + }, + { + "epoch": 1.3365451475116719, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006930803615774313, + "loss": 0.6065, + "step": 26910 + }, + { + "epoch": 1.337041819807291, + "grad_norm": 0.16796875, + "learning_rate": 0.0006930406277937817, + "loss": 0.62, + "step": 26920 + }, + { + "epoch": 1.3375384921029105, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006930008940101321, + "loss": 0.6124, + "step": 26930 + }, + { + "epoch": 1.33803516439853, + "grad_norm": 0.150390625, + "learning_rate": 0.0006929611602264827, + "loss": 0.618, + "step": 26940 + }, + { + "epoch": 1.3385318366941492, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006929214264428331, + "loss": 0.6175, + "step": 26950 + }, + { + "epoch": 1.3390285089897684, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006928816926591835, + "loss": 0.6139, + "step": 26960 + }, + { + "epoch": 1.339525181285388, + "grad_norm": 0.09765625, + "learning_rate": 0.000692841958875534, + "loss": 0.6035, + "step": 26970 + }, + { + "epoch": 1.3400218535810073, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006928022250918843, + "loss": 0.6273, + "step": 26980 + }, + { + "epoch": 1.3405185258766266, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006927624913082349, + "loss": 0.5981, + "step": 26990 + }, + { + "epoch": 1.341015198172246, + "grad_norm": 0.15625, + "learning_rate": 0.0006927227575245854, + "loss": 0.6299, + "step": 27000 + }, + { + "epoch": 1.3415118704678652, + "grad_norm": 0.107421875, + "learning_rate": 0.0006926830237409357, + "loss": 0.6188, + "step": 27010 + }, + { + "epoch": 1.3420085427634847, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006926432899572862, + "loss": 0.6372, + "step": 27020 + }, + { + "epoch": 1.342505215059104, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006926035561736366, + "loss": 0.6362, + "step": 27030 + }, + { + "epoch": 1.3430018873547234, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006925638223899872, + "loss": 0.6245, + "step": 27040 + }, + { + "epoch": 1.3434985596503428, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006925240886063376, + "loss": 0.6176, + "step": 27050 + }, + { + "epoch": 1.343995231945962, + "grad_norm": 0.0986328125, + "learning_rate": 0.000692484354822688, + "loss": 0.5695, + "step": 27060 + }, + { + "epoch": 1.3444919042415813, + "grad_norm": 0.095703125, + "learning_rate": 0.0006924446210390385, + "loss": 0.6038, + "step": 27070 + }, + { + "epoch": 1.3449885765372007, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006924048872553889, + "loss": 0.6044, + "step": 27080 + }, + { + "epoch": 1.3454852488328202, + "grad_norm": 0.177734375, + "learning_rate": 0.0006923651534717393, + "loss": 0.606, + "step": 27090 + }, + { + "epoch": 1.3459819211284394, + "grad_norm": 0.1767578125, + "learning_rate": 0.0006923254196880899, + "loss": 0.5946, + "step": 27100 + }, + { + "epoch": 1.3464785934240588, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006922856859044403, + "loss": 0.6438, + "step": 27110 + }, + { + "epoch": 1.3469752657196783, + "grad_norm": 0.1171875, + "learning_rate": 0.0006922459521207907, + "loss": 0.5927, + "step": 27120 + }, + { + "epoch": 1.3474719380152975, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006922062183371412, + "loss": 0.5905, + "step": 27130 + }, + { + "epoch": 1.3479686103109167, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006921664845534917, + "loss": 0.6242, + "step": 27140 + }, + { + "epoch": 1.3484652826065362, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006921267507698421, + "loss": 0.6159, + "step": 27150 + }, + { + "epoch": 1.3489619549021556, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006920870169861926, + "loss": 0.6114, + "step": 27160 + }, + { + "epoch": 1.3494586271977749, + "grad_norm": 0.1513671875, + "learning_rate": 0.000692047283202543, + "loss": 0.5995, + "step": 27170 + }, + { + "epoch": 1.3499552994933943, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006920075494188934, + "loss": 0.6224, + "step": 27180 + }, + { + "epoch": 1.3504519717890135, + "grad_norm": 0.10791015625, + "learning_rate": 0.000691967815635244, + "loss": 0.6009, + "step": 27190 + }, + { + "epoch": 1.350948644084633, + "grad_norm": 0.142578125, + "learning_rate": 0.0006919280818515944, + "loss": 0.5902, + "step": 27200 + }, + { + "epoch": 1.3514453163802522, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006918883480679448, + "loss": 0.606, + "step": 27210 + }, + { + "epoch": 1.3519419886758717, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006918486142842953, + "loss": 0.6061, + "step": 27220 + }, + { + "epoch": 1.3524386609714911, + "grad_norm": 0.08984375, + "learning_rate": 0.0006918088805006457, + "loss": 0.6187, + "step": 27230 + }, + { + "epoch": 1.3529353332671104, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006917691467169961, + "loss": 0.6309, + "step": 27240 + }, + { + "epoch": 1.3534320055627296, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006917294129333466, + "loss": 0.625, + "step": 27250 + }, + { + "epoch": 1.353928677858349, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006916896791496971, + "loss": 0.6102, + "step": 27260 + }, + { + "epoch": 1.3544253501539685, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006916499453660475, + "loss": 0.6332, + "step": 27270 + }, + { + "epoch": 1.3549220224495877, + "grad_norm": 0.10546875, + "learning_rate": 0.0006916102115823979, + "loss": 0.5904, + "step": 27280 + }, + { + "epoch": 1.3554186947452072, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006915704777987484, + "loss": 0.6378, + "step": 27290 + }, + { + "epoch": 1.3559153670408264, + "grad_norm": 0.109375, + "learning_rate": 0.0006915307440150989, + "loss": 0.61, + "step": 27300 + }, + { + "epoch": 1.3564120393364458, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006914910102314493, + "loss": 0.6269, + "step": 27310 + }, + { + "epoch": 1.356908711632065, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006914512764477998, + "loss": 0.603, + "step": 27320 + }, + { + "epoch": 1.3574053839276845, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006914115426641502, + "loss": 0.619, + "step": 27330 + }, + { + "epoch": 1.357902056223304, + "grad_norm": 0.103515625, + "learning_rate": 0.0006913718088805006, + "loss": 0.6086, + "step": 27340 + }, + { + "epoch": 1.3583987285189232, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006913320750968512, + "loss": 0.608, + "step": 27350 + }, + { + "epoch": 1.3588954008145426, + "grad_norm": 0.2197265625, + "learning_rate": 0.0006912923413132016, + "loss": 0.595, + "step": 27360 + }, + { + "epoch": 1.3593920731101619, + "grad_norm": 0.1064453125, + "learning_rate": 0.000691252607529552, + "loss": 0.6164, + "step": 27370 + }, + { + "epoch": 1.3598887454057813, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006912128737459025, + "loss": 0.6428, + "step": 27380 + }, + { + "epoch": 1.3603854177014005, + "grad_norm": 0.0986328125, + "learning_rate": 0.000691173139962253, + "loss": 0.6131, + "step": 27390 + }, + { + "epoch": 1.36088208999702, + "grad_norm": 0.146484375, + "learning_rate": 0.0006911334061786034, + "loss": 0.6363, + "step": 27400 + }, + { + "epoch": 1.3613787622926394, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006910936723949539, + "loss": 0.5877, + "step": 27410 + }, + { + "epoch": 1.3618754345882587, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006910539386113043, + "loss": 0.6, + "step": 27420 + }, + { + "epoch": 1.3623721068838779, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006910142048276547, + "loss": 0.6287, + "step": 27430 + }, + { + "epoch": 1.3628687791794973, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006909744710440051, + "loss": 0.6438, + "step": 27440 + }, + { + "epoch": 1.3633654514751168, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006909347372603557, + "loss": 0.6215, + "step": 27450 + }, + { + "epoch": 1.363862123770736, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006908950034767061, + "loss": 0.6013, + "step": 27460 + }, + { + "epoch": 1.3643587960663555, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006908552696930565, + "loss": 0.6294, + "step": 27470 + }, + { + "epoch": 1.3648554683619747, + "grad_norm": 0.1015625, + "learning_rate": 0.000690815535909407, + "loss": 0.6018, + "step": 27480 + }, + { + "epoch": 1.3653521406575941, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006907758021257576, + "loss": 0.5911, + "step": 27490 + }, + { + "epoch": 1.3658488129532134, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006907360683421079, + "loss": 0.6479, + "step": 27500 + }, + { + "epoch": 1.3663454852488328, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006906963345584584, + "loss": 0.6117, + "step": 27510 + }, + { + "epoch": 1.3668421575444523, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006906566007748088, + "loss": 0.6127, + "step": 27520 + }, + { + "epoch": 1.3673388298400715, + "grad_norm": 0.099609375, + "learning_rate": 0.0006906168669911592, + "loss": 0.6233, + "step": 27530 + }, + { + "epoch": 1.367835502135691, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006905771332075097, + "loss": 0.6019, + "step": 27540 + }, + { + "epoch": 1.3683321744313102, + "grad_norm": 0.15234375, + "learning_rate": 0.0006905373994238602, + "loss": 0.6169, + "step": 27550 + }, + { + "epoch": 1.3688288467269296, + "grad_norm": 0.115234375, + "learning_rate": 0.0006904976656402106, + "loss": 0.5841, + "step": 27560 + }, + { + "epoch": 1.3693255190225488, + "grad_norm": 0.08740234375, + "learning_rate": 0.0006904579318565611, + "loss": 0.6046, + "step": 27570 + }, + { + "epoch": 1.3698221913181683, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006904181980729115, + "loss": 0.609, + "step": 27580 + }, + { + "epoch": 1.3703188636137877, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006903784642892619, + "loss": 0.6145, + "step": 27590 + }, + { + "epoch": 1.370815535909407, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006903387305056125, + "loss": 0.6483, + "step": 27600 + }, + { + "epoch": 1.3713122082050262, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006902989967219629, + "loss": 0.6199, + "step": 27610 + }, + { + "epoch": 1.3718088805006456, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006902592629383134, + "loss": 0.6271, + "step": 27620 + }, + { + "epoch": 1.372305552796265, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006902195291546637, + "loss": 0.6004, + "step": 27630 + }, + { + "epoch": 1.3728022250918843, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006901797953710142, + "loss": 0.6027, + "step": 27640 + }, + { + "epoch": 1.3732988973875038, + "grad_norm": 0.1884765625, + "learning_rate": 0.0006901400615873648, + "loss": 0.6139, + "step": 27650 + }, + { + "epoch": 1.373795569683123, + "grad_norm": 0.10546875, + "learning_rate": 0.0006901003278037151, + "loss": 0.6204, + "step": 27660 + }, + { + "epoch": 1.3742922419787424, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006900605940200656, + "loss": 0.5861, + "step": 27670 + }, + { + "epoch": 1.3747889142743617, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006900208602364161, + "loss": 0.6278, + "step": 27680 + }, + { + "epoch": 1.3752855865699811, + "grad_norm": 0.1328125, + "learning_rate": 0.0006899811264527664, + "loss": 0.6139, + "step": 27690 + }, + { + "epoch": 1.3757822588656006, + "grad_norm": 0.099609375, + "learning_rate": 0.000689941392669117, + "loss": 0.6202, + "step": 27700 + }, + { + "epoch": 1.3762789311612198, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006899016588854674, + "loss": 0.5959, + "step": 27710 + }, + { + "epoch": 1.3767756034568392, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006898619251018178, + "loss": 0.6025, + "step": 27720 + }, + { + "epoch": 1.3772722757524585, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006898221913181683, + "loss": 0.5889, + "step": 27730 + }, + { + "epoch": 1.377768948048078, + "grad_norm": 0.16796875, + "learning_rate": 0.0006897824575345187, + "loss": 0.6065, + "step": 27740 + }, + { + "epoch": 1.3782656203436972, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006897427237508692, + "loss": 0.5845, + "step": 27750 + }, + { + "epoch": 1.3787622926393166, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006897029899672197, + "loss": 0.5899, + "step": 27760 + }, + { + "epoch": 1.379258964934936, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006896632561835701, + "loss": 0.6149, + "step": 27770 + }, + { + "epoch": 1.3797556372305553, + "grad_norm": 0.103515625, + "learning_rate": 0.0006896235223999206, + "loss": 0.6169, + "step": 27780 + }, + { + "epoch": 1.3802523095261745, + "grad_norm": 0.095703125, + "learning_rate": 0.000689583788616271, + "loss": 0.6069, + "step": 27790 + }, + { + "epoch": 1.380748981821794, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006895440548326215, + "loss": 0.6278, + "step": 27800 + }, + { + "epoch": 1.3812456541174134, + "grad_norm": 0.12060546875, + "learning_rate": 0.000689504321048972, + "loss": 0.5892, + "step": 27810 + }, + { + "epoch": 1.3817423264130326, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006894645872653224, + "loss": 0.6149, + "step": 27820 + }, + { + "epoch": 1.382238998708652, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006894248534816728, + "loss": 0.6042, + "step": 27830 + }, + { + "epoch": 1.3827356710042713, + "grad_norm": 0.107421875, + "learning_rate": 0.0006893851196980233, + "loss": 0.6224, + "step": 27840 + }, + { + "epoch": 1.3832323432998908, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006893453859143737, + "loss": 0.6311, + "step": 27850 + }, + { + "epoch": 1.38372901559551, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006893056521307242, + "loss": 0.6114, + "step": 27860 + }, + { + "epoch": 1.3842256878911294, + "grad_norm": 0.134765625, + "learning_rate": 0.0006892659183470747, + "loss": 0.6021, + "step": 27870 + }, + { + "epoch": 1.3847223601867489, + "grad_norm": 0.12255859375, + "learning_rate": 0.000689226184563425, + "loss": 0.6095, + "step": 27880 + }, + { + "epoch": 1.385219032482368, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006891864507797755, + "loss": 0.5971, + "step": 27890 + }, + { + "epoch": 1.3857157047779876, + "grad_norm": 0.109375, + "learning_rate": 0.000689146716996126, + "loss": 0.5634, + "step": 27900 + }, + { + "epoch": 1.3862123770736068, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006891069832124764, + "loss": 0.6128, + "step": 27910 + }, + { + "epoch": 1.3867090493692262, + "grad_norm": 0.138671875, + "learning_rate": 0.0006890672494288269, + "loss": 0.6113, + "step": 27920 + }, + { + "epoch": 1.3872057216648455, + "grad_norm": 0.130859375, + "learning_rate": 0.0006890275156451773, + "loss": 0.5969, + "step": 27930 + }, + { + "epoch": 1.387702393960465, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006889877818615278, + "loss": 0.6106, + "step": 27940 + }, + { + "epoch": 1.3881990662560844, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006889480480778783, + "loss": 0.6181, + "step": 27950 + }, + { + "epoch": 1.3886957385517036, + "grad_norm": 0.1015625, + "learning_rate": 0.0006889083142942287, + "loss": 0.6144, + "step": 27960 + }, + { + "epoch": 1.3891924108473228, + "grad_norm": 0.1015625, + "learning_rate": 0.0006888685805105792, + "loss": 0.5952, + "step": 27970 + }, + { + "epoch": 1.3896890831429423, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006888288467269296, + "loss": 0.6273, + "step": 27980 + }, + { + "epoch": 1.3901857554385617, + "grad_norm": 0.09814453125, + "learning_rate": 0.00068878911294328, + "loss": 0.611, + "step": 27990 + }, + { + "epoch": 1.390682427734181, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006887493791596306, + "loss": 0.6432, + "step": 28000 + }, + { + "epoch": 1.3911791000298004, + "grad_norm": 0.1318359375, + "learning_rate": 0.000688709645375981, + "loss": 0.5919, + "step": 28010 + }, + { + "epoch": 1.3916757723254196, + "grad_norm": 0.150390625, + "learning_rate": 0.0006886699115923314, + "loss": 0.5956, + "step": 28020 + }, + { + "epoch": 1.392172444621039, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006886301778086819, + "loss": 0.6022, + "step": 28030 + }, + { + "epoch": 1.3926691169166583, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006885904440250322, + "loss": 0.6028, + "step": 28040 + }, + { + "epoch": 1.3931657892122777, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006885507102413828, + "loss": 0.6129, + "step": 28050 + }, + { + "epoch": 1.3936624615078972, + "grad_norm": 0.1650390625, + "learning_rate": 0.0006885109764577333, + "loss": 0.5869, + "step": 28060 + }, + { + "epoch": 1.3941591338035164, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006884712426740837, + "loss": 0.6021, + "step": 28070 + }, + { + "epoch": 1.3946558060991359, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006884315088904341, + "loss": 0.6046, + "step": 28080 + }, + { + "epoch": 1.395152478394755, + "grad_norm": 0.12890625, + "learning_rate": 0.0006883917751067846, + "loss": 0.603, + "step": 28090 + }, + { + "epoch": 1.3956491506903745, + "grad_norm": 0.12255859375, + "learning_rate": 0.0006883520413231351, + "loss": 0.614, + "step": 28100 + }, + { + "epoch": 1.3961458229859938, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006883123075394855, + "loss": 0.5915, + "step": 28110 + }, + { + "epoch": 1.3966424952816132, + "grad_norm": 0.140625, + "learning_rate": 0.0006882725737558359, + "loss": 0.6009, + "step": 28120 + }, + { + "epoch": 1.3971391675772327, + "grad_norm": 0.1552734375, + "learning_rate": 0.0006882328399721864, + "loss": 0.6098, + "step": 28130 + }, + { + "epoch": 1.397635839872852, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006881931061885368, + "loss": 0.6106, + "step": 28140 + }, + { + "epoch": 1.3981325121684711, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006881533724048873, + "loss": 0.6075, + "step": 28150 + }, + { + "epoch": 1.3986291844640906, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006881136386212378, + "loss": 0.6005, + "step": 28160 + }, + { + "epoch": 1.39912585675971, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006880739048375882, + "loss": 0.5895, + "step": 28170 + }, + { + "epoch": 1.3996225290553292, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006880341710539386, + "loss": 0.6271, + "step": 28180 + }, + { + "epoch": 1.4001192013509487, + "grad_norm": 0.095703125, + "learning_rate": 0.0006879944372702891, + "loss": 0.6261, + "step": 28190 + }, + { + "epoch": 1.400615873646568, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006879547034866396, + "loss": 0.6023, + "step": 28200 + }, + { + "epoch": 1.4011125459421874, + "grad_norm": 0.11767578125, + "learning_rate": 0.00068791496970299, + "loss": 0.64, + "step": 28210 + }, + { + "epoch": 1.4016092182378066, + "grad_norm": 0.16015625, + "learning_rate": 0.0006878752359193405, + "loss": 0.5955, + "step": 28220 + }, + { + "epoch": 1.402105890533426, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006878355021356909, + "loss": 0.5777, + "step": 28230 + }, + { + "epoch": 1.4026025628290455, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006877957683520413, + "loss": 0.6048, + "step": 28240 + }, + { + "epoch": 1.4030992351246647, + "grad_norm": 0.1357421875, + "learning_rate": 0.0006877560345683919, + "loss": 0.6006, + "step": 28250 + }, + { + "epoch": 1.4035959074202842, + "grad_norm": 0.134765625, + "learning_rate": 0.0006877163007847423, + "loss": 0.5771, + "step": 28260 + }, + { + "epoch": 1.4040925797159034, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006876765670010927, + "loss": 0.6036, + "step": 28270 + }, + { + "epoch": 1.4045892520115228, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006876368332174432, + "loss": 0.5789, + "step": 28280 + }, + { + "epoch": 1.405085924307142, + "grad_norm": 0.109375, + "learning_rate": 0.0006875970994337936, + "loss": 0.6106, + "step": 28290 + }, + { + "epoch": 1.4055825966027615, + "grad_norm": 0.1640625, + "learning_rate": 0.000687557365650144, + "loss": 0.6219, + "step": 28300 + }, + { + "epoch": 1.406079268898381, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006875176318664945, + "loss": 0.6348, + "step": 28310 + }, + { + "epoch": 1.4065759411940002, + "grad_norm": 0.12255859375, + "learning_rate": 0.000687477898082845, + "loss": 0.6038, + "step": 28320 + }, + { + "epoch": 1.4070726134896194, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006874381642991954, + "loss": 0.5846, + "step": 28330 + }, + { + "epoch": 1.4075692857852389, + "grad_norm": 0.1171875, + "learning_rate": 0.0006873984305155458, + "loss": 0.6066, + "step": 28340 + }, + { + "epoch": 1.4080659580808583, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006873586967318964, + "loss": 0.6239, + "step": 28350 + }, + { + "epoch": 1.4085626303764776, + "grad_norm": 0.095703125, + "learning_rate": 0.0006873189629482468, + "loss": 0.5947, + "step": 28360 + }, + { + "epoch": 1.409059302672097, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006872792291645972, + "loss": 0.615, + "step": 28370 + }, + { + "epoch": 1.4095559749677162, + "grad_norm": 0.103515625, + "learning_rate": 0.0006872394953809477, + "loss": 0.5876, + "step": 28380 + }, + { + "epoch": 1.4100526472633357, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006871997615972981, + "loss": 0.6149, + "step": 28390 + }, + { + "epoch": 1.410549319558955, + "grad_norm": 0.15625, + "learning_rate": 0.0006871600278136485, + "loss": 0.5882, + "step": 28400 + }, + { + "epoch": 1.4110459918545744, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006871202940299991, + "loss": 0.6271, + "step": 28410 + }, + { + "epoch": 1.4115426641501938, + "grad_norm": 0.123046875, + "learning_rate": 0.0006870805602463495, + "loss": 0.5835, + "step": 28420 + }, + { + "epoch": 1.412039336445813, + "grad_norm": 0.1650390625, + "learning_rate": 0.0006870408264626999, + "loss": 0.6312, + "step": 28430 + }, + { + "epoch": 1.4125360087414325, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006870010926790504, + "loss": 0.5719, + "step": 28440 + }, + { + "epoch": 1.4130326810370517, + "grad_norm": 0.146484375, + "learning_rate": 0.0006869613588954009, + "loss": 0.5973, + "step": 28450 + }, + { + "epoch": 1.4135293533326712, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006869216251117513, + "loss": 0.6259, + "step": 28460 + }, + { + "epoch": 1.4140260256282904, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006868818913281018, + "loss": 0.6359, + "step": 28470 + }, + { + "epoch": 1.4145226979239098, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006868421575444522, + "loss": 0.5925, + "step": 28480 + }, + { + "epoch": 1.4150193702195293, + "grad_norm": 0.1328125, + "learning_rate": 0.0006868024237608026, + "loss": 0.5965, + "step": 28490 + }, + { + "epoch": 1.4155160425151485, + "grad_norm": 0.125, + "learning_rate": 0.000686762689977153, + "loss": 0.6216, + "step": 28500 + }, + { + "epoch": 1.4160127148107677, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006867229561935036, + "loss": 0.5953, + "step": 28510 + }, + { + "epoch": 1.4165093871063872, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006866832224098541, + "loss": 0.6156, + "step": 28520 + }, + { + "epoch": 1.4170060594020066, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006866434886262044, + "loss": 0.6186, + "step": 28530 + }, + { + "epoch": 1.4175027316976259, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006866037548425549, + "loss": 0.6166, + "step": 28540 + }, + { + "epoch": 1.4179994039932453, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006865640210589055, + "loss": 0.5936, + "step": 28550 + }, + { + "epoch": 1.4184960762888645, + "grad_norm": 0.10546875, + "learning_rate": 0.0006865242872752558, + "loss": 0.6375, + "step": 28560 + }, + { + "epoch": 1.418992748584484, + "grad_norm": 0.115234375, + "learning_rate": 0.0006864845534916063, + "loss": 0.6165, + "step": 28570 + }, + { + "epoch": 1.4194894208801032, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006864448197079567, + "loss": 0.5936, + "step": 28580 + }, + { + "epoch": 1.4199860931757227, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006864050859243071, + "loss": 0.5865, + "step": 28590 + }, + { + "epoch": 1.4204827654713421, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006863653521406576, + "loss": 0.5765, + "step": 28600 + }, + { + "epoch": 1.4209794377669613, + "grad_norm": 0.103515625, + "learning_rate": 0.0006863256183570081, + "loss": 0.5794, + "step": 28610 + }, + { + "epoch": 1.4214761100625808, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006862858845733585, + "loss": 0.6259, + "step": 28620 + }, + { + "epoch": 1.4219727823582, + "grad_norm": 0.11181640625, + "learning_rate": 0.000686246150789709, + "loss": 0.6048, + "step": 28630 + }, + { + "epoch": 1.4224694546538195, + "grad_norm": 0.1728515625, + "learning_rate": 0.0006862064170060594, + "loss": 0.606, + "step": 28640 + }, + { + "epoch": 1.4229661269494387, + "grad_norm": 0.09375, + "learning_rate": 0.0006861666832224098, + "loss": 0.6099, + "step": 28650 + }, + { + "epoch": 1.4234627992450581, + "grad_norm": 0.1337890625, + "learning_rate": 0.0006861269494387604, + "loss": 0.6104, + "step": 28660 + }, + { + "epoch": 1.4239594715406776, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006860872156551108, + "loss": 0.6408, + "step": 28670 + }, + { + "epoch": 1.4244561438362968, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006860474818714613, + "loss": 0.6107, + "step": 28680 + }, + { + "epoch": 1.424952816131916, + "grad_norm": 0.099609375, + "learning_rate": 0.0006860077480878117, + "loss": 0.6043, + "step": 28690 + }, + { + "epoch": 1.4254494884275355, + "grad_norm": 0.095703125, + "learning_rate": 0.0006859680143041621, + "loss": 0.6196, + "step": 28700 + }, + { + "epoch": 1.425946160723155, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006859282805205127, + "loss": 0.5761, + "step": 28710 + }, + { + "epoch": 1.4264428330187742, + "grad_norm": 0.146484375, + "learning_rate": 0.000685888546736863, + "loss": 0.61, + "step": 28720 + }, + { + "epoch": 1.4269395053143936, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006858488129532135, + "loss": 0.5906, + "step": 28730 + }, + { + "epoch": 1.4274361776100128, + "grad_norm": 0.1337890625, + "learning_rate": 0.000685809079169564, + "loss": 0.578, + "step": 28740 + }, + { + "epoch": 1.4279328499056323, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006857693453859143, + "loss": 0.6389, + "step": 28750 + }, + { + "epoch": 1.4284295222012515, + "grad_norm": 0.109375, + "learning_rate": 0.0006857296116022649, + "loss": 0.6311, + "step": 28760 + }, + { + "epoch": 1.428926194496871, + "grad_norm": 0.125, + "learning_rate": 0.0006856898778186153, + "loss": 0.6115, + "step": 28770 + }, + { + "epoch": 1.4294228667924904, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006856501440349657, + "loss": 0.5909, + "step": 28780 + }, + { + "epoch": 1.4299195390881096, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006856104102513162, + "loss": 0.5808, + "step": 28790 + }, + { + "epoch": 1.430416211383729, + "grad_norm": 0.1328125, + "learning_rate": 0.0006855706764676666, + "loss": 0.6197, + "step": 28800 + }, + { + "epoch": 1.4309128836793483, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006855309426840171, + "loss": 0.5979, + "step": 28810 + }, + { + "epoch": 1.4314095559749678, + "grad_norm": 0.1015625, + "learning_rate": 0.0006854912089003676, + "loss": 0.6095, + "step": 28820 + }, + { + "epoch": 1.431906228270587, + "grad_norm": 0.1201171875, + "learning_rate": 0.000685451475116718, + "loss": 0.6002, + "step": 28830 + }, + { + "epoch": 1.4324029005662064, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006854117413330685, + "loss": 0.6342, + "step": 28840 + }, + { + "epoch": 1.432899572861826, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006853720075494189, + "loss": 0.6459, + "step": 28850 + }, + { + "epoch": 1.4333962451574451, + "grad_norm": 0.13671875, + "learning_rate": 0.0006853322737657694, + "loss": 0.5952, + "step": 28860 + }, + { + "epoch": 1.4338929174530644, + "grad_norm": 0.111328125, + "learning_rate": 0.0006852925399821199, + "loss": 0.6145, + "step": 28870 + }, + { + "epoch": 1.4343895897486838, + "grad_norm": 0.11328125, + "learning_rate": 0.0006852528061984703, + "loss": 0.6077, + "step": 28880 + }, + { + "epoch": 1.4348862620443033, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006852130724148207, + "loss": 0.6086, + "step": 28890 + }, + { + "epoch": 1.4353829343399225, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006851733386311712, + "loss": 0.6101, + "step": 28900 + }, + { + "epoch": 1.435879606635542, + "grad_norm": 0.12255859375, + "learning_rate": 0.0006851336048475216, + "loss": 0.6171, + "step": 28910 + }, + { + "epoch": 1.4363762789311612, + "grad_norm": 0.1435546875, + "learning_rate": 0.0006850938710638721, + "loss": 0.5873, + "step": 28920 + }, + { + "epoch": 1.4368729512267806, + "grad_norm": 0.091796875, + "learning_rate": 0.0006850541372802226, + "loss": 0.638, + "step": 28930 + }, + { + "epoch": 1.4373696235223998, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006850144034965729, + "loss": 0.6042, + "step": 28940 + }, + { + "epoch": 1.4378662958180193, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006849746697129234, + "loss": 0.5868, + "step": 28950 + }, + { + "epoch": 1.4383629681136387, + "grad_norm": 0.1826171875, + "learning_rate": 0.0006849349359292739, + "loss": 0.6008, + "step": 28960 + }, + { + "epoch": 1.438859640409258, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006848952021456244, + "loss": 0.6159, + "step": 28970 + }, + { + "epoch": 1.4393563127048772, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006848554683619748, + "loss": 0.5899, + "step": 28980 + }, + { + "epoch": 1.4398529850004966, + "grad_norm": 0.1669921875, + "learning_rate": 0.0006848157345783252, + "loss": 0.5951, + "step": 28990 + }, + { + "epoch": 1.440349657296116, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006847760007946757, + "loss": 0.6214, + "step": 29000 + }, + { + "epoch": 1.4408463295917353, + "grad_norm": 0.134765625, + "learning_rate": 0.0006847362670110262, + "loss": 0.6028, + "step": 29010 + }, + { + "epoch": 1.4413430018873548, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006846965332273766, + "loss": 0.6098, + "step": 29020 + }, + { + "epoch": 1.4418396741829742, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006846567994437271, + "loss": 0.6203, + "step": 29030 + }, + { + "epoch": 1.4423363464785934, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006846170656600775, + "loss": 0.6068, + "step": 29040 + }, + { + "epoch": 1.4428330187742127, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006845773318764279, + "loss": 0.6099, + "step": 29050 + }, + { + "epoch": 1.443329691069832, + "grad_norm": 0.142578125, + "learning_rate": 0.0006845375980927785, + "loss": 0.5605, + "step": 29060 + }, + { + "epoch": 1.4438263633654516, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006844978643091289, + "loss": 0.5866, + "step": 29070 + }, + { + "epoch": 1.4443230356610708, + "grad_norm": 0.119140625, + "learning_rate": 0.0006844581305254793, + "loss": 0.6057, + "step": 29080 + }, + { + "epoch": 1.4448197079566902, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006844183967418298, + "loss": 0.6059, + "step": 29090 + }, + { + "epoch": 1.4453163802523095, + "grad_norm": 0.099609375, + "learning_rate": 0.0006843786629581801, + "loss": 0.5785, + "step": 29100 + }, + { + "epoch": 1.445813052547929, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006843389291745307, + "loss": 0.5752, + "step": 29110 + }, + { + "epoch": 1.4463097248435481, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006842991953908812, + "loss": 0.5905, + "step": 29120 + }, + { + "epoch": 1.4468063971391676, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006842594616072316, + "loss": 0.5859, + "step": 29130 + }, + { + "epoch": 1.447303069434787, + "grad_norm": 0.130859375, + "learning_rate": 0.000684219727823582, + "loss": 0.6083, + "step": 29140 + }, + { + "epoch": 1.4477997417304063, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006841799940399325, + "loss": 0.5989, + "step": 29150 + }, + { + "epoch": 1.4482964140260255, + "grad_norm": 0.0986328125, + "learning_rate": 0.000684140260256283, + "loss": 0.5948, + "step": 29160 + }, + { + "epoch": 1.448793086321645, + "grad_norm": 0.103515625, + "learning_rate": 0.0006841005264726334, + "loss": 0.6228, + "step": 29170 + }, + { + "epoch": 1.4492897586172644, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006840607926889838, + "loss": 0.599, + "step": 29180 + }, + { + "epoch": 1.4497864309128836, + "grad_norm": 0.09765625, + "learning_rate": 0.0006840210589053343, + "loss": 0.6125, + "step": 29190 + }, + { + "epoch": 1.450283103208503, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006839813251216847, + "loss": 0.6139, + "step": 29200 + }, + { + "epoch": 1.4507797755041225, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006839415913380352, + "loss": 0.6245, + "step": 29210 + }, + { + "epoch": 1.4512764477997417, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006839018575543857, + "loss": 0.619, + "step": 29220 + }, + { + "epoch": 1.451773120095361, + "grad_norm": 0.185546875, + "learning_rate": 0.0006838621237707361, + "loss": 0.6243, + "step": 29230 + }, + { + "epoch": 1.4522697923909804, + "grad_norm": 0.130859375, + "learning_rate": 0.0006838223899870865, + "loss": 0.5803, + "step": 29240 + }, + { + "epoch": 1.4527664646865999, + "grad_norm": 0.216796875, + "learning_rate": 0.000683782656203437, + "loss": 0.6058, + "step": 29250 + }, + { + "epoch": 1.453263136982219, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006837429224197875, + "loss": 0.6069, + "step": 29260 + }, + { + "epoch": 1.4537598092778385, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006837031886361379, + "loss": 0.5811, + "step": 29270 + }, + { + "epoch": 1.4542564815734578, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006836634548524884, + "loss": 0.5839, + "step": 29280 + }, + { + "epoch": 1.4547531538690772, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006836237210688388, + "loss": 0.6351, + "step": 29290 + }, + { + "epoch": 1.4552498261646964, + "grad_norm": 0.09765625, + "learning_rate": 0.0006835839872851892, + "loss": 0.6138, + "step": 29300 + }, + { + "epoch": 1.455746498460316, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006835442535015398, + "loss": 0.6059, + "step": 29310 + }, + { + "epoch": 1.4562431707559353, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006835045197178902, + "loss": 0.5771, + "step": 29320 + }, + { + "epoch": 1.4567398430515546, + "grad_norm": 0.125, + "learning_rate": 0.0006834647859342406, + "loss": 0.5944, + "step": 29330 + }, + { + "epoch": 1.4572365153471738, + "grad_norm": 0.091796875, + "learning_rate": 0.0006834250521505911, + "loss": 0.5916, + "step": 29340 + }, + { + "epoch": 1.4577331876427932, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006833853183669415, + "loss": 0.6127, + "step": 29350 + }, + { + "epoch": 1.4582298599384127, + "grad_norm": 0.109375, + "learning_rate": 0.000683345584583292, + "loss": 0.6273, + "step": 29360 + }, + { + "epoch": 1.458726532234032, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006833058507996424, + "loss": 0.6252, + "step": 29370 + }, + { + "epoch": 1.4592232045296514, + "grad_norm": 0.119140625, + "learning_rate": 0.0006832661170159929, + "loss": 0.5981, + "step": 29380 + }, + { + "epoch": 1.4597198768252708, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006832263832323433, + "loss": 0.6148, + "step": 29390 + }, + { + "epoch": 1.46021654912089, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006831866494486937, + "loss": 0.5942, + "step": 29400 + }, + { + "epoch": 1.4607132214165093, + "grad_norm": 0.11328125, + "learning_rate": 0.0006831469156650443, + "loss": 0.6089, + "step": 29410 + }, + { + "epoch": 1.4612098937121287, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006831071818813948, + "loss": 0.5826, + "step": 29420 + }, + { + "epoch": 1.4617065660077482, + "grad_norm": 0.11328125, + "learning_rate": 0.0006830674480977451, + "loss": 0.6103, + "step": 29430 + }, + { + "epoch": 1.4622032383033674, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006830277143140956, + "loss": 0.6019, + "step": 29440 + }, + { + "epoch": 1.4626999105989869, + "grad_norm": 0.10107421875, + "learning_rate": 0.000682987980530446, + "loss": 0.6102, + "step": 29450 + }, + { + "epoch": 1.463196582894606, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006829482467467965, + "loss": 0.6198, + "step": 29460 + }, + { + "epoch": 1.4636932551902255, + "grad_norm": 0.1376953125, + "learning_rate": 0.000682908512963147, + "loss": 0.5934, + "step": 29470 + }, + { + "epoch": 1.4641899274858448, + "grad_norm": 0.1328125, + "learning_rate": 0.0006828687791794974, + "loss": 0.5846, + "step": 29480 + }, + { + "epoch": 1.4646865997814642, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006828290453958478, + "loss": 0.6057, + "step": 29490 + }, + { + "epoch": 1.4651832720770837, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006827893116121983, + "loss": 0.633, + "step": 29500 + }, + { + "epoch": 1.4656799443727029, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006827495778285488, + "loss": 0.6107, + "step": 29510 + }, + { + "epoch": 1.466176616668322, + "grad_norm": 0.2119140625, + "learning_rate": 0.0006827098440448992, + "loss": 0.6107, + "step": 29520 + }, + { + "epoch": 1.4666732889639416, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006826701102612497, + "loss": 0.5991, + "step": 29530 + }, + { + "epoch": 1.467169961259561, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006826303764776001, + "loss": 0.6115, + "step": 29540 + }, + { + "epoch": 1.4676666335551802, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006825906426939505, + "loss": 0.6148, + "step": 29550 + }, + { + "epoch": 1.4681633058507997, + "grad_norm": 0.1181640625, + "learning_rate": 0.000682550908910301, + "loss": 0.6007, + "step": 29560 + }, + { + "epoch": 1.468659978146419, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006825111751266515, + "loss": 0.6115, + "step": 29570 + }, + { + "epoch": 1.4691566504420384, + "grad_norm": 0.1337890625, + "learning_rate": 0.000682471441343002, + "loss": 0.6059, + "step": 29580 + }, + { + "epoch": 1.4696533227376576, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006824317075593523, + "loss": 0.6125, + "step": 29590 + }, + { + "epoch": 1.470149995033277, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006823919737757028, + "loss": 0.5979, + "step": 29600 + }, + { + "epoch": 1.4706466673288965, + "grad_norm": 0.119140625, + "learning_rate": 0.0006823522399920534, + "loss": 0.598, + "step": 29610 + }, + { + "epoch": 1.4711433396245157, + "grad_norm": 0.181640625, + "learning_rate": 0.0006823125062084037, + "loss": 0.6289, + "step": 29620 + }, + { + "epoch": 1.4716400119201352, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006822727724247542, + "loss": 0.6102, + "step": 29630 + }, + { + "epoch": 1.4721366842157544, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006822330386411046, + "loss": 0.5902, + "step": 29640 + }, + { + "epoch": 1.4726333565113738, + "grad_norm": 0.0986328125, + "learning_rate": 0.000682193304857455, + "loss": 0.5947, + "step": 29650 + }, + { + "epoch": 1.473130028806993, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006821535710738056, + "loss": 0.5978, + "step": 29660 + }, + { + "epoch": 1.4736267011026125, + "grad_norm": 0.10888671875, + "learning_rate": 0.000682113837290156, + "loss": 0.6403, + "step": 29670 + }, + { + "epoch": 1.474123373398232, + "grad_norm": 0.1953125, + "learning_rate": 0.0006820741035065064, + "loss": 0.607, + "step": 29680 + }, + { + "epoch": 1.4746200456938512, + "grad_norm": 0.09765625, + "learning_rate": 0.0006820343697228569, + "loss": 0.6096, + "step": 29690 + }, + { + "epoch": 1.4751167179894704, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006819946359392073, + "loss": 0.6089, + "step": 29700 + }, + { + "epoch": 1.4756133902850899, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006819549021555579, + "loss": 0.6185, + "step": 29710 + }, + { + "epoch": 1.4761100625807093, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006819151683719083, + "loss": 0.5886, + "step": 29720 + }, + { + "epoch": 1.4766067348763285, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006818754345882587, + "loss": 0.6284, + "step": 29730 + }, + { + "epoch": 1.477103407171948, + "grad_norm": 0.111328125, + "learning_rate": 0.0006818357008046092, + "loss": 0.587, + "step": 29740 + }, + { + "epoch": 1.4776000794675672, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006817959670209596, + "loss": 0.5764, + "step": 29750 + }, + { + "epoch": 1.4780967517631867, + "grad_norm": 0.12353515625, + "learning_rate": 0.00068175623323731, + "loss": 0.6388, + "step": 29760 + }, + { + "epoch": 1.478593424058806, + "grad_norm": 0.103515625, + "learning_rate": 0.0006817164994536606, + "loss": 0.5818, + "step": 29770 + }, + { + "epoch": 1.4790900963544253, + "grad_norm": 0.109375, + "learning_rate": 0.0006816767656700109, + "loss": 0.6276, + "step": 29780 + }, + { + "epoch": 1.4795867686500448, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006816370318863614, + "loss": 0.5938, + "step": 29790 + }, + { + "epoch": 1.480083440945664, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006815972981027119, + "loss": 0.5975, + "step": 29800 + }, + { + "epoch": 1.4805801132412835, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006815575643190622, + "loss": 0.6227, + "step": 29810 + }, + { + "epoch": 1.4810767855369027, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006815178305354128, + "loss": 0.612, + "step": 29820 + }, + { + "epoch": 1.4815734578325221, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006814780967517632, + "loss": 0.5949, + "step": 29830 + }, + { + "epoch": 1.4820701301281414, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006814383629681136, + "loss": 0.5996, + "step": 29840 + }, + { + "epoch": 1.4825668024237608, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006813986291844641, + "loss": 0.5888, + "step": 29850 + }, + { + "epoch": 1.4830634747193803, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006813588954008145, + "loss": 0.6183, + "step": 29860 + }, + { + "epoch": 1.4835601470149995, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006813191616171651, + "loss": 0.626, + "step": 29870 + }, + { + "epoch": 1.4840568193106187, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006812794278335155, + "loss": 0.6085, + "step": 29880 + }, + { + "epoch": 1.4845534916062382, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006812396940498659, + "loss": 0.5884, + "step": 29890 + }, + { + "epoch": 1.4850501639018576, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006811999602662164, + "loss": 0.6011, + "step": 29900 + }, + { + "epoch": 1.4855468361974768, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006811602264825668, + "loss": 0.6363, + "step": 29910 + }, + { + "epoch": 1.4860435084930963, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006811204926989173, + "loss": 0.6158, + "step": 29920 + }, + { + "epoch": 1.4865401807887155, + "grad_norm": 0.150390625, + "learning_rate": 0.0006810807589152678, + "loss": 0.5923, + "step": 29930 + }, + { + "epoch": 1.487036853084335, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006810410251316182, + "loss": 0.6281, + "step": 29940 + }, + { + "epoch": 1.4875335253799542, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006810012913479686, + "loss": 0.5919, + "step": 29950 + }, + { + "epoch": 1.4880301976755737, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006809615575643192, + "loss": 0.6249, + "step": 29960 + }, + { + "epoch": 1.488526869971193, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006809218237806695, + "loss": 0.5997, + "step": 29970 + }, + { + "epoch": 1.4890235422668123, + "grad_norm": 0.1298828125, + "learning_rate": 0.00068088208999702, + "loss": 0.6269, + "step": 29980 + }, + { + "epoch": 1.4895202145624318, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006808423562133705, + "loss": 0.6076, + "step": 29990 + }, + { + "epoch": 1.490016886858051, + "grad_norm": 0.1357421875, + "learning_rate": 0.0006808026224297208, + "loss": 0.5876, + "step": 30000 + }, + { + "epoch": 1.4905135591536705, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006807628886460713, + "loss": 0.6008, + "step": 30010 + }, + { + "epoch": 1.4910102314492897, + "grad_norm": 0.1806640625, + "learning_rate": 0.0006807231548624219, + "loss": 0.5796, + "step": 30020 + }, + { + "epoch": 1.4915069037449091, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006806834210787723, + "loss": 0.5966, + "step": 30030 + }, + { + "epoch": 1.4920035760405286, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006806436872951227, + "loss": 0.6028, + "step": 30040 + }, + { + "epoch": 1.4925002483361478, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006806039535114731, + "loss": 0.6235, + "step": 30050 + }, + { + "epoch": 1.492996920631767, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006805642197278236, + "loss": 0.6222, + "step": 30060 + }, + { + "epoch": 1.4934935929273865, + "grad_norm": 0.11328125, + "learning_rate": 0.0006805244859441741, + "loss": 0.5923, + "step": 30070 + }, + { + "epoch": 1.493990265223006, + "grad_norm": 0.09033203125, + "learning_rate": 0.0006804847521605245, + "loss": 0.5916, + "step": 30080 + }, + { + "epoch": 1.4944869375186252, + "grad_norm": 0.099609375, + "learning_rate": 0.000680445018376875, + "loss": 0.6256, + "step": 30090 + }, + { + "epoch": 1.4949836098142446, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006804052845932254, + "loss": 0.608, + "step": 30100 + }, + { + "epoch": 1.4954802821098638, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006803655508095758, + "loss": 0.5927, + "step": 30110 + }, + { + "epoch": 1.4959769544054833, + "grad_norm": 0.130859375, + "learning_rate": 0.0006803258170259264, + "loss": 0.6394, + "step": 30120 + }, + { + "epoch": 1.4964736267011025, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006802860832422768, + "loss": 0.6129, + "step": 30130 + }, + { + "epoch": 1.496970298996722, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006802463494586272, + "loss": 0.6006, + "step": 30140 + }, + { + "epoch": 1.4974669712923414, + "grad_norm": 0.134765625, + "learning_rate": 0.0006802066156749777, + "loss": 0.6076, + "step": 30150 + }, + { + "epoch": 1.4979636435879606, + "grad_norm": 0.14453125, + "learning_rate": 0.0006801668818913281, + "loss": 0.6152, + "step": 30160 + }, + { + "epoch": 1.49846031588358, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006801271481076786, + "loss": 0.6272, + "step": 30170 + }, + { + "epoch": 1.4989569881791993, + "grad_norm": 0.138671875, + "learning_rate": 0.0006800874143240291, + "loss": 0.6088, + "step": 30180 + }, + { + "epoch": 1.4994536604748188, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006800476805403795, + "loss": 0.6184, + "step": 30190 + }, + { + "epoch": 1.499950332770438, + "grad_norm": 0.111328125, + "learning_rate": 0.0006800079467567299, + "loss": 0.594, + "step": 30200 + }, + { + "epoch": 1.5004470050660574, + "grad_norm": 0.12255859375, + "learning_rate": 0.0006799682129730804, + "loss": 0.617, + "step": 30210 + }, + { + "epoch": 1.5009436773616769, + "grad_norm": 0.09375, + "learning_rate": 0.0006799284791894309, + "loss": 0.6101, + "step": 30220 + }, + { + "epoch": 1.5014403496572961, + "grad_norm": 0.1552734375, + "learning_rate": 0.0006798887454057813, + "loss": 0.6013, + "step": 30230 + }, + { + "epoch": 1.5019370219529153, + "grad_norm": 0.140625, + "learning_rate": 0.0006798490116221317, + "loss": 0.6145, + "step": 30240 + }, + { + "epoch": 1.5024336942485348, + "grad_norm": 0.158203125, + "learning_rate": 0.0006798092778384822, + "loss": 0.6136, + "step": 30250 + }, + { + "epoch": 1.5029303665441542, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006797695440548326, + "loss": 0.6326, + "step": 30260 + }, + { + "epoch": 1.5034270388397735, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006797298102711831, + "loss": 0.6127, + "step": 30270 + }, + { + "epoch": 1.503923711135393, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006796900764875336, + "loss": 0.5979, + "step": 30280 + }, + { + "epoch": 1.5044203834310124, + "grad_norm": 0.08837890625, + "learning_rate": 0.000679650342703884, + "loss": 0.6082, + "step": 30290 + }, + { + "epoch": 1.5049170557266316, + "grad_norm": 0.140625, + "learning_rate": 0.0006796106089202344, + "loss": 0.6313, + "step": 30300 + }, + { + "epoch": 1.5054137280222508, + "grad_norm": 0.09375, + "learning_rate": 0.0006795708751365849, + "loss": 0.5912, + "step": 30310 + }, + { + "epoch": 1.5059104003178703, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006795311413529354, + "loss": 0.6078, + "step": 30320 + }, + { + "epoch": 1.5064070726134897, + "grad_norm": 0.103515625, + "learning_rate": 0.0006794914075692858, + "loss": 0.5964, + "step": 30330 + }, + { + "epoch": 1.506903744909109, + "grad_norm": 0.10546875, + "learning_rate": 0.0006794516737856363, + "loss": 0.5968, + "step": 30340 + }, + { + "epoch": 1.5074004172047282, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006794119400019867, + "loss": 0.6018, + "step": 30350 + }, + { + "epoch": 1.5078970895003476, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006793722062183371, + "loss": 0.5776, + "step": 30360 + }, + { + "epoch": 1.508393761795967, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006793324724346877, + "loss": 0.616, + "step": 30370 + }, + { + "epoch": 1.5088904340915863, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006792927386510381, + "loss": 0.6041, + "step": 30380 + }, + { + "epoch": 1.5093871063872057, + "grad_norm": 0.142578125, + "learning_rate": 0.0006792530048673885, + "loss": 0.6189, + "step": 30390 + }, + { + "epoch": 1.5098837786828252, + "grad_norm": 0.1220703125, + "learning_rate": 0.000679213271083739, + "loss": 0.6155, + "step": 30400 + }, + { + "epoch": 1.5103804509784444, + "grad_norm": 0.111328125, + "learning_rate": 0.0006791735373000894, + "loss": 0.596, + "step": 30410 + }, + { + "epoch": 1.5108771232740636, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006791338035164399, + "loss": 0.614, + "step": 30420 + }, + { + "epoch": 1.511373795569683, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006790940697327903, + "loss": 0.5913, + "step": 30430 + }, + { + "epoch": 1.5118704678653025, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006790543359491408, + "loss": 0.6084, + "step": 30440 + }, + { + "epoch": 1.5123671401609218, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006790146021654912, + "loss": 0.5904, + "step": 30450 + }, + { + "epoch": 1.5128638124565412, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006789748683818416, + "loss": 0.5973, + "step": 30460 + }, + { + "epoch": 1.5133604847521607, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006789351345981922, + "loss": 0.6363, + "step": 30470 + }, + { + "epoch": 1.51385715704778, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006788954008145427, + "loss": 0.617, + "step": 30480 + }, + { + "epoch": 1.5143538293433991, + "grad_norm": 0.11865234375, + "learning_rate": 0.000678855667030893, + "loss": 0.6019, + "step": 30490 + }, + { + "epoch": 1.5148505016390186, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006788159332472435, + "loss": 0.5968, + "step": 30500 + }, + { + "epoch": 1.515347173934638, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006787761994635939, + "loss": 0.6072, + "step": 30510 + }, + { + "epoch": 1.5158438462302573, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006787364656799444, + "loss": 0.6382, + "step": 30520 + }, + { + "epoch": 1.5163405185258765, + "grad_norm": 0.158203125, + "learning_rate": 0.0006786967318962949, + "loss": 0.5835, + "step": 30530 + }, + { + "epoch": 1.516837190821496, + "grad_norm": 0.091796875, + "learning_rate": 0.0006786569981126453, + "loss": 0.6065, + "step": 30540 + }, + { + "epoch": 1.5173338631171154, + "grad_norm": 0.126953125, + "learning_rate": 0.0006786172643289957, + "loss": 0.6027, + "step": 30550 + }, + { + "epoch": 1.5178305354127346, + "grad_norm": 0.1328125, + "learning_rate": 0.0006785775305453462, + "loss": 0.5881, + "step": 30560 + }, + { + "epoch": 1.518327207708354, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006785377967616967, + "loss": 0.5985, + "step": 30570 + }, + { + "epoch": 1.5188238800039735, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006784980629780471, + "loss": 0.5662, + "step": 30580 + }, + { + "epoch": 1.5193205522995927, + "grad_norm": 0.154296875, + "learning_rate": 0.0006784583291943976, + "loss": 0.5797, + "step": 30590 + }, + { + "epoch": 1.519817224595212, + "grad_norm": 0.09814453125, + "learning_rate": 0.000678418595410748, + "loss": 0.565, + "step": 30600 + }, + { + "epoch": 1.5203138968908314, + "grad_norm": 0.14453125, + "learning_rate": 0.0006783788616270985, + "loss": 0.5841, + "step": 30610 + }, + { + "epoch": 1.5208105691864509, + "grad_norm": 0.09912109375, + "learning_rate": 0.000678339127843449, + "loss": 0.6007, + "step": 30620 + }, + { + "epoch": 1.52130724148207, + "grad_norm": 0.1328125, + "learning_rate": 0.0006782993940597994, + "loss": 0.5948, + "step": 30630 + }, + { + "epoch": 1.5218039137776895, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006782596602761499, + "loss": 0.5944, + "step": 30640 + }, + { + "epoch": 1.522300586073309, + "grad_norm": 0.162109375, + "learning_rate": 0.0006782199264925002, + "loss": 0.6135, + "step": 30650 + }, + { + "epoch": 1.5227972583689282, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006781801927088507, + "loss": 0.6057, + "step": 30660 + }, + { + "epoch": 1.5232939306645474, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006781404589252013, + "loss": 0.6222, + "step": 30670 + }, + { + "epoch": 1.5237906029601669, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006781007251415516, + "loss": 0.5781, + "step": 30680 + }, + { + "epoch": 1.5242872752557863, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006780609913579021, + "loss": 0.6204, + "step": 30690 + }, + { + "epoch": 1.5247839475514056, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006780212575742525, + "loss": 0.6038, + "step": 30700 + }, + { + "epoch": 1.5252806198470248, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006779815237906029, + "loss": 0.6073, + "step": 30710 + }, + { + "epoch": 1.5257772921426442, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006779417900069535, + "loss": 0.6317, + "step": 30720 + }, + { + "epoch": 1.5262739644382637, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006779020562233039, + "loss": 0.6054, + "step": 30730 + }, + { + "epoch": 1.526770636733883, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006778623224396543, + "loss": 0.5999, + "step": 30740 + }, + { + "epoch": 1.5272673090295024, + "grad_norm": 0.2158203125, + "learning_rate": 0.0006778225886560048, + "loss": 0.6027, + "step": 30750 + }, + { + "epoch": 1.5277639813251218, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006777828548723552, + "loss": 0.6004, + "step": 30760 + }, + { + "epoch": 1.528260653620741, + "grad_norm": 0.111328125, + "learning_rate": 0.0006777431210887058, + "loss": 0.5881, + "step": 30770 + }, + { + "epoch": 1.5287573259163603, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006777033873050562, + "loss": 0.6004, + "step": 30780 + }, + { + "epoch": 1.5292539982119797, + "grad_norm": 0.134765625, + "learning_rate": 0.0006776636535214066, + "loss": 0.6021, + "step": 30790 + }, + { + "epoch": 1.5297506705075992, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006776239197377571, + "loss": 0.6088, + "step": 30800 + }, + { + "epoch": 1.5302473428032184, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006775841859541075, + "loss": 0.5897, + "step": 30810 + }, + { + "epoch": 1.5307440150988378, + "grad_norm": 0.1015625, + "learning_rate": 0.000677544452170458, + "loss": 0.5908, + "step": 30820 + }, + { + "epoch": 1.5312406873944573, + "grad_norm": 0.126953125, + "learning_rate": 0.0006775047183868085, + "loss": 0.5859, + "step": 30830 + }, + { + "epoch": 1.5317373596900765, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006774649846031588, + "loss": 0.5808, + "step": 30840 + }, + { + "epoch": 1.5322340319856957, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006774252508195093, + "loss": 0.6055, + "step": 30850 + }, + { + "epoch": 1.5327307042813152, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006773855170358598, + "loss": 0.5918, + "step": 30860 + }, + { + "epoch": 1.5332273765769346, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006773457832522101, + "loss": 0.6138, + "step": 30870 + }, + { + "epoch": 1.5337240488725539, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006773060494685607, + "loss": 0.6295, + "step": 30880 + }, + { + "epoch": 1.534220721168173, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006772663156849112, + "loss": 0.6049, + "step": 30890 + }, + { + "epoch": 1.5347173934637925, + "grad_norm": 0.1171875, + "learning_rate": 0.0006772265819012615, + "loss": 0.5895, + "step": 30900 + }, + { + "epoch": 1.535214065759412, + "grad_norm": 0.1337890625, + "learning_rate": 0.000677186848117612, + "loss": 0.6292, + "step": 30910 + }, + { + "epoch": 1.5357107380550312, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006771471143339625, + "loss": 0.6287, + "step": 30920 + }, + { + "epoch": 1.5362074103506507, + "grad_norm": 0.0908203125, + "learning_rate": 0.000677107380550313, + "loss": 0.5945, + "step": 30930 + }, + { + "epoch": 1.5367040826462701, + "grad_norm": 0.111328125, + "learning_rate": 0.0006770676467666634, + "loss": 0.6321, + "step": 30940 + }, + { + "epoch": 1.5372007549418893, + "grad_norm": 0.11328125, + "learning_rate": 0.0006770279129830138, + "loss": 0.6027, + "step": 30950 + }, + { + "epoch": 1.5376974272375086, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006769881791993643, + "loss": 0.6035, + "step": 30960 + }, + { + "epoch": 1.538194099533128, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006769484454157148, + "loss": 0.6127, + "step": 30970 + }, + { + "epoch": 1.5386907718287475, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006769087116320652, + "loss": 0.5827, + "step": 30980 + }, + { + "epoch": 1.5391874441243667, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006768689778484157, + "loss": 0.617, + "step": 30990 + }, + { + "epoch": 1.5396841164199861, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006768292440647661, + "loss": 0.6028, + "step": 31000 + }, + { + "epoch": 1.5401807887156056, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006767895102811165, + "loss": 0.5821, + "step": 31010 + }, + { + "epoch": 1.5406774610112248, + "grad_norm": 0.146484375, + "learning_rate": 0.0006767497764974671, + "loss": 0.602, + "step": 31020 + }, + { + "epoch": 1.541174133306844, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006767100427138174, + "loss": 0.6195, + "step": 31030 + }, + { + "epoch": 1.5416708056024635, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006766703089301679, + "loss": 0.6345, + "step": 31040 + }, + { + "epoch": 1.542167477898083, + "grad_norm": 0.1337890625, + "learning_rate": 0.0006766305751465184, + "loss": 0.6286, + "step": 31050 + }, + { + "epoch": 1.5426641501937022, + "grad_norm": 0.1455078125, + "learning_rate": 0.0006765908413628688, + "loss": 0.607, + "step": 31060 + }, + { + "epoch": 1.5431608224893214, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006765511075792192, + "loss": 0.6033, + "step": 31070 + }, + { + "epoch": 1.5436574947849409, + "grad_norm": 0.10546875, + "learning_rate": 0.0006765113737955698, + "loss": 0.5816, + "step": 31080 + }, + { + "epoch": 1.5441541670805603, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006764716400119202, + "loss": 0.6319, + "step": 31090 + }, + { + "epoch": 1.5446508393761795, + "grad_norm": 0.1357421875, + "learning_rate": 0.0006764319062282706, + "loss": 0.5986, + "step": 31100 + }, + { + "epoch": 1.545147511671799, + "grad_norm": 0.1494140625, + "learning_rate": 0.000676392172444621, + "loss": 0.6509, + "step": 31110 + }, + { + "epoch": 1.5456441839674184, + "grad_norm": 0.1171875, + "learning_rate": 0.0006763524386609716, + "loss": 0.5765, + "step": 31120 + }, + { + "epoch": 1.5461408562630377, + "grad_norm": 0.1787109375, + "learning_rate": 0.000676312704877322, + "loss": 0.5904, + "step": 31130 + }, + { + "epoch": 1.5466375285586569, + "grad_norm": 0.17578125, + "learning_rate": 0.0006762729710936724, + "loss": 0.5956, + "step": 31140 + }, + { + "epoch": 1.5471342008542763, + "grad_norm": 0.08984375, + "learning_rate": 0.0006762332373100229, + "loss": 0.5829, + "step": 31150 + }, + { + "epoch": 1.5476308731498958, + "grad_norm": 0.125, + "learning_rate": 0.0006761935035263733, + "loss": 0.5784, + "step": 31160 + }, + { + "epoch": 1.548127545445515, + "grad_norm": 0.15234375, + "learning_rate": 0.0006761537697427237, + "loss": 0.6014, + "step": 31170 + }, + { + "epoch": 1.5486242177411342, + "grad_norm": 0.09033203125, + "learning_rate": 0.0006761140359590743, + "loss": 0.6188, + "step": 31180 + }, + { + "epoch": 1.549120890036754, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006760743021754247, + "loss": 0.6035, + "step": 31190 + }, + { + "epoch": 1.5496175623323731, + "grad_norm": 0.103515625, + "learning_rate": 0.0006760345683917751, + "loss": 0.5998, + "step": 31200 + }, + { + "epoch": 1.5501142346279924, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006759948346081256, + "loss": 0.5868, + "step": 31210 + }, + { + "epoch": 1.5506109069236118, + "grad_norm": 0.146484375, + "learning_rate": 0.000675955100824476, + "loss": 0.6067, + "step": 31220 + }, + { + "epoch": 1.5511075792192313, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006759153670408265, + "loss": 0.6151, + "step": 31230 + }, + { + "epoch": 1.5516042515148505, + "grad_norm": 0.109375, + "learning_rate": 0.000675875633257177, + "loss": 0.6484, + "step": 31240 + }, + { + "epoch": 1.5521009238104697, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006758358994735274, + "loss": 0.6143, + "step": 31250 + }, + { + "epoch": 1.5525975961060892, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006757961656898778, + "loss": 0.5907, + "step": 31260 + }, + { + "epoch": 1.5530942684017086, + "grad_norm": 0.22265625, + "learning_rate": 0.0006757564319062284, + "loss": 0.6016, + "step": 31270 + }, + { + "epoch": 1.5535909406973278, + "grad_norm": 0.20703125, + "learning_rate": 0.0006757166981225788, + "loss": 0.6084, + "step": 31280 + }, + { + "epoch": 1.5540876129929473, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006756769643389292, + "loss": 0.6154, + "step": 31290 + }, + { + "epoch": 1.5545842852885667, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006756372305552796, + "loss": 0.6008, + "step": 31300 + }, + { + "epoch": 1.555080957584186, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006755974967716301, + "loss": 0.5986, + "step": 31310 + }, + { + "epoch": 1.5555776298798052, + "grad_norm": 0.1171875, + "learning_rate": 0.0006755577629879805, + "loss": 0.5987, + "step": 31320 + }, + { + "epoch": 1.5560743021754246, + "grad_norm": 0.09326171875, + "learning_rate": 0.000675518029204331, + "loss": 0.5984, + "step": 31330 + }, + { + "epoch": 1.556570974471044, + "grad_norm": 0.16015625, + "learning_rate": 0.0006754782954206815, + "loss": 0.5834, + "step": 31340 + }, + { + "epoch": 1.5570676467666633, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006754385616370319, + "loss": 0.6261, + "step": 31350 + }, + { + "epoch": 1.5575643190622825, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006753988278533823, + "loss": 0.5748, + "step": 31360 + }, + { + "epoch": 1.5580609913579022, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006753590940697328, + "loss": 0.5874, + "step": 31370 + }, + { + "epoch": 1.5585576636535214, + "grad_norm": 0.126953125, + "learning_rate": 0.0006753193602860833, + "loss": 0.5873, + "step": 31380 + }, + { + "epoch": 1.5590543359491407, + "grad_norm": 0.08837890625, + "learning_rate": 0.0006752796265024337, + "loss": 0.6083, + "step": 31390 + }, + { + "epoch": 1.5595510082447601, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006752398927187842, + "loss": 0.5917, + "step": 31400 + }, + { + "epoch": 1.5600476805403796, + "grad_norm": 0.14453125, + "learning_rate": 0.0006752001589351346, + "loss": 0.6035, + "step": 31410 + }, + { + "epoch": 1.5605443528359988, + "grad_norm": 0.10400390625, + "learning_rate": 0.000675160425151485, + "loss": 0.5833, + "step": 31420 + }, + { + "epoch": 1.561041025131618, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006751206913678356, + "loss": 0.5765, + "step": 31430 + }, + { + "epoch": 1.5615376974272375, + "grad_norm": 0.1103515625, + "learning_rate": 0.000675080957584186, + "loss": 0.5868, + "step": 31440 + }, + { + "epoch": 1.562034369722857, + "grad_norm": 0.115234375, + "learning_rate": 0.0006750412238005364, + "loss": 0.6094, + "step": 31450 + }, + { + "epoch": 1.5625310420184761, + "grad_norm": 0.154296875, + "learning_rate": 0.0006750014900168869, + "loss": 0.5841, + "step": 31460 + }, + { + "epoch": 1.5630277143140956, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006749617562332373, + "loss": 0.5807, + "step": 31470 + }, + { + "epoch": 1.563524386609715, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006749220224495878, + "loss": 0.5957, + "step": 31480 + }, + { + "epoch": 1.5640210589053343, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006748822886659383, + "loss": 0.6276, + "step": 31490 + }, + { + "epoch": 1.5645177312009535, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006748425548822887, + "loss": 0.5916, + "step": 31500 + }, + { + "epoch": 1.565014403496573, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006748028210986392, + "loss": 0.5754, + "step": 31510 + }, + { + "epoch": 1.5655110757921924, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006747630873149895, + "loss": 0.6124, + "step": 31520 + }, + { + "epoch": 1.5660077480878116, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006747233535313401, + "loss": 0.579, + "step": 31530 + }, + { + "epoch": 1.5665044203834309, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006746836197476906, + "loss": 0.6369, + "step": 31540 + }, + { + "epoch": 1.5670010926790505, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006746438859640409, + "loss": 0.6231, + "step": 31550 + }, + { + "epoch": 1.5674977649746697, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006746041521803914, + "loss": 0.6074, + "step": 31560 + }, + { + "epoch": 1.567994437270289, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006745644183967418, + "loss": 0.6241, + "step": 31570 + }, + { + "epoch": 1.5684911095659084, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006745246846130923, + "loss": 0.5816, + "step": 31580 + }, + { + "epoch": 1.5689877818615279, + "grad_norm": 0.13671875, + "learning_rate": 0.0006744849508294428, + "loss": 0.6155, + "step": 31590 + }, + { + "epoch": 1.569484454157147, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006744452170457932, + "loss": 0.6011, + "step": 31600 + }, + { + "epoch": 1.5699811264527663, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006744054832621436, + "loss": 0.6116, + "step": 31610 + }, + { + "epoch": 1.5704777987483858, + "grad_norm": 0.1484375, + "learning_rate": 0.0006743657494784941, + "loss": 0.5968, + "step": 31620 + }, + { + "epoch": 1.5709744710440052, + "grad_norm": 0.119140625, + "learning_rate": 0.0006743260156948446, + "loss": 0.5831, + "step": 31630 + }, + { + "epoch": 1.5714711433396245, + "grad_norm": 0.1259765625, + "learning_rate": 0.000674286281911195, + "loss": 0.614, + "step": 31640 + }, + { + "epoch": 1.571967815635244, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006742465481275455, + "loss": 0.5724, + "step": 31650 + }, + { + "epoch": 1.5724644879308634, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006742068143438959, + "loss": 0.5828, + "step": 31660 + }, + { + "epoch": 1.5729611602264826, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006741670805602464, + "loss": 0.5952, + "step": 31670 + }, + { + "epoch": 1.5734578325221018, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006741273467765969, + "loss": 0.6089, + "step": 31680 + }, + { + "epoch": 1.5739545048177213, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006740876129929473, + "loss": 0.6162, + "step": 31690 + }, + { + "epoch": 1.5744511771133407, + "grad_norm": 0.1015625, + "learning_rate": 0.0006740478792092978, + "loss": 0.5958, + "step": 31700 + }, + { + "epoch": 1.57494784940896, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006740081454256481, + "loss": 0.5772, + "step": 31710 + }, + { + "epoch": 1.5754445217045792, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006739684116419986, + "loss": 0.594, + "step": 31720 + }, + { + "epoch": 1.5759411940001988, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006739286778583492, + "loss": 0.6195, + "step": 31730 + }, + { + "epoch": 1.576437866295818, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006738889440746995, + "loss": 0.6125, + "step": 31740 + }, + { + "epoch": 1.5769345385914373, + "grad_norm": 0.0986328125, + "learning_rate": 0.00067384921029105, + "loss": 0.6229, + "step": 31750 + }, + { + "epoch": 1.5774312108870567, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006738094765074005, + "loss": 0.5876, + "step": 31760 + }, + { + "epoch": 1.5779278831826762, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006737697427237508, + "loss": 0.5819, + "step": 31770 + }, + { + "epoch": 1.5784245554782954, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006737300089401014, + "loss": 0.59, + "step": 31780 + }, + { + "epoch": 1.5789212277739146, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006736902751564518, + "loss": 0.6214, + "step": 31790 + }, + { + "epoch": 1.579417900069534, + "grad_norm": 0.123046875, + "learning_rate": 0.0006736505413728022, + "loss": 0.6092, + "step": 31800 + }, + { + "epoch": 1.5799145723651535, + "grad_norm": 0.08935546875, + "learning_rate": 0.0006736108075891527, + "loss": 0.601, + "step": 31810 + }, + { + "epoch": 1.5804112446607728, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006735710738055031, + "loss": 0.6004, + "step": 31820 + }, + { + "epoch": 1.5809079169563922, + "grad_norm": 0.1806640625, + "learning_rate": 0.0006735313400218537, + "loss": 0.5944, + "step": 31830 + }, + { + "epoch": 1.5814045892520117, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006734916062382041, + "loss": 0.6069, + "step": 31840 + }, + { + "epoch": 1.5819012615476309, + "grad_norm": 0.12890625, + "learning_rate": 0.0006734518724545545, + "loss": 0.5988, + "step": 31850 + }, + { + "epoch": 1.5823979338432501, + "grad_norm": 0.138671875, + "learning_rate": 0.000673412138670905, + "loss": 0.5958, + "step": 31860 + }, + { + "epoch": 1.5828946061388696, + "grad_norm": 0.091796875, + "learning_rate": 0.0006733724048872554, + "loss": 0.5858, + "step": 31870 + }, + { + "epoch": 1.583391278434489, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006733326711036059, + "loss": 0.6077, + "step": 31880 + }, + { + "epoch": 1.5838879507301082, + "grad_norm": 0.08984375, + "learning_rate": 0.0006732929373199564, + "loss": 0.5997, + "step": 31890 + }, + { + "epoch": 1.5843846230257275, + "grad_norm": 0.1171875, + "learning_rate": 0.0006732532035363067, + "loss": 0.6122, + "step": 31900 + }, + { + "epoch": 1.5848812953213471, + "grad_norm": 0.08984375, + "learning_rate": 0.0006732134697526572, + "loss": 0.5781, + "step": 31910 + }, + { + "epoch": 1.5853779676169664, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006731737359690077, + "loss": 0.5809, + "step": 31920 + }, + { + "epoch": 1.5858746399125856, + "grad_norm": 0.10400390625, + "learning_rate": 0.000673134002185358, + "loss": 0.6108, + "step": 31930 + }, + { + "epoch": 1.586371312208205, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006730942684017086, + "loss": 0.604, + "step": 31940 + }, + { + "epoch": 1.5868679845038245, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006730545346180591, + "loss": 0.6119, + "step": 31950 + }, + { + "epoch": 1.5873646567994437, + "grad_norm": 0.162109375, + "learning_rate": 0.0006730148008344095, + "loss": 0.5925, + "step": 31960 + }, + { + "epoch": 1.587861329095063, + "grad_norm": 0.103515625, + "learning_rate": 0.0006729750670507599, + "loss": 0.588, + "step": 31970 + }, + { + "epoch": 1.5883580013906824, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006729353332671104, + "loss": 0.6016, + "step": 31980 + }, + { + "epoch": 1.5888546736863018, + "grad_norm": 0.140625, + "learning_rate": 0.0006728955994834609, + "loss": 0.6076, + "step": 31990 + }, + { + "epoch": 1.589351345981921, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006728558656998113, + "loss": 0.6245, + "step": 32000 + }, + { + "epoch": 1.5898480182775405, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006728161319161617, + "loss": 0.5935, + "step": 32010 + }, + { + "epoch": 1.59034469057316, + "grad_norm": 0.134765625, + "learning_rate": 0.0006727763981325122, + "loss": 0.6305, + "step": 32020 + }, + { + "epoch": 1.5908413628687792, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006727366643488627, + "loss": 0.5974, + "step": 32030 + }, + { + "epoch": 1.5913380351643984, + "grad_norm": 0.08984375, + "learning_rate": 0.0006726969305652131, + "loss": 0.5709, + "step": 32040 + }, + { + "epoch": 1.5918347074600179, + "grad_norm": 0.126953125, + "learning_rate": 0.0006726571967815636, + "loss": 0.5996, + "step": 32050 + }, + { + "epoch": 1.5923313797556373, + "grad_norm": 0.09619140625, + "learning_rate": 0.000672617462997914, + "loss": 0.6167, + "step": 32060 + }, + { + "epoch": 1.5928280520512565, + "grad_norm": 0.0869140625, + "learning_rate": 0.0006725777292142644, + "loss": 0.6354, + "step": 32070 + }, + { + "epoch": 1.5933247243468758, + "grad_norm": 0.0986328125, + "learning_rate": 0.000672537995430615, + "loss": 0.5934, + "step": 32080 + }, + { + "epoch": 1.5938213966424954, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006724982616469653, + "loss": 0.5804, + "step": 32090 + }, + { + "epoch": 1.5943180689381147, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006724585278633158, + "loss": 0.5837, + "step": 32100 + }, + { + "epoch": 1.594814741233734, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006724187940796663, + "loss": 0.5765, + "step": 32110 + }, + { + "epoch": 1.5953114135293534, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006723790602960167, + "loss": 0.6098, + "step": 32120 + }, + { + "epoch": 1.5958080858249728, + "grad_norm": 0.126953125, + "learning_rate": 0.0006723393265123672, + "loss": 0.5863, + "step": 32130 + }, + { + "epoch": 1.596304758120592, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006722995927287177, + "loss": 0.5996, + "step": 32140 + }, + { + "epoch": 1.5968014304162113, + "grad_norm": 0.171875, + "learning_rate": 0.0006722598589450681, + "loss": 0.6006, + "step": 32150 + }, + { + "epoch": 1.5972981027118307, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006722201251614185, + "loss": 0.5707, + "step": 32160 + }, + { + "epoch": 1.5977947750074502, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006721803913777689, + "loss": 0.5769, + "step": 32170 + }, + { + "epoch": 1.5982914473030694, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006721406575941195, + "loss": 0.607, + "step": 32180 + }, + { + "epoch": 1.5987881195986888, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006721009238104699, + "loss": 0.5976, + "step": 32190 + }, + { + "epoch": 1.5992847918943083, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006720611900268203, + "loss": 0.6053, + "step": 32200 + }, + { + "epoch": 1.5997814641899275, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006720214562431708, + "loss": 0.6023, + "step": 32210 + }, + { + "epoch": 1.6002781364855467, + "grad_norm": 0.111328125, + "learning_rate": 0.0006719817224595212, + "loss": 0.6075, + "step": 32220 + }, + { + "epoch": 1.6007748087811662, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006719419886758716, + "loss": 0.5845, + "step": 32230 + }, + { + "epoch": 1.6012714810767856, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006719022548922222, + "loss": 0.6106, + "step": 32240 + }, + { + "epoch": 1.6017681533724049, + "grad_norm": 0.1015625, + "learning_rate": 0.0006718625211085726, + "loss": 0.6036, + "step": 32250 + }, + { + "epoch": 1.602264825668024, + "grad_norm": 0.171875, + "learning_rate": 0.000671822787324923, + "loss": 0.6072, + "step": 32260 + }, + { + "epoch": 1.6027614979636435, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006717830535412735, + "loss": 0.5904, + "step": 32270 + }, + { + "epoch": 1.603258170259263, + "grad_norm": 0.11865234375, + "learning_rate": 0.000671743319757624, + "loss": 0.5886, + "step": 32280 + }, + { + "epoch": 1.6037548425548822, + "grad_norm": 0.09765625, + "learning_rate": 0.0006717035859739744, + "loss": 0.6086, + "step": 32290 + }, + { + "epoch": 1.6042515148505017, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006716638521903249, + "loss": 0.6034, + "step": 32300 + }, + { + "epoch": 1.604748187146121, + "grad_norm": 0.185546875, + "learning_rate": 0.0006716241184066753, + "loss": 0.6138, + "step": 32310 + }, + { + "epoch": 1.6052448594417403, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006715843846230257, + "loss": 0.614, + "step": 32320 + }, + { + "epoch": 1.6057415317373596, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006715446508393763, + "loss": 0.5995, + "step": 32330 + }, + { + "epoch": 1.606238204032979, + "grad_norm": 0.099609375, + "learning_rate": 0.0006715049170557267, + "loss": 0.5998, + "step": 32340 + }, + { + "epoch": 1.6067348763285985, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006714651832720771, + "loss": 0.5739, + "step": 32350 + }, + { + "epoch": 1.6072315486242177, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006714254494884275, + "loss": 0.5946, + "step": 32360 + }, + { + "epoch": 1.6077282209198371, + "grad_norm": 0.11865234375, + "learning_rate": 0.000671385715704778, + "loss": 0.5991, + "step": 32370 + }, + { + "epoch": 1.6082248932154566, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006713459819211284, + "loss": 0.612, + "step": 32380 + }, + { + "epoch": 1.6087215655110758, + "grad_norm": 0.107421875, + "learning_rate": 0.0006713062481374789, + "loss": 0.5953, + "step": 32390 + }, + { + "epoch": 1.609218237806695, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006712665143538294, + "loss": 0.5863, + "step": 32400 + }, + { + "epoch": 1.6097149101023145, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006712267805701799, + "loss": 0.5835, + "step": 32410 + }, + { + "epoch": 1.610211582397934, + "grad_norm": 0.138671875, + "learning_rate": 0.0006711870467865302, + "loss": 0.6002, + "step": 32420 + }, + { + "epoch": 1.6107082546935532, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006711473130028808, + "loss": 0.5809, + "step": 32430 + }, + { + "epoch": 1.6112049269891724, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006711075792192312, + "loss": 0.6159, + "step": 32440 + }, + { + "epoch": 1.6117015992847918, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006710678454355816, + "loss": 0.6001, + "step": 32450 + }, + { + "epoch": 1.6121982715804113, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006710281116519321, + "loss": 0.625, + "step": 32460 + }, + { + "epoch": 1.6126949438760305, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006709883778682825, + "loss": 0.6024, + "step": 32470 + }, + { + "epoch": 1.61319161617165, + "grad_norm": 0.10546875, + "learning_rate": 0.000670948644084633, + "loss": 0.5837, + "step": 32480 + }, + { + "epoch": 1.6136882884672694, + "grad_norm": 0.1357421875, + "learning_rate": 0.0006709089103009835, + "loss": 0.5987, + "step": 32490 + }, + { + "epoch": 1.6141849607628886, + "grad_norm": 0.107421875, + "learning_rate": 0.0006708691765173339, + "loss": 0.5843, + "step": 32500 + }, + { + "epoch": 1.6146816330585079, + "grad_norm": 0.08642578125, + "learning_rate": 0.0006708294427336843, + "loss": 0.5891, + "step": 32510 + }, + { + "epoch": 1.6151783053541273, + "grad_norm": 0.11328125, + "learning_rate": 0.0006707897089500348, + "loss": 0.593, + "step": 32520 + }, + { + "epoch": 1.6156749776497468, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006707499751663852, + "loss": 0.6336, + "step": 32530 + }, + { + "epoch": 1.616171649945366, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006707102413827357, + "loss": 0.5974, + "step": 32540 + }, + { + "epoch": 1.6166683222409854, + "grad_norm": 0.130859375, + "learning_rate": 0.0006706705075990862, + "loss": 0.5764, + "step": 32550 + }, + { + "epoch": 1.617164994536605, + "grad_norm": 0.1337890625, + "learning_rate": 0.0006706307738154366, + "loss": 0.6318, + "step": 32560 + }, + { + "epoch": 1.6176616668322241, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006705910400317871, + "loss": 0.5853, + "step": 32570 + }, + { + "epoch": 1.6181583391278433, + "grad_norm": 0.09765625, + "learning_rate": 0.0006705513062481374, + "loss": 0.5914, + "step": 32580 + }, + { + "epoch": 1.6186550114234628, + "grad_norm": 0.1005859375, + "learning_rate": 0.000670511572464488, + "loss": 0.5874, + "step": 32590 + }, + { + "epoch": 1.6191516837190822, + "grad_norm": 0.09375, + "learning_rate": 0.0006704718386808385, + "loss": 0.5953, + "step": 32600 + }, + { + "epoch": 1.6196483560147015, + "grad_norm": 0.138671875, + "learning_rate": 0.0006704321048971888, + "loss": 0.5655, + "step": 32610 + }, + { + "epoch": 1.6201450283103207, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006703923711135393, + "loss": 0.6293, + "step": 32620 + }, + { + "epoch": 1.6206417006059401, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006703526373298897, + "loss": 0.5849, + "step": 32630 + }, + { + "epoch": 1.6211383729015596, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006703129035462402, + "loss": 0.5957, + "step": 32640 + }, + { + "epoch": 1.6216350451971788, + "grad_norm": 0.154296875, + "learning_rate": 0.0006702731697625907, + "loss": 0.5986, + "step": 32650 + }, + { + "epoch": 1.6221317174927983, + "grad_norm": 0.12890625, + "learning_rate": 0.0006702334359789411, + "loss": 0.6059, + "step": 32660 + }, + { + "epoch": 1.6226283897884177, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006701937021952915, + "loss": 0.6147, + "step": 32670 + }, + { + "epoch": 1.623125062084037, + "grad_norm": 0.140625, + "learning_rate": 0.000670153968411642, + "loss": 0.602, + "step": 32680 + }, + { + "epoch": 1.6236217343796562, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006701142346279925, + "loss": 0.5796, + "step": 32690 + }, + { + "epoch": 1.6241184066752756, + "grad_norm": 0.1357421875, + "learning_rate": 0.000670074500844343, + "loss": 0.6129, + "step": 32700 + }, + { + "epoch": 1.624615078970895, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006700347670606934, + "loss": 0.5866, + "step": 32710 + }, + { + "epoch": 1.6251117512665143, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006699950332770438, + "loss": 0.609, + "step": 32720 + }, + { + "epoch": 1.6256084235621338, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006699552994933944, + "loss": 0.639, + "step": 32730 + }, + { + "epoch": 1.6261050958577532, + "grad_norm": 0.125, + "learning_rate": 0.0006699155657097448, + "loss": 0.6427, + "step": 32740 + }, + { + "epoch": 1.6266017681533724, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006698758319260952, + "loss": 0.5993, + "step": 32750 + }, + { + "epoch": 1.6270984404489917, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006698360981424457, + "loss": 0.5942, + "step": 32760 + }, + { + "epoch": 1.627595112744611, + "grad_norm": 0.10009765625, + "learning_rate": 0.000669796364358796, + "loss": 0.5955, + "step": 32770 + }, + { + "epoch": 1.6280917850402306, + "grad_norm": 0.13671875, + "learning_rate": 0.0006697566305751465, + "loss": 0.6021, + "step": 32780 + }, + { + "epoch": 1.6285884573358498, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006697168967914971, + "loss": 0.608, + "step": 32790 + }, + { + "epoch": 1.629085129631469, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006696771630078474, + "loss": 0.572, + "step": 32800 + }, + { + "epoch": 1.6295818019270885, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006696374292241979, + "loss": 0.6285, + "step": 32810 + }, + { + "epoch": 1.630078474222708, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006695976954405484, + "loss": 0.5893, + "step": 32820 + }, + { + "epoch": 1.6305751465183271, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006695579616568987, + "loss": 0.5894, + "step": 32830 + }, + { + "epoch": 1.6310718188139466, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006695182278732493, + "loss": 0.6047, + "step": 32840 + }, + { + "epoch": 1.631568491109566, + "grad_norm": 0.2060546875, + "learning_rate": 0.0006694784940895997, + "loss": 0.6013, + "step": 32850 + }, + { + "epoch": 1.6320651634051853, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006694387603059502, + "loss": 0.6161, + "step": 32860 + }, + { + "epoch": 1.6325618357008045, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006693990265223006, + "loss": 0.583, + "step": 32870 + }, + { + "epoch": 1.633058507996424, + "grad_norm": 0.11181640625, + "learning_rate": 0.000669359292738651, + "loss": 0.6175, + "step": 32880 + }, + { + "epoch": 1.6335551802920434, + "grad_norm": 0.154296875, + "learning_rate": 0.0006693195589550016, + "loss": 0.5876, + "step": 32890 + }, + { + "epoch": 1.6340518525876626, + "grad_norm": 0.08935546875, + "learning_rate": 0.000669279825171352, + "loss": 0.5955, + "step": 32900 + }, + { + "epoch": 1.634548524883282, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006692400913877024, + "loss": 0.5851, + "step": 32910 + }, + { + "epoch": 1.6350451971789015, + "grad_norm": 0.19921875, + "learning_rate": 0.0006692003576040529, + "loss": 0.6354, + "step": 32920 + }, + { + "epoch": 1.6355418694745207, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006691606238204033, + "loss": 0.5891, + "step": 32930 + }, + { + "epoch": 1.63603854177014, + "grad_norm": 0.12890625, + "learning_rate": 0.0006691208900367538, + "loss": 0.6053, + "step": 32940 + }, + { + "epoch": 1.6365352140657594, + "grad_norm": 0.1015625, + "learning_rate": 0.0006690811562531043, + "loss": 0.5908, + "step": 32950 + }, + { + "epoch": 1.6370318863613789, + "grad_norm": 0.171875, + "learning_rate": 0.0006690414224694546, + "loss": 0.5854, + "step": 32960 + }, + { + "epoch": 1.637528558656998, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006690016886858051, + "loss": 0.5946, + "step": 32970 + }, + { + "epoch": 1.6380252309526173, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006689619549021556, + "loss": 0.6006, + "step": 32980 + }, + { + "epoch": 1.6385219032482368, + "grad_norm": 0.142578125, + "learning_rate": 0.000668922221118506, + "loss": 0.5873, + "step": 32990 + }, + { + "epoch": 1.6390185755438562, + "grad_norm": 0.138671875, + "learning_rate": 0.0006688824873348565, + "loss": 0.6015, + "step": 33000 + }, + { + "epoch": 1.6395152478394754, + "grad_norm": 0.099609375, + "learning_rate": 0.000668842753551207, + "loss": 0.5932, + "step": 33010 + }, + { + "epoch": 1.640011920135095, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006688030197675574, + "loss": 0.601, + "step": 33020 + }, + { + "epoch": 1.6405085924307143, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006687632859839078, + "loss": 0.5883, + "step": 33030 + }, + { + "epoch": 1.6410052647263336, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006687235522002583, + "loss": 0.619, + "step": 33040 + }, + { + "epoch": 1.6415019370219528, + "grad_norm": 0.1640625, + "learning_rate": 0.0006686838184166088, + "loss": 0.5785, + "step": 33050 + }, + { + "epoch": 1.6419986093175722, + "grad_norm": 0.17578125, + "learning_rate": 0.0006686440846329592, + "loss": 0.6193, + "step": 33060 + }, + { + "epoch": 1.6424952816131917, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006686043508493096, + "loss": 0.6014, + "step": 33070 + }, + { + "epoch": 1.642991953908811, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006685646170656601, + "loss": 0.6019, + "step": 33080 + }, + { + "epoch": 1.6434886262044304, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006685248832820106, + "loss": 0.5937, + "step": 33090 + }, + { + "epoch": 1.6439852985000498, + "grad_norm": 0.11474609375, + "learning_rate": 0.000668485149498361, + "loss": 0.6061, + "step": 33100 + }, + { + "epoch": 1.644481970795669, + "grad_norm": 0.11328125, + "learning_rate": 0.0006684454157147115, + "loss": 0.571, + "step": 33110 + }, + { + "epoch": 1.6449786430912883, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006684056819310619, + "loss": 0.6038, + "step": 33120 + }, + { + "epoch": 1.6454753153869077, + "grad_norm": 0.107421875, + "learning_rate": 0.0006683659481474123, + "loss": 0.5675, + "step": 33130 + }, + { + "epoch": 1.6459719876825272, + "grad_norm": 0.119140625, + "learning_rate": 0.0006683262143637629, + "loss": 0.6265, + "step": 33140 + }, + { + "epoch": 1.6464686599781464, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006682864805801133, + "loss": 0.6078, + "step": 33150 + }, + { + "epoch": 1.6469653322737656, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006682467467964637, + "loss": 0.5742, + "step": 33160 + }, + { + "epoch": 1.647462004569385, + "grad_norm": 0.111328125, + "learning_rate": 0.0006682070130128142, + "loss": 0.5663, + "step": 33170 + }, + { + "epoch": 1.6479586768650045, + "grad_norm": 0.169921875, + "learning_rate": 0.0006681672792291646, + "loss": 0.5814, + "step": 33180 + }, + { + "epoch": 1.6484553491606238, + "grad_norm": 0.099609375, + "learning_rate": 0.0006681275454455151, + "loss": 0.6041, + "step": 33190 + }, + { + "epoch": 1.6489520214562432, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006680878116618656, + "loss": 0.5971, + "step": 33200 + }, + { + "epoch": 1.6494486937518626, + "grad_norm": 0.12890625, + "learning_rate": 0.000668048077878216, + "loss": 0.5873, + "step": 33210 + }, + { + "epoch": 1.6499453660474819, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006680083440945664, + "loss": 0.5809, + "step": 33220 + }, + { + "epoch": 1.650442038343101, + "grad_norm": 0.1015625, + "learning_rate": 0.0006679686103109168, + "loss": 0.5429, + "step": 33230 + }, + { + "epoch": 1.6509387106387206, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006679288765272674, + "loss": 0.5753, + "step": 33240 + }, + { + "epoch": 1.65143538293434, + "grad_norm": 0.12890625, + "learning_rate": 0.0006678891427436178, + "loss": 0.5938, + "step": 33250 + }, + { + "epoch": 1.6519320552299592, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006678494089599682, + "loss": 0.6029, + "step": 33260 + }, + { + "epoch": 1.6524287275255787, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006678096751763187, + "loss": 0.5796, + "step": 33270 + }, + { + "epoch": 1.6529253998211981, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006677699413926691, + "loss": 0.5814, + "step": 33280 + }, + { + "epoch": 1.6534220721168174, + "grad_norm": 0.0869140625, + "learning_rate": 0.0006677302076090196, + "loss": 0.5828, + "step": 33290 + }, + { + "epoch": 1.6539187444124366, + "grad_norm": 0.10546875, + "learning_rate": 0.0006676904738253701, + "loss": 0.5606, + "step": 33300 + }, + { + "epoch": 1.654415416708056, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006676507400417205, + "loss": 0.6069, + "step": 33310 + }, + { + "epoch": 1.6549120890036755, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006676110062580709, + "loss": 0.5975, + "step": 33320 + }, + { + "epoch": 1.6554087612992947, + "grad_norm": 0.126953125, + "learning_rate": 0.0006675712724744214, + "loss": 0.587, + "step": 33330 + }, + { + "epoch": 1.655905433594914, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006675315386907719, + "loss": 0.576, + "step": 33340 + }, + { + "epoch": 1.6564021058905334, + "grad_norm": 0.228515625, + "learning_rate": 0.0006674918049071223, + "loss": 0.6055, + "step": 33350 + }, + { + "epoch": 1.6568987781861528, + "grad_norm": 0.134765625, + "learning_rate": 0.0006674520711234728, + "loss": 0.5697, + "step": 33360 + }, + { + "epoch": 1.657395450481772, + "grad_norm": 0.107421875, + "learning_rate": 0.0006674123373398232, + "loss": 0.5907, + "step": 33370 + }, + { + "epoch": 1.6578921227773915, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006673726035561736, + "loss": 0.5925, + "step": 33380 + }, + { + "epoch": 1.658388795073011, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006673328697725242, + "loss": 0.6118, + "step": 33390 + }, + { + "epoch": 1.6588854673686302, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006672931359888746, + "loss": 0.6064, + "step": 33400 + }, + { + "epoch": 1.6593821396642494, + "grad_norm": 0.11181640625, + "learning_rate": 0.000667253402205225, + "loss": 0.5998, + "step": 33410 + }, + { + "epoch": 1.6598788119598689, + "grad_norm": 0.14453125, + "learning_rate": 0.0006672136684215755, + "loss": 0.5894, + "step": 33420 + }, + { + "epoch": 1.6603754842554883, + "grad_norm": 0.12109375, + "learning_rate": 0.0006671739346379259, + "loss": 0.5991, + "step": 33430 + }, + { + "epoch": 1.6608721565511075, + "grad_norm": 0.2041015625, + "learning_rate": 0.0006671342008542764, + "loss": 0.5902, + "step": 33440 + }, + { + "epoch": 1.6613688288467268, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006670944670706268, + "loss": 0.6083, + "step": 33450 + }, + { + "epoch": 1.6618655011423464, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006670547332869773, + "loss": 0.5757, + "step": 33460 + }, + { + "epoch": 1.6623621734379657, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006670149995033278, + "loss": 0.5965, + "step": 33470 + }, + { + "epoch": 1.662858845733585, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006669752657196781, + "loss": 0.6245, + "step": 33480 + }, + { + "epoch": 1.6633555180292043, + "grad_norm": 0.29296875, + "learning_rate": 0.0006669355319360287, + "loss": 0.6389, + "step": 33490 + }, + { + "epoch": 1.6638521903248238, + "grad_norm": 0.10546875, + "learning_rate": 0.0006668957981523791, + "loss": 0.5866, + "step": 33500 + }, + { + "epoch": 1.664348862620443, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006668560643687295, + "loss": 0.5713, + "step": 33510 + }, + { + "epoch": 1.6648455349160622, + "grad_norm": 0.1298828125, + "learning_rate": 0.00066681633058508, + "loss": 0.6158, + "step": 33520 + }, + { + "epoch": 1.6653422072116817, + "grad_norm": 0.20703125, + "learning_rate": 0.0006667765968014304, + "loss": 0.5874, + "step": 33530 + }, + { + "epoch": 1.6658388795073011, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006667368630177808, + "loss": 0.5814, + "step": 33540 + }, + { + "epoch": 1.6663355518029204, + "grad_norm": 0.158203125, + "learning_rate": 0.0006666971292341314, + "loss": 0.586, + "step": 33550 + }, + { + "epoch": 1.6668322240985398, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006666573954504818, + "loss": 0.5789, + "step": 33560 + }, + { + "epoch": 1.6673288963941593, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006666176616668322, + "loss": 0.5843, + "step": 33570 + }, + { + "epoch": 1.6678255686897785, + "grad_norm": 0.1455078125, + "learning_rate": 0.0006665779278831827, + "loss": 0.5828, + "step": 33580 + }, + { + "epoch": 1.6683222409853977, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006665381940995332, + "loss": 0.5808, + "step": 33590 + }, + { + "epoch": 1.6688189132810172, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006664984603158837, + "loss": 0.5988, + "step": 33600 + }, + { + "epoch": 1.6693155855766366, + "grad_norm": 0.11328125, + "learning_rate": 0.0006664587265322341, + "loss": 0.5865, + "step": 33610 + }, + { + "epoch": 1.6698122578722558, + "grad_norm": 0.15625, + "learning_rate": 0.0006664189927485845, + "loss": 0.5933, + "step": 33620 + }, + { + "epoch": 1.670308930167875, + "grad_norm": 0.11181640625, + "learning_rate": 0.000666379258964935, + "loss": 0.5899, + "step": 33630 + }, + { + "epoch": 1.6708056024634947, + "grad_norm": 0.1484375, + "learning_rate": 0.0006663395251812853, + "loss": 0.5751, + "step": 33640 + }, + { + "epoch": 1.671302274759114, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006662997913976359, + "loss": 0.5719, + "step": 33650 + }, + { + "epoch": 1.6717989470547332, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006662600576139864, + "loss": 0.5757, + "step": 33660 + }, + { + "epoch": 1.6722956193503526, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006662203238303367, + "loss": 0.6287, + "step": 33670 + }, + { + "epoch": 1.672792291645972, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006661805900466872, + "loss": 0.6213, + "step": 33680 + }, + { + "epoch": 1.6732889639415913, + "grad_norm": 0.12890625, + "learning_rate": 0.0006661408562630378, + "loss": 0.6159, + "step": 33690 + }, + { + "epoch": 1.6737856362372106, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006661011224793881, + "loss": 0.5905, + "step": 33700 + }, + { + "epoch": 1.67428230853283, + "grad_norm": 0.103515625, + "learning_rate": 0.0006660613886957386, + "loss": 0.6113, + "step": 33710 + }, + { + "epoch": 1.6747789808284494, + "grad_norm": 0.095703125, + "learning_rate": 0.000666021654912089, + "loss": 0.5731, + "step": 33720 + }, + { + "epoch": 1.6752756531240687, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006659819211284394, + "loss": 0.5678, + "step": 33730 + }, + { + "epoch": 1.6757723254196881, + "grad_norm": 0.1240234375, + "learning_rate": 0.00066594218734479, + "loss": 0.5949, + "step": 33740 + }, + { + "epoch": 1.6762689977153076, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006659024535611404, + "loss": 0.5886, + "step": 33750 + }, + { + "epoch": 1.6767656700109268, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006658627197774909, + "loss": 0.5974, + "step": 33760 + }, + { + "epoch": 1.677262342306546, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006658229859938413, + "loss": 0.58, + "step": 33770 + }, + { + "epoch": 1.6777590146021655, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006657832522101917, + "loss": 0.5542, + "step": 33780 + }, + { + "epoch": 1.678255686897785, + "grad_norm": 0.0859375, + "learning_rate": 0.0006657435184265423, + "loss": 0.5807, + "step": 33790 + }, + { + "epoch": 1.6787523591934042, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006657037846428927, + "loss": 0.5959, + "step": 33800 + }, + { + "epoch": 1.6792490314890234, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006656640508592431, + "loss": 0.5967, + "step": 33810 + }, + { + "epoch": 1.679745703784643, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006656243170755936, + "loss": 0.6256, + "step": 33820 + }, + { + "epoch": 1.6802423760802623, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006655845832919439, + "loss": 0.5835, + "step": 33830 + }, + { + "epoch": 1.6807390483758815, + "grad_norm": 0.10546875, + "learning_rate": 0.0006655448495082944, + "loss": 0.6092, + "step": 33840 + }, + { + "epoch": 1.681235720671501, + "grad_norm": 0.1220703125, + "learning_rate": 0.000665505115724645, + "loss": 0.6094, + "step": 33850 + }, + { + "epoch": 1.6817323929671204, + "grad_norm": 0.1455078125, + "learning_rate": 0.0006654653819409953, + "loss": 0.6105, + "step": 33860 + }, + { + "epoch": 1.6822290652627396, + "grad_norm": 0.12109375, + "learning_rate": 0.0006654256481573458, + "loss": 0.632, + "step": 33870 + }, + { + "epoch": 1.6827257375583589, + "grad_norm": 0.09765625, + "learning_rate": 0.0006653859143736963, + "loss": 0.6004, + "step": 33880 + }, + { + "epoch": 1.6832224098539783, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006653461805900466, + "loss": 0.5981, + "step": 33890 + }, + { + "epoch": 1.6837190821495978, + "grad_norm": 0.134765625, + "learning_rate": 0.0006653064468063972, + "loss": 0.5944, + "step": 33900 + }, + { + "epoch": 1.684215754445217, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006652667130227476, + "loss": 0.599, + "step": 33910 + }, + { + "epoch": 1.6847124267408364, + "grad_norm": 0.2294921875, + "learning_rate": 0.0006652269792390981, + "loss": 0.5958, + "step": 33920 + }, + { + "epoch": 1.6852090990364559, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006651872454554485, + "loss": 0.5523, + "step": 33930 + }, + { + "epoch": 1.685705771332075, + "grad_norm": 0.140625, + "learning_rate": 0.0006651475116717989, + "loss": 0.5817, + "step": 33940 + }, + { + "epoch": 1.6862024436276943, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006651077778881495, + "loss": 0.579, + "step": 33950 + }, + { + "epoch": 1.6866991159233138, + "grad_norm": 0.1357421875, + "learning_rate": 0.0006650680441044999, + "loss": 0.6026, + "step": 33960 + }, + { + "epoch": 1.6871957882189332, + "grad_norm": 0.08935546875, + "learning_rate": 0.0006650283103208503, + "loss": 0.5954, + "step": 33970 + }, + { + "epoch": 1.6876924605145525, + "grad_norm": 0.12109375, + "learning_rate": 0.0006649885765372008, + "loss": 0.5736, + "step": 33980 + }, + { + "epoch": 1.6881891328101717, + "grad_norm": 0.1728515625, + "learning_rate": 0.0006649488427535512, + "loss": 0.5884, + "step": 33990 + }, + { + "epoch": 1.6886858051057914, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006649091089699017, + "loss": 0.6015, + "step": 34000 + }, + { + "epoch": 1.6891824774014106, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006648693751862522, + "loss": 0.5854, + "step": 34010 + }, + { + "epoch": 1.6896791496970298, + "grad_norm": 0.13671875, + "learning_rate": 0.0006648296414026026, + "loss": 0.6298, + "step": 34020 + }, + { + "epoch": 1.6901758219926493, + "grad_norm": 0.0947265625, + "learning_rate": 0.000664789907618953, + "loss": 0.576, + "step": 34030 + }, + { + "epoch": 1.6906724942882687, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006647501738353036, + "loss": 0.5603, + "step": 34040 + }, + { + "epoch": 1.691169166583888, + "grad_norm": 0.099609375, + "learning_rate": 0.000664710440051654, + "loss": 0.601, + "step": 34050 + }, + { + "epoch": 1.6916658388795072, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006646707062680044, + "loss": 0.6415, + "step": 34060 + }, + { + "epoch": 1.6921625111751266, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006646309724843549, + "loss": 0.6118, + "step": 34070 + }, + { + "epoch": 1.692659183470746, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006645912387007053, + "loss": 0.5767, + "step": 34080 + }, + { + "epoch": 1.6931558557663653, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006645515049170557, + "loss": 0.5706, + "step": 34090 + }, + { + "epoch": 1.6936525280619847, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006645117711334062, + "loss": 0.6036, + "step": 34100 + }, + { + "epoch": 1.6941492003576042, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006644720373497567, + "loss": 0.6352, + "step": 34110 + }, + { + "epoch": 1.6946458726532234, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006644323035661071, + "loss": 0.5675, + "step": 34120 + }, + { + "epoch": 1.6951425449488426, + "grad_norm": 0.125, + "learning_rate": 0.0006643925697824575, + "loss": 0.6123, + "step": 34130 + }, + { + "epoch": 1.695639217244462, + "grad_norm": 0.11328125, + "learning_rate": 0.000664352835998808, + "loss": 0.5847, + "step": 34140 + }, + { + "epoch": 1.6961358895400815, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006643131022151585, + "loss": 0.589, + "step": 34150 + }, + { + "epoch": 1.6966325618357008, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006642733684315089, + "loss": 0.5961, + "step": 34160 + }, + { + "epoch": 1.69712923413132, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006642336346478594, + "loss": 0.6074, + "step": 34170 + }, + { + "epoch": 1.6976259064269397, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006641939008642098, + "loss": 0.5817, + "step": 34180 + }, + { + "epoch": 1.698122578722559, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006641541670805602, + "loss": 0.5915, + "step": 34190 + }, + { + "epoch": 1.6986192510181781, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006641144332969108, + "loss": 0.5905, + "step": 34200 + }, + { + "epoch": 1.6991159233137976, + "grad_norm": 0.134765625, + "learning_rate": 0.0006640746995132612, + "loss": 0.5991, + "step": 34210 + }, + { + "epoch": 1.699612595609417, + "grad_norm": 0.125, + "learning_rate": 0.0006640349657296116, + "loss": 0.5691, + "step": 34220 + }, + { + "epoch": 1.7001092679050362, + "grad_norm": 0.09765625, + "learning_rate": 0.0006639952319459621, + "loss": 0.5869, + "step": 34230 + }, + { + "epoch": 1.7006059402006555, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006639554981623125, + "loss": 0.6391, + "step": 34240 + }, + { + "epoch": 1.701102612496275, + "grad_norm": 0.11767578125, + "learning_rate": 0.000663915764378663, + "loss": 0.583, + "step": 34250 + }, + { + "epoch": 1.7015992847918944, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006638760305950135, + "loss": 0.6054, + "step": 34260 + }, + { + "epoch": 1.7020959570875136, + "grad_norm": 0.138671875, + "learning_rate": 0.0006638362968113639, + "loss": 0.5867, + "step": 34270 + }, + { + "epoch": 1.702592629383133, + "grad_norm": 0.09375, + "learning_rate": 0.0006637965630277143, + "loss": 0.6036, + "step": 34280 + }, + { + "epoch": 1.7030893016787525, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006637568292440648, + "loss": 0.6233, + "step": 34290 + }, + { + "epoch": 1.7035859739743717, + "grad_norm": 0.08642578125, + "learning_rate": 0.0006637170954604153, + "loss": 0.5898, + "step": 34300 + }, + { + "epoch": 1.704082646269991, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006636773616767657, + "loss": 0.6078, + "step": 34310 + }, + { + "epoch": 1.7045793185656104, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006636376278931161, + "loss": 0.6156, + "step": 34320 + }, + { + "epoch": 1.7050759908612299, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006635978941094666, + "loss": 0.5982, + "step": 34330 + }, + { + "epoch": 1.705572663156849, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006635581603258171, + "loss": 0.5923, + "step": 34340 + }, + { + "epoch": 1.7060693354524683, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006635184265421675, + "loss": 0.5583, + "step": 34350 + }, + { + "epoch": 1.706566007748088, + "grad_norm": 0.15625, + "learning_rate": 0.000663478692758518, + "loss": 0.5799, + "step": 34360 + }, + { + "epoch": 1.7070626800437072, + "grad_norm": 0.125, + "learning_rate": 0.0006634389589748684, + "loss": 0.5927, + "step": 34370 + }, + { + "epoch": 1.7075593523393264, + "grad_norm": 0.140625, + "learning_rate": 0.0006633992251912188, + "loss": 0.5996, + "step": 34380 + }, + { + "epoch": 1.7080560246349459, + "grad_norm": 0.103515625, + "learning_rate": 0.0006633594914075693, + "loss": 0.6004, + "step": 34390 + }, + { + "epoch": 1.7085526969305653, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006633197576239198, + "loss": 0.611, + "step": 34400 + }, + { + "epoch": 1.7090493692261846, + "grad_norm": 0.138671875, + "learning_rate": 0.0006632800238402702, + "loss": 0.5754, + "step": 34410 + }, + { + "epoch": 1.7095460415218038, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006632402900566207, + "loss": 0.599, + "step": 34420 + }, + { + "epoch": 1.7100427138174232, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006632005562729711, + "loss": 0.5795, + "step": 34430 + }, + { + "epoch": 1.7105393861130427, + "grad_norm": 0.146484375, + "learning_rate": 0.0006631608224893215, + "loss": 0.5899, + "step": 34440 + }, + { + "epoch": 1.711036058408662, + "grad_norm": 0.16015625, + "learning_rate": 0.0006631210887056721, + "loss": 0.5833, + "step": 34450 + }, + { + "epoch": 1.7115327307042814, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006630813549220225, + "loss": 0.5888, + "step": 34460 + }, + { + "epoch": 1.7120294029999008, + "grad_norm": 0.19140625, + "learning_rate": 0.0006630416211383729, + "loss": 0.5978, + "step": 34470 + }, + { + "epoch": 1.71252607529552, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006630018873547234, + "loss": 0.5993, + "step": 34480 + }, + { + "epoch": 1.7130227475911393, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006629621535710738, + "loss": 0.5815, + "step": 34490 + }, + { + "epoch": 1.7135194198867587, + "grad_norm": 0.1357421875, + "learning_rate": 0.0006629224197874244, + "loss": 0.5956, + "step": 34500 + }, + { + "epoch": 1.7140160921823782, + "grad_norm": 0.09765625, + "learning_rate": 0.0006628826860037747, + "loss": 0.5933, + "step": 34510 + }, + { + "epoch": 1.7145127644779974, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006628429522201252, + "loss": 0.5888, + "step": 34520 + }, + { + "epoch": 1.7150094367736166, + "grad_norm": 0.1640625, + "learning_rate": 0.0006628032184364757, + "loss": 0.5999, + "step": 34530 + }, + { + "epoch": 1.715506109069236, + "grad_norm": 0.162109375, + "learning_rate": 0.000662763484652826, + "loss": 0.6141, + "step": 34540 + }, + { + "epoch": 1.7160027813648555, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006627237508691766, + "loss": 0.5664, + "step": 34550 + }, + { + "epoch": 1.7164994536604747, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006626840170855271, + "loss": 0.5732, + "step": 34560 + }, + { + "epoch": 1.7169961259560942, + "grad_norm": 0.0849609375, + "learning_rate": 0.0006626442833018774, + "loss": 0.5897, + "step": 34570 + }, + { + "epoch": 1.7174927982517136, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006626045495182279, + "loss": 0.6314, + "step": 34580 + }, + { + "epoch": 1.7179894705473329, + "grad_norm": 0.10546875, + "learning_rate": 0.0006625648157345783, + "loss": 0.5981, + "step": 34590 + }, + { + "epoch": 1.718486142842952, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006625250819509288, + "loss": 0.6238, + "step": 34600 + }, + { + "epoch": 1.7189828151385715, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006624853481672793, + "loss": 0.5775, + "step": 34610 + }, + { + "epoch": 1.719479487434191, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006624456143836297, + "loss": 0.5957, + "step": 34620 + }, + { + "epoch": 1.7199761597298102, + "grad_norm": 0.1328125, + "learning_rate": 0.0006624058805999801, + "loss": 0.6131, + "step": 34630 + }, + { + "epoch": 1.7204728320254297, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006623661468163306, + "loss": 0.6022, + "step": 34640 + }, + { + "epoch": 1.7209695043210491, + "grad_norm": 0.109375, + "learning_rate": 0.0006623264130326811, + "loss": 0.601, + "step": 34650 + }, + { + "epoch": 1.7214661766166683, + "grad_norm": 0.1015625, + "learning_rate": 0.0006622866792490316, + "loss": 0.594, + "step": 34660 + }, + { + "epoch": 1.7219628489122876, + "grad_norm": 0.11669921875, + "learning_rate": 0.000662246945465382, + "loss": 0.5892, + "step": 34670 + }, + { + "epoch": 1.722459521207907, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006622072116817324, + "loss": 0.5817, + "step": 34680 + }, + { + "epoch": 1.7229561935035265, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006621674778980829, + "loss": 0.6195, + "step": 34690 + }, + { + "epoch": 1.7234528657991457, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006621277441144333, + "loss": 0.5867, + "step": 34700 + }, + { + "epoch": 1.723949538094765, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006620880103307838, + "loss": 0.6086, + "step": 34710 + }, + { + "epoch": 1.7244462103903844, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006620482765471343, + "loss": 0.5925, + "step": 34720 + }, + { + "epoch": 1.7249428826860038, + "grad_norm": 0.248046875, + "learning_rate": 0.0006620085427634846, + "loss": 0.6208, + "step": 34730 + }, + { + "epoch": 1.725439554981623, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006619688089798351, + "loss": 0.6107, + "step": 34740 + }, + { + "epoch": 1.7259362272772425, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006619290751961857, + "loss": 0.5579, + "step": 34750 + }, + { + "epoch": 1.726432899572862, + "grad_norm": 0.115234375, + "learning_rate": 0.000661889341412536, + "loss": 0.5842, + "step": 34760 + }, + { + "epoch": 1.7269295718684812, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006618496076288865, + "loss": 0.5636, + "step": 34770 + }, + { + "epoch": 1.7274262441641004, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006618098738452369, + "loss": 0.6168, + "step": 34780 + }, + { + "epoch": 1.7279229164597198, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006617701400615874, + "loss": 0.5637, + "step": 34790 + }, + { + "epoch": 1.7284195887553393, + "grad_norm": 0.083984375, + "learning_rate": 0.0006617304062779379, + "loss": 0.5769, + "step": 34800 + }, + { + "epoch": 1.7289162610509585, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006616906724942883, + "loss": 0.6049, + "step": 34810 + }, + { + "epoch": 1.729412933346578, + "grad_norm": 0.130859375, + "learning_rate": 0.0006616509387106388, + "loss": 0.5878, + "step": 34820 + }, + { + "epoch": 1.7299096056421974, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006616112049269892, + "loss": 0.5919, + "step": 34830 + }, + { + "epoch": 1.7304062779378167, + "grad_norm": 0.1640625, + "learning_rate": 0.0006615714711433396, + "loss": 0.5782, + "step": 34840 + }, + { + "epoch": 1.7309029502334359, + "grad_norm": 0.1748046875, + "learning_rate": 0.0006615317373596902, + "loss": 0.634, + "step": 34850 + }, + { + "epoch": 1.7313996225290553, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006614920035760406, + "loss": 0.5835, + "step": 34860 + }, + { + "epoch": 1.7318962948246748, + "grad_norm": 0.10205078125, + "learning_rate": 0.000661452269792391, + "loss": 0.6219, + "step": 34870 + }, + { + "epoch": 1.732392967120294, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006614125360087415, + "loss": 0.5947, + "step": 34880 + }, + { + "epoch": 1.7328896394159132, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006613728022250919, + "loss": 0.59, + "step": 34890 + }, + { + "epoch": 1.7333863117115327, + "grad_norm": 0.283203125, + "learning_rate": 0.0006613330684414424, + "loss": 0.6176, + "step": 34900 + }, + { + "epoch": 1.7338829840071521, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006612933346577929, + "loss": 0.619, + "step": 34910 + }, + { + "epoch": 1.7343796563027714, + "grad_norm": 0.0869140625, + "learning_rate": 0.0006612536008741432, + "loss": 0.5682, + "step": 34920 + }, + { + "epoch": 1.7348763285983908, + "grad_norm": 0.099609375, + "learning_rate": 0.0006612138670904937, + "loss": 0.5862, + "step": 34930 + }, + { + "epoch": 1.7353730008940103, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006611741333068442, + "loss": 0.5935, + "step": 34940 + }, + { + "epoch": 1.7358696731896295, + "grad_norm": 0.1650390625, + "learning_rate": 0.0006611343995231947, + "loss": 0.5896, + "step": 34950 + }, + { + "epoch": 1.7363663454852487, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006610946657395451, + "loss": 0.5667, + "step": 34960 + }, + { + "epoch": 1.7368630177808682, + "grad_norm": 0.16015625, + "learning_rate": 0.0006610549319558955, + "loss": 0.6185, + "step": 34970 + }, + { + "epoch": 1.7373596900764876, + "grad_norm": 0.0947265625, + "learning_rate": 0.000661015198172246, + "loss": 0.5739, + "step": 34980 + }, + { + "epoch": 1.7378563623721068, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006609754643885964, + "loss": 0.5968, + "step": 34990 + }, + { + "epoch": 1.7383530346677263, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006609357306049468, + "loss": 0.6268, + "step": 35000 + }, + { + "epoch": 1.7388497069633457, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006608959968212974, + "loss": 0.6147, + "step": 35010 + }, + { + "epoch": 1.739346379258965, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006608562630376478, + "loss": 0.577, + "step": 35020 + }, + { + "epoch": 1.7398430515545842, + "grad_norm": 0.12890625, + "learning_rate": 0.0006608165292539982, + "loss": 0.601, + "step": 35030 + }, + { + "epoch": 1.7403397238502036, + "grad_norm": 0.08935546875, + "learning_rate": 0.0006607767954703487, + "loss": 0.6041, + "step": 35040 + }, + { + "epoch": 1.740836396145823, + "grad_norm": 0.08642578125, + "learning_rate": 0.0006607370616866992, + "loss": 0.581, + "step": 35050 + }, + { + "epoch": 1.7413330684414423, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006606973279030496, + "loss": 0.5901, + "step": 35060 + }, + { + "epoch": 1.7418297407370615, + "grad_norm": 0.091796875, + "learning_rate": 0.0006606575941194001, + "loss": 0.6036, + "step": 35070 + }, + { + "epoch": 1.742326413032681, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006606178603357505, + "loss": 0.6168, + "step": 35080 + }, + { + "epoch": 1.7428230853283004, + "grad_norm": 0.09033203125, + "learning_rate": 0.0006605781265521009, + "loss": 0.6085, + "step": 35090 + }, + { + "epoch": 1.7433197576239197, + "grad_norm": 0.1015625, + "learning_rate": 0.0006605383927684515, + "loss": 0.5746, + "step": 35100 + }, + { + "epoch": 1.7438164299195391, + "grad_norm": 0.17578125, + "learning_rate": 0.0006604986589848019, + "loss": 0.581, + "step": 35110 + }, + { + "epoch": 1.7443131022151586, + "grad_norm": 0.154296875, + "learning_rate": 0.0006604589252011523, + "loss": 0.5806, + "step": 35120 + }, + { + "epoch": 1.7448097745107778, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006604191914175028, + "loss": 0.5574, + "step": 35130 + }, + { + "epoch": 1.745306446806397, + "grad_norm": 0.142578125, + "learning_rate": 0.0006603794576338532, + "loss": 0.6143, + "step": 35140 + }, + { + "epoch": 1.7458031191020165, + "grad_norm": 0.140625, + "learning_rate": 0.0006603397238502036, + "loss": 0.5985, + "step": 35150 + }, + { + "epoch": 1.746299791397636, + "grad_norm": 0.2177734375, + "learning_rate": 0.0006602999900665542, + "loss": 0.6048, + "step": 35160 + }, + { + "epoch": 1.7467964636932551, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006602602562829046, + "loss": 0.5965, + "step": 35170 + }, + { + "epoch": 1.7472931359888746, + "grad_norm": 0.1044921875, + "learning_rate": 0.000660220522499255, + "loss": 0.5958, + "step": 35180 + }, + { + "epoch": 1.747789808284494, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006601807887156054, + "loss": 0.607, + "step": 35190 + }, + { + "epoch": 1.7482864805801133, + "grad_norm": 0.1123046875, + "learning_rate": 0.000660141054931956, + "loss": 0.5923, + "step": 35200 + }, + { + "epoch": 1.7487831528757325, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006601013211483064, + "loss": 0.6049, + "step": 35210 + }, + { + "epoch": 1.749279825171352, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006600615873646568, + "loss": 0.6206, + "step": 35220 + }, + { + "epoch": 1.7497764974669714, + "grad_norm": 0.1435546875, + "learning_rate": 0.0006600218535810073, + "loss": 0.6048, + "step": 35230 + }, + { + "epoch": 1.7502731697625906, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006599821197973577, + "loss": 0.6114, + "step": 35240 + }, + { + "epoch": 1.7507698420582098, + "grad_norm": 0.16015625, + "learning_rate": 0.0006599423860137081, + "loss": 0.5983, + "step": 35250 + }, + { + "epoch": 1.7512665143538293, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006599026522300587, + "loss": 0.6022, + "step": 35260 + }, + { + "epoch": 1.7517631866494487, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006598629184464091, + "loss": 0.616, + "step": 35270 + }, + { + "epoch": 1.752259858945068, + "grad_norm": 0.099609375, + "learning_rate": 0.0006598231846627595, + "loss": 0.5992, + "step": 35280 + }, + { + "epoch": 1.7527565312406874, + "grad_norm": 0.11865234375, + "learning_rate": 0.00065978345087911, + "loss": 0.5907, + "step": 35290 + }, + { + "epoch": 1.7532532035363069, + "grad_norm": 0.150390625, + "learning_rate": 0.0006597437170954604, + "loss": 0.5911, + "step": 35300 + }, + { + "epoch": 1.753749875831926, + "grad_norm": 0.09375, + "learning_rate": 0.0006597039833118109, + "loss": 0.5878, + "step": 35310 + }, + { + "epoch": 1.7542465481275453, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006596642495281614, + "loss": 0.58, + "step": 35320 + }, + { + "epoch": 1.7547432204231648, + "grad_norm": 0.19921875, + "learning_rate": 0.0006596245157445118, + "loss": 0.6123, + "step": 35330 + }, + { + "epoch": 1.7552398927187842, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006595847819608622, + "loss": 0.5974, + "step": 35340 + }, + { + "epoch": 1.7557365650144034, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006595450481772127, + "loss": 0.5911, + "step": 35350 + }, + { + "epoch": 1.756233237310023, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006595053143935632, + "loss": 0.5947, + "step": 35360 + }, + { + "epoch": 1.7567299096056423, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006594655806099136, + "loss": 0.5844, + "step": 35370 + }, + { + "epoch": 1.7572265819012616, + "grad_norm": 0.09033203125, + "learning_rate": 0.000659425846826264, + "loss": 0.5914, + "step": 35380 + }, + { + "epoch": 1.7577232541968808, + "grad_norm": 0.1015625, + "learning_rate": 0.0006593861130426145, + "loss": 0.5856, + "step": 35390 + }, + { + "epoch": 1.7582199264925003, + "grad_norm": 0.0859375, + "learning_rate": 0.000659346379258965, + "loss": 0.6051, + "step": 35400 + }, + { + "epoch": 1.7587165987881197, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006593066454753154, + "loss": 0.6096, + "step": 35410 + }, + { + "epoch": 1.759213271083739, + "grad_norm": 0.16015625, + "learning_rate": 0.0006592669116916659, + "loss": 0.5855, + "step": 35420 + }, + { + "epoch": 1.7597099433793582, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006592271779080163, + "loss": 0.6032, + "step": 35430 + }, + { + "epoch": 1.7602066156749776, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006591874441243667, + "loss": 0.5703, + "step": 35440 + }, + { + "epoch": 1.760703287970597, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006591477103407172, + "loss": 0.596, + "step": 35450 + }, + { + "epoch": 1.7611999602662163, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006591079765570677, + "loss": 0.5854, + "step": 35460 + }, + { + "epoch": 1.7616966325618357, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006590682427734181, + "loss": 0.5599, + "step": 35470 + }, + { + "epoch": 1.7621933048574552, + "grad_norm": 0.130859375, + "learning_rate": 0.0006590285089897686, + "loss": 0.6016, + "step": 35480 + }, + { + "epoch": 1.7626899771530744, + "grad_norm": 0.0966796875, + "learning_rate": 0.000658988775206119, + "loss": 0.597, + "step": 35490 + }, + { + "epoch": 1.7631866494486936, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006589490414224694, + "loss": 0.6352, + "step": 35500 + }, + { + "epoch": 1.763683321744313, + "grad_norm": 0.09375, + "learning_rate": 0.00065890930763882, + "loss": 0.5891, + "step": 35510 + }, + { + "epoch": 1.7641799940399325, + "grad_norm": 0.08984375, + "learning_rate": 0.0006588695738551704, + "loss": 0.5808, + "step": 35520 + }, + { + "epoch": 1.7646766663355518, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006588298400715208, + "loss": 0.585, + "step": 35530 + }, + { + "epoch": 1.7651733386311712, + "grad_norm": 0.12890625, + "learning_rate": 0.0006587901062878713, + "loss": 0.5931, + "step": 35540 + }, + { + "epoch": 1.7656700109267907, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006587503725042217, + "loss": 0.5712, + "step": 35550 + }, + { + "epoch": 1.7661666832224099, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006587106387205723, + "loss": 0.5875, + "step": 35560 + }, + { + "epoch": 1.766663355518029, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006586709049369226, + "loss": 0.5762, + "step": 35570 + }, + { + "epoch": 1.7671600278136486, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006586311711532731, + "loss": 0.586, + "step": 35580 + }, + { + "epoch": 1.767656700109268, + "grad_norm": 0.1533203125, + "learning_rate": 0.0006585914373696236, + "loss": 0.6036, + "step": 35590 + }, + { + "epoch": 1.7681533724048872, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006585517035859739, + "loss": 0.592, + "step": 35600 + }, + { + "epoch": 1.7686500447005065, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006585119698023245, + "loss": 0.5644, + "step": 35610 + }, + { + "epoch": 1.769146716996126, + "grad_norm": 0.12890625, + "learning_rate": 0.000658472236018675, + "loss": 0.579, + "step": 35620 + }, + { + "epoch": 1.7696433892917454, + "grad_norm": 0.09375, + "learning_rate": 0.0006584325022350253, + "loss": 0.5832, + "step": 35630 + }, + { + "epoch": 1.7701400615873646, + "grad_norm": 0.095703125, + "learning_rate": 0.0006583927684513758, + "loss": 0.5951, + "step": 35640 + }, + { + "epoch": 1.770636733882984, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006583530346677262, + "loss": 0.5847, + "step": 35650 + }, + { + "epoch": 1.7711334061786035, + "grad_norm": 0.1552734375, + "learning_rate": 0.0006583133008840767, + "loss": 0.5947, + "step": 35660 + }, + { + "epoch": 1.7716300784742227, + "grad_norm": 0.1533203125, + "learning_rate": 0.0006582735671004272, + "loss": 0.5862, + "step": 35670 + }, + { + "epoch": 1.772126750769842, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006582338333167776, + "loss": 0.6173, + "step": 35680 + }, + { + "epoch": 1.7726234230654614, + "grad_norm": 0.123046875, + "learning_rate": 0.0006581940995331281, + "loss": 0.5851, + "step": 35690 + }, + { + "epoch": 1.7731200953610808, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006581543657494785, + "loss": 0.5829, + "step": 35700 + }, + { + "epoch": 1.7736167676567, + "grad_norm": 0.1181640625, + "learning_rate": 0.000658114631965829, + "loss": 0.5852, + "step": 35710 + }, + { + "epoch": 1.7741134399523193, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006580748981821795, + "loss": 0.5783, + "step": 35720 + }, + { + "epoch": 1.774610112247939, + "grad_norm": 0.10546875, + "learning_rate": 0.0006580351643985299, + "loss": 0.5801, + "step": 35730 + }, + { + "epoch": 1.7751067845435582, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006579954306148803, + "loss": 0.5973, + "step": 35740 + }, + { + "epoch": 1.7756034568391774, + "grad_norm": 0.1552734375, + "learning_rate": 0.0006579556968312308, + "loss": 0.611, + "step": 35750 + }, + { + "epoch": 1.7761001291347969, + "grad_norm": 0.08740234375, + "learning_rate": 0.0006579159630475812, + "loss": 0.5763, + "step": 35760 + }, + { + "epoch": 1.7765968014304163, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006578762292639317, + "loss": 0.5711, + "step": 35770 + }, + { + "epoch": 1.7770934737260355, + "grad_norm": 0.111328125, + "learning_rate": 0.0006578364954802822, + "loss": 0.6111, + "step": 35780 + }, + { + "epoch": 1.7775901460216548, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006577967616966325, + "loss": 0.5693, + "step": 35790 + }, + { + "epoch": 1.7780868183172742, + "grad_norm": 0.1494140625, + "learning_rate": 0.000657757027912983, + "loss": 0.5779, + "step": 35800 + }, + { + "epoch": 1.7785834906128937, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006577172941293336, + "loss": 0.5906, + "step": 35810 + }, + { + "epoch": 1.779080162908513, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006576775603456839, + "loss": 0.5878, + "step": 35820 + }, + { + "epoch": 1.7795768352041323, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006576378265620344, + "loss": 0.5832, + "step": 35830 + }, + { + "epoch": 1.7800735074997518, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006575980927783848, + "loss": 0.5882, + "step": 35840 + }, + { + "epoch": 1.780570179795371, + "grad_norm": 0.109375, + "learning_rate": 0.0006575583589947353, + "loss": 0.6045, + "step": 35850 + }, + { + "epoch": 1.7810668520909902, + "grad_norm": 0.09765625, + "learning_rate": 0.0006575186252110858, + "loss": 0.5789, + "step": 35860 + }, + { + "epoch": 1.7815635243866097, + "grad_norm": 0.1640625, + "learning_rate": 0.0006574788914274362, + "loss": 0.5812, + "step": 35870 + }, + { + "epoch": 1.7820601966822291, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006574391576437867, + "loss": 0.5801, + "step": 35880 + }, + { + "epoch": 1.7825568689778484, + "grad_norm": 0.103515625, + "learning_rate": 0.0006573994238601371, + "loss": 0.5915, + "step": 35890 + }, + { + "epoch": 1.7830535412734676, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006573596900764875, + "loss": 0.5687, + "step": 35900 + }, + { + "epoch": 1.7835502135690873, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006573199562928381, + "loss": 0.5588, + "step": 35910 + }, + { + "epoch": 1.7840468858647065, + "grad_norm": 0.111328125, + "learning_rate": 0.0006572802225091885, + "loss": 0.579, + "step": 35920 + }, + { + "epoch": 1.7845435581603257, + "grad_norm": 0.119140625, + "learning_rate": 0.0006572404887255389, + "loss": 0.6217, + "step": 35930 + }, + { + "epoch": 1.7850402304559452, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006572007549418894, + "loss": 0.5919, + "step": 35940 + }, + { + "epoch": 1.7855369027515646, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006571610211582398, + "loss": 0.5992, + "step": 35950 + }, + { + "epoch": 1.7860335750471839, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006571212873745903, + "loss": 0.5961, + "step": 35960 + }, + { + "epoch": 1.786530247342803, + "grad_norm": 0.1337890625, + "learning_rate": 0.0006570815535909408, + "loss": 0.6127, + "step": 35970 + }, + { + "epoch": 1.7870269196384225, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006570418198072911, + "loss": 0.5762, + "step": 35980 + }, + { + "epoch": 1.787523591934042, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006570020860236416, + "loss": 0.5988, + "step": 35990 + }, + { + "epoch": 1.7880202642296612, + "grad_norm": 0.0869140625, + "learning_rate": 0.0006569623522399921, + "loss": 0.581, + "step": 36000 + }, + { + "epoch": 1.7885169365252807, + "grad_norm": 0.09765625, + "learning_rate": 0.0006569226184563426, + "loss": 0.5811, + "step": 36010 + }, + { + "epoch": 1.7890136088209, + "grad_norm": 0.11181640625, + "learning_rate": 0.000656882884672693, + "loss": 0.5708, + "step": 36020 + }, + { + "epoch": 1.7895102811165193, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006568431508890434, + "loss": 0.5819, + "step": 36030 + }, + { + "epoch": 1.7900069534121386, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006568034171053939, + "loss": 0.5852, + "step": 36040 + }, + { + "epoch": 1.790503625707758, + "grad_norm": 0.119140625, + "learning_rate": 0.0006567636833217443, + "loss": 0.6267, + "step": 36050 + }, + { + "epoch": 1.7910002980033775, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006567239495380948, + "loss": 0.5895, + "step": 36060 + }, + { + "epoch": 1.7914969702989967, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006566842157544453, + "loss": 0.599, + "step": 36070 + }, + { + "epoch": 1.791993642594616, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006566444819707957, + "loss": 0.5701, + "step": 36080 + }, + { + "epoch": 1.7924903148902356, + "grad_norm": 0.125, + "learning_rate": 0.0006566047481871461, + "loss": 0.5955, + "step": 36090 + }, + { + "epoch": 1.7929869871858548, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006565650144034966, + "loss": 0.5917, + "step": 36100 + }, + { + "epoch": 1.793483659481474, + "grad_norm": 0.130859375, + "learning_rate": 0.0006565252806198471, + "loss": 0.5633, + "step": 36110 + }, + { + "epoch": 1.7939803317770935, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006564855468361975, + "loss": 0.5555, + "step": 36120 + }, + { + "epoch": 1.794477004072713, + "grad_norm": 0.09521484375, + "learning_rate": 0.000656445813052548, + "loss": 0.6074, + "step": 36130 + }, + { + "epoch": 1.7949736763683322, + "grad_norm": 0.111328125, + "learning_rate": 0.0006564060792688984, + "loss": 0.6201, + "step": 36140 + }, + { + "epoch": 1.7954703486639514, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006563663454852488, + "loss": 0.586, + "step": 36150 + }, + { + "epoch": 1.7959670209595708, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006563266117015994, + "loss": 0.5746, + "step": 36160 + }, + { + "epoch": 1.7964636932551903, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006562868779179498, + "loss": 0.5907, + "step": 36170 + }, + { + "epoch": 1.7969603655508095, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006562471441343002, + "loss": 0.5704, + "step": 36180 + }, + { + "epoch": 1.797457037846429, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006562074103506507, + "loss": 0.6184, + "step": 36190 + }, + { + "epoch": 1.7979537101420484, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006561676765670011, + "loss": 0.6149, + "step": 36200 + }, + { + "epoch": 1.7984503824376676, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006561279427833516, + "loss": 0.5642, + "step": 36210 + }, + { + "epoch": 1.7989470547332869, + "grad_norm": 0.123046875, + "learning_rate": 0.0006560882089997021, + "loss": 0.5813, + "step": 36220 + }, + { + "epoch": 1.7994437270289063, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006560484752160525, + "loss": 0.612, + "step": 36230 + }, + { + "epoch": 1.7999403993245258, + "grad_norm": 0.099609375, + "learning_rate": 0.0006560087414324029, + "loss": 0.5886, + "step": 36240 + }, + { + "epoch": 1.800437071620145, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006559690076487533, + "loss": 0.6051, + "step": 36250 + }, + { + "epoch": 1.8009337439157642, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006559292738651039, + "loss": 0.5852, + "step": 36260 + }, + { + "epoch": 1.801430416211384, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006558895400814543, + "loss": 0.5926, + "step": 36270 + }, + { + "epoch": 1.8019270885070031, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006558498062978047, + "loss": 0.604, + "step": 36280 + }, + { + "epoch": 1.8024237608026223, + "grad_norm": 0.091796875, + "learning_rate": 0.0006558100725141552, + "loss": 0.609, + "step": 36290 + }, + { + "epoch": 1.8029204330982418, + "grad_norm": 0.08544921875, + "learning_rate": 0.0006557703387305056, + "loss": 0.5675, + "step": 36300 + }, + { + "epoch": 1.8034171053938612, + "grad_norm": 0.1005859375, + "learning_rate": 0.000655730604946856, + "loss": 0.5771, + "step": 36310 + }, + { + "epoch": 1.8039137776894805, + "grad_norm": 0.123046875, + "learning_rate": 0.0006556908711632066, + "loss": 0.5779, + "step": 36320 + }, + { + "epoch": 1.8044104499850997, + "grad_norm": 0.095703125, + "learning_rate": 0.000655651137379557, + "loss": 0.6223, + "step": 36330 + }, + { + "epoch": 1.8049071222807191, + "grad_norm": 0.109375, + "learning_rate": 0.0006556114035959074, + "loss": 0.5769, + "step": 36340 + }, + { + "epoch": 1.8054037945763386, + "grad_norm": 0.158203125, + "learning_rate": 0.0006555716698122579, + "loss": 0.5798, + "step": 36350 + }, + { + "epoch": 1.8059004668719578, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006555319360286084, + "loss": 0.5818, + "step": 36360 + }, + { + "epoch": 1.8063971391675773, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006554922022449588, + "loss": 0.6004, + "step": 36370 + }, + { + "epoch": 1.8068938114631967, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006554524684613093, + "loss": 0.597, + "step": 36380 + }, + { + "epoch": 1.807390483758816, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006554127346776597, + "loss": 0.5771, + "step": 36390 + }, + { + "epoch": 1.8078871560544352, + "grad_norm": 0.14453125, + "learning_rate": 0.0006553730008940101, + "loss": 0.5779, + "step": 36400 + }, + { + "epoch": 1.8083838283500546, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006553332671103607, + "loss": 0.596, + "step": 36410 + }, + { + "epoch": 1.808880500645674, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006552935333267111, + "loss": 0.6053, + "step": 36420 + }, + { + "epoch": 1.8093771729412933, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006552537995430615, + "loss": 0.5661, + "step": 36430 + }, + { + "epoch": 1.8098738452369125, + "grad_norm": 0.08203125, + "learning_rate": 0.0006552140657594119, + "loss": 0.5906, + "step": 36440 + }, + { + "epoch": 1.8103705175325322, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006551743319757624, + "loss": 0.5939, + "step": 36450 + }, + { + "epoch": 1.8108671898281514, + "grad_norm": 0.146484375, + "learning_rate": 0.000655134598192113, + "loss": 0.5859, + "step": 36460 + }, + { + "epoch": 1.8113638621237707, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006550948644084633, + "loss": 0.5821, + "step": 36470 + }, + { + "epoch": 1.81186053441939, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006550551306248138, + "loss": 0.5904, + "step": 36480 + }, + { + "epoch": 1.8123572067150095, + "grad_norm": 0.087890625, + "learning_rate": 0.0006550153968411643, + "loss": 0.5741, + "step": 36490 + }, + { + "epoch": 1.8128538790106288, + "grad_norm": 0.10546875, + "learning_rate": 0.0006549756630575146, + "loss": 0.6147, + "step": 36500 + }, + { + "epoch": 1.813350551306248, + "grad_norm": 0.16015625, + "learning_rate": 0.0006549359292738652, + "loss": 0.5682, + "step": 36510 + }, + { + "epoch": 1.8138472236018675, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006548961954902156, + "loss": 0.6279, + "step": 36520 + }, + { + "epoch": 1.814343895897487, + "grad_norm": 0.126953125, + "learning_rate": 0.000654856461706566, + "loss": 0.5933, + "step": 36530 + }, + { + "epoch": 1.8148405681931061, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006548167279229165, + "loss": 0.5712, + "step": 36540 + }, + { + "epoch": 1.8153372404887256, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006547769941392669, + "loss": 0.5993, + "step": 36550 + }, + { + "epoch": 1.815833912784345, + "grad_norm": 0.095703125, + "learning_rate": 0.0006547372603556173, + "loss": 0.6267, + "step": 36560 + }, + { + "epoch": 1.8163305850799643, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006546975265719679, + "loss": 0.5799, + "step": 36570 + }, + { + "epoch": 1.8168272573755835, + "grad_norm": 0.130859375, + "learning_rate": 0.0006546577927883183, + "loss": 0.6024, + "step": 36580 + }, + { + "epoch": 1.817323929671203, + "grad_norm": 0.138671875, + "learning_rate": 0.0006546180590046688, + "loss": 0.6024, + "step": 36590 + }, + { + "epoch": 1.8178206019668224, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006545783252210192, + "loss": 0.5906, + "step": 36600 + }, + { + "epoch": 1.8183172742624416, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006545385914373696, + "loss": 0.5762, + "step": 36610 + }, + { + "epoch": 1.8188139465580608, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006544988576537202, + "loss": 0.5811, + "step": 36620 + }, + { + "epoch": 1.8193106188536805, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006544591238700705, + "loss": 0.5927, + "step": 36630 + }, + { + "epoch": 1.8198072911492997, + "grad_norm": 0.08447265625, + "learning_rate": 0.000654419390086421, + "loss": 0.5813, + "step": 36640 + }, + { + "epoch": 1.820303963444919, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006543796563027715, + "loss": 0.5863, + "step": 36650 + }, + { + "epoch": 1.8208006357405384, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006543399225191218, + "loss": 0.5647, + "step": 36660 + }, + { + "epoch": 1.8212973080361579, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006543001887354724, + "loss": 0.5953, + "step": 36670 + }, + { + "epoch": 1.821793980331777, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006542604549518229, + "loss": 0.5822, + "step": 36680 + }, + { + "epoch": 1.8222906526273963, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006542207211681732, + "loss": 0.593, + "step": 36690 + }, + { + "epoch": 1.8227873249230158, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006541809873845237, + "loss": 0.5887, + "step": 36700 + }, + { + "epoch": 1.8232839972186352, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006541412536008741, + "loss": 0.6251, + "step": 36710 + }, + { + "epoch": 1.8237806695142544, + "grad_norm": 0.140625, + "learning_rate": 0.0006541015198172246, + "loss": 0.5714, + "step": 36720 + }, + { + "epoch": 1.8242773418098739, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006540617860335751, + "loss": 0.5708, + "step": 36730 + }, + { + "epoch": 1.8247740141054933, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006540220522499255, + "loss": 0.5896, + "step": 36740 + }, + { + "epoch": 1.8252706864011126, + "grad_norm": 0.09765625, + "learning_rate": 0.000653982318466276, + "loss": 0.578, + "step": 36750 + }, + { + "epoch": 1.8257673586967318, + "grad_norm": 0.1796875, + "learning_rate": 0.0006539425846826264, + "loss": 0.6018, + "step": 36760 + }, + { + "epoch": 1.8262640309923512, + "grad_norm": 0.1435546875, + "learning_rate": 0.0006539028508989769, + "loss": 0.6012, + "step": 36770 + }, + { + "epoch": 1.8267607032879707, + "grad_norm": 0.18359375, + "learning_rate": 0.0006538631171153274, + "loss": 0.6244, + "step": 36780 + }, + { + "epoch": 1.82725737558359, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006538233833316778, + "loss": 0.5579, + "step": 36790 + }, + { + "epoch": 1.8277540478792091, + "grad_norm": 0.111328125, + "learning_rate": 0.0006537836495480282, + "loss": 0.6142, + "step": 36800 + }, + { + "epoch": 1.8282507201748288, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006537439157643787, + "loss": 0.5613, + "step": 36810 + }, + { + "epoch": 1.828747392470448, + "grad_norm": 0.08642578125, + "learning_rate": 0.0006537041819807292, + "loss": 0.6049, + "step": 36820 + }, + { + "epoch": 1.8292440647660673, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006536644481970796, + "loss": 0.5672, + "step": 36830 + }, + { + "epoch": 1.8297407370616867, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006536247144134301, + "loss": 0.5831, + "step": 36840 + }, + { + "epoch": 1.8302374093573062, + "grad_norm": 0.1640625, + "learning_rate": 0.0006535849806297804, + "loss": 0.5675, + "step": 36850 + }, + { + "epoch": 1.8307340816529254, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006535452468461309, + "loss": 0.6041, + "step": 36860 + }, + { + "epoch": 1.8312307539485446, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006535055130624815, + "loss": 0.5579, + "step": 36870 + }, + { + "epoch": 1.831727426244164, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006534657792788319, + "loss": 0.5693, + "step": 36880 + }, + { + "epoch": 1.8322240985397835, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006534260454951823, + "loss": 0.5783, + "step": 36890 + }, + { + "epoch": 1.8327207708354027, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006533863117115327, + "loss": 0.5867, + "step": 36900 + }, + { + "epoch": 1.8332174431310222, + "grad_norm": 0.1328125, + "learning_rate": 0.0006533465779278832, + "loss": 0.5905, + "step": 36910 + }, + { + "epoch": 1.8337141154266416, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006533068441442337, + "loss": 0.6061, + "step": 36920 + }, + { + "epoch": 1.8342107877222609, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006532671103605841, + "loss": 0.6096, + "step": 36930 + }, + { + "epoch": 1.83470746001788, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006532273765769346, + "loss": 0.576, + "step": 36940 + }, + { + "epoch": 1.8352041323134995, + "grad_norm": 0.1005859375, + "learning_rate": 0.000653187642793285, + "loss": 0.6109, + "step": 36950 + }, + { + "epoch": 1.835700804609119, + "grad_norm": 0.134765625, + "learning_rate": 0.0006531479090096354, + "loss": 0.568, + "step": 36960 + }, + { + "epoch": 1.8361974769047382, + "grad_norm": 0.1416015625, + "learning_rate": 0.000653108175225986, + "loss": 0.5775, + "step": 36970 + }, + { + "epoch": 1.8366941492003575, + "grad_norm": 0.08447265625, + "learning_rate": 0.0006530684414423364, + "loss": 0.5969, + "step": 36980 + }, + { + "epoch": 1.837190821495977, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006530287076586868, + "loss": 0.582, + "step": 36990 + }, + { + "epoch": 1.8376874937915963, + "grad_norm": 0.185546875, + "learning_rate": 0.0006529889738750373, + "loss": 0.5714, + "step": 37000 + }, + { + "epoch": 1.8381841660872156, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006529492400913877, + "loss": 0.609, + "step": 37010 + }, + { + "epoch": 1.838680838382835, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006529095063077382, + "loss": 0.6116, + "step": 37020 + }, + { + "epoch": 1.8391775106784545, + "grad_norm": 0.1884765625, + "learning_rate": 0.0006528697725240887, + "loss": 0.6088, + "step": 37030 + }, + { + "epoch": 1.8396741829740737, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006528300387404391, + "loss": 0.6042, + "step": 37040 + }, + { + "epoch": 1.840170855269693, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006527903049567895, + "loss": 0.6014, + "step": 37050 + }, + { + "epoch": 1.8406675275653124, + "grad_norm": 0.1318359375, + "learning_rate": 0.00065275057117314, + "loss": 0.5956, + "step": 37060 + }, + { + "epoch": 1.8411641998609318, + "grad_norm": 0.119140625, + "learning_rate": 0.0006527108373894905, + "loss": 0.5895, + "step": 37070 + }, + { + "epoch": 1.841660872156551, + "grad_norm": 0.154296875, + "learning_rate": 0.0006526711036058409, + "loss": 0.6098, + "step": 37080 + }, + { + "epoch": 1.8421575444521705, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006526313698221914, + "loss": 0.585, + "step": 37090 + }, + { + "epoch": 1.84265421674779, + "grad_norm": 0.1650390625, + "learning_rate": 0.0006525916360385418, + "loss": 0.6035, + "step": 37100 + }, + { + "epoch": 1.8431508890434092, + "grad_norm": 0.15625, + "learning_rate": 0.0006525519022548922, + "loss": 0.5677, + "step": 37110 + }, + { + "epoch": 1.8436475613390284, + "grad_norm": 0.09765625, + "learning_rate": 0.0006525121684712427, + "loss": 0.5505, + "step": 37120 + }, + { + "epoch": 1.8441442336346479, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006524724346875932, + "loss": 0.5833, + "step": 37130 + }, + { + "epoch": 1.8446409059302673, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006524327009039436, + "loss": 0.5779, + "step": 37140 + }, + { + "epoch": 1.8451375782258865, + "grad_norm": 0.1357421875, + "learning_rate": 0.000652392967120294, + "loss": 0.5994, + "step": 37150 + }, + { + "epoch": 1.8456342505215058, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006523532333366445, + "loss": 0.5936, + "step": 37160 + }, + { + "epoch": 1.8461309228171252, + "grad_norm": 0.08935546875, + "learning_rate": 0.000652313499552995, + "loss": 0.5973, + "step": 37170 + }, + { + "epoch": 1.8466275951127447, + "grad_norm": 0.1787109375, + "learning_rate": 0.0006522737657693454, + "loss": 0.5972, + "step": 37180 + }, + { + "epoch": 1.8471242674083639, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006522340319856959, + "loss": 0.6032, + "step": 37190 + }, + { + "epoch": 1.8476209397039833, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006521942982020463, + "loss": 0.607, + "step": 37200 + }, + { + "epoch": 1.8481176119996028, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006521545644183967, + "loss": 0.6153, + "step": 37210 + }, + { + "epoch": 1.848614284295222, + "grad_norm": 0.13671875, + "learning_rate": 0.0006521148306347473, + "loss": 0.5763, + "step": 37220 + }, + { + "epoch": 1.8491109565908412, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006520750968510977, + "loss": 0.5736, + "step": 37230 + }, + { + "epoch": 1.8496076288864607, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006520353630674481, + "loss": 0.6079, + "step": 37240 + }, + { + "epoch": 1.8501043011820801, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006519956292837986, + "loss": 0.5812, + "step": 37250 + }, + { + "epoch": 1.8506009734776994, + "grad_norm": 0.146484375, + "learning_rate": 0.000651955895500149, + "loss": 0.6019, + "step": 37260 + }, + { + "epoch": 1.8510976457733188, + "grad_norm": 0.130859375, + "learning_rate": 0.0006519161617164995, + "loss": 0.6007, + "step": 37270 + }, + { + "epoch": 1.8515943180689383, + "grad_norm": 0.1669921875, + "learning_rate": 0.00065187642793285, + "loss": 0.5725, + "step": 37280 + }, + { + "epoch": 1.8520909903645575, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006518366941492004, + "loss": 0.5952, + "step": 37290 + }, + { + "epoch": 1.8525876626601767, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006517969603655508, + "loss": 0.5955, + "step": 37300 + }, + { + "epoch": 1.8530843349557962, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006517572265819012, + "loss": 0.5889, + "step": 37310 + }, + { + "epoch": 1.8535810072514156, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006517174927982518, + "loss": 0.5997, + "step": 37320 + }, + { + "epoch": 1.8540776795470348, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006516777590146023, + "loss": 0.5642, + "step": 37330 + }, + { + "epoch": 1.854574351842654, + "grad_norm": 0.181640625, + "learning_rate": 0.0006516380252309526, + "loss": 0.595, + "step": 37340 + }, + { + "epoch": 1.8550710241382735, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006515982914473031, + "loss": 0.5922, + "step": 37350 + }, + { + "epoch": 1.855567696433893, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006515585576636536, + "loss": 0.5832, + "step": 37360 + }, + { + "epoch": 1.8560643687295122, + "grad_norm": 0.1494140625, + "learning_rate": 0.000651518823880004, + "loss": 0.5978, + "step": 37370 + }, + { + "epoch": 1.8565610410251316, + "grad_norm": 0.0849609375, + "learning_rate": 0.0006514790900963545, + "loss": 0.573, + "step": 37380 + }, + { + "epoch": 1.857057713320751, + "grad_norm": 0.1494140625, + "learning_rate": 0.0006514393563127049, + "loss": 0.5722, + "step": 37390 + }, + { + "epoch": 1.8575543856163703, + "grad_norm": 0.14453125, + "learning_rate": 0.0006513996225290553, + "loss": 0.6139, + "step": 37400 + }, + { + "epoch": 1.8580510579119895, + "grad_norm": 0.109375, + "learning_rate": 0.0006513598887454058, + "loss": 0.5884, + "step": 37410 + }, + { + "epoch": 1.858547730207609, + "grad_norm": 0.185546875, + "learning_rate": 0.0006513201549617563, + "loss": 0.6201, + "step": 37420 + }, + { + "epoch": 1.8590444025032284, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006512804211781067, + "loss": 0.5874, + "step": 37430 + }, + { + "epoch": 1.8595410747988477, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006512406873944572, + "loss": 0.6012, + "step": 37440 + }, + { + "epoch": 1.8600377470944671, + "grad_norm": 0.091796875, + "learning_rate": 0.0006512009536108076, + "loss": 0.5957, + "step": 37450 + }, + { + "epoch": 1.8605344193900866, + "grad_norm": 0.1171875, + "learning_rate": 0.000651161219827158, + "loss": 0.5916, + "step": 37460 + }, + { + "epoch": 1.8610310916857058, + "grad_norm": 0.13671875, + "learning_rate": 0.0006511214860435086, + "loss": 0.586, + "step": 37470 + }, + { + "epoch": 1.861527763981325, + "grad_norm": 0.10107421875, + "learning_rate": 0.000651081752259859, + "loss": 0.569, + "step": 37480 + }, + { + "epoch": 1.8620244362769445, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006510420184762095, + "loss": 0.5574, + "step": 37490 + }, + { + "epoch": 1.862521108572564, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006510022846925598, + "loss": 0.5899, + "step": 37500 + }, + { + "epoch": 1.8630177808681831, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006509625509089103, + "loss": 0.5996, + "step": 37510 + }, + { + "epoch": 1.8635144531638024, + "grad_norm": 0.091796875, + "learning_rate": 0.0006509228171252609, + "loss": 0.6009, + "step": 37520 + }, + { + "epoch": 1.8640111254594218, + "grad_norm": 0.140625, + "learning_rate": 0.0006508830833416112, + "loss": 0.5804, + "step": 37530 + }, + { + "epoch": 1.8645077977550413, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006508433495579617, + "loss": 0.6056, + "step": 37540 + }, + { + "epoch": 1.8650044700506605, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006508036157743122, + "loss": 0.5867, + "step": 37550 + }, + { + "epoch": 1.86550114234628, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006507638819906625, + "loss": 0.6001, + "step": 37560 + }, + { + "epoch": 1.8659978146418994, + "grad_norm": 0.1357421875, + "learning_rate": 0.000650724148207013, + "loss": 0.5841, + "step": 37570 + }, + { + "epoch": 1.8664944869375186, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006506844144233635, + "loss": 0.5766, + "step": 37580 + }, + { + "epoch": 1.8669911592331379, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006506446806397139, + "loss": 0.5876, + "step": 37590 + }, + { + "epoch": 1.8674878315287573, + "grad_norm": 0.154296875, + "learning_rate": 0.0006506049468560644, + "loss": 0.5907, + "step": 37600 + }, + { + "epoch": 1.8679845038243768, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006505652130724148, + "loss": 0.5754, + "step": 37610 + }, + { + "epoch": 1.868481176119996, + "grad_norm": 0.1357421875, + "learning_rate": 0.0006505254792887652, + "loss": 0.5736, + "step": 37620 + }, + { + "epoch": 1.8689778484156154, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006504857455051158, + "loss": 0.6151, + "step": 37630 + }, + { + "epoch": 1.8694745207112349, + "grad_norm": 0.140625, + "learning_rate": 0.0006504460117214662, + "loss": 0.5984, + "step": 37640 + }, + { + "epoch": 1.869971193006854, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006504062779378167, + "loss": 0.6013, + "step": 37650 + }, + { + "epoch": 1.8704678653024733, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006503665441541671, + "loss": 0.6051, + "step": 37660 + }, + { + "epoch": 1.8709645375980928, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006503268103705176, + "loss": 0.5964, + "step": 37670 + }, + { + "epoch": 1.8714612098937122, + "grad_norm": 0.1171875, + "learning_rate": 0.0006502870765868681, + "loss": 0.5812, + "step": 37680 + }, + { + "epoch": 1.8719578821893315, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006502473428032185, + "loss": 0.5946, + "step": 37690 + }, + { + "epoch": 1.8724545544849507, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006502076090195689, + "loss": 0.5612, + "step": 37700 + }, + { + "epoch": 1.8729512267805701, + "grad_norm": 0.126953125, + "learning_rate": 0.0006501678752359194, + "loss": 0.585, + "step": 37710 + }, + { + "epoch": 1.8734478990761896, + "grad_norm": 0.09033203125, + "learning_rate": 0.0006501281414522697, + "loss": 0.5948, + "step": 37720 + }, + { + "epoch": 1.8739445713718088, + "grad_norm": 0.1455078125, + "learning_rate": 0.0006500884076686203, + "loss": 0.5773, + "step": 37730 + }, + { + "epoch": 1.8744412436674283, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006500486738849708, + "loss": 0.5941, + "step": 37740 + }, + { + "epoch": 1.8749379159630477, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006500089401013211, + "loss": 0.5943, + "step": 37750 + }, + { + "epoch": 1.875434588258667, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006499692063176716, + "loss": 0.5959, + "step": 37760 + }, + { + "epoch": 1.8759312605542862, + "grad_norm": 0.095703125, + "learning_rate": 0.000649929472534022, + "loss": 0.5773, + "step": 37770 + }, + { + "epoch": 1.8764279328499056, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006498897387503726, + "loss": 0.5837, + "step": 37780 + }, + { + "epoch": 1.876924605145525, + "grad_norm": 0.115234375, + "learning_rate": 0.000649850004966723, + "loss": 0.5888, + "step": 37790 + }, + { + "epoch": 1.8774212774411443, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006498102711830734, + "loss": 0.6306, + "step": 37800 + }, + { + "epoch": 1.8779179497367637, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006497705373994239, + "loss": 0.5537, + "step": 37810 + }, + { + "epoch": 1.8784146220323832, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006497308036157743, + "loss": 0.5905, + "step": 37820 + }, + { + "epoch": 1.8789112943280024, + "grad_norm": 0.09765625, + "learning_rate": 0.0006496910698321248, + "loss": 0.5726, + "step": 37830 + }, + { + "epoch": 1.8794079666236216, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006496513360484753, + "loss": 0.6031, + "step": 37840 + }, + { + "epoch": 1.879904638919241, + "grad_norm": 0.1796875, + "learning_rate": 0.0006496116022648257, + "loss": 0.5986, + "step": 37850 + }, + { + "epoch": 1.8804013112148605, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006495718684811761, + "loss": 0.5791, + "step": 37860 + }, + { + "epoch": 1.8808979835104798, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006495321346975267, + "loss": 0.5831, + "step": 37870 + }, + { + "epoch": 1.881394655806099, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006494924009138771, + "loss": 0.5692, + "step": 37880 + }, + { + "epoch": 1.8818913281017184, + "grad_norm": 0.09375, + "learning_rate": 0.0006494526671302275, + "loss": 0.58, + "step": 37890 + }, + { + "epoch": 1.882388000397338, + "grad_norm": 0.1181640625, + "learning_rate": 0.000649412933346578, + "loss": 0.5919, + "step": 37900 + }, + { + "epoch": 1.8828846726929571, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006493731995629283, + "loss": 0.6251, + "step": 37910 + }, + { + "epoch": 1.8833813449885766, + "grad_norm": 0.1669921875, + "learning_rate": 0.0006493334657792788, + "loss": 0.579, + "step": 37920 + }, + { + "epoch": 1.883878017284196, + "grad_norm": 0.091796875, + "learning_rate": 0.0006492937319956294, + "loss": 0.6264, + "step": 37930 + }, + { + "epoch": 1.8843746895798152, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006492539982119798, + "loss": 0.5701, + "step": 37940 + }, + { + "epoch": 1.8848713618754345, + "grad_norm": 0.1494140625, + "learning_rate": 0.0006492142644283302, + "loss": 0.604, + "step": 37950 + }, + { + "epoch": 1.885368034171054, + "grad_norm": 0.09765625, + "learning_rate": 0.0006491745306446807, + "loss": 0.5774, + "step": 37960 + }, + { + "epoch": 1.8858647064666734, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006491347968610311, + "loss": 0.5748, + "step": 37970 + }, + { + "epoch": 1.8863613787622926, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006490950630773816, + "loss": 0.6001, + "step": 37980 + }, + { + "epoch": 1.886858051057912, + "grad_norm": 0.1630859375, + "learning_rate": 0.000649055329293732, + "loss": 0.6058, + "step": 37990 + }, + { + "epoch": 1.8873547233535315, + "grad_norm": 0.1552734375, + "learning_rate": 0.0006490155955100825, + "loss": 0.5758, + "step": 38000 + }, + { + "epoch": 1.8878513956491507, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006489758617264329, + "loss": 0.5878, + "step": 38010 + }, + { + "epoch": 1.88834806794477, + "grad_norm": 0.11328125, + "learning_rate": 0.0006489361279427833, + "loss": 0.5932, + "step": 38020 + }, + { + "epoch": 1.8888447402403894, + "grad_norm": 0.130859375, + "learning_rate": 0.0006488963941591339, + "loss": 0.5892, + "step": 38030 + }, + { + "epoch": 1.8893414125360088, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006488566603754843, + "loss": 0.5921, + "step": 38040 + }, + { + "epoch": 1.889838084831628, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006488169265918347, + "loss": 0.5968, + "step": 38050 + }, + { + "epoch": 1.8903347571272473, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006487771928081852, + "loss": 0.5817, + "step": 38060 + }, + { + "epoch": 1.8908314294228667, + "grad_norm": 0.109375, + "learning_rate": 0.0006487374590245356, + "loss": 0.6205, + "step": 38070 + }, + { + "epoch": 1.8913281017184862, + "grad_norm": 0.1171875, + "learning_rate": 0.0006486977252408861, + "loss": 0.5994, + "step": 38080 + }, + { + "epoch": 1.8918247740141054, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006486579914572366, + "loss": 0.5979, + "step": 38090 + }, + { + "epoch": 1.8923214463097249, + "grad_norm": 0.162109375, + "learning_rate": 0.000648618257673587, + "loss": 0.6016, + "step": 38100 + }, + { + "epoch": 1.8928181186053443, + "grad_norm": 0.091796875, + "learning_rate": 0.0006485785238899374, + "loss": 0.615, + "step": 38110 + }, + { + "epoch": 1.8933147909009636, + "grad_norm": 0.11572265625, + "learning_rate": 0.000648538790106288, + "loss": 0.5877, + "step": 38120 + }, + { + "epoch": 1.8938114631965828, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006484990563226384, + "loss": 0.594, + "step": 38130 + }, + { + "epoch": 1.8943081354922022, + "grad_norm": 0.1455078125, + "learning_rate": 0.0006484593225389888, + "loss": 0.5847, + "step": 38140 + }, + { + "epoch": 1.8948048077878217, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006484195887553393, + "loss": 0.5837, + "step": 38150 + }, + { + "epoch": 1.895301480083441, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006483798549716897, + "loss": 0.569, + "step": 38160 + }, + { + "epoch": 1.8957981523790601, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006483401211880401, + "loss": 0.5629, + "step": 38170 + }, + { + "epoch": 1.8962948246746798, + "grad_norm": 0.16015625, + "learning_rate": 0.0006483003874043906, + "loss": 0.592, + "step": 38180 + }, + { + "epoch": 1.896791496970299, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006482606536207411, + "loss": 0.5981, + "step": 38190 + }, + { + "epoch": 1.8972881692659183, + "grad_norm": 0.12353515625, + "learning_rate": 0.0006482209198370915, + "loss": 0.5692, + "step": 38200 + }, + { + "epoch": 1.8977848415615377, + "grad_norm": 0.154296875, + "learning_rate": 0.0006481811860534419, + "loss": 0.582, + "step": 38210 + }, + { + "epoch": 1.8982815138571572, + "grad_norm": 0.13671875, + "learning_rate": 0.0006481414522697924, + "loss": 0.5871, + "step": 38220 + }, + { + "epoch": 1.8987781861527764, + "grad_norm": 0.1171875, + "learning_rate": 0.000648101718486143, + "loss": 0.5967, + "step": 38230 + }, + { + "epoch": 1.8992748584483956, + "grad_norm": 0.1328125, + "learning_rate": 0.0006480619847024933, + "loss": 0.5846, + "step": 38240 + }, + { + "epoch": 1.899771530744015, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006480222509188438, + "loss": 0.5886, + "step": 38250 + }, + { + "epoch": 1.9002682030396345, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006479825171351942, + "loss": 0.593, + "step": 38260 + }, + { + "epoch": 1.9007648753352537, + "grad_norm": 0.1630859375, + "learning_rate": 0.0006479427833515446, + "loss": 0.5926, + "step": 38270 + }, + { + "epoch": 1.9012615476308732, + "grad_norm": 0.17578125, + "learning_rate": 0.0006479030495678952, + "loss": 0.5937, + "step": 38280 + }, + { + "epoch": 1.9017582199264926, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006478633157842456, + "loss": 0.5817, + "step": 38290 + }, + { + "epoch": 1.9022548922221119, + "grad_norm": 0.09521484375, + "learning_rate": 0.000647823582000596, + "loss": 0.5878, + "step": 38300 + }, + { + "epoch": 1.902751564517731, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006477838482169465, + "loss": 0.6043, + "step": 38310 + }, + { + "epoch": 1.9032482368133505, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006477441144332969, + "loss": 0.5627, + "step": 38320 + }, + { + "epoch": 1.90374490910897, + "grad_norm": 0.09765625, + "learning_rate": 0.0006477043806496474, + "loss": 0.578, + "step": 38330 + }, + { + "epoch": 1.9042415814045892, + "grad_norm": 0.1708984375, + "learning_rate": 0.0006476646468659979, + "loss": 0.5814, + "step": 38340 + }, + { + "epoch": 1.9047382537002084, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006476249130823483, + "loss": 0.6276, + "step": 38350 + }, + { + "epoch": 1.905234925995828, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006475851792986987, + "loss": 0.5758, + "step": 38360 + }, + { + "epoch": 1.9057315982914473, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006475454455150491, + "loss": 0.5622, + "step": 38370 + }, + { + "epoch": 1.9062282705870666, + "grad_norm": 0.103515625, + "learning_rate": 0.0006475057117313997, + "loss": 0.5823, + "step": 38380 + }, + { + "epoch": 1.906724942882686, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006474659779477502, + "loss": 0.5854, + "step": 38390 + }, + { + "epoch": 1.9072216151783055, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006474262441641005, + "loss": 0.6088, + "step": 38400 + }, + { + "epoch": 1.9077182874739247, + "grad_norm": 0.09716796875, + "learning_rate": 0.000647386510380451, + "loss": 0.5652, + "step": 38410 + }, + { + "epoch": 1.908214959769544, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006473467765968015, + "loss": 0.6183, + "step": 38420 + }, + { + "epoch": 1.9087116320651634, + "grad_norm": 0.111328125, + "learning_rate": 0.0006473070428131519, + "loss": 0.6121, + "step": 38430 + }, + { + "epoch": 1.9092083043607828, + "grad_norm": 0.103515625, + "learning_rate": 0.0006472673090295024, + "loss": 0.5846, + "step": 38440 + }, + { + "epoch": 1.909704976656402, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006472275752458528, + "loss": 0.5984, + "step": 38450 + }, + { + "epoch": 1.9102016489520215, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006471878414622032, + "loss": 0.5955, + "step": 38460 + }, + { + "epoch": 1.910698321247641, + "grad_norm": 0.130859375, + "learning_rate": 0.0006471481076785537, + "loss": 0.5847, + "step": 38470 + }, + { + "epoch": 1.9111949935432602, + "grad_norm": 0.10546875, + "learning_rate": 0.0006471083738949042, + "loss": 0.561, + "step": 38480 + }, + { + "epoch": 1.9116916658388794, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006470686401112546, + "loss": 0.5744, + "step": 38490 + }, + { + "epoch": 1.9121883381344988, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006470289063276051, + "loss": 0.581, + "step": 38500 + }, + { + "epoch": 1.9126850104301183, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006469891725439555, + "loss": 0.5776, + "step": 38510 + }, + { + "epoch": 1.9131816827257375, + "grad_norm": 0.2451171875, + "learning_rate": 0.0006469494387603059, + "loss": 0.5793, + "step": 38520 + }, + { + "epoch": 1.9136783550213567, + "grad_norm": 0.1337890625, + "learning_rate": 0.0006469097049766565, + "loss": 0.5699, + "step": 38530 + }, + { + "epoch": 1.9141750273169764, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006468699711930069, + "loss": 0.584, + "step": 38540 + }, + { + "epoch": 1.9146716996125956, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006468302374093574, + "loss": 0.5715, + "step": 38550 + }, + { + "epoch": 1.9151683719082149, + "grad_norm": 0.111328125, + "learning_rate": 0.0006467905036257078, + "loss": 0.5985, + "step": 38560 + }, + { + "epoch": 1.9156650442038343, + "grad_norm": 0.099609375, + "learning_rate": 0.0006467507698420582, + "loss": 0.5708, + "step": 38570 + }, + { + "epoch": 1.9161617164994538, + "grad_norm": 0.142578125, + "learning_rate": 0.0006467110360584088, + "loss": 0.5726, + "step": 38580 + }, + { + "epoch": 1.916658388795073, + "grad_norm": 0.1689453125, + "learning_rate": 0.0006466713022747591, + "loss": 0.5902, + "step": 38590 + }, + { + "epoch": 1.9171550610906922, + "grad_norm": 0.177734375, + "learning_rate": 0.0006466315684911096, + "loss": 0.5852, + "step": 38600 + }, + { + "epoch": 1.9176517333863117, + "grad_norm": 0.1630859375, + "learning_rate": 0.0006465918347074601, + "loss": 0.5814, + "step": 38610 + }, + { + "epoch": 1.9181484056819311, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006465521009238104, + "loss": 0.5683, + "step": 38620 + }, + { + "epoch": 1.9186450779775504, + "grad_norm": 0.09619140625, + "learning_rate": 0.000646512367140161, + "loss": 0.5598, + "step": 38630 + }, + { + "epoch": 1.9191417502731698, + "grad_norm": 0.12890625, + "learning_rate": 0.0006464726333565114, + "loss": 0.5908, + "step": 38640 + }, + { + "epoch": 1.9196384225687892, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006464328995728618, + "loss": 0.5623, + "step": 38650 + }, + { + "epoch": 1.9201350948644085, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006463931657892123, + "loss": 0.5959, + "step": 38660 + }, + { + "epoch": 1.9206317671600277, + "grad_norm": 0.171875, + "learning_rate": 0.0006463534320055627, + "loss": 0.5941, + "step": 38670 + }, + { + "epoch": 1.9211284394556472, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006463136982219133, + "loss": 0.6107, + "step": 38680 + }, + { + "epoch": 1.9216251117512666, + "grad_norm": 0.103515625, + "learning_rate": 0.0006462739644382637, + "loss": 0.5719, + "step": 38690 + }, + { + "epoch": 1.9221217840468858, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006462342306546141, + "loss": 0.5812, + "step": 38700 + }, + { + "epoch": 1.922618456342505, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006461944968709646, + "loss": 0.6058, + "step": 38710 + }, + { + "epoch": 1.9231151286381247, + "grad_norm": 0.09765625, + "learning_rate": 0.000646154763087315, + "loss": 0.5757, + "step": 38720 + }, + { + "epoch": 1.923611800933744, + "grad_norm": 0.11328125, + "learning_rate": 0.0006461150293036655, + "loss": 0.5796, + "step": 38730 + }, + { + "epoch": 1.9241084732293632, + "grad_norm": 0.0908203125, + "learning_rate": 0.000646075295520016, + "loss": 0.5969, + "step": 38740 + }, + { + "epoch": 1.9246051455249826, + "grad_norm": 0.12353515625, + "learning_rate": 0.0006460355617363664, + "loss": 0.6022, + "step": 38750 + }, + { + "epoch": 1.925101817820602, + "grad_norm": 0.109375, + "learning_rate": 0.0006459958279527168, + "loss": 0.5901, + "step": 38760 + }, + { + "epoch": 1.9255984901162213, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006459560941690673, + "loss": 0.5949, + "step": 38770 + }, + { + "epoch": 1.9260951624118405, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006459163603854176, + "loss": 0.565, + "step": 38780 + }, + { + "epoch": 1.92659183470746, + "grad_norm": 0.16015625, + "learning_rate": 0.0006458766266017682, + "loss": 0.5685, + "step": 38790 + }, + { + "epoch": 1.9270885070030794, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006458368928181187, + "loss": 0.5727, + "step": 38800 + }, + { + "epoch": 1.9275851792986987, + "grad_norm": 0.12060546875, + "learning_rate": 0.000645797159034469, + "loss": 0.5729, + "step": 38810 + }, + { + "epoch": 1.928081851594318, + "grad_norm": 0.14453125, + "learning_rate": 0.0006457574252508195, + "loss": 0.5759, + "step": 38820 + }, + { + "epoch": 1.9285785238899376, + "grad_norm": 0.13671875, + "learning_rate": 0.00064571769146717, + "loss": 0.5774, + "step": 38830 + }, + { + "epoch": 1.9290751961855568, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006456779576835205, + "loss": 0.596, + "step": 38840 + }, + { + "epoch": 1.929571868481176, + "grad_norm": 0.087890625, + "learning_rate": 0.0006456382238998709, + "loss": 0.5776, + "step": 38850 + }, + { + "epoch": 1.9300685407767955, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006455984901162213, + "loss": 0.5806, + "step": 38860 + }, + { + "epoch": 1.930565213072415, + "grad_norm": 0.1806640625, + "learning_rate": 0.0006455587563325718, + "loss": 0.5667, + "step": 38870 + }, + { + "epoch": 1.9310618853680341, + "grad_norm": 0.123046875, + "learning_rate": 0.0006455190225489223, + "loss": 0.5859, + "step": 38880 + }, + { + "epoch": 1.9315585576636534, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006454792887652727, + "loss": 0.5778, + "step": 38890 + }, + { + "epoch": 1.932055229959273, + "grad_norm": 0.109375, + "learning_rate": 0.0006454395549816232, + "loss": 0.5954, + "step": 38900 + }, + { + "epoch": 1.9325519022548923, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006453998211979736, + "loss": 0.6011, + "step": 38910 + }, + { + "epoch": 1.9330485745505115, + "grad_norm": 0.0908203125, + "learning_rate": 0.000645360087414324, + "loss": 0.5982, + "step": 38920 + }, + { + "epoch": 1.933545246846131, + "grad_norm": 0.12890625, + "learning_rate": 0.0006453203536306746, + "loss": 0.6026, + "step": 38930 + }, + { + "epoch": 1.9340419191417504, + "grad_norm": 0.09716796875, + "learning_rate": 0.000645280619847025, + "loss": 0.5653, + "step": 38940 + }, + { + "epoch": 1.9345385914373696, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006452408860633754, + "loss": 0.5959, + "step": 38950 + }, + { + "epoch": 1.9350352637329888, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006452011522797259, + "loss": 0.6307, + "step": 38960 + }, + { + "epoch": 1.9355319360286083, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006451614184960763, + "loss": 0.6209, + "step": 38970 + }, + { + "epoch": 1.9360286083242277, + "grad_norm": 0.109375, + "learning_rate": 0.0006451216847124268, + "loss": 0.5748, + "step": 38980 + }, + { + "epoch": 1.936525280619847, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006450819509287773, + "loss": 0.5798, + "step": 38990 + }, + { + "epoch": 1.9370219529154664, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006450422171451277, + "loss": 0.6054, + "step": 39000 + }, + { + "epoch": 1.9375186252110859, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006450024833614781, + "loss": 0.5824, + "step": 39010 + }, + { + "epoch": 1.938015297506705, + "grad_norm": 0.095703125, + "learning_rate": 0.0006449627495778286, + "loss": 0.6086, + "step": 39020 + }, + { + "epoch": 1.9385119698023243, + "grad_norm": 0.11328125, + "learning_rate": 0.000644923015794179, + "loss": 0.6038, + "step": 39030 + }, + { + "epoch": 1.9390086420979438, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006448832820105295, + "loss": 0.5818, + "step": 39040 + }, + { + "epoch": 1.9395053143935632, + "grad_norm": 0.09375, + "learning_rate": 0.0006448435482268799, + "loss": 0.5801, + "step": 39050 + }, + { + "epoch": 1.9400019866891824, + "grad_norm": 0.1669921875, + "learning_rate": 0.0006448038144432304, + "loss": 0.6027, + "step": 39060 + }, + { + "epoch": 1.9404986589848017, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006447640806595808, + "loss": 0.5803, + "step": 39070 + }, + { + "epoch": 1.9409953312804213, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006447243468759312, + "loss": 0.584, + "step": 39080 + }, + { + "epoch": 1.9414920035760406, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006446846130922818, + "loss": 0.5877, + "step": 39090 + }, + { + "epoch": 1.9419886758716598, + "grad_norm": 0.16796875, + "learning_rate": 0.0006446448793086322, + "loss": 0.6003, + "step": 39100 + }, + { + "epoch": 1.9424853481672792, + "grad_norm": 0.134765625, + "learning_rate": 0.0006446051455249826, + "loss": 0.5956, + "step": 39110 + }, + { + "epoch": 1.9429820204628987, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006445654117413331, + "loss": 0.5991, + "step": 39120 + }, + { + "epoch": 1.943478692758518, + "grad_norm": 0.09765625, + "learning_rate": 0.0006445256779576835, + "loss": 0.5997, + "step": 39130 + }, + { + "epoch": 1.9439753650541371, + "grad_norm": 0.14453125, + "learning_rate": 0.000644485944174034, + "loss": 0.5702, + "step": 39140 + }, + { + "epoch": 1.9444720373497566, + "grad_norm": 0.10546875, + "learning_rate": 0.0006444462103903845, + "loss": 0.594, + "step": 39150 + }, + { + "epoch": 1.944968709645376, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006444064766067349, + "loss": 0.5686, + "step": 39160 + }, + { + "epoch": 1.9454653819409953, + "grad_norm": 0.123046875, + "learning_rate": 0.0006443667428230853, + "loss": 0.6089, + "step": 39170 + }, + { + "epoch": 1.9459620542366147, + "grad_norm": 0.1689453125, + "learning_rate": 0.0006443270090394359, + "loss": 0.5801, + "step": 39180 + }, + { + "epoch": 1.9464587265322342, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006442872752557863, + "loss": 0.5689, + "step": 39190 + }, + { + "epoch": 1.9469553988278534, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006442475414721367, + "loss": 0.5747, + "step": 39200 + }, + { + "epoch": 1.9474520711234726, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006442078076884872, + "loss": 0.5758, + "step": 39210 + }, + { + "epoch": 1.947948743419092, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006441680739048376, + "loss": 0.5688, + "step": 39220 + }, + { + "epoch": 1.9484454157147115, + "grad_norm": 0.09912109375, + "learning_rate": 0.000644128340121188, + "loss": 0.5876, + "step": 39230 + }, + { + "epoch": 1.9489420880103308, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006440886063375385, + "loss": 0.565, + "step": 39240 + }, + { + "epoch": 1.94943876030595, + "grad_norm": 0.1708984375, + "learning_rate": 0.000644048872553889, + "loss": 0.6057, + "step": 39250 + }, + { + "epoch": 1.9499354326015694, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006440091387702394, + "loss": 0.5712, + "step": 39260 + }, + { + "epoch": 1.9504321048971889, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006439694049865898, + "loss": 0.6069, + "step": 39270 + }, + { + "epoch": 1.950928777192808, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006439296712029403, + "loss": 0.5796, + "step": 39280 + }, + { + "epoch": 1.9514254494884276, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006438899374192909, + "loss": 0.5689, + "step": 39290 + }, + { + "epoch": 1.951922121784047, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006438502036356412, + "loss": 0.5608, + "step": 39300 + }, + { + "epoch": 1.9524187940796662, + "grad_norm": 0.095703125, + "learning_rate": 0.0006438104698519917, + "loss": 0.5871, + "step": 39310 + }, + { + "epoch": 1.9529154663752855, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006437707360683421, + "loss": 0.6047, + "step": 39320 + }, + { + "epoch": 1.953412138670905, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006437310022846925, + "loss": 0.5833, + "step": 39330 + }, + { + "epoch": 1.9539088109665244, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006436912685010431, + "loss": 0.6137, + "step": 39340 + }, + { + "epoch": 1.9544054832621436, + "grad_norm": 0.1669921875, + "learning_rate": 0.0006436515347173935, + "loss": 0.5939, + "step": 39350 + }, + { + "epoch": 1.954902155557763, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006436118009337439, + "loss": 0.5941, + "step": 39360 + }, + { + "epoch": 1.9553988278533825, + "grad_norm": 0.08544921875, + "learning_rate": 0.0006435720671500944, + "loss": 0.5745, + "step": 39370 + }, + { + "epoch": 1.9558955001490017, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006435323333664448, + "loss": 0.5842, + "step": 39380 + }, + { + "epoch": 1.956392172444621, + "grad_norm": 0.13671875, + "learning_rate": 0.0006434925995827953, + "loss": 0.5685, + "step": 39390 + }, + { + "epoch": 1.9568888447402404, + "grad_norm": 0.12255859375, + "learning_rate": 0.0006434528657991458, + "loss": 0.5829, + "step": 39400 + }, + { + "epoch": 1.9573855170358598, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006434131320154962, + "loss": 0.6016, + "step": 39410 + }, + { + "epoch": 1.957882189331479, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006433733982318467, + "loss": 0.559, + "step": 39420 + }, + { + "epoch": 1.9583788616270983, + "grad_norm": 0.09228515625, + "learning_rate": 0.000643333664448197, + "loss": 0.5896, + "step": 39430 + }, + { + "epoch": 1.9588755339227177, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006432939306645476, + "loss": 0.5995, + "step": 39440 + }, + { + "epoch": 1.9593722062183372, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006432541968808981, + "loss": 0.5651, + "step": 39450 + }, + { + "epoch": 1.9598688785139564, + "grad_norm": 0.1015625, + "learning_rate": 0.0006432144630972484, + "loss": 0.5804, + "step": 39460 + }, + { + "epoch": 1.9603655508095759, + "grad_norm": 0.1796875, + "learning_rate": 0.0006431747293135989, + "loss": 0.5789, + "step": 39470 + }, + { + "epoch": 1.9608622231051953, + "grad_norm": 0.1796875, + "learning_rate": 0.0006431349955299495, + "loss": 0.6015, + "step": 39480 + }, + { + "epoch": 1.9613588954008145, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006430952617462998, + "loss": 0.5799, + "step": 39490 + }, + { + "epoch": 1.9618555676964338, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006430555279626503, + "loss": 0.6069, + "step": 39500 + }, + { + "epoch": 1.9623522399920532, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006430157941790007, + "loss": 0.589, + "step": 39510 + }, + { + "epoch": 1.9628489122876727, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006429760603953511, + "loss": 0.6054, + "step": 39520 + }, + { + "epoch": 1.963345584583292, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006429363266117016, + "loss": 0.5633, + "step": 39530 + }, + { + "epoch": 1.9638422568789113, + "grad_norm": 0.08837890625, + "learning_rate": 0.0006428965928280521, + "loss": 0.5687, + "step": 39540 + }, + { + "epoch": 1.9643389291745308, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006428568590444025, + "loss": 0.6061, + "step": 39550 + }, + { + "epoch": 1.96483560147015, + "grad_norm": 0.109375, + "learning_rate": 0.000642817125260753, + "loss": 0.5849, + "step": 39560 + }, + { + "epoch": 1.9653322737657692, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006427773914771034, + "loss": 0.5754, + "step": 39570 + }, + { + "epoch": 1.9658289460613887, + "grad_norm": 0.1494140625, + "learning_rate": 0.000642737657693454, + "loss": 0.5902, + "step": 39580 + }, + { + "epoch": 1.9663256183570081, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006426979239098044, + "loss": 0.5815, + "step": 39590 + }, + { + "epoch": 1.9668222906526274, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006426581901261548, + "loss": 0.584, + "step": 39600 + }, + { + "epoch": 1.9673189629482466, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006426184563425053, + "loss": 0.5894, + "step": 39610 + }, + { + "epoch": 1.967815635243866, + "grad_norm": 0.1015625, + "learning_rate": 0.0006425787225588557, + "loss": 0.5796, + "step": 39620 + }, + { + "epoch": 1.9683123075394855, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006425389887752061, + "loss": 0.6132, + "step": 39630 + }, + { + "epoch": 1.9688089798351047, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006424992549915567, + "loss": 0.5728, + "step": 39640 + }, + { + "epoch": 1.9693056521307242, + "grad_norm": 0.119140625, + "learning_rate": 0.000642459521207907, + "loss": 0.5985, + "step": 39650 + }, + { + "epoch": 1.9698023244263436, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006424197874242575, + "loss": 0.5922, + "step": 39660 + }, + { + "epoch": 1.9702989967219628, + "grad_norm": 0.10009765625, + "learning_rate": 0.000642380053640608, + "loss": 0.6181, + "step": 39670 + }, + { + "epoch": 1.970795669017582, + "grad_norm": 0.1171875, + "learning_rate": 0.0006423403198569583, + "loss": 0.5975, + "step": 39680 + }, + { + "epoch": 1.9712923413132015, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006423005860733089, + "loss": 0.6012, + "step": 39690 + }, + { + "epoch": 1.971789013608821, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006422608522896593, + "loss": 0.567, + "step": 39700 + }, + { + "epoch": 1.9722856859044402, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006422211185060097, + "loss": 0.5704, + "step": 39710 + }, + { + "epoch": 1.9727823582000596, + "grad_norm": 0.107421875, + "learning_rate": 0.0006421813847223602, + "loss": 0.6102, + "step": 39720 + }, + { + "epoch": 1.973279030495679, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006421416509387106, + "loss": 0.6061, + "step": 39730 + }, + { + "epoch": 1.9737757027912983, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006421019171550612, + "loss": 0.5885, + "step": 39740 + }, + { + "epoch": 1.9742723750869176, + "grad_norm": 0.08642578125, + "learning_rate": 0.0006420621833714116, + "loss": 0.5902, + "step": 39750 + }, + { + "epoch": 1.974769047382537, + "grad_norm": 0.1259765625, + "learning_rate": 0.000642022449587762, + "loss": 0.5694, + "step": 39760 + }, + { + "epoch": 1.9752657196781565, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006419827158041125, + "loss": 0.6038, + "step": 39770 + }, + { + "epoch": 1.9757623919737757, + "grad_norm": 0.16015625, + "learning_rate": 0.0006419429820204629, + "loss": 0.5931, + "step": 39780 + }, + { + "epoch": 1.976259064269395, + "grad_norm": 0.091796875, + "learning_rate": 0.0006419032482368134, + "loss": 0.5722, + "step": 39790 + }, + { + "epoch": 1.9767557365650144, + "grad_norm": 0.09765625, + "learning_rate": 0.0006418635144531639, + "loss": 0.5876, + "step": 39800 + }, + { + "epoch": 1.9772524088606338, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006418237806695143, + "loss": 0.5602, + "step": 39810 + }, + { + "epoch": 1.977749081156253, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006417840468858647, + "loss": 0.5556, + "step": 39820 + }, + { + "epoch": 1.9782457534518725, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006417443131022152, + "loss": 0.5809, + "step": 39830 + }, + { + "epoch": 1.978742425747492, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006417045793185656, + "loss": 0.5805, + "step": 39840 + }, + { + "epoch": 1.9792390980431112, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006416648455349161, + "loss": 0.5848, + "step": 39850 + }, + { + "epoch": 1.9797357703387304, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006416251117512666, + "loss": 0.5994, + "step": 39860 + }, + { + "epoch": 1.9802324426343498, + "grad_norm": 0.1064453125, + "learning_rate": 0.000641585377967617, + "loss": 0.5487, + "step": 39870 + }, + { + "epoch": 1.9807291149299693, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006415456441839674, + "loss": 0.5817, + "step": 39880 + }, + { + "epoch": 1.9812257872255885, + "grad_norm": 0.15625, + "learning_rate": 0.000641505910400318, + "loss": 0.5628, + "step": 39890 + }, + { + "epoch": 1.981722459521208, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006414661766166684, + "loss": 0.5559, + "step": 39900 + }, + { + "epoch": 1.9822191318168274, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006414264428330188, + "loss": 0.579, + "step": 39910 + }, + { + "epoch": 1.9827158041124466, + "grad_norm": 0.150390625, + "learning_rate": 0.0006413867090493692, + "loss": 0.5864, + "step": 39920 + }, + { + "epoch": 1.9832124764080659, + "grad_norm": 0.119140625, + "learning_rate": 0.0006413469752657197, + "loss": 0.5798, + "step": 39930 + }, + { + "epoch": 1.9837091487036853, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006413072414820702, + "loss": 0.589, + "step": 39940 + }, + { + "epoch": 1.9842058209993048, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006412675076984206, + "loss": 0.6079, + "step": 39950 + }, + { + "epoch": 1.984702493294924, + "grad_norm": 0.1630859375, + "learning_rate": 0.0006412277739147711, + "loss": 0.5823, + "step": 39960 + }, + { + "epoch": 1.9851991655905432, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006411880401311215, + "loss": 0.5776, + "step": 39970 + }, + { + "epoch": 1.9856958378861627, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006411483063474719, + "loss": 0.5796, + "step": 39980 + }, + { + "epoch": 1.9861925101817821, + "grad_norm": 0.126953125, + "learning_rate": 0.0006411085725638225, + "loss": 0.5989, + "step": 39990 + }, + { + "epoch": 1.9866891824774013, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006410688387801729, + "loss": 0.5959, + "step": 40000 + }, + { + "epoch": 1.9871858547730208, + "grad_norm": 0.1845703125, + "learning_rate": 0.0006410291049965233, + "loss": 0.5618, + "step": 40010 + }, + { + "epoch": 1.9876825270686402, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006409893712128738, + "loss": 0.5903, + "step": 40020 + }, + { + "epoch": 1.9881791993642595, + "grad_norm": 0.08740234375, + "learning_rate": 0.0006409496374292242, + "loss": 0.5609, + "step": 40030 + }, + { + "epoch": 1.9886758716598787, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006409099036455747, + "loss": 0.5721, + "step": 40040 + }, + { + "epoch": 1.9891725439554981, + "grad_norm": 0.111328125, + "learning_rate": 0.0006408701698619252, + "loss": 0.578, + "step": 40050 + }, + { + "epoch": 1.9896692162511176, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006408304360782756, + "loss": 0.5972, + "step": 40060 + }, + { + "epoch": 1.9901658885467368, + "grad_norm": 0.11181640625, + "learning_rate": 0.000640790702294626, + "loss": 0.5915, + "step": 40070 + }, + { + "epoch": 1.9906625608423563, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006407509685109765, + "loss": 0.597, + "step": 40080 + }, + { + "epoch": 1.9911592331379757, + "grad_norm": 0.173828125, + "learning_rate": 0.000640711234727327, + "loss": 0.5964, + "step": 40090 + }, + { + "epoch": 1.991655905433595, + "grad_norm": 0.12109375, + "learning_rate": 0.0006406715009436774, + "loss": 0.5867, + "step": 40100 + }, + { + "epoch": 1.9921525777292142, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006406317671600278, + "loss": 0.5844, + "step": 40110 + }, + { + "epoch": 1.9926492500248336, + "grad_norm": 0.08935546875, + "learning_rate": 0.0006405920333763783, + "loss": 0.6189, + "step": 40120 + }, + { + "epoch": 1.993145922320453, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006405522995927287, + "loss": 0.5775, + "step": 40130 + }, + { + "epoch": 1.9936425946160723, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006405125658090792, + "loss": 0.5647, + "step": 40140 + }, + { + "epoch": 1.9941392669116915, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006404728320254297, + "loss": 0.5857, + "step": 40150 + }, + { + "epoch": 1.994635939207311, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006404330982417801, + "loss": 0.5828, + "step": 40160 + }, + { + "epoch": 1.9951326115029304, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006403933644581305, + "loss": 0.5727, + "step": 40170 + }, + { + "epoch": 1.9956292837985496, + "grad_norm": 0.1513671875, + "learning_rate": 0.000640353630674481, + "loss": 0.5982, + "step": 40180 + }, + { + "epoch": 1.996125956094169, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006403138968908315, + "loss": 0.5826, + "step": 40190 + }, + { + "epoch": 1.9966226283897885, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006402741631071819, + "loss": 0.5778, + "step": 40200 + }, + { + "epoch": 1.9971193006854078, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006402344293235324, + "loss": 0.5487, + "step": 40210 + }, + { + "epoch": 1.997615972981027, + "grad_norm": 0.203125, + "learning_rate": 0.0006401946955398828, + "loss": 0.5894, + "step": 40220 + }, + { + "epoch": 1.9981126452766464, + "grad_norm": 0.111328125, + "learning_rate": 0.0006401549617562332, + "loss": 0.6065, + "step": 40230 + }, + { + "epoch": 1.998609317572266, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006401152279725838, + "loss": 0.5924, + "step": 40240 + }, + { + "epoch": 1.9991059898678851, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006400754941889342, + "loss": 0.5868, + "step": 40250 + }, + { + "epoch": 1.9996026621635046, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006400357604052846, + "loss": 0.5794, + "step": 40260 + }, + { + "epoch": 2.000099334459124, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006399960266216351, + "loss": 0.5783, + "step": 40270 + }, + { + "epoch": 2.0005960067547433, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006399562928379855, + "loss": 0.5969, + "step": 40280 + }, + { + "epoch": 2.0010926790503625, + "grad_norm": 0.1416015625, + "learning_rate": 0.000639916559054336, + "loss": 0.5477, + "step": 40290 + }, + { + "epoch": 2.0015893513459817, + "grad_norm": 0.08984375, + "learning_rate": 0.0006398768252706864, + "loss": 0.5871, + "step": 40300 + }, + { + "epoch": 2.0020860236416014, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006398370914870369, + "loss": 0.5876, + "step": 40310 + }, + { + "epoch": 2.0025826959372206, + "grad_norm": 0.09033203125, + "learning_rate": 0.0006397973577033874, + "loss": 0.5657, + "step": 40320 + }, + { + "epoch": 2.00307936823284, + "grad_norm": 0.103515625, + "learning_rate": 0.0006397576239197377, + "loss": 0.5332, + "step": 40330 + }, + { + "epoch": 2.0035760405284595, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006397178901360883, + "loss": 0.5807, + "step": 40340 + }, + { + "epoch": 2.0040727128240787, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006396781563524388, + "loss": 0.5744, + "step": 40350 + }, + { + "epoch": 2.004569385119698, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006396384225687891, + "loss": 0.5867, + "step": 40360 + }, + { + "epoch": 2.005066057415317, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006395986887851396, + "loss": 0.583, + "step": 40370 + }, + { + "epoch": 2.005562729710937, + "grad_norm": 0.1591796875, + "learning_rate": 0.00063955895500149, + "loss": 0.5734, + "step": 40380 + }, + { + "epoch": 2.006059402006556, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006395192212178404, + "loss": 0.567, + "step": 40390 + }, + { + "epoch": 2.0065560743021753, + "grad_norm": 0.1865234375, + "learning_rate": 0.000639479487434191, + "loss": 0.5576, + "step": 40400 + }, + { + "epoch": 2.007052746597795, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006394397536505414, + "loss": 0.606, + "step": 40410 + }, + { + "epoch": 2.007549418893414, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006394000198668918, + "loss": 0.5705, + "step": 40420 + }, + { + "epoch": 2.0080460911890334, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006393602860832423, + "loss": 0.5697, + "step": 40430 + }, + { + "epoch": 2.0085427634846527, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006393205522995927, + "loss": 0.5761, + "step": 40440 + }, + { + "epoch": 2.0090394357802723, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006392808185159432, + "loss": 0.5834, + "step": 40450 + }, + { + "epoch": 2.0095361080758916, + "grad_norm": 0.126953125, + "learning_rate": 0.0006392410847322937, + "loss": 0.5585, + "step": 40460 + }, + { + "epoch": 2.010032780371511, + "grad_norm": 0.095703125, + "learning_rate": 0.0006392013509486441, + "loss": 0.5593, + "step": 40470 + }, + { + "epoch": 2.01052945266713, + "grad_norm": 0.099609375, + "learning_rate": 0.0006391616171649946, + "loss": 0.553, + "step": 40480 + }, + { + "epoch": 2.0110261249627497, + "grad_norm": 0.16796875, + "learning_rate": 0.000639121883381345, + "loss": 0.571, + "step": 40490 + }, + { + "epoch": 2.011522797258369, + "grad_norm": 0.1875, + "learning_rate": 0.0006390821495976955, + "loss": 0.5681, + "step": 40500 + }, + { + "epoch": 2.012019469553988, + "grad_norm": 0.1220703125, + "learning_rate": 0.000639042415814046, + "loss": 0.5786, + "step": 40510 + }, + { + "epoch": 2.012516141849608, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006390026820303963, + "loss": 0.5697, + "step": 40520 + }, + { + "epoch": 2.013012814145227, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006389629482467468, + "loss": 0.5741, + "step": 40530 + }, + { + "epoch": 2.0135094864408463, + "grad_norm": 0.095703125, + "learning_rate": 0.0006389232144630974, + "loss": 0.5523, + "step": 40540 + }, + { + "epoch": 2.0140061587364655, + "grad_norm": 0.10546875, + "learning_rate": 0.0006388834806794477, + "loss": 0.5956, + "step": 40550 + }, + { + "epoch": 2.014502831032085, + "grad_norm": 0.125, + "learning_rate": 0.0006388437468957982, + "loss": 0.5698, + "step": 40560 + }, + { + "epoch": 2.0149995033277044, + "grad_norm": 0.126953125, + "learning_rate": 0.0006388040131121486, + "loss": 0.5914, + "step": 40570 + }, + { + "epoch": 2.0154961756233236, + "grad_norm": 0.10205078125, + "learning_rate": 0.000638764279328499, + "loss": 0.5779, + "step": 40580 + }, + { + "epoch": 2.0159928479189433, + "grad_norm": 0.08837890625, + "learning_rate": 0.0006387245455448495, + "loss": 0.5761, + "step": 40590 + }, + { + "epoch": 2.0164895202145625, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006386848117612, + "loss": 0.5418, + "step": 40600 + }, + { + "epoch": 2.0169861925101817, + "grad_norm": 0.12255859375, + "learning_rate": 0.0006386450779775504, + "loss": 0.5969, + "step": 40610 + }, + { + "epoch": 2.017482864805801, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006386053441939009, + "loss": 0.5575, + "step": 40620 + }, + { + "epoch": 2.0179795371014206, + "grad_norm": 0.142578125, + "learning_rate": 0.0006385656104102513, + "loss": 0.5786, + "step": 40630 + }, + { + "epoch": 2.01847620939704, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006385258766266019, + "loss": 0.5354, + "step": 40640 + }, + { + "epoch": 2.018972881692659, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006384861428429523, + "loss": 0.608, + "step": 40650 + }, + { + "epoch": 2.0194695539882783, + "grad_norm": 0.228515625, + "learning_rate": 0.0006384464090593027, + "loss": 0.5686, + "step": 40660 + }, + { + "epoch": 2.019966226283898, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006384066752756532, + "loss": 0.5938, + "step": 40670 + }, + { + "epoch": 2.020462898579517, + "grad_norm": 0.138671875, + "learning_rate": 0.0006383669414920036, + "loss": 0.5727, + "step": 40680 + }, + { + "epoch": 2.0209595708751364, + "grad_norm": 0.1474609375, + "learning_rate": 0.000638327207708354, + "loss": 0.5695, + "step": 40690 + }, + { + "epoch": 2.021456243170756, + "grad_norm": 0.19140625, + "learning_rate": 0.0006382874739247046, + "loss": 0.5923, + "step": 40700 + }, + { + "epoch": 2.0219529154663753, + "grad_norm": 0.099609375, + "learning_rate": 0.0006382477401410549, + "loss": 0.5913, + "step": 40710 + }, + { + "epoch": 2.0224495877619946, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006382080063574054, + "loss": 0.5862, + "step": 40720 + }, + { + "epoch": 2.022946260057614, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006381682725737559, + "loss": 0.6107, + "step": 40730 + }, + { + "epoch": 2.0234429323532335, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006381285387901062, + "loss": 0.6058, + "step": 40740 + }, + { + "epoch": 2.0239396046488527, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006380888050064568, + "loss": 0.5606, + "step": 40750 + }, + { + "epoch": 2.024436276944472, + "grad_norm": 0.08984375, + "learning_rate": 0.0006380490712228073, + "loss": 0.572, + "step": 40760 + }, + { + "epoch": 2.0249329492400916, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006380093374391577, + "loss": 0.5759, + "step": 40770 + }, + { + "epoch": 2.025429621535711, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006379696036555081, + "loss": 0.6079, + "step": 40780 + }, + { + "epoch": 2.02592629383133, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006379298698718585, + "loss": 0.5937, + "step": 40790 + }, + { + "epoch": 2.0264229661269493, + "grad_norm": 0.12890625, + "learning_rate": 0.0006378901360882091, + "loss": 0.5953, + "step": 40800 + }, + { + "epoch": 2.026919638422569, + "grad_norm": 0.12890625, + "learning_rate": 0.0006378504023045595, + "loss": 0.5788, + "step": 40810 + }, + { + "epoch": 2.027416310718188, + "grad_norm": 0.126953125, + "learning_rate": 0.0006378106685209099, + "loss": 0.5634, + "step": 40820 + }, + { + "epoch": 2.0279129830138074, + "grad_norm": 0.259765625, + "learning_rate": 0.0006377709347372604, + "loss": 0.5917, + "step": 40830 + }, + { + "epoch": 2.0284096553094266, + "grad_norm": 0.18359375, + "learning_rate": 0.0006377312009536108, + "loss": 0.5721, + "step": 40840 + }, + { + "epoch": 2.0289063276050463, + "grad_norm": 0.09033203125, + "learning_rate": 0.0006376914671699613, + "loss": 0.5689, + "step": 40850 + }, + { + "epoch": 2.0294029999006655, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006376517333863118, + "loss": 0.5732, + "step": 40860 + }, + { + "epoch": 2.0298996721962848, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006376119996026622, + "loss": 0.5649, + "step": 40870 + }, + { + "epoch": 2.0303963444919044, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006375722658190126, + "loss": 0.5461, + "step": 40880 + }, + { + "epoch": 2.0308930167875237, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006375325320353631, + "loss": 0.5322, + "step": 40890 + }, + { + "epoch": 2.031389689083143, + "grad_norm": 0.1962890625, + "learning_rate": 0.0006374927982517135, + "loss": 0.6145, + "step": 40900 + }, + { + "epoch": 2.031886361378762, + "grad_norm": 0.109375, + "learning_rate": 0.000637453064468064, + "loss": 0.5766, + "step": 40910 + }, + { + "epoch": 2.032383033674382, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006374133306844145, + "loss": 0.563, + "step": 40920 + }, + { + "epoch": 2.032879705970001, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006373735969007649, + "loss": 0.5614, + "step": 40930 + }, + { + "epoch": 2.0333763782656202, + "grad_norm": 0.09765625, + "learning_rate": 0.0006373338631171153, + "loss": 0.5522, + "step": 40940 + }, + { + "epoch": 2.03387305056124, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006372941293334659, + "loss": 0.5635, + "step": 40950 + }, + { + "epoch": 2.034369722856859, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006372543955498163, + "loss": 0.5879, + "step": 40960 + }, + { + "epoch": 2.0348663951524784, + "grad_norm": 0.138671875, + "learning_rate": 0.0006372146617661667, + "loss": 0.5443, + "step": 40970 + }, + { + "epoch": 2.0353630674480976, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006371749279825171, + "loss": 0.5672, + "step": 40980 + }, + { + "epoch": 2.0358597397437173, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006371351941988676, + "loss": 0.5633, + "step": 40990 + }, + { + "epoch": 2.0363564120393365, + "grad_norm": 0.1435546875, + "learning_rate": 0.0006370954604152181, + "loss": 0.588, + "step": 41000 + }, + { + "epoch": 2.0368530843349557, + "grad_norm": 0.115234375, + "learning_rate": 0.0006370557266315685, + "loss": 0.5558, + "step": 41010 + }, + { + "epoch": 2.037349756630575, + "grad_norm": 0.10107421875, + "learning_rate": 0.000637015992847919, + "loss": 0.5913, + "step": 41020 + }, + { + "epoch": 2.0378464289261946, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006369762590642694, + "loss": 0.5494, + "step": 41030 + }, + { + "epoch": 2.038343101221814, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006369365252806198, + "loss": 0.5775, + "step": 41040 + }, + { + "epoch": 2.038839773517433, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006368967914969704, + "loss": 0.5988, + "step": 41050 + }, + { + "epoch": 2.0393364458130527, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006368570577133208, + "loss": 0.5296, + "step": 41060 + }, + { + "epoch": 2.039833118108672, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006368173239296712, + "loss": 0.5901, + "step": 41070 + }, + { + "epoch": 2.040329790404291, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006367775901460217, + "loss": 0.559, + "step": 41080 + }, + { + "epoch": 2.0408264626999104, + "grad_norm": 0.095703125, + "learning_rate": 0.0006367378563623721, + "loss": 0.5933, + "step": 41090 + }, + { + "epoch": 2.04132313499553, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006366981225787226, + "loss": 0.5288, + "step": 41100 + }, + { + "epoch": 2.0418198072911493, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006366583887950731, + "loss": 0.5744, + "step": 41110 + }, + { + "epoch": 2.0423164795867685, + "grad_norm": 0.154296875, + "learning_rate": 0.0006366186550114235, + "loss": 0.5789, + "step": 41120 + }, + { + "epoch": 2.042813151882388, + "grad_norm": 0.1806640625, + "learning_rate": 0.0006365789212277739, + "loss": 0.5652, + "step": 41130 + }, + { + "epoch": 2.0433098241780074, + "grad_norm": 0.146484375, + "learning_rate": 0.0006365391874441244, + "loss": 0.5516, + "step": 41140 + }, + { + "epoch": 2.0438064964736267, + "grad_norm": 0.140625, + "learning_rate": 0.0006364994536604749, + "loss": 0.5759, + "step": 41150 + }, + { + "epoch": 2.044303168769246, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006364597198768253, + "loss": 0.5565, + "step": 41160 + }, + { + "epoch": 2.0447998410648656, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006364199860931757, + "loss": 0.5778, + "step": 41170 + }, + { + "epoch": 2.045296513360485, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006363802523095262, + "loss": 0.5653, + "step": 41180 + }, + { + "epoch": 2.045793185656104, + "grad_norm": 0.119140625, + "learning_rate": 0.0006363405185258766, + "loss": 0.5975, + "step": 41190 + }, + { + "epoch": 2.0462898579517232, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006363007847422271, + "loss": 0.5602, + "step": 41200 + }, + { + "epoch": 2.046786530247343, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006362610509585776, + "loss": 0.5594, + "step": 41210 + }, + { + "epoch": 2.047283202542962, + "grad_norm": 0.1494140625, + "learning_rate": 0.0006362213171749281, + "loss": 0.5369, + "step": 41220 + }, + { + "epoch": 2.0477798748385814, + "grad_norm": 0.162109375, + "learning_rate": 0.0006361815833912784, + "loss": 0.5881, + "step": 41230 + }, + { + "epoch": 2.048276547134201, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006361418496076289, + "loss": 0.5492, + "step": 41240 + }, + { + "epoch": 2.0487732194298203, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006361021158239794, + "loss": 0.5637, + "step": 41250 + }, + { + "epoch": 2.0492698917254395, + "grad_norm": 0.29296875, + "learning_rate": 0.0006360623820403298, + "loss": 0.5624, + "step": 41260 + }, + { + "epoch": 2.0497665640210587, + "grad_norm": 0.1357421875, + "learning_rate": 0.0006360226482566803, + "loss": 0.5966, + "step": 41270 + }, + { + "epoch": 2.0502632363166784, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006359829144730307, + "loss": 0.6032, + "step": 41280 + }, + { + "epoch": 2.0507599086122976, + "grad_norm": 0.169921875, + "learning_rate": 0.0006359431806893811, + "loss": 0.594, + "step": 41290 + }, + { + "epoch": 2.051256580907917, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006359034469057317, + "loss": 0.5837, + "step": 41300 + }, + { + "epoch": 2.0517532532035365, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006358637131220821, + "loss": 0.5417, + "step": 41310 + }, + { + "epoch": 2.0522499254991557, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006358239793384325, + "loss": 0.6023, + "step": 41320 + }, + { + "epoch": 2.052746597794775, + "grad_norm": 0.11767578125, + "learning_rate": 0.000635784245554783, + "loss": 0.5314, + "step": 41330 + }, + { + "epoch": 2.053243270090394, + "grad_norm": 0.140625, + "learning_rate": 0.0006357445117711334, + "loss": 0.5968, + "step": 41340 + }, + { + "epoch": 2.053739942386014, + "grad_norm": 0.1875, + "learning_rate": 0.0006357047779874839, + "loss": 0.558, + "step": 41350 + }, + { + "epoch": 2.054236614681633, + "grad_norm": 0.08837890625, + "learning_rate": 0.0006356650442038344, + "loss": 0.5716, + "step": 41360 + }, + { + "epoch": 2.0547332869772523, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006356253104201848, + "loss": 0.5497, + "step": 41370 + }, + { + "epoch": 2.0552299592728716, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006355855766365353, + "loss": 0.5591, + "step": 41380 + }, + { + "epoch": 2.0557266315684912, + "grad_norm": 0.134765625, + "learning_rate": 0.0006355458428528856, + "loss": 0.5912, + "step": 41390 + }, + { + "epoch": 2.0562233038641105, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006355061090692362, + "loss": 0.5788, + "step": 41400 + }, + { + "epoch": 2.0567199761597297, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006354663752855867, + "loss": 0.5456, + "step": 41410 + }, + { + "epoch": 2.0572166484553494, + "grad_norm": 0.1865234375, + "learning_rate": 0.000635426641501937, + "loss": 0.5691, + "step": 41420 + }, + { + "epoch": 2.0577133207509686, + "grad_norm": 0.1328125, + "learning_rate": 0.0006353869077182875, + "loss": 0.5643, + "step": 41430 + }, + { + "epoch": 2.058209993046588, + "grad_norm": 0.08349609375, + "learning_rate": 0.0006353471739346379, + "loss": 0.5555, + "step": 41440 + }, + { + "epoch": 2.058706665342207, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006353074401509884, + "loss": 0.5833, + "step": 41450 + }, + { + "epoch": 2.0592033376378267, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006352677063673389, + "loss": 0.5867, + "step": 41460 + }, + { + "epoch": 2.059700009933446, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006352279725836893, + "loss": 0.5708, + "step": 41470 + }, + { + "epoch": 2.060196682229065, + "grad_norm": 0.171875, + "learning_rate": 0.0006351882388000397, + "loss": 0.592, + "step": 41480 + }, + { + "epoch": 2.060693354524685, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006351485050163902, + "loss": 0.5738, + "step": 41490 + }, + { + "epoch": 2.061190026820304, + "grad_norm": 0.09375, + "learning_rate": 0.0006351087712327407, + "loss": 0.5879, + "step": 41500 + }, + { + "epoch": 2.0616866991159233, + "grad_norm": 0.126953125, + "learning_rate": 0.0006350690374490912, + "loss": 0.5452, + "step": 41510 + }, + { + "epoch": 2.0621833714115425, + "grad_norm": 0.09765625, + "learning_rate": 0.0006350293036654416, + "loss": 0.5565, + "step": 41520 + }, + { + "epoch": 2.062680043707162, + "grad_norm": 0.11865234375, + "learning_rate": 0.000634989569881792, + "loss": 0.5858, + "step": 41530 + }, + { + "epoch": 2.0631767160027814, + "grad_norm": 0.146484375, + "learning_rate": 0.0006349498360981425, + "loss": 0.5948, + "step": 41540 + }, + { + "epoch": 2.0636733882984006, + "grad_norm": 0.1015625, + "learning_rate": 0.000634910102314493, + "loss": 0.5653, + "step": 41550 + }, + { + "epoch": 2.06417006059402, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006348703685308434, + "loss": 0.5673, + "step": 41560 + }, + { + "epoch": 2.0646667328896395, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006348306347471939, + "loss": 0.5766, + "step": 41570 + }, + { + "epoch": 2.0651634051852588, + "grad_norm": 0.130859375, + "learning_rate": 0.0006347909009635442, + "loss": 0.591, + "step": 41580 + }, + { + "epoch": 2.065660077480878, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006347511671798947, + "loss": 0.5991, + "step": 41590 + }, + { + "epoch": 2.0661567497764977, + "grad_norm": 0.205078125, + "learning_rate": 0.0006347114333962453, + "loss": 0.5761, + "step": 41600 + }, + { + "epoch": 2.066653422072117, + "grad_norm": 0.126953125, + "learning_rate": 0.0006346716996125956, + "loss": 0.5686, + "step": 41610 + }, + { + "epoch": 2.067150094367736, + "grad_norm": 0.091796875, + "learning_rate": 0.0006346319658289461, + "loss": 0.5431, + "step": 41620 + }, + { + "epoch": 2.0676467666633553, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006345922320452966, + "loss": 0.5931, + "step": 41630 + }, + { + "epoch": 2.068143438958975, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006345524982616469, + "loss": 0.5624, + "step": 41640 + }, + { + "epoch": 2.0686401112545942, + "grad_norm": 0.138671875, + "learning_rate": 0.0006345127644779975, + "loss": 0.6122, + "step": 41650 + }, + { + "epoch": 2.0691367835502135, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006344730306943479, + "loss": 0.5739, + "step": 41660 + }, + { + "epoch": 2.0696334558458327, + "grad_norm": 0.189453125, + "learning_rate": 0.0006344332969106984, + "loss": 0.5955, + "step": 41670 + }, + { + "epoch": 2.0701301281414524, + "grad_norm": 0.12255859375, + "learning_rate": 0.0006343935631270488, + "loss": 0.5844, + "step": 41680 + }, + { + "epoch": 2.0706268004370716, + "grad_norm": 0.1943359375, + "learning_rate": 0.0006343538293433992, + "loss": 0.5685, + "step": 41690 + }, + { + "epoch": 2.071123472732691, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006343140955597498, + "loss": 0.5632, + "step": 41700 + }, + { + "epoch": 2.0716201450283105, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006342743617761002, + "loss": 0.5264, + "step": 41710 + }, + { + "epoch": 2.0721168173239297, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006342346279924506, + "loss": 0.5794, + "step": 41720 + }, + { + "epoch": 2.072613489619549, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006341948942088011, + "loss": 0.579, + "step": 41730 + }, + { + "epoch": 2.073110161915168, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006341551604251515, + "loss": 0.6261, + "step": 41740 + }, + { + "epoch": 2.073606834210788, + "grad_norm": 0.1162109375, + "learning_rate": 0.000634115426641502, + "loss": 0.5718, + "step": 41750 + }, + { + "epoch": 2.074103506506407, + "grad_norm": 0.166015625, + "learning_rate": 0.0006340756928578525, + "loss": 0.5665, + "step": 41760 + }, + { + "epoch": 2.0746001788020263, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006340359590742028, + "loss": 0.5789, + "step": 41770 + }, + { + "epoch": 2.075096851097646, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006339962252905533, + "loss": 0.5854, + "step": 41780 + }, + { + "epoch": 2.075593523393265, + "grad_norm": 0.150390625, + "learning_rate": 0.0006339564915069038, + "loss": 0.5833, + "step": 41790 + }, + { + "epoch": 2.0760901956888844, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006339167577232541, + "loss": 0.5486, + "step": 41800 + }, + { + "epoch": 2.0765868679845036, + "grad_norm": 0.134765625, + "learning_rate": 0.0006338770239396047, + "loss": 0.5727, + "step": 41810 + }, + { + "epoch": 2.0770835402801233, + "grad_norm": 0.12060546875, + "learning_rate": 0.0006338372901559552, + "loss": 0.5696, + "step": 41820 + }, + { + "epoch": 2.0775802125757425, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006337975563723056, + "loss": 0.5609, + "step": 41830 + }, + { + "epoch": 2.0780768848713618, + "grad_norm": 0.10888671875, + "learning_rate": 0.000633757822588656, + "loss": 0.5891, + "step": 41840 + }, + { + "epoch": 2.0785735571669814, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006337180888050064, + "loss": 0.5951, + "step": 41850 + }, + { + "epoch": 2.0790702294626007, + "grad_norm": 0.0947265625, + "learning_rate": 0.000633678355021357, + "loss": 0.5704, + "step": 41860 + }, + { + "epoch": 2.07956690175822, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006336386212377074, + "loss": 0.5804, + "step": 41870 + }, + { + "epoch": 2.080063574053839, + "grad_norm": 0.09765625, + "learning_rate": 0.0006335988874540578, + "loss": 0.5651, + "step": 41880 + }, + { + "epoch": 2.080560246349459, + "grad_norm": 0.0869140625, + "learning_rate": 0.0006335591536704083, + "loss": 0.5728, + "step": 41890 + }, + { + "epoch": 2.081056918645078, + "grad_norm": 0.09375, + "learning_rate": 0.0006335194198867587, + "loss": 0.5829, + "step": 41900 + }, + { + "epoch": 2.0815535909406973, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006334796861031092, + "loss": 0.5777, + "step": 41910 + }, + { + "epoch": 2.0820502632363165, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006334399523194597, + "loss": 0.5489, + "step": 41920 + }, + { + "epoch": 2.082546935531936, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006334002185358101, + "loss": 0.5876, + "step": 41930 + }, + { + "epoch": 2.0830436078275554, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006333604847521605, + "loss": 0.5451, + "step": 41940 + }, + { + "epoch": 2.0835402801231746, + "grad_norm": 0.0888671875, + "learning_rate": 0.000633320750968511, + "loss": 0.6034, + "step": 41950 + }, + { + "epoch": 2.0840369524187943, + "grad_norm": 0.1484375, + "learning_rate": 0.0006332810171848615, + "loss": 0.5812, + "step": 41960 + }, + { + "epoch": 2.0845336247144135, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006332412834012119, + "loss": 0.5701, + "step": 41970 + }, + { + "epoch": 2.0850302970100327, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006332015496175624, + "loss": 0.5569, + "step": 41980 + }, + { + "epoch": 2.085526969305652, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006331618158339128, + "loss": 0.6005, + "step": 41990 + }, + { + "epoch": 2.0860236416012716, + "grad_norm": 0.091796875, + "learning_rate": 0.0006331220820502632, + "loss": 0.5837, + "step": 42000 + }, + { + "epoch": 2.086520313896891, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006330823482666138, + "loss": 0.576, + "step": 42010 + }, + { + "epoch": 2.08701698619251, + "grad_norm": 0.146484375, + "learning_rate": 0.0006330426144829642, + "loss": 0.5952, + "step": 42020 + }, + { + "epoch": 2.0875136584881293, + "grad_norm": 0.115234375, + "learning_rate": 0.0006330028806993146, + "loss": 0.6144, + "step": 42030 + }, + { + "epoch": 2.088010330783749, + "grad_norm": 0.1328125, + "learning_rate": 0.000632963146915665, + "loss": 0.5758, + "step": 42040 + }, + { + "epoch": 2.088507003079368, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006329234131320155, + "loss": 0.5794, + "step": 42050 + }, + { + "epoch": 2.0890036753749874, + "grad_norm": 0.09423828125, + "learning_rate": 0.000632883679348366, + "loss": 0.5563, + "step": 42060 + }, + { + "epoch": 2.089500347670607, + "grad_norm": 0.09375, + "learning_rate": 0.0006328439455647164, + "loss": 0.5668, + "step": 42070 + }, + { + "epoch": 2.0899970199662263, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006328042117810669, + "loss": 0.5914, + "step": 42080 + }, + { + "epoch": 2.0904936922618456, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006327644779974173, + "loss": 0.5852, + "step": 42090 + }, + { + "epoch": 2.090990364557465, + "grad_norm": 0.099609375, + "learning_rate": 0.0006327247442137677, + "loss": 0.559, + "step": 42100 + }, + { + "epoch": 2.0914870368530845, + "grad_norm": 0.11328125, + "learning_rate": 0.0006326850104301183, + "loss": 0.5842, + "step": 42110 + }, + { + "epoch": 2.0919837091487037, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006326452766464687, + "loss": 0.5544, + "step": 42120 + }, + { + "epoch": 2.092480381444323, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006326055428628191, + "loss": 0.5769, + "step": 42130 + }, + { + "epoch": 2.0929770537399426, + "grad_norm": 0.099609375, + "learning_rate": 0.0006325658090791696, + "loss": 0.5735, + "step": 42140 + }, + { + "epoch": 2.093473726035562, + "grad_norm": 0.130859375, + "learning_rate": 0.00063252607529552, + "loss": 0.5723, + "step": 42150 + }, + { + "epoch": 2.093970398331181, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006324863415118705, + "loss": 0.5741, + "step": 42160 + }, + { + "epoch": 2.0944670706268003, + "grad_norm": 0.11962890625, + "learning_rate": 0.000632446607728221, + "loss": 0.5562, + "step": 42170 + }, + { + "epoch": 2.09496374292242, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006324068739445714, + "loss": 0.5773, + "step": 42180 + }, + { + "epoch": 2.095460415218039, + "grad_norm": 0.09765625, + "learning_rate": 0.0006323671401609218, + "loss": 0.587, + "step": 42190 + }, + { + "epoch": 2.0959570875136584, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006323274063772723, + "loss": 0.5982, + "step": 42200 + }, + { + "epoch": 2.096453759809278, + "grad_norm": 0.13671875, + "learning_rate": 0.0006322876725936228, + "loss": 0.5631, + "step": 42210 + }, + { + "epoch": 2.0969504321048973, + "grad_norm": 0.103515625, + "learning_rate": 0.0006322479388099732, + "loss": 0.5632, + "step": 42220 + }, + { + "epoch": 2.0974471044005165, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006322082050263236, + "loss": 0.5785, + "step": 42230 + }, + { + "epoch": 2.0979437766961357, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006321684712426741, + "loss": 0.575, + "step": 42240 + }, + { + "epoch": 2.0984404489917554, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006321287374590245, + "loss": 0.5626, + "step": 42250 + }, + { + "epoch": 2.0989371212873746, + "grad_norm": 0.1513671875, + "learning_rate": 0.000632089003675375, + "loss": 0.6085, + "step": 42260 + }, + { + "epoch": 2.099433793582994, + "grad_norm": 0.146484375, + "learning_rate": 0.0006320492698917255, + "loss": 0.5885, + "step": 42270 + }, + { + "epoch": 2.099930465878613, + "grad_norm": 0.1064453125, + "learning_rate": 0.000632009536108076, + "loss": 0.5616, + "step": 42280 + }, + { + "epoch": 2.1004271381742328, + "grad_norm": 0.162109375, + "learning_rate": 0.0006319698023244263, + "loss": 0.5764, + "step": 42290 + }, + { + "epoch": 2.100923810469852, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006319300685407768, + "loss": 0.5754, + "step": 42300 + }, + { + "epoch": 2.101420482765471, + "grad_norm": 0.109375, + "learning_rate": 0.0006318903347571273, + "loss": 0.5673, + "step": 42310 + }, + { + "epoch": 2.101917155061091, + "grad_norm": 0.12353515625, + "learning_rate": 0.0006318506009734777, + "loss": 0.5716, + "step": 42320 + }, + { + "epoch": 2.10241382735671, + "grad_norm": 0.146484375, + "learning_rate": 0.0006318108671898282, + "loss": 0.5877, + "step": 42330 + }, + { + "epoch": 2.1029104996523293, + "grad_norm": 0.1015625, + "learning_rate": 0.0006317711334061786, + "loss": 0.5538, + "step": 42340 + }, + { + "epoch": 2.1034071719479486, + "grad_norm": 0.1181640625, + "learning_rate": 0.000631731399622529, + "loss": 0.5831, + "step": 42350 + }, + { + "epoch": 2.1039038442435682, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006316916658388796, + "loss": 0.5802, + "step": 42360 + }, + { + "epoch": 2.1044005165391875, + "grad_norm": 0.1572265625, + "learning_rate": 0.00063165193205523, + "loss": 0.5466, + "step": 42370 + }, + { + "epoch": 2.1048971888348067, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006316121982715804, + "loss": 0.5852, + "step": 42380 + }, + { + "epoch": 2.105393861130426, + "grad_norm": 0.091796875, + "learning_rate": 0.0006315724644879309, + "loss": 0.5649, + "step": 42390 + }, + { + "epoch": 2.1058905334260456, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006315327307042813, + "loss": 0.608, + "step": 42400 + }, + { + "epoch": 2.106387205721665, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006314929969206319, + "loss": 0.6046, + "step": 42410 + }, + { + "epoch": 2.106883878017284, + "grad_norm": 0.09765625, + "learning_rate": 0.0006314532631369823, + "loss": 0.5915, + "step": 42420 + }, + { + "epoch": 2.1073805503129037, + "grad_norm": 0.12890625, + "learning_rate": 0.0006314135293533327, + "loss": 0.579, + "step": 42430 + }, + { + "epoch": 2.107877222608523, + "grad_norm": 0.08935546875, + "learning_rate": 0.0006313737955696832, + "loss": 0.5746, + "step": 42440 + }, + { + "epoch": 2.108373894904142, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006313340617860335, + "loss": 0.5734, + "step": 42450 + }, + { + "epoch": 2.1088705671997614, + "grad_norm": 0.138671875, + "learning_rate": 0.0006312943280023841, + "loss": 0.5612, + "step": 42460 + }, + { + "epoch": 2.109367239495381, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006312545942187346, + "loss": 0.5787, + "step": 42470 + }, + { + "epoch": 2.1098639117910003, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006312148604350849, + "loss": 0.5601, + "step": 42480 + }, + { + "epoch": 2.1103605840866195, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006311751266514354, + "loss": 0.5422, + "step": 42490 + }, + { + "epoch": 2.110857256382239, + "grad_norm": 0.1630859375, + "learning_rate": 0.0006311353928677858, + "loss": 0.592, + "step": 42500 + }, + { + "epoch": 2.1113539286778584, + "grad_norm": 0.095703125, + "learning_rate": 0.0006310956590841363, + "loss": 0.5701, + "step": 42510 + }, + { + "epoch": 2.1118506009734777, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006310559253004868, + "loss": 0.5951, + "step": 42520 + }, + { + "epoch": 2.112347273269097, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006310161915168372, + "loss": 0.5869, + "step": 42530 + }, + { + "epoch": 2.1128439455647166, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006309764577331876, + "loss": 0.579, + "step": 42540 + }, + { + "epoch": 2.113340617860336, + "grad_norm": 0.091796875, + "learning_rate": 0.0006309367239495381, + "loss": 0.5473, + "step": 42550 + }, + { + "epoch": 2.113837290155955, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006308969901658886, + "loss": 0.6045, + "step": 42560 + }, + { + "epoch": 2.1143339624515747, + "grad_norm": 0.130859375, + "learning_rate": 0.0006308572563822391, + "loss": 0.5556, + "step": 42570 + }, + { + "epoch": 2.114830634747194, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006308175225985895, + "loss": 0.6007, + "step": 42580 + }, + { + "epoch": 2.115327307042813, + "grad_norm": 0.12890625, + "learning_rate": 0.0006307777888149399, + "loss": 0.533, + "step": 42590 + }, + { + "epoch": 2.1158239793384324, + "grad_norm": 0.09375, + "learning_rate": 0.0006307380550312904, + "loss": 0.548, + "step": 42600 + }, + { + "epoch": 2.116320651634052, + "grad_norm": 0.138671875, + "learning_rate": 0.0006306983212476409, + "loss": 0.562, + "step": 42610 + }, + { + "epoch": 2.1168173239296713, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006306585874639913, + "loss": 0.5785, + "step": 42620 + }, + { + "epoch": 2.1173139962252905, + "grad_norm": 0.205078125, + "learning_rate": 0.0006306188536803418, + "loss": 0.5775, + "step": 42630 + }, + { + "epoch": 2.1178106685209097, + "grad_norm": 0.1328125, + "learning_rate": 0.0006305791198966921, + "loss": 0.5739, + "step": 42640 + }, + { + "epoch": 2.1183073408165294, + "grad_norm": 0.11328125, + "learning_rate": 0.0006305393861130426, + "loss": 0.5826, + "step": 42650 + }, + { + "epoch": 2.1188040131121486, + "grad_norm": 0.1328125, + "learning_rate": 0.0006304996523293932, + "loss": 0.5619, + "step": 42660 + }, + { + "epoch": 2.119300685407768, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006304599185457435, + "loss": 0.567, + "step": 42670 + }, + { + "epoch": 2.1197973577033875, + "grad_norm": 0.09765625, + "learning_rate": 0.000630420184762094, + "loss": 0.5476, + "step": 42680 + }, + { + "epoch": 2.1202940299990067, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006303804509784445, + "loss": 0.5572, + "step": 42690 + }, + { + "epoch": 2.120790702294626, + "grad_norm": 0.111328125, + "learning_rate": 0.0006303407171947948, + "loss": 0.5884, + "step": 42700 + }, + { + "epoch": 2.121287374590245, + "grad_norm": 0.216796875, + "learning_rate": 0.0006303009834111454, + "loss": 0.591, + "step": 42710 + }, + { + "epoch": 2.121784046885865, + "grad_norm": 0.158203125, + "learning_rate": 0.0006302612496274958, + "loss": 0.543, + "step": 42720 + }, + { + "epoch": 2.122280719181484, + "grad_norm": 0.111328125, + "learning_rate": 0.0006302215158438463, + "loss": 0.5653, + "step": 42730 + }, + { + "epoch": 2.1227773914771033, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006301817820601967, + "loss": 0.6003, + "step": 42740 + }, + { + "epoch": 2.1232740637727225, + "grad_norm": 0.134765625, + "learning_rate": 0.0006301420482765471, + "loss": 0.58, + "step": 42750 + }, + { + "epoch": 2.123770736068342, + "grad_norm": 0.11328125, + "learning_rate": 0.0006301023144928977, + "loss": 0.5922, + "step": 42760 + }, + { + "epoch": 2.1242674083639614, + "grad_norm": 0.09765625, + "learning_rate": 0.0006300625807092481, + "loss": 0.6117, + "step": 42770 + }, + { + "epoch": 2.1247640806595807, + "grad_norm": 0.099609375, + "learning_rate": 0.0006300228469255985, + "loss": 0.576, + "step": 42780 + }, + { + "epoch": 2.1252607529552003, + "grad_norm": 0.1044921875, + "learning_rate": 0.000629983113141949, + "loss": 0.5785, + "step": 42790 + }, + { + "epoch": 2.1257574252508196, + "grad_norm": 0.1328125, + "learning_rate": 0.0006299433793582994, + "loss": 0.5766, + "step": 42800 + }, + { + "epoch": 2.126254097546439, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006299036455746499, + "loss": 0.5651, + "step": 42810 + }, + { + "epoch": 2.126750769842058, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006298639117910004, + "loss": 0.5559, + "step": 42820 + }, + { + "epoch": 2.1272474421376777, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006298241780073507, + "loss": 0.5722, + "step": 42830 + }, + { + "epoch": 2.127744114433297, + "grad_norm": 0.0869140625, + "learning_rate": 0.0006297844442237012, + "loss": 0.5456, + "step": 42840 + }, + { + "epoch": 2.128240786728916, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006297447104400517, + "loss": 0.5748, + "step": 42850 + }, + { + "epoch": 2.128737459024536, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006297049766564022, + "loss": 0.6068, + "step": 42860 + }, + { + "epoch": 2.129234131320155, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006296652428727526, + "loss": 0.5902, + "step": 42870 + }, + { + "epoch": 2.1297308036157743, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006296255090891031, + "loss": 0.5661, + "step": 42880 + }, + { + "epoch": 2.1302274759113935, + "grad_norm": 0.251953125, + "learning_rate": 0.0006295857753054535, + "loss": 0.5611, + "step": 42890 + }, + { + "epoch": 2.130724148207013, + "grad_norm": 0.1337890625, + "learning_rate": 0.0006295460415218039, + "loss": 0.5864, + "step": 42900 + }, + { + "epoch": 2.1312208205026324, + "grad_norm": 0.181640625, + "learning_rate": 0.0006295063077381543, + "loss": 0.5897, + "step": 42910 + }, + { + "epoch": 2.1317174927982516, + "grad_norm": 0.138671875, + "learning_rate": 0.0006294665739545049, + "loss": 0.5606, + "step": 42920 + }, + { + "epoch": 2.1322141650938713, + "grad_norm": 0.08544921875, + "learning_rate": 0.0006294268401708553, + "loss": 0.5695, + "step": 42930 + }, + { + "epoch": 2.1327108373894905, + "grad_norm": 0.103515625, + "learning_rate": 0.0006293871063872057, + "loss": 0.5523, + "step": 42940 + }, + { + "epoch": 2.1332075096851097, + "grad_norm": 0.11328125, + "learning_rate": 0.0006293473726035562, + "loss": 0.5766, + "step": 42950 + }, + { + "epoch": 2.133704181980729, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006293076388199067, + "loss": 0.5649, + "step": 42960 + }, + { + "epoch": 2.1342008542763486, + "grad_norm": 0.1015625, + "learning_rate": 0.0006292679050362571, + "loss": 0.5848, + "step": 42970 + }, + { + "epoch": 2.134697526571968, + "grad_norm": 0.138671875, + "learning_rate": 0.0006292281712526076, + "loss": 0.5745, + "step": 42980 + }, + { + "epoch": 2.135194198867587, + "grad_norm": 0.1142578125, + "learning_rate": 0.000629188437468958, + "loss": 0.5442, + "step": 42990 + }, + { + "epoch": 2.1356908711632063, + "grad_norm": 0.09765625, + "learning_rate": 0.0006291487036853084, + "loss": 0.5622, + "step": 43000 + }, + { + "epoch": 2.136187543458826, + "grad_norm": 0.1279296875, + "learning_rate": 0.000629108969901659, + "loss": 0.5485, + "step": 43010 + }, + { + "epoch": 2.1366842157544452, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006290692361180094, + "loss": 0.5648, + "step": 43020 + }, + { + "epoch": 2.1371808880500645, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006290295023343598, + "loss": 0.5578, + "step": 43030 + }, + { + "epoch": 2.137677560345684, + "grad_norm": 0.10546875, + "learning_rate": 0.0006289897685507103, + "loss": 0.5757, + "step": 43040 + }, + { + "epoch": 2.1381742326413034, + "grad_norm": 0.10546875, + "learning_rate": 0.0006289500347670607, + "loss": 0.5711, + "step": 43050 + }, + { + "epoch": 2.1386709049369226, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006289103009834111, + "loss": 0.5715, + "step": 43060 + }, + { + "epoch": 2.139167577232542, + "grad_norm": 0.1630859375, + "learning_rate": 0.0006288705671997617, + "loss": 0.5418, + "step": 43070 + }, + { + "epoch": 2.1396642495281615, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006288308334161121, + "loss": 0.5691, + "step": 43080 + }, + { + "epoch": 2.1401609218237807, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006287910996324625, + "loss": 0.5663, + "step": 43090 + }, + { + "epoch": 2.1406575941194, + "grad_norm": 0.099609375, + "learning_rate": 0.0006287513658488129, + "loss": 0.5602, + "step": 43100 + }, + { + "epoch": 2.141154266415019, + "grad_norm": 0.1435546875, + "learning_rate": 0.0006287116320651635, + "loss": 0.5615, + "step": 43110 + }, + { + "epoch": 2.141650938710639, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006286718982815139, + "loss": 0.5414, + "step": 43120 + }, + { + "epoch": 2.142147611006258, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006286321644978643, + "loss": 0.5916, + "step": 43130 + }, + { + "epoch": 2.1426442833018773, + "grad_norm": 0.08740234375, + "learning_rate": 0.0006285924307142148, + "loss": 0.5707, + "step": 43140 + }, + { + "epoch": 2.143140955597497, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006285526969305652, + "loss": 0.5878, + "step": 43150 + }, + { + "epoch": 2.143637627893116, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006285129631469156, + "loss": 0.5597, + "step": 43160 + }, + { + "epoch": 2.1441343001887354, + "grad_norm": 0.109375, + "learning_rate": 0.0006284732293632662, + "loss": 0.5935, + "step": 43170 + }, + { + "epoch": 2.1446309724843546, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006284334955796166, + "loss": 0.5761, + "step": 43180 + }, + { + "epoch": 2.1451276447799743, + "grad_norm": 0.1376953125, + "learning_rate": 0.000628393761795967, + "loss": 0.573, + "step": 43190 + }, + { + "epoch": 2.1456243170755935, + "grad_norm": 0.1787109375, + "learning_rate": 0.0006283540280123175, + "loss": 0.6216, + "step": 43200 + }, + { + "epoch": 2.1461209893712128, + "grad_norm": 0.10400390625, + "learning_rate": 0.000628314294228668, + "loss": 0.5643, + "step": 43210 + }, + { + "epoch": 2.1466176616668324, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006282745604450184, + "loss": 0.5654, + "step": 43220 + }, + { + "epoch": 2.1471143339624517, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006282348266613689, + "loss": 0.586, + "step": 43230 + }, + { + "epoch": 2.147611006258071, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006281950928777193, + "loss": 0.5753, + "step": 43240 + }, + { + "epoch": 2.14810767855369, + "grad_norm": 0.111328125, + "learning_rate": 0.0006281553590940697, + "loss": 0.5611, + "step": 43250 + }, + { + "epoch": 2.14860435084931, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006281156253104203, + "loss": 0.5534, + "step": 43260 + }, + { + "epoch": 2.149101023144929, + "grad_norm": 0.1611328125, + "learning_rate": 0.0006280758915267707, + "loss": 0.5512, + "step": 43270 + }, + { + "epoch": 2.1495976954405482, + "grad_norm": 0.115234375, + "learning_rate": 0.0006280361577431211, + "loss": 0.6057, + "step": 43280 + }, + { + "epoch": 2.150094367736168, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006279964239594716, + "loss": 0.6012, + "step": 43290 + }, + { + "epoch": 2.150591040031787, + "grad_norm": 0.107421875, + "learning_rate": 0.000627956690175822, + "loss": 0.5826, + "step": 43300 + }, + { + "epoch": 2.1510877123274064, + "grad_norm": 0.12890625, + "learning_rate": 0.0006279169563921726, + "loss": 0.5674, + "step": 43310 + }, + { + "epoch": 2.1515843846230256, + "grad_norm": 0.095703125, + "learning_rate": 0.0006278772226085229, + "loss": 0.5706, + "step": 43320 + }, + { + "epoch": 2.1520810569186453, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006278374888248734, + "loss": 0.5826, + "step": 43330 + }, + { + "epoch": 2.1525777292142645, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006277977550412239, + "loss": 0.56, + "step": 43340 + }, + { + "epoch": 2.1530744015098837, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006277580212575742, + "loss": 0.6019, + "step": 43350 + }, + { + "epoch": 2.153571073805503, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006277182874739247, + "loss": 0.5726, + "step": 43360 + }, + { + "epoch": 2.1540677461011226, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006276785536902752, + "loss": 0.5966, + "step": 43370 + }, + { + "epoch": 2.154564418396742, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006276388199066256, + "loss": 0.5368, + "step": 43380 + }, + { + "epoch": 2.155061090692361, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006275990861229761, + "loss": 0.5404, + "step": 43390 + }, + { + "epoch": 2.1555577629879803, + "grad_norm": 0.130859375, + "learning_rate": 0.0006275593523393265, + "loss": 0.5567, + "step": 43400 + }, + { + "epoch": 2.1560544352836, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006275196185556769, + "loss": 0.5695, + "step": 43410 + }, + { + "epoch": 2.156551107579219, + "grad_norm": 0.09765625, + "learning_rate": 0.0006274798847720275, + "loss": 0.5826, + "step": 43420 + }, + { + "epoch": 2.1570477798748384, + "grad_norm": 0.107421875, + "learning_rate": 0.0006274401509883779, + "loss": 0.5905, + "step": 43430 + }, + { + "epoch": 2.157544452170458, + "grad_norm": 0.1015625, + "learning_rate": 0.0006274004172047283, + "loss": 0.572, + "step": 43440 + }, + { + "epoch": 2.1580411244660773, + "grad_norm": 0.142578125, + "learning_rate": 0.0006273606834210788, + "loss": 0.583, + "step": 43450 + }, + { + "epoch": 2.1585377967616965, + "grad_norm": 0.181640625, + "learning_rate": 0.0006273209496374292, + "loss": 0.5851, + "step": 43460 + }, + { + "epoch": 2.1590344690573158, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006272812158537798, + "loss": 0.5956, + "step": 43470 + }, + { + "epoch": 2.1595311413529354, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006272414820701302, + "loss": 0.5583, + "step": 43480 + }, + { + "epoch": 2.1600278136485547, + "grad_norm": 0.111328125, + "learning_rate": 0.0006272017482864806, + "loss": 0.6028, + "step": 43490 + }, + { + "epoch": 2.160524485944174, + "grad_norm": 0.2138671875, + "learning_rate": 0.0006271620145028311, + "loss": 0.5814, + "step": 43500 + }, + { + "epoch": 2.1610211582397936, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006271222807191814, + "loss": 0.553, + "step": 43510 + }, + { + "epoch": 2.161517830535413, + "grad_norm": 0.1337890625, + "learning_rate": 0.000627082546935532, + "loss": 0.5377, + "step": 43520 + }, + { + "epoch": 2.162014502831032, + "grad_norm": 0.1826171875, + "learning_rate": 0.0006270428131518825, + "loss": 0.5696, + "step": 43530 + }, + { + "epoch": 2.1625111751266513, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006270030793682328, + "loss": 0.534, + "step": 43540 + }, + { + "epoch": 2.163007847422271, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006269633455845833, + "loss": 0.5761, + "step": 43550 + }, + { + "epoch": 2.16350451971789, + "grad_norm": 0.12890625, + "learning_rate": 0.0006269236118009338, + "loss": 0.5962, + "step": 43560 + }, + { + "epoch": 2.1640011920135094, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006268838780172842, + "loss": 0.5698, + "step": 43570 + }, + { + "epoch": 2.164497864309129, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006268441442336347, + "loss": 0.5536, + "step": 43580 + }, + { + "epoch": 2.1649945366047483, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006268044104499851, + "loss": 0.5618, + "step": 43590 + }, + { + "epoch": 2.1654912089003675, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006267646766663356, + "loss": 0.5572, + "step": 43600 + }, + { + "epoch": 2.1659878811959867, + "grad_norm": 0.115234375, + "learning_rate": 0.000626724942882686, + "loss": 0.5328, + "step": 43610 + }, + { + "epoch": 2.1664845534916064, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006266852090990365, + "loss": 0.5961, + "step": 43620 + }, + { + "epoch": 2.1669812257872256, + "grad_norm": 0.1025390625, + "learning_rate": 0.000626645475315387, + "loss": 0.5593, + "step": 43630 + }, + { + "epoch": 2.167477898082845, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006266057415317374, + "loss": 0.5712, + "step": 43640 + }, + { + "epoch": 2.167974570378464, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006265660077480878, + "loss": 0.5823, + "step": 43650 + }, + { + "epoch": 2.1684712426740838, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006265262739644383, + "loss": 0.5414, + "step": 43660 + }, + { + "epoch": 2.168967914969703, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006264865401807888, + "loss": 0.5812, + "step": 43670 + }, + { + "epoch": 2.169464587265322, + "grad_norm": 0.19140625, + "learning_rate": 0.0006264468063971392, + "loss": 0.5957, + "step": 43680 + }, + { + "epoch": 2.169961259560942, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006264070726134897, + "loss": 0.5713, + "step": 43690 + }, + { + "epoch": 2.170457931856561, + "grad_norm": 0.11328125, + "learning_rate": 0.00062636733882984, + "loss": 0.5753, + "step": 43700 + }, + { + "epoch": 2.1709546041521803, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006263276050461905, + "loss": 0.5447, + "step": 43710 + }, + { + "epoch": 2.1714512764477996, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006262878712625411, + "loss": 0.5614, + "step": 43720 + }, + { + "epoch": 2.1719479487434192, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006262481374788914, + "loss": 0.5699, + "step": 43730 + }, + { + "epoch": 2.1724446210390385, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006262084036952419, + "loss": 0.5488, + "step": 43740 + }, + { + "epoch": 2.1729412933346577, + "grad_norm": 0.140625, + "learning_rate": 0.0006261686699115924, + "loss": 0.5825, + "step": 43750 + }, + { + "epoch": 2.173437965630277, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006261289361279428, + "loss": 0.5778, + "step": 43760 + }, + { + "epoch": 2.1739346379258966, + "grad_norm": 0.1494140625, + "learning_rate": 0.0006260892023442933, + "loss": 0.5551, + "step": 43770 + }, + { + "epoch": 2.174431310221516, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006260494685606437, + "loss": 0.5708, + "step": 43780 + }, + { + "epoch": 2.174927982517135, + "grad_norm": 0.08837890625, + "learning_rate": 0.0006260097347769942, + "loss": 0.5614, + "step": 43790 + }, + { + "epoch": 2.1754246548127547, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006259700009933446, + "loss": 0.5647, + "step": 43800 + }, + { + "epoch": 2.175921327108374, + "grad_norm": 0.11279296875, + "learning_rate": 0.000625930267209695, + "loss": 0.5616, + "step": 43810 + }, + { + "epoch": 2.176417999403993, + "grad_norm": 0.09765625, + "learning_rate": 0.0006258905334260456, + "loss": 0.5607, + "step": 43820 + }, + { + "epoch": 2.1769146716996124, + "grad_norm": 0.09765625, + "learning_rate": 0.000625850799642396, + "loss": 0.568, + "step": 43830 + }, + { + "epoch": 2.177411343995232, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006258110658587464, + "loss": 0.5598, + "step": 43840 + }, + { + "epoch": 2.1779080162908513, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006257713320750969, + "loss": 0.5484, + "step": 43850 + }, + { + "epoch": 2.1784046885864705, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006257315982914473, + "loss": 0.572, + "step": 43860 + }, + { + "epoch": 2.17890136088209, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006256918645077978, + "loss": 0.5755, + "step": 43870 + }, + { + "epoch": 2.1793980331777094, + "grad_norm": 0.099609375, + "learning_rate": 0.0006256521307241483, + "loss": 0.5544, + "step": 43880 + }, + { + "epoch": 2.1798947054733286, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006256123969404987, + "loss": 0.5519, + "step": 43890 + }, + { + "epoch": 2.180391377768948, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006255726631568491, + "loss": 0.5935, + "step": 43900 + }, + { + "epoch": 2.1808880500645675, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006255329293731996, + "loss": 0.562, + "step": 43910 + }, + { + "epoch": 2.1813847223601868, + "grad_norm": 0.1455078125, + "learning_rate": 0.0006254931955895501, + "loss": 0.562, + "step": 43920 + }, + { + "epoch": 2.181881394655806, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006254534618059005, + "loss": 0.5664, + "step": 43930 + }, + { + "epoch": 2.1823780669514257, + "grad_norm": 0.10498046875, + "learning_rate": 0.000625413728022251, + "loss": 0.5742, + "step": 43940 + }, + { + "epoch": 2.182874739247045, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006253739942386014, + "loss": 0.5476, + "step": 43950 + }, + { + "epoch": 2.183371411542664, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006253342604549518, + "loss": 0.6065, + "step": 43960 + }, + { + "epoch": 2.1838680838382833, + "grad_norm": 0.12451171875, + "learning_rate": 0.0006252945266713023, + "loss": 0.5835, + "step": 43970 + }, + { + "epoch": 2.184364756133903, + "grad_norm": 0.123046875, + "learning_rate": 0.0006252547928876528, + "loss": 0.5412, + "step": 43980 + }, + { + "epoch": 2.1848614284295222, + "grad_norm": 0.185546875, + "learning_rate": 0.0006252150591040032, + "loss": 0.5947, + "step": 43990 + }, + { + "epoch": 2.1853581007251415, + "grad_norm": 0.130859375, + "learning_rate": 0.0006251753253203536, + "loss": 0.57, + "step": 44000 + }, + { + "epoch": 2.1858547730207607, + "grad_norm": 0.13671875, + "learning_rate": 0.0006251355915367041, + "loss": 0.5734, + "step": 44010 + }, + { + "epoch": 2.1863514453163804, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006250958577530546, + "loss": 0.5586, + "step": 44020 + }, + { + "epoch": 2.1868481176119996, + "grad_norm": 0.09521484375, + "learning_rate": 0.000625056123969405, + "loss": 0.5857, + "step": 44030 + }, + { + "epoch": 2.187344789907619, + "grad_norm": 0.09033203125, + "learning_rate": 0.0006250163901857555, + "loss": 0.5902, + "step": 44040 + }, + { + "epoch": 2.1878414622032385, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006249766564021059, + "loss": 0.5731, + "step": 44050 + }, + { + "epoch": 2.1883381344988577, + "grad_norm": 0.111328125, + "learning_rate": 0.0006249369226184563, + "loss": 0.5886, + "step": 44060 + }, + { + "epoch": 2.188834806794477, + "grad_norm": 0.1337890625, + "learning_rate": 0.0006248971888348069, + "loss": 0.5806, + "step": 44070 + }, + { + "epoch": 2.189331479090096, + "grad_norm": 0.12890625, + "learning_rate": 0.0006248574550511573, + "loss": 0.5795, + "step": 44080 + }, + { + "epoch": 2.189828151385716, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006248177212675077, + "loss": 0.5496, + "step": 44090 + }, + { + "epoch": 2.190324823681335, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006247779874838582, + "loss": 0.551, + "step": 44100 + }, + { + "epoch": 2.1908214959769543, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006247382537002086, + "loss": 0.5759, + "step": 44110 + }, + { + "epoch": 2.1913181682725735, + "grad_norm": 0.142578125, + "learning_rate": 0.000624698519916559, + "loss": 0.5475, + "step": 44120 + }, + { + "epoch": 2.191814840568193, + "grad_norm": 0.12109375, + "learning_rate": 0.0006246587861329096, + "loss": 0.542, + "step": 44130 + }, + { + "epoch": 2.1923115128638124, + "grad_norm": 0.10888671875, + "learning_rate": 0.00062461905234926, + "loss": 0.5988, + "step": 44140 + }, + { + "epoch": 2.1928081851594317, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006245793185656104, + "loss": 0.583, + "step": 44150 + }, + { + "epoch": 2.1933048574550513, + "grad_norm": 0.087890625, + "learning_rate": 0.0006245395847819609, + "loss": 0.5564, + "step": 44160 + }, + { + "epoch": 2.1938015297506706, + "grad_norm": 0.11328125, + "learning_rate": 0.0006244998509983114, + "loss": 0.5518, + "step": 44170 + }, + { + "epoch": 2.19429820204629, + "grad_norm": 0.111328125, + "learning_rate": 0.0006244601172146618, + "loss": 0.523, + "step": 44180 + }, + { + "epoch": 2.194794874341909, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006244203834310122, + "loss": 0.5687, + "step": 44190 + }, + { + "epoch": 2.1952915466375287, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006243806496473627, + "loss": 0.5617, + "step": 44200 + }, + { + "epoch": 2.195788218933148, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006243409158637132, + "loss": 0.5695, + "step": 44210 + }, + { + "epoch": 2.196284891228767, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006243011820800635, + "loss": 0.5669, + "step": 44220 + }, + { + "epoch": 2.196781563524387, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006242614482964141, + "loss": 0.5852, + "step": 44230 + }, + { + "epoch": 2.197278235820006, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006242217145127645, + "loss": 0.5893, + "step": 44240 + }, + { + "epoch": 2.1977749081156253, + "grad_norm": 0.1455078125, + "learning_rate": 0.0006241819807291149, + "loss": 0.5775, + "step": 44250 + }, + { + "epoch": 2.1982715804112445, + "grad_norm": 0.1015625, + "learning_rate": 0.0006241422469454654, + "loss": 0.5627, + "step": 44260 + }, + { + "epoch": 2.198768252706864, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006241025131618159, + "loss": 0.5642, + "step": 44270 + }, + { + "epoch": 2.1992649250024834, + "grad_norm": 0.130859375, + "learning_rate": 0.0006240627793781663, + "loss": 0.54, + "step": 44280 + }, + { + "epoch": 2.1997615972981026, + "grad_norm": 0.109375, + "learning_rate": 0.0006240230455945168, + "loss": 0.5426, + "step": 44290 + }, + { + "epoch": 2.2002582695937223, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006239833118108672, + "loss": 0.5992, + "step": 44300 + }, + { + "epoch": 2.2007549418893415, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006239435780272176, + "loss": 0.5477, + "step": 44310 + }, + { + "epoch": 2.2012516141849607, + "grad_norm": 0.119140625, + "learning_rate": 0.0006239038442435682, + "loss": 0.5361, + "step": 44320 + }, + { + "epoch": 2.20174828648058, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006238641104599186, + "loss": 0.5513, + "step": 44330 + }, + { + "epoch": 2.2022449587761996, + "grad_norm": 0.1015625, + "learning_rate": 0.000623824376676269, + "loss": 0.5877, + "step": 44340 + }, + { + "epoch": 2.202741631071819, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006237846428926195, + "loss": 0.5765, + "step": 44350 + }, + { + "epoch": 2.203238303367438, + "grad_norm": 0.09375, + "learning_rate": 0.0006237449091089699, + "loss": 0.5669, + "step": 44360 + }, + { + "epoch": 2.2037349756630573, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006237051753253205, + "loss": 0.5862, + "step": 44370 + }, + { + "epoch": 2.204231647958677, + "grad_norm": 0.150390625, + "learning_rate": 0.0006236654415416708, + "loss": 0.5514, + "step": 44380 + }, + { + "epoch": 2.204728320254296, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006236257077580213, + "loss": 0.572, + "step": 44390 + }, + { + "epoch": 2.2052249925499154, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006235859739743718, + "loss": 0.5707, + "step": 44400 + }, + { + "epoch": 2.205721664845535, + "grad_norm": 0.1728515625, + "learning_rate": 0.0006235462401907221, + "loss": 0.5504, + "step": 44410 + }, + { + "epoch": 2.2062183371411543, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006235065064070727, + "loss": 0.5581, + "step": 44420 + }, + { + "epoch": 2.2067150094367736, + "grad_norm": 0.130859375, + "learning_rate": 0.0006234667726234232, + "loss": 0.5712, + "step": 44430 + }, + { + "epoch": 2.207211681732393, + "grad_norm": 0.125, + "learning_rate": 0.0006234270388397735, + "loss": 0.5782, + "step": 44440 + }, + { + "epoch": 2.2077083540280125, + "grad_norm": 0.1259765625, + "learning_rate": 0.000623387305056124, + "loss": 0.593, + "step": 44450 + }, + { + "epoch": 2.2082050263236317, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006233475712724744, + "loss": 0.5679, + "step": 44460 + }, + { + "epoch": 2.208701698619251, + "grad_norm": 0.17578125, + "learning_rate": 0.0006233078374888248, + "loss": 0.5535, + "step": 44470 + }, + { + "epoch": 2.20919837091487, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006232681037051754, + "loss": 0.5747, + "step": 44480 + }, + { + "epoch": 2.20969504321049, + "grad_norm": 0.125, + "learning_rate": 0.0006232283699215258, + "loss": 0.5539, + "step": 44490 + }, + { + "epoch": 2.210191715506109, + "grad_norm": 0.1640625, + "learning_rate": 0.0006231886361378763, + "loss": 0.5745, + "step": 44500 + }, + { + "epoch": 2.2106883878017283, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006231489023542267, + "loss": 0.5689, + "step": 44510 + }, + { + "epoch": 2.211185060097348, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006231091685705771, + "loss": 0.5573, + "step": 44520 + }, + { + "epoch": 2.211681732392967, + "grad_norm": 0.1552734375, + "learning_rate": 0.0006230694347869277, + "loss": 0.5812, + "step": 44530 + }, + { + "epoch": 2.2121784046885864, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006230297010032781, + "loss": 0.5662, + "step": 44540 + }, + { + "epoch": 2.2126750769842056, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006229899672196285, + "loss": 0.5748, + "step": 44550 + }, + { + "epoch": 2.2131717492798253, + "grad_norm": 0.1259765625, + "learning_rate": 0.000622950233435979, + "loss": 0.5514, + "step": 44560 + }, + { + "epoch": 2.2136684215754445, + "grad_norm": 0.2041015625, + "learning_rate": 0.0006229104996523293, + "loss": 0.5866, + "step": 44570 + }, + { + "epoch": 2.2141650938710637, + "grad_norm": 0.126953125, + "learning_rate": 0.0006228707658686799, + "loss": 0.598, + "step": 44580 + }, + { + "epoch": 2.2146617661666834, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006228310320850304, + "loss": 0.5792, + "step": 44590 + }, + { + "epoch": 2.2151584384623026, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006227912983013807, + "loss": 0.572, + "step": 44600 + }, + { + "epoch": 2.215655110757922, + "grad_norm": 0.142578125, + "learning_rate": 0.0006227515645177312, + "loss": 0.5688, + "step": 44610 + }, + { + "epoch": 2.216151783053541, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006227118307340818, + "loss": 0.5636, + "step": 44620 + }, + { + "epoch": 2.2166484553491608, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006226720969504321, + "loss": 0.5957, + "step": 44630 + }, + { + "epoch": 2.21714512764478, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006226323631667826, + "loss": 0.5581, + "step": 44640 + }, + { + "epoch": 2.2176417999403992, + "grad_norm": 0.125, + "learning_rate": 0.000622592629383133, + "loss": 0.5876, + "step": 44650 + }, + { + "epoch": 2.218138472236019, + "grad_norm": 0.123046875, + "learning_rate": 0.0006225528955994835, + "loss": 0.5619, + "step": 44660 + }, + { + "epoch": 2.218635144531638, + "grad_norm": 0.1005859375, + "learning_rate": 0.000622513161815834, + "loss": 0.567, + "step": 44670 + }, + { + "epoch": 2.2191318168272574, + "grad_norm": 0.1171875, + "learning_rate": 0.0006224734280321844, + "loss": 0.5615, + "step": 44680 + }, + { + "epoch": 2.2196284891228766, + "grad_norm": 0.138671875, + "learning_rate": 0.0006224336942485349, + "loss": 0.5507, + "step": 44690 + }, + { + "epoch": 2.2201251614184963, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006223939604648853, + "loss": 0.5586, + "step": 44700 + }, + { + "epoch": 2.2206218337141155, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006223542266812357, + "loss": 0.5518, + "step": 44710 + }, + { + "epoch": 2.2211185060097347, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006223144928975862, + "loss": 0.5889, + "step": 44720 + }, + { + "epoch": 2.221615178305354, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006222747591139367, + "loss": 0.5527, + "step": 44730 + }, + { + "epoch": 2.2221118506009736, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006222350253302871, + "loss": 0.5946, + "step": 44740 + }, + { + "epoch": 2.222608522896593, + "grad_norm": 0.09375, + "learning_rate": 0.0006221952915466376, + "loss": 0.5737, + "step": 44750 + }, + { + "epoch": 2.223105195192212, + "grad_norm": 0.091796875, + "learning_rate": 0.000622155557762988, + "loss": 0.5618, + "step": 44760 + }, + { + "epoch": 2.2236018674878317, + "grad_norm": 0.1630859375, + "learning_rate": 0.0006221158239793384, + "loss": 0.5826, + "step": 44770 + }, + { + "epoch": 2.224098539783451, + "grad_norm": 0.115234375, + "learning_rate": 0.000622076090195689, + "loss": 0.5718, + "step": 44780 + }, + { + "epoch": 2.22459521207907, + "grad_norm": 0.150390625, + "learning_rate": 0.0006220363564120393, + "loss": 0.5569, + "step": 44790 + }, + { + "epoch": 2.2250918843746894, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006219966226283898, + "loss": 0.5731, + "step": 44800 + }, + { + "epoch": 2.225588556670309, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006219568888447403, + "loss": 0.5907, + "step": 44810 + }, + { + "epoch": 2.2260852289659283, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006219171550610907, + "loss": 0.5298, + "step": 44820 + }, + { + "epoch": 2.2265819012615475, + "grad_norm": 0.10546875, + "learning_rate": 0.0006218774212774412, + "loss": 0.5683, + "step": 44830 + }, + { + "epoch": 2.2270785735571668, + "grad_norm": 0.138671875, + "learning_rate": 0.0006218376874937916, + "loss": 0.5793, + "step": 44840 + }, + { + "epoch": 2.2275752458527864, + "grad_norm": 0.08935546875, + "learning_rate": 0.0006217979537101421, + "loss": 0.5538, + "step": 44850 + }, + { + "epoch": 2.2280719181484057, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006217582199264925, + "loss": 0.5721, + "step": 44860 + }, + { + "epoch": 2.228568590444025, + "grad_norm": 0.119140625, + "learning_rate": 0.0006217184861428429, + "loss": 0.5709, + "step": 44870 + }, + { + "epoch": 2.2290652627396446, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006216787523591935, + "loss": 0.5901, + "step": 44880 + }, + { + "epoch": 2.229561935035264, + "grad_norm": 0.1328125, + "learning_rate": 0.0006216390185755439, + "loss": 0.5396, + "step": 44890 + }, + { + "epoch": 2.230058607330883, + "grad_norm": 0.1015625, + "learning_rate": 0.0006215992847918943, + "loss": 0.5628, + "step": 44900 + }, + { + "epoch": 2.2305552796265022, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006215595510082448, + "loss": 0.5887, + "step": 44910 + }, + { + "epoch": 2.231051951922122, + "grad_norm": 0.099609375, + "learning_rate": 0.0006215198172245952, + "loss": 0.5642, + "step": 44920 + }, + { + "epoch": 2.231548624217741, + "grad_norm": 0.11328125, + "learning_rate": 0.0006214800834409457, + "loss": 0.5751, + "step": 44930 + }, + { + "epoch": 2.2320452965133604, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006214403496572962, + "loss": 0.572, + "step": 44940 + }, + { + "epoch": 2.23254196880898, + "grad_norm": 0.12890625, + "learning_rate": 0.0006214006158736466, + "loss": 0.5656, + "step": 44950 + }, + { + "epoch": 2.2330386411045993, + "grad_norm": 0.1552734375, + "learning_rate": 0.000621360882089997, + "loss": 0.5807, + "step": 44960 + }, + { + "epoch": 2.2335353134002185, + "grad_norm": 0.13671875, + "learning_rate": 0.0006213211483063475, + "loss": 0.5496, + "step": 44970 + }, + { + "epoch": 2.2340319856958377, + "grad_norm": 0.1396484375, + "learning_rate": 0.000621281414522698, + "loss": 0.5722, + "step": 44980 + }, + { + "epoch": 2.2345286579914574, + "grad_norm": 0.11328125, + "learning_rate": 0.0006212416807390484, + "loss": 0.5721, + "step": 44990 + }, + { + "epoch": 2.2350253302870766, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006212019469553989, + "loss": 0.5698, + "step": 45000 + }, + { + "epoch": 2.235522002582696, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006211622131717493, + "loss": 0.5797, + "step": 45010 + }, + { + "epoch": 2.2360186748783155, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006211224793880997, + "loss": 0.5671, + "step": 45020 + }, + { + "epoch": 2.2365153471739347, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006210827456044503, + "loss": 0.5678, + "step": 45030 + }, + { + "epoch": 2.237012019469554, + "grad_norm": 0.13671875, + "learning_rate": 0.0006210430118208007, + "loss": 0.5494, + "step": 45040 + }, + { + "epoch": 2.237508691765173, + "grad_norm": 0.12890625, + "learning_rate": 0.0006210032780371511, + "loss": 0.5662, + "step": 45050 + }, + { + "epoch": 2.238005364060793, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006209635442535015, + "loss": 0.5825, + "step": 45060 + }, + { + "epoch": 2.238502036356412, + "grad_norm": 0.1396484375, + "learning_rate": 0.000620923810469852, + "loss": 0.5997, + "step": 45070 + }, + { + "epoch": 2.2389987086520313, + "grad_norm": 0.09375, + "learning_rate": 0.0006208840766862025, + "loss": 0.5766, + "step": 45080 + }, + { + "epoch": 2.2394953809476505, + "grad_norm": 0.162109375, + "learning_rate": 0.0006208443429025529, + "loss": 0.5571, + "step": 45090 + }, + { + "epoch": 2.23999205324327, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006208046091189034, + "loss": 0.5639, + "step": 45100 + }, + { + "epoch": 2.2404887255388894, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006207648753352538, + "loss": 0.5667, + "step": 45110 + }, + { + "epoch": 2.2409853978345087, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006207251415516042, + "loss": 0.5826, + "step": 45120 + }, + { + "epoch": 2.2414820701301283, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006206854077679548, + "loss": 0.5695, + "step": 45130 + }, + { + "epoch": 2.2419787424257476, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006206456739843052, + "loss": 0.5393, + "step": 45140 + }, + { + "epoch": 2.242475414721367, + "grad_norm": 0.099609375, + "learning_rate": 0.0006206059402006556, + "loss": 0.5668, + "step": 45150 + }, + { + "epoch": 2.242972087016986, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006205662064170061, + "loss": 0.5789, + "step": 45160 + }, + { + "epoch": 2.2434687593126057, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006205264726333565, + "loss": 0.5663, + "step": 45170 + }, + { + "epoch": 2.243965431608225, + "grad_norm": 0.09814453125, + "learning_rate": 0.000620486738849707, + "loss": 0.5535, + "step": 45180 + }, + { + "epoch": 2.244462103903844, + "grad_norm": 0.1533203125, + "learning_rate": 0.0006204470050660575, + "loss": 0.6223, + "step": 45190 + }, + { + "epoch": 2.2449587761994634, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006204072712824079, + "loss": 0.5743, + "step": 45200 + }, + { + "epoch": 2.245455448495083, + "grad_norm": 0.095703125, + "learning_rate": 0.0006203675374987583, + "loss": 0.57, + "step": 45210 + }, + { + "epoch": 2.2459521207907023, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006203278037151088, + "loss": 0.5516, + "step": 45220 + }, + { + "epoch": 2.2464487930863215, + "grad_norm": 0.15625, + "learning_rate": 0.0006202880699314593, + "loss": 0.5767, + "step": 45230 + }, + { + "epoch": 2.246945465381941, + "grad_norm": 0.130859375, + "learning_rate": 0.0006202483361478097, + "loss": 0.558, + "step": 45240 + }, + { + "epoch": 2.2474421376775604, + "grad_norm": 0.08935546875, + "learning_rate": 0.0006202086023641601, + "loss": 0.5542, + "step": 45250 + }, + { + "epoch": 2.2479388099731796, + "grad_norm": 0.109375, + "learning_rate": 0.0006201688685805106, + "loss": 0.5535, + "step": 45260 + }, + { + "epoch": 2.248435482268799, + "grad_norm": 0.140625, + "learning_rate": 0.0006201291347968611, + "loss": 0.5627, + "step": 45270 + }, + { + "epoch": 2.2489321545644185, + "grad_norm": 0.19921875, + "learning_rate": 0.0006200894010132115, + "loss": 0.5727, + "step": 45280 + }, + { + "epoch": 2.2494288268600378, + "grad_norm": 0.11962890625, + "learning_rate": 0.000620049667229562, + "loss": 0.5784, + "step": 45290 + }, + { + "epoch": 2.249925499155657, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006200099334459124, + "loss": 0.5787, + "step": 45300 + }, + { + "epoch": 2.2504221714512767, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006199701996622628, + "loss": 0.5643, + "step": 45310 + }, + { + "epoch": 2.250918843746896, + "grad_norm": 0.154296875, + "learning_rate": 0.0006199304658786133, + "loss": 0.5724, + "step": 45320 + }, + { + "epoch": 2.251415516042515, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006198907320949638, + "loss": 0.5998, + "step": 45330 + }, + { + "epoch": 2.2519121883381343, + "grad_norm": 0.09619140625, + "learning_rate": 0.0006198509983113142, + "loss": 0.5486, + "step": 45340 + }, + { + "epoch": 2.252408860633754, + "grad_norm": 0.130859375, + "learning_rate": 0.0006198112645276647, + "loss": 0.559, + "step": 45350 + }, + { + "epoch": 2.2529055329293732, + "grad_norm": 0.095703125, + "learning_rate": 0.0006197715307440151, + "loss": 0.5696, + "step": 45360 + }, + { + "epoch": 2.2534022052249925, + "grad_norm": 0.095703125, + "learning_rate": 0.0006197317969603655, + "loss": 0.5941, + "step": 45370 + }, + { + "epoch": 2.253898877520612, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006196920631767161, + "loss": 0.5569, + "step": 45380 + }, + { + "epoch": 2.2543955498162314, + "grad_norm": 0.123046875, + "learning_rate": 0.0006196523293930665, + "loss": 0.5771, + "step": 45390 + }, + { + "epoch": 2.2548922221118506, + "grad_norm": 0.0986328125, + "learning_rate": 0.000619612595609417, + "loss": 0.5813, + "step": 45400 + }, + { + "epoch": 2.25538889440747, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006195728618257674, + "loss": 0.5656, + "step": 45410 + }, + { + "epoch": 2.2558855667030895, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006195331280421178, + "loss": 0.5561, + "step": 45420 + }, + { + "epoch": 2.2563822389987087, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006194933942584684, + "loss": 0.5836, + "step": 45430 + }, + { + "epoch": 2.256878911294328, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006194536604748187, + "loss": 0.5693, + "step": 45440 + }, + { + "epoch": 2.257375583589947, + "grad_norm": 0.22265625, + "learning_rate": 0.0006194139266911692, + "loss": 0.5708, + "step": 45450 + }, + { + "epoch": 2.257872255885567, + "grad_norm": 0.109375, + "learning_rate": 0.0006193741929075197, + "loss": 0.5635, + "step": 45460 + }, + { + "epoch": 2.258368928181186, + "grad_norm": 0.09130859375, + "learning_rate": 0.00061933445912387, + "loss": 0.5647, + "step": 45470 + }, + { + "epoch": 2.2588656004768053, + "grad_norm": 0.138671875, + "learning_rate": 0.0006192947253402206, + "loss": 0.5577, + "step": 45480 + }, + { + "epoch": 2.2593622727724245, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006192549915565711, + "loss": 0.5508, + "step": 45490 + }, + { + "epoch": 2.259858945068044, + "grad_norm": 0.095703125, + "learning_rate": 0.0006192152577729214, + "loss": 0.5782, + "step": 45500 + }, + { + "epoch": 2.2603556173636634, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006191755239892719, + "loss": 0.5538, + "step": 45510 + }, + { + "epoch": 2.2608522896592826, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006191357902056223, + "loss": 0.5579, + "step": 45520 + }, + { + "epoch": 2.2613489619549023, + "grad_norm": 0.140625, + "learning_rate": 0.0006190960564219727, + "loss": 0.5628, + "step": 45530 + }, + { + "epoch": 2.2618456342505215, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006190563226383233, + "loss": 0.5516, + "step": 45540 + }, + { + "epoch": 2.2623423065461408, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006190165888546737, + "loss": 0.5491, + "step": 45550 + }, + { + "epoch": 2.26283897884176, + "grad_norm": 0.126953125, + "learning_rate": 0.0006189768550710242, + "loss": 0.5538, + "step": 45560 + }, + { + "epoch": 2.2633356511373797, + "grad_norm": 0.12255859375, + "learning_rate": 0.0006189371212873746, + "loss": 0.5748, + "step": 45570 + }, + { + "epoch": 2.263832323432999, + "grad_norm": 0.130859375, + "learning_rate": 0.000618897387503725, + "loss": 0.5923, + "step": 45580 + }, + { + "epoch": 2.264328995728618, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006188576537200756, + "loss": 0.5539, + "step": 45590 + }, + { + "epoch": 2.264825668024238, + "grad_norm": 0.1298828125, + "learning_rate": 0.000618817919936426, + "loss": 0.5704, + "step": 45600 + }, + { + "epoch": 2.265322340319857, + "grad_norm": 0.11328125, + "learning_rate": 0.0006187781861527764, + "loss": 0.561, + "step": 45610 + }, + { + "epoch": 2.2658190126154762, + "grad_norm": 0.12890625, + "learning_rate": 0.0006187384523691269, + "loss": 0.5745, + "step": 45620 + }, + { + "epoch": 2.2663156849110955, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006186987185854772, + "loss": 0.5669, + "step": 45630 + }, + { + "epoch": 2.266812357206715, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006186589848018278, + "loss": 0.5674, + "step": 45640 + }, + { + "epoch": 2.2673090295023344, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006186192510181783, + "loss": 0.5598, + "step": 45650 + }, + { + "epoch": 2.2678057017979536, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006185795172345286, + "loss": 0.5784, + "step": 45660 + }, + { + "epoch": 2.2683023740935733, + "grad_norm": 0.134765625, + "learning_rate": 0.0006185397834508791, + "loss": 0.6022, + "step": 45670 + }, + { + "epoch": 2.2687990463891925, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006185000496672297, + "loss": 0.567, + "step": 45680 + }, + { + "epoch": 2.2692957186848117, + "grad_norm": 0.10546875, + "learning_rate": 0.00061846031588358, + "loss": 0.5606, + "step": 45690 + }, + { + "epoch": 2.269792390980431, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006184205820999305, + "loss": 0.5446, + "step": 45700 + }, + { + "epoch": 2.2702890632760506, + "grad_norm": 0.119140625, + "learning_rate": 0.0006183808483162809, + "loss": 0.5699, + "step": 45710 + }, + { + "epoch": 2.27078573557167, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006183411145326314, + "loss": 0.5412, + "step": 45720 + }, + { + "epoch": 2.271282407867289, + "grad_norm": 0.1328125, + "learning_rate": 0.0006183013807489819, + "loss": 0.5655, + "step": 45730 + }, + { + "epoch": 2.2717790801629087, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006182616469653323, + "loss": 0.5383, + "step": 45740 + }, + { + "epoch": 2.272275752458528, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006182219131816828, + "loss": 0.565, + "step": 45750 + }, + { + "epoch": 2.272772424754147, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006181821793980332, + "loss": 0.5611, + "step": 45760 + }, + { + "epoch": 2.2732690970497664, + "grad_norm": 0.08837890625, + "learning_rate": 0.0006181424456143836, + "loss": 0.5672, + "step": 45770 + }, + { + "epoch": 2.273765769345386, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006181027118307342, + "loss": 0.6097, + "step": 45780 + }, + { + "epoch": 2.2742624416410053, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006180629780470846, + "loss": 0.5667, + "step": 45790 + }, + { + "epoch": 2.2747591139366246, + "grad_norm": 0.11181640625, + "learning_rate": 0.000618023244263435, + "loss": 0.5503, + "step": 45800 + }, + { + "epoch": 2.275255786232244, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006179835104797855, + "loss": 0.5652, + "step": 45810 + }, + { + "epoch": 2.2757524585278635, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006179437766961359, + "loss": 0.566, + "step": 45820 + }, + { + "epoch": 2.2762491308234827, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006179040429124863, + "loss": 0.5466, + "step": 45830 + }, + { + "epoch": 2.276745803119102, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006178643091288369, + "loss": 0.5661, + "step": 45840 + }, + { + "epoch": 2.277242475414721, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006178245753451873, + "loss": 0.548, + "step": 45850 + }, + { + "epoch": 2.277739147710341, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006177848415615377, + "loss": 0.5579, + "step": 45860 + }, + { + "epoch": 2.27823582000596, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006177451077778882, + "loss": 0.566, + "step": 45870 + }, + { + "epoch": 2.2787324923015793, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006177053739942387, + "loss": 0.5508, + "step": 45880 + }, + { + "epoch": 2.279229164597199, + "grad_norm": 0.126953125, + "learning_rate": 0.0006176656402105891, + "loss": 0.5312, + "step": 45890 + }, + { + "epoch": 2.279725836892818, + "grad_norm": 0.091796875, + "learning_rate": 0.0006176259064269395, + "loss": 0.5519, + "step": 45900 + }, + { + "epoch": 2.2802225091884374, + "grad_norm": 0.154296875, + "learning_rate": 0.00061758617264329, + "loss": 0.5566, + "step": 45910 + }, + { + "epoch": 2.2807191814840566, + "grad_norm": 0.1806640625, + "learning_rate": 0.0006175464388596404, + "loss": 0.5298, + "step": 45920 + }, + { + "epoch": 2.2812158537796763, + "grad_norm": 0.08984375, + "learning_rate": 0.0006175067050759908, + "loss": 0.5844, + "step": 45930 + }, + { + "epoch": 2.2817125260752955, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006174669712923414, + "loss": 0.5758, + "step": 45940 + }, + { + "epoch": 2.2822091983709147, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006174272375086918, + "loss": 0.5557, + "step": 45950 + }, + { + "epoch": 2.2827058706665344, + "grad_norm": 0.2021484375, + "learning_rate": 0.0006173875037250422, + "loss": 0.5717, + "step": 45960 + }, + { + "epoch": 2.2832025429621536, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006173477699413927, + "loss": 0.5456, + "step": 45970 + }, + { + "epoch": 2.283699215257773, + "grad_norm": 0.134765625, + "learning_rate": 0.0006173080361577431, + "loss": 0.5535, + "step": 45980 + }, + { + "epoch": 2.284195887553392, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006172683023740936, + "loss": 0.575, + "step": 45990 + }, + { + "epoch": 2.2846925598490118, + "grad_norm": 0.095703125, + "learning_rate": 0.0006172285685904441, + "loss": 0.53, + "step": 46000 + }, + { + "epoch": 2.285189232144631, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006171888348067945, + "loss": 0.5599, + "step": 46010 + }, + { + "epoch": 2.28568590444025, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006171491010231449, + "loss": 0.5837, + "step": 46020 + }, + { + "epoch": 2.28618257673587, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006171093672394954, + "loss": 0.5603, + "step": 46030 + }, + { + "epoch": 2.286679249031489, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006170696334558459, + "loss": 0.5613, + "step": 46040 + }, + { + "epoch": 2.2871759213271083, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006170298996721963, + "loss": 0.5899, + "step": 46050 + }, + { + "epoch": 2.2876725936227276, + "grad_norm": 0.14453125, + "learning_rate": 0.0006169901658885468, + "loss": 0.6059, + "step": 46060 + }, + { + "epoch": 2.2881692659183472, + "grad_norm": 0.103515625, + "learning_rate": 0.0006169504321048972, + "loss": 0.5521, + "step": 46070 + }, + { + "epoch": 2.2886659382139665, + "grad_norm": 0.08642578125, + "learning_rate": 0.0006169106983212476, + "loss": 0.5426, + "step": 46080 + }, + { + "epoch": 2.2891626105095857, + "grad_norm": 0.154296875, + "learning_rate": 0.0006168709645375982, + "loss": 0.5622, + "step": 46090 + }, + { + "epoch": 2.2896592828052054, + "grad_norm": 0.1484375, + "learning_rate": 0.0006168312307539486, + "loss": 0.5883, + "step": 46100 + }, + { + "epoch": 2.2901559551008246, + "grad_norm": 0.1328125, + "learning_rate": 0.000616791496970299, + "loss": 0.5705, + "step": 46110 + }, + { + "epoch": 2.290652627396444, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006167517631866494, + "loss": 0.5991, + "step": 46120 + }, + { + "epoch": 2.291149299692063, + "grad_norm": 0.10595703125, + "learning_rate": 0.000616712029403, + "loss": 0.5995, + "step": 46130 + }, + { + "epoch": 2.2916459719876827, + "grad_norm": 0.123046875, + "learning_rate": 0.0006166722956193505, + "loss": 0.611, + "step": 46140 + }, + { + "epoch": 2.292142644283302, + "grad_norm": 0.08349609375, + "learning_rate": 0.0006166325618357008, + "loss": 0.5577, + "step": 46150 + }, + { + "epoch": 2.292639316578921, + "grad_norm": 0.138671875, + "learning_rate": 0.0006165928280520513, + "loss": 0.5759, + "step": 46160 + }, + { + "epoch": 2.2931359888745404, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006165530942684017, + "loss": 0.5706, + "step": 46170 + }, + { + "epoch": 2.29363266117016, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006165133604847521, + "loss": 0.5883, + "step": 46180 + }, + { + "epoch": 2.2941293334657793, + "grad_norm": 0.08935546875, + "learning_rate": 0.0006164736267011027, + "loss": 0.5422, + "step": 46190 + }, + { + "epoch": 2.2946260057613985, + "grad_norm": 0.2109375, + "learning_rate": 0.0006164338929174531, + "loss": 0.5701, + "step": 46200 + }, + { + "epoch": 2.2951226780570178, + "grad_norm": 0.154296875, + "learning_rate": 0.0006163941591338035, + "loss": 0.5675, + "step": 46210 + }, + { + "epoch": 2.2956193503526374, + "grad_norm": 0.1279296875, + "learning_rate": 0.000616354425350154, + "loss": 0.5479, + "step": 46220 + }, + { + "epoch": 2.2961160226482566, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006163146915665044, + "loss": 0.607, + "step": 46230 + }, + { + "epoch": 2.296612694943876, + "grad_norm": 0.154296875, + "learning_rate": 0.0006162749577828549, + "loss": 0.5768, + "step": 46240 + }, + { + "epoch": 2.2971093672394955, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006162352239992054, + "loss": 0.5206, + "step": 46250 + }, + { + "epoch": 2.2976060395351148, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006161954902155558, + "loss": 0.5652, + "step": 46260 + }, + { + "epoch": 2.298102711830734, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006161557564319062, + "loss": 0.6084, + "step": 46270 + }, + { + "epoch": 2.2985993841263532, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006161160226482567, + "loss": 0.5947, + "step": 46280 + }, + { + "epoch": 2.299096056421973, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006160762888646072, + "loss": 0.5682, + "step": 46290 + }, + { + "epoch": 2.299592728717592, + "grad_norm": 0.111328125, + "learning_rate": 0.0006160365550809577, + "loss": 0.5654, + "step": 46300 + }, + { + "epoch": 2.3000894010132114, + "grad_norm": 0.1171875, + "learning_rate": 0.000615996821297308, + "loss": 0.6128, + "step": 46310 + }, + { + "epoch": 2.300586073308831, + "grad_norm": 0.1552734375, + "learning_rate": 0.0006159570875136585, + "loss": 0.5572, + "step": 46320 + }, + { + "epoch": 2.3010827456044503, + "grad_norm": 0.1435546875, + "learning_rate": 0.000615917353730009, + "loss": 0.5714, + "step": 46330 + }, + { + "epoch": 2.3015794179000695, + "grad_norm": 0.12890625, + "learning_rate": 0.0006158776199463594, + "loss": 0.5847, + "step": 46340 + }, + { + "epoch": 2.3020760901956887, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006158378861627099, + "loss": 0.5539, + "step": 46350 + }, + { + "epoch": 2.3025727624913084, + "grad_norm": 0.125, + "learning_rate": 0.0006157981523790604, + "loss": 0.5749, + "step": 46360 + }, + { + "epoch": 2.3030694347869276, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006157584185954107, + "loss": 0.5744, + "step": 46370 + }, + { + "epoch": 2.303566107082547, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006157186848117612, + "loss": 0.5616, + "step": 46380 + }, + { + "epoch": 2.3040627793781665, + "grad_norm": 0.10546875, + "learning_rate": 0.0006156789510281117, + "loss": 0.5513, + "step": 46390 + }, + { + "epoch": 2.3045594516737857, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006156392172444621, + "loss": 0.5751, + "step": 46400 + }, + { + "epoch": 2.305056123969405, + "grad_norm": 0.1708984375, + "learning_rate": 0.0006155994834608126, + "loss": 0.5758, + "step": 46410 + }, + { + "epoch": 2.305552796265024, + "grad_norm": 0.158203125, + "learning_rate": 0.000615559749677163, + "loss": 0.5471, + "step": 46420 + }, + { + "epoch": 2.306049468560644, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006155200158935134, + "loss": 0.5547, + "step": 46430 + }, + { + "epoch": 2.306546140856263, + "grad_norm": 0.1025390625, + "learning_rate": 0.000615480282109864, + "loss": 0.5718, + "step": 46440 + }, + { + "epoch": 2.3070428131518823, + "grad_norm": 0.1328125, + "learning_rate": 0.0006154405483262144, + "loss": 0.578, + "step": 46450 + }, + { + "epoch": 2.307539485447502, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006154008145425649, + "loss": 0.5647, + "step": 46460 + }, + { + "epoch": 2.308036157743121, + "grad_norm": 0.111328125, + "learning_rate": 0.0006153610807589153, + "loss": 0.5614, + "step": 46470 + }, + { + "epoch": 2.3085328300387404, + "grad_norm": 0.2041015625, + "learning_rate": 0.0006153213469752657, + "loss": 0.542, + "step": 46480 + }, + { + "epoch": 2.3090295023343597, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006152816131916163, + "loss": 0.5636, + "step": 46490 + }, + { + "epoch": 2.3095261746299793, + "grad_norm": 0.1025390625, + "learning_rate": 0.0006152418794079666, + "loss": 0.5703, + "step": 46500 + }, + { + "epoch": 2.3100228469255986, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006152021456243171, + "loss": 0.5895, + "step": 46510 + }, + { + "epoch": 2.310519519221218, + "grad_norm": 0.103515625, + "learning_rate": 0.0006151624118406676, + "loss": 0.5679, + "step": 46520 + }, + { + "epoch": 2.311016191516837, + "grad_norm": 0.099609375, + "learning_rate": 0.0006151226780570179, + "loss": 0.5894, + "step": 46530 + }, + { + "epoch": 2.3115128638124567, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006150829442733685, + "loss": 0.5703, + "step": 46540 + }, + { + "epoch": 2.312009536108076, + "grad_norm": 0.11181640625, + "learning_rate": 0.000615043210489719, + "loss": 0.563, + "step": 46550 + }, + { + "epoch": 2.312506208403695, + "grad_norm": 0.1201171875, + "learning_rate": 0.0006150034767060693, + "loss": 0.5498, + "step": 46560 + }, + { + "epoch": 2.3130028806993144, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006149637429224198, + "loss": 0.5776, + "step": 46570 + }, + { + "epoch": 2.313499552994934, + "grad_norm": 0.08984375, + "learning_rate": 0.0006149240091387702, + "loss": 0.5543, + "step": 46580 + }, + { + "epoch": 2.3139962252905533, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006148842753551208, + "loss": 0.5573, + "step": 46590 + }, + { + "epoch": 2.3144928975861725, + "grad_norm": 0.09033203125, + "learning_rate": 0.0006148445415714712, + "loss": 0.5547, + "step": 46600 + }, + { + "epoch": 2.314989569881792, + "grad_norm": 0.095703125, + "learning_rate": 0.0006148048077878216, + "loss": 0.5712, + "step": 46610 + }, + { + "epoch": 2.3154862421774114, + "grad_norm": 0.115234375, + "learning_rate": 0.0006147650740041721, + "loss": 0.5789, + "step": 46620 + }, + { + "epoch": 2.3159829144730306, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006147253402205225, + "loss": 0.5589, + "step": 46630 + }, + { + "epoch": 2.31647958676865, + "grad_norm": 0.146484375, + "learning_rate": 0.000614685606436873, + "loss": 0.5565, + "step": 46640 + }, + { + "epoch": 2.3169762590642695, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006146458726532235, + "loss": 0.5665, + "step": 46650 + }, + { + "epoch": 2.3174729313598887, + "grad_norm": 0.099609375, + "learning_rate": 0.0006146061388695739, + "loss": 0.5588, + "step": 46660 + }, + { + "epoch": 2.317969603655508, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006145664050859243, + "loss": 0.556, + "step": 46670 + }, + { + "epoch": 2.3184662759511276, + "grad_norm": 0.2392578125, + "learning_rate": 0.0006145266713022748, + "loss": 0.5475, + "step": 46680 + }, + { + "epoch": 2.318962948246747, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006144869375186253, + "loss": 0.5643, + "step": 46690 + }, + { + "epoch": 2.319459620542366, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006144472037349757, + "loss": 0.5823, + "step": 46700 + }, + { + "epoch": 2.3199562928379853, + "grad_norm": 0.130859375, + "learning_rate": 0.0006144074699513262, + "loss": 0.5479, + "step": 46710 + }, + { + "epoch": 2.320452965133605, + "grad_norm": 0.103515625, + "learning_rate": 0.0006143677361676765, + "loss": 0.5915, + "step": 46720 + }, + { + "epoch": 2.320949637429224, + "grad_norm": 0.1318359375, + "learning_rate": 0.000614328002384027, + "loss": 0.5537, + "step": 46730 + }, + { + "epoch": 2.3214463097248434, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006142882686003776, + "loss": 0.5458, + "step": 46740 + }, + { + "epoch": 2.321942982020463, + "grad_norm": 0.095703125, + "learning_rate": 0.000614248534816728, + "loss": 0.5543, + "step": 46750 + }, + { + "epoch": 2.3224396543160823, + "grad_norm": 0.1171875, + "learning_rate": 0.0006142088010330784, + "loss": 0.5659, + "step": 46760 + }, + { + "epoch": 2.3229363266117016, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006141690672494288, + "loss": 0.5702, + "step": 46770 + }, + { + "epoch": 2.323432998907321, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006141293334657793, + "loss": 0.554, + "step": 46780 + }, + { + "epoch": 2.3239296712029405, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006140895996821298, + "loss": 0.5765, + "step": 46790 + }, + { + "epoch": 2.3244263434985597, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006140498658984802, + "loss": 0.5599, + "step": 46800 + }, + { + "epoch": 2.324923015794179, + "grad_norm": 0.083984375, + "learning_rate": 0.0006140101321148307, + "loss": 0.5532, + "step": 46810 + }, + { + "epoch": 2.3254196880897986, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006139703983311811, + "loss": 0.5837, + "step": 46820 + }, + { + "epoch": 2.325916360385418, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006139306645475315, + "loss": 0.5828, + "step": 46830 + }, + { + "epoch": 2.326413032681037, + "grad_norm": 0.22265625, + "learning_rate": 0.0006138909307638821, + "loss": 0.5436, + "step": 46840 + }, + { + "epoch": 2.3269097049766563, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006138511969802325, + "loss": 0.5453, + "step": 46850 + }, + { + "epoch": 2.3274063772722755, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006138114631965829, + "loss": 0.5653, + "step": 46860 + }, + { + "epoch": 2.327903049567895, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006137717294129334, + "loss": 0.5757, + "step": 46870 + }, + { + "epoch": 2.3283997218635144, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006137319956292838, + "loss": 0.5588, + "step": 46880 + }, + { + "epoch": 2.3288963941591336, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006136922618456343, + "loss": 0.569, + "step": 46890 + }, + { + "epoch": 2.3293930664547533, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006136525280619848, + "loss": 0.5593, + "step": 46900 + }, + { + "epoch": 2.3298897387503725, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006136127942783352, + "loss": 0.5966, + "step": 46910 + }, + { + "epoch": 2.3303864110459918, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006135730604946856, + "loss": 0.5688, + "step": 46920 + }, + { + "epoch": 2.330883083341611, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006135333267110361, + "loss": 0.5858, + "step": 46930 + }, + { + "epoch": 2.3313797556372307, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006134935929273866, + "loss": 0.5659, + "step": 46940 + }, + { + "epoch": 2.33187642793285, + "grad_norm": 0.1611328125, + "learning_rate": 0.000613453859143737, + "loss": 0.5702, + "step": 46950 + }, + { + "epoch": 2.332373100228469, + "grad_norm": 0.1767578125, + "learning_rate": 0.0006134141253600875, + "loss": 0.548, + "step": 46960 + }, + { + "epoch": 2.332869772524089, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006133743915764379, + "loss": 0.5671, + "step": 46970 + }, + { + "epoch": 2.333366444819708, + "grad_norm": 0.115234375, + "learning_rate": 0.0006133346577927883, + "loss": 0.5613, + "step": 46980 + }, + { + "epoch": 2.3338631171153272, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006132949240091387, + "loss": 0.6136, + "step": 46990 + }, + { + "epoch": 2.3343597894109465, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006132551902254893, + "loss": 0.5489, + "step": 47000 + }, + { + "epoch": 2.334856461706566, + "grad_norm": 0.119140625, + "learning_rate": 0.0006132154564418397, + "loss": 0.5452, + "step": 47010 + }, + { + "epoch": 2.3353531340021854, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006131757226581901, + "loss": 0.5752, + "step": 47020 + }, + { + "epoch": 2.3358498062978046, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006131359888745406, + "loss": 0.5623, + "step": 47030 + }, + { + "epoch": 2.3363464785934243, + "grad_norm": 0.107421875, + "learning_rate": 0.000613096255090891, + "loss": 0.5492, + "step": 47040 + }, + { + "epoch": 2.3368431508890435, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006130565213072415, + "loss": 0.5686, + "step": 47050 + }, + { + "epoch": 2.3373398231846627, + "grad_norm": 0.09814453125, + "learning_rate": 0.000613016787523592, + "loss": 0.5749, + "step": 47060 + }, + { + "epoch": 2.337836495480282, + "grad_norm": 0.1962890625, + "learning_rate": 0.0006129770537399424, + "loss": 0.6005, + "step": 47070 + }, + { + "epoch": 2.3383331677759016, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006129373199562928, + "loss": 0.581, + "step": 47080 + }, + { + "epoch": 2.338829840071521, + "grad_norm": 0.09765625, + "learning_rate": 0.0006128975861726434, + "loss": 0.5312, + "step": 47090 + }, + { + "epoch": 2.33932651236714, + "grad_norm": 0.09375, + "learning_rate": 0.0006128578523889938, + "loss": 0.5676, + "step": 47100 + }, + { + "epoch": 2.3398231846627597, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006128181186053442, + "loss": 0.5615, + "step": 47110 + }, + { + "epoch": 2.340319856958379, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006127783848216947, + "loss": 0.5737, + "step": 47120 + }, + { + "epoch": 2.340816529253998, + "grad_norm": 0.12890625, + "learning_rate": 0.0006127386510380451, + "loss": 0.5802, + "step": 47130 + }, + { + "epoch": 2.3413132015496174, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006126989172543955, + "loss": 0.5564, + "step": 47140 + }, + { + "epoch": 2.341809873845237, + "grad_norm": 0.125, + "learning_rate": 0.0006126591834707461, + "loss": 0.563, + "step": 47150 + }, + { + "epoch": 2.3423065461408563, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006126194496870965, + "loss": 0.5529, + "step": 47160 + }, + { + "epoch": 2.3428032184364755, + "grad_norm": 0.12255859375, + "learning_rate": 0.0006125797159034469, + "loss": 0.5879, + "step": 47170 + }, + { + "epoch": 2.3432998907320948, + "grad_norm": 0.1171875, + "learning_rate": 0.0006125399821197973, + "loss": 0.5619, + "step": 47180 + }, + { + "epoch": 2.3437965630277144, + "grad_norm": 0.20703125, + "learning_rate": 0.0006125002483361478, + "loss": 0.5826, + "step": 47190 + }, + { + "epoch": 2.3442932353233337, + "grad_norm": 0.1689453125, + "learning_rate": 0.0006124605145524984, + "loss": 0.5169, + "step": 47200 + }, + { + "epoch": 2.344789907618953, + "grad_norm": 0.083984375, + "learning_rate": 0.0006124207807688487, + "loss": 0.5398, + "step": 47210 + }, + { + "epoch": 2.345286579914572, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006123810469851992, + "loss": 0.5669, + "step": 47220 + }, + { + "epoch": 2.345783252210192, + "grad_norm": 0.162109375, + "learning_rate": 0.0006123413132015497, + "loss": 0.5466, + "step": 47230 + }, + { + "epoch": 2.346279924505811, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006123015794179, + "loss": 0.5729, + "step": 47240 + }, + { + "epoch": 2.3467765968014302, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006122618456342506, + "loss": 0.5874, + "step": 47250 + }, + { + "epoch": 2.34727326909705, + "grad_norm": 0.09130859375, + "learning_rate": 0.000612222111850601, + "loss": 0.5505, + "step": 47260 + }, + { + "epoch": 2.347769941392669, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006121823780669514, + "loss": 0.5806, + "step": 47270 + }, + { + "epoch": 2.3482666136882884, + "grad_norm": 0.21875, + "learning_rate": 0.0006121426442833019, + "loss": 0.558, + "step": 47280 + }, + { + "epoch": 2.3487632859839076, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006121029104996523, + "loss": 0.5507, + "step": 47290 + }, + { + "epoch": 2.3492599582795273, + "grad_norm": 0.1875, + "learning_rate": 0.0006120631767160028, + "loss": 0.5562, + "step": 47300 + }, + { + "epoch": 2.3497566305751465, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006120234429323533, + "loss": 0.5989, + "step": 47310 + }, + { + "epoch": 2.3502533028707657, + "grad_norm": 0.11572265625, + "learning_rate": 0.0006119837091487037, + "loss": 0.577, + "step": 47320 + }, + { + "epoch": 2.3507499751663854, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006119439753650541, + "loss": 0.5982, + "step": 47330 + }, + { + "epoch": 2.3512466474620046, + "grad_norm": 0.1337890625, + "learning_rate": 0.0006119042415814046, + "loss": 0.5664, + "step": 47340 + }, + { + "epoch": 2.351743319757624, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006118645077977551, + "loss": 0.557, + "step": 47350 + }, + { + "epoch": 2.352239992053243, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006118247740141056, + "loss": 0.5568, + "step": 47360 + }, + { + "epoch": 2.3527366643488627, + "grad_norm": 0.1728515625, + "learning_rate": 0.0006117850402304559, + "loss": 0.5472, + "step": 47370 + }, + { + "epoch": 2.353233336644482, + "grad_norm": 0.09375, + "learning_rate": 0.0006117453064468064, + "loss": 0.5609, + "step": 47380 + }, + { + "epoch": 2.353730008940101, + "grad_norm": 0.09814453125, + "learning_rate": 0.000611705572663157, + "loss": 0.5356, + "step": 47390 + }, + { + "epoch": 2.354226681235721, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006116658388795073, + "loss": 0.5859, + "step": 47400 + }, + { + "epoch": 2.35472335353134, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006116261050958578, + "loss": 0.5539, + "step": 47410 + }, + { + "epoch": 2.3552200258269593, + "grad_norm": 0.162109375, + "learning_rate": 0.0006115863713122083, + "loss": 0.5498, + "step": 47420 + }, + { + "epoch": 2.3557166981225786, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006115466375285586, + "loss": 0.5688, + "step": 47430 + }, + { + "epoch": 2.3562133704181982, + "grad_norm": 0.111328125, + "learning_rate": 0.0006115069037449091, + "loss": 0.543, + "step": 47440 + }, + { + "epoch": 2.3567100427138175, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006114671699612596, + "loss": 0.5701, + "step": 47450 + }, + { + "epoch": 2.3572067150094367, + "grad_norm": 0.09375, + "learning_rate": 0.00061142743617761, + "loss": 0.5587, + "step": 47460 + }, + { + "epoch": 2.3577033873050564, + "grad_norm": 0.125, + "learning_rate": 0.0006113877023939605, + "loss": 0.6082, + "step": 47470 + }, + { + "epoch": 2.3582000596006756, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006113479686103109, + "loss": 0.5627, + "step": 47480 + }, + { + "epoch": 2.358696731896295, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006113082348266614, + "loss": 0.5644, + "step": 47490 + }, + { + "epoch": 2.359193404191914, + "grad_norm": 0.08935546875, + "learning_rate": 0.0006112685010430119, + "loss": 0.5654, + "step": 47500 + }, + { + "epoch": 2.3596900764875337, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006112287672593623, + "loss": 0.5713, + "step": 47510 + }, + { + "epoch": 2.360186748783153, + "grad_norm": 0.091796875, + "learning_rate": 0.0006111890334757128, + "loss": 0.5781, + "step": 47520 + }, + { + "epoch": 2.360683421078772, + "grad_norm": 0.1357421875, + "learning_rate": 0.0006111492996920632, + "loss": 0.5249, + "step": 47530 + }, + { + "epoch": 2.3611800933743914, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006111095659084136, + "loss": 0.5465, + "step": 47540 + }, + { + "epoch": 2.361676765670011, + "grad_norm": 0.138671875, + "learning_rate": 0.0006110698321247642, + "loss": 0.5634, + "step": 47550 + }, + { + "epoch": 2.3621734379656303, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006110300983411146, + "loss": 0.5259, + "step": 47560 + }, + { + "epoch": 2.3626701102612495, + "grad_norm": 0.10009765625, + "learning_rate": 0.000610990364557465, + "loss": 0.5567, + "step": 47570 + }, + { + "epoch": 2.3631667825568687, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006109506307738155, + "loss": 0.5519, + "step": 47580 + }, + { + "epoch": 2.3636634548524884, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006109108969901658, + "loss": 0.5779, + "step": 47590 + }, + { + "epoch": 2.3641601271481076, + "grad_norm": 0.103515625, + "learning_rate": 0.0006108711632065164, + "loss": 0.5892, + "step": 47600 + }, + { + "epoch": 2.364656799443727, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006108314294228669, + "loss": 0.5698, + "step": 47610 + }, + { + "epoch": 2.3651534717393465, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006107916956392172, + "loss": 0.5859, + "step": 47620 + }, + { + "epoch": 2.3656501440349658, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006107519618555677, + "loss": 0.5683, + "step": 47630 + }, + { + "epoch": 2.366146816330585, + "grad_norm": 0.126953125, + "learning_rate": 0.0006107122280719181, + "loss": 0.5654, + "step": 47640 + }, + { + "epoch": 2.366643488626204, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006106724942882687, + "loss": 0.5919, + "step": 47650 + }, + { + "epoch": 2.367140160921824, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006106327605046191, + "loss": 0.5796, + "step": 47660 + }, + { + "epoch": 2.367636833217443, + "grad_norm": 0.11181640625, + "learning_rate": 0.0006105930267209695, + "loss": 0.5524, + "step": 47670 + }, + { + "epoch": 2.3681335055130623, + "grad_norm": 0.181640625, + "learning_rate": 0.00061055329293732, + "loss": 0.5327, + "step": 47680 + }, + { + "epoch": 2.368630177808682, + "grad_norm": 0.11328125, + "learning_rate": 0.0006105135591536704, + "loss": 0.5474, + "step": 47690 + }, + { + "epoch": 2.3691268501043012, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006104738253700209, + "loss": 0.5729, + "step": 47700 + }, + { + "epoch": 2.3696235223999205, + "grad_norm": 0.1015625, + "learning_rate": 0.0006104340915863714, + "loss": 0.5572, + "step": 47710 + }, + { + "epoch": 2.3701201946955397, + "grad_norm": 0.09765625, + "learning_rate": 0.0006103943578027218, + "loss": 0.5617, + "step": 47720 + }, + { + "epoch": 2.3706168669911594, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006103546240190722, + "loss": 0.5546, + "step": 47730 + }, + { + "epoch": 2.3711135392867786, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006103148902354227, + "loss": 0.5584, + "step": 47740 + }, + { + "epoch": 2.371610211582398, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006102751564517732, + "loss": 0.5744, + "step": 47750 + }, + { + "epoch": 2.3721068838780175, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006102354226681236, + "loss": 0.5729, + "step": 47760 + }, + { + "epoch": 2.3726035561736367, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006101956888844741, + "loss": 0.5538, + "step": 47770 + }, + { + "epoch": 2.373100228469256, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006101559551008244, + "loss": 0.5581, + "step": 47780 + }, + { + "epoch": 2.373596900764875, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006101162213171749, + "loss": 0.5644, + "step": 47790 + }, + { + "epoch": 2.374093573060495, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006100764875335255, + "loss": 0.5738, + "step": 47800 + }, + { + "epoch": 2.374590245356114, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006100367537498759, + "loss": 0.5686, + "step": 47810 + }, + { + "epoch": 2.3750869176517333, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006099970199662263, + "loss": 0.5489, + "step": 47820 + }, + { + "epoch": 2.375583589947353, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006099572861825768, + "loss": 0.5604, + "step": 47830 + }, + { + "epoch": 2.376080262242972, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006099175523989272, + "loss": 0.5671, + "step": 47840 + }, + { + "epoch": 2.3765769345385914, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006098778186152777, + "loss": 0.5772, + "step": 47850 + }, + { + "epoch": 2.3770736068342107, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006098380848316281, + "loss": 0.592, + "step": 47860 + }, + { + "epoch": 2.3775702791298303, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006097983510479786, + "loss": 0.555, + "step": 47870 + }, + { + "epoch": 2.3780669514254495, + "grad_norm": 0.12890625, + "learning_rate": 0.000609758617264329, + "loss": 0.5402, + "step": 47880 + }, + { + "epoch": 2.3785636237210688, + "grad_norm": 0.115234375, + "learning_rate": 0.0006097188834806794, + "loss": 0.5615, + "step": 47890 + }, + { + "epoch": 2.379060296016688, + "grad_norm": 0.2177734375, + "learning_rate": 0.00060967914969703, + "loss": 0.5759, + "step": 47900 + }, + { + "epoch": 2.3795569683123077, + "grad_norm": 0.1298828125, + "learning_rate": 0.0006096394159133804, + "loss": 0.5377, + "step": 47910 + }, + { + "epoch": 2.380053640607927, + "grad_norm": 0.162109375, + "learning_rate": 0.0006095996821297308, + "loss": 0.5576, + "step": 47920 + }, + { + "epoch": 2.380550312903546, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006095599483460813, + "loss": 0.5403, + "step": 47930 + }, + { + "epoch": 2.3810469851991654, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006095202145624317, + "loss": 0.5741, + "step": 47940 + }, + { + "epoch": 2.381543657494785, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006094804807787822, + "loss": 0.5711, + "step": 47950 + }, + { + "epoch": 2.3820403297904043, + "grad_norm": 0.12353515625, + "learning_rate": 0.0006094407469951327, + "loss": 0.5971, + "step": 47960 + }, + { + "epoch": 2.3825370020860235, + "grad_norm": 0.099609375, + "learning_rate": 0.0006094010132114831, + "loss": 0.5602, + "step": 47970 + }, + { + "epoch": 2.383033674381643, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006093612794278335, + "loss": 0.5781, + "step": 47980 + }, + { + "epoch": 2.3835303466772624, + "grad_norm": 0.10205078125, + "learning_rate": 0.000609321545644184, + "loss": 0.5557, + "step": 47990 + }, + { + "epoch": 2.3840270189728816, + "grad_norm": 0.099609375, + "learning_rate": 0.0006092818118605345, + "loss": 0.5475, + "step": 48000 + }, + { + "epoch": 2.384523691268501, + "grad_norm": 0.130859375, + "learning_rate": 0.0006092420780768849, + "loss": 0.5776, + "step": 48010 + }, + { + "epoch": 2.3850203635641205, + "grad_norm": 0.142578125, + "learning_rate": 0.0006092023442932354, + "loss": 0.5688, + "step": 48020 + }, + { + "epoch": 2.3855170358597397, + "grad_norm": 0.123046875, + "learning_rate": 0.0006091626105095858, + "loss": 0.5748, + "step": 48030 + }, + { + "epoch": 2.386013708155359, + "grad_norm": 0.08984375, + "learning_rate": 0.0006091228767259362, + "loss": 0.5595, + "step": 48040 + }, + { + "epoch": 2.3865103804509786, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006090831429422867, + "loss": 0.55, + "step": 48050 + }, + { + "epoch": 2.387007052746598, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006090434091586372, + "loss": 0.5507, + "step": 48060 + }, + { + "epoch": 2.387503725042217, + "grad_norm": 0.1455078125, + "learning_rate": 0.0006090036753749876, + "loss": 0.5414, + "step": 48070 + }, + { + "epoch": 2.3880003973378363, + "grad_norm": 0.099609375, + "learning_rate": 0.000608963941591338, + "loss": 0.5573, + "step": 48080 + }, + { + "epoch": 2.388497069633456, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006089242078076885, + "loss": 0.5717, + "step": 48090 + }, + { + "epoch": 2.388993741929075, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006088844740240391, + "loss": 0.5798, + "step": 48100 + }, + { + "epoch": 2.3894904142246944, + "grad_norm": 0.126953125, + "learning_rate": 0.0006088447402403894, + "loss": 0.5437, + "step": 48110 + }, + { + "epoch": 2.389987086520314, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006088050064567399, + "loss": 0.5658, + "step": 48120 + }, + { + "epoch": 2.3904837588159333, + "grad_norm": 0.2431640625, + "learning_rate": 0.0006087652726730903, + "loss": 0.5929, + "step": 48130 + }, + { + "epoch": 2.3909804311115526, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006087255388894407, + "loss": 0.5686, + "step": 48140 + }, + { + "epoch": 2.391477103407172, + "grad_norm": 0.1240234375, + "learning_rate": 0.0006086858051057913, + "loss": 0.5695, + "step": 48150 + }, + { + "epoch": 2.3919737757027915, + "grad_norm": 0.10009765625, + "learning_rate": 0.0006086460713221417, + "loss": 0.5974, + "step": 48160 + }, + { + "epoch": 2.3924704479984107, + "grad_norm": 0.09765625, + "learning_rate": 0.0006086063375384921, + "loss": 0.5581, + "step": 48170 + }, + { + "epoch": 2.39296712029403, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006085666037548426, + "loss": 0.5586, + "step": 48180 + }, + { + "epoch": 2.3934637925896496, + "grad_norm": 0.11474609375, + "learning_rate": 0.000608526869971193, + "loss": 0.5454, + "step": 48190 + }, + { + "epoch": 2.393960464885269, + "grad_norm": 0.099609375, + "learning_rate": 0.0006084871361875435, + "loss": 0.5464, + "step": 48200 + }, + { + "epoch": 2.394457137180888, + "grad_norm": 0.12451171875, + "learning_rate": 0.000608447402403894, + "loss": 0.5685, + "step": 48210 + }, + { + "epoch": 2.3949538094765073, + "grad_norm": 0.130859375, + "learning_rate": 0.0006084076686202444, + "loss": 0.545, + "step": 48220 + }, + { + "epoch": 2.395450481772127, + "grad_norm": 0.1513671875, + "learning_rate": 0.0006083679348365949, + "loss": 0.5746, + "step": 48230 + }, + { + "epoch": 2.395947154067746, + "grad_norm": 0.103515625, + "learning_rate": 0.0006083282010529452, + "loss": 0.5444, + "step": 48240 + }, + { + "epoch": 2.3964438263633654, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006082884672692958, + "loss": 0.5705, + "step": 48250 + }, + { + "epoch": 2.3969404986589846, + "grad_norm": 0.15234375, + "learning_rate": 0.0006082487334856463, + "loss": 0.5899, + "step": 48260 + }, + { + "epoch": 2.3974371709546043, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006082089997019966, + "loss": 0.546, + "step": 48270 + }, + { + "epoch": 2.3979338432502235, + "grad_norm": 0.162109375, + "learning_rate": 0.0006081692659183471, + "loss": 0.5731, + "step": 48280 + }, + { + "epoch": 2.3984305155458427, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006081295321346976, + "loss": 0.5569, + "step": 48290 + }, + { + "epoch": 2.398927187841462, + "grad_norm": 0.10888671875, + "learning_rate": 0.000608089798351048, + "loss": 0.5787, + "step": 48300 + }, + { + "epoch": 2.3994238601370816, + "grad_norm": 0.08837890625, + "learning_rate": 0.0006080500645673985, + "loss": 0.565, + "step": 48310 + }, + { + "epoch": 2.399920532432701, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006080103307837489, + "loss": 0.5364, + "step": 48320 + }, + { + "epoch": 2.40041720472832, + "grad_norm": 0.10791015625, + "learning_rate": 0.0006079705970000993, + "loss": 0.5913, + "step": 48330 + }, + { + "epoch": 2.4009138770239398, + "grad_norm": 0.11376953125, + "learning_rate": 0.0006079308632164498, + "loss": 0.5611, + "step": 48340 + }, + { + "epoch": 2.401410549319559, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006078911294328003, + "loss": 0.5501, + "step": 48350 + }, + { + "epoch": 2.4019072216151782, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006078513956491507, + "loss": 0.5486, + "step": 48360 + }, + { + "epoch": 2.4024038939107974, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006078116618655012, + "loss": 0.5591, + "step": 48370 + }, + { + "epoch": 2.402900566206417, + "grad_norm": 0.1015625, + "learning_rate": 0.0006077719280818516, + "loss": 0.5502, + "step": 48380 + }, + { + "epoch": 2.4033972385020363, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006077321942982021, + "loss": 0.554, + "step": 48390 + }, + { + "epoch": 2.4038939107976556, + "grad_norm": 0.16796875, + "learning_rate": 0.0006076924605145526, + "loss": 0.608, + "step": 48400 + }, + { + "epoch": 2.4043905830932752, + "grad_norm": 0.19140625, + "learning_rate": 0.000607652726730903, + "loss": 0.5643, + "step": 48410 + }, + { + "epoch": 2.4048872553888945, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006076129929472535, + "loss": 0.5602, + "step": 48420 + }, + { + "epoch": 2.4053839276845137, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006075732591636039, + "loss": 0.5781, + "step": 48430 + }, + { + "epoch": 2.405880599980133, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006075335253799543, + "loss": 0.5666, + "step": 48440 + }, + { + "epoch": 2.4063772722757526, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006074937915963049, + "loss": 0.5493, + "step": 48450 + }, + { + "epoch": 2.406873944571372, + "grad_norm": 0.1572265625, + "learning_rate": 0.0006074540578126552, + "loss": 0.5756, + "step": 48460 + }, + { + "epoch": 2.407370616866991, + "grad_norm": 0.1826171875, + "learning_rate": 0.0006074143240290057, + "loss": 0.548, + "step": 48470 + }, + { + "epoch": 2.4078672891626107, + "grad_norm": 0.1494140625, + "learning_rate": 0.0006073745902453562, + "loss": 0.5627, + "step": 48480 + }, + { + "epoch": 2.40836396145823, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006073348564617065, + "loss": 0.5822, + "step": 48490 + }, + { + "epoch": 2.408860633753849, + "grad_norm": 0.109375, + "learning_rate": 0.000607295122678057, + "loss": 0.5511, + "step": 48500 + }, + { + "epoch": 2.4093573060494684, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006072553888944075, + "loss": 0.5496, + "step": 48510 + }, + { + "epoch": 2.409853978345088, + "grad_norm": 0.12109375, + "learning_rate": 0.0006072156551107579, + "loss": 0.5663, + "step": 48520 + }, + { + "epoch": 2.4103506506407073, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006071759213271084, + "loss": 0.5617, + "step": 48530 + }, + { + "epoch": 2.4108473229363265, + "grad_norm": 0.1435546875, + "learning_rate": 0.0006071361875434588, + "loss": 0.5533, + "step": 48540 + }, + { + "epoch": 2.411343995231946, + "grad_norm": 0.1337890625, + "learning_rate": 0.0006070964537598094, + "loss": 0.5751, + "step": 48550 + }, + { + "epoch": 2.4118406675275654, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006070567199761598, + "loss": 0.5347, + "step": 48560 + }, + { + "epoch": 2.4123373398231847, + "grad_norm": 0.099609375, + "learning_rate": 0.0006070169861925102, + "loss": 0.5762, + "step": 48570 + }, + { + "epoch": 2.412834012118804, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006069772524088607, + "loss": 0.5724, + "step": 48580 + }, + { + "epoch": 2.4133306844144236, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006069375186252111, + "loss": 0.5843, + "step": 48590 + }, + { + "epoch": 2.413827356710043, + "grad_norm": 0.140625, + "learning_rate": 0.0006068977848415615, + "loss": 0.5724, + "step": 48600 + }, + { + "epoch": 2.414324029005662, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006068580510579121, + "loss": 0.5487, + "step": 48610 + }, + { + "epoch": 2.4148207013012812, + "grad_norm": 0.08935546875, + "learning_rate": 0.0006068183172742625, + "loss": 0.547, + "step": 48620 + }, + { + "epoch": 2.415317373596901, + "grad_norm": 0.115234375, + "learning_rate": 0.0006067785834906129, + "loss": 0.5484, + "step": 48630 + }, + { + "epoch": 2.41581404589252, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006067388497069634, + "loss": 0.565, + "step": 48640 + }, + { + "epoch": 2.4163107181881394, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006066991159233137, + "loss": 0.5757, + "step": 48650 + }, + { + "epoch": 2.4168073904837586, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006066593821396643, + "loss": 0.5451, + "step": 48660 + }, + { + "epoch": 2.4173040627793783, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006066196483560148, + "loss": 0.5837, + "step": 48670 + }, + { + "epoch": 2.4178007350749975, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006065799145723652, + "loss": 0.5708, + "step": 48680 + }, + { + "epoch": 2.4182974073706167, + "grad_norm": 0.126953125, + "learning_rate": 0.0006065401807887156, + "loss": 0.565, + "step": 48690 + }, + { + "epoch": 2.4187940796662364, + "grad_norm": 0.09716796875, + "learning_rate": 0.000606500447005066, + "loss": 0.5489, + "step": 48700 + }, + { + "epoch": 2.4192907519618556, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006064607132214166, + "loss": 0.5792, + "step": 48710 + }, + { + "epoch": 2.419787424257475, + "grad_norm": 0.1162109375, + "learning_rate": 0.000606420979437767, + "loss": 0.5766, + "step": 48720 + }, + { + "epoch": 2.420284096553094, + "grad_norm": 0.1669921875, + "learning_rate": 0.0006063812456541174, + "loss": 0.5633, + "step": 48730 + }, + { + "epoch": 2.4207807688487137, + "grad_norm": 0.1181640625, + "learning_rate": 0.0006063415118704679, + "loss": 0.5718, + "step": 48740 + }, + { + "epoch": 2.421277441144333, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006063017780868183, + "loss": 0.573, + "step": 48750 + }, + { + "epoch": 2.421774113439952, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006062620443031688, + "loss": 0.5696, + "step": 48760 + }, + { + "epoch": 2.422270785735572, + "grad_norm": 0.08984375, + "learning_rate": 0.0006062223105195193, + "loss": 0.5642, + "step": 48770 + }, + { + "epoch": 2.422767458031191, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006061825767358697, + "loss": 0.5697, + "step": 48780 + }, + { + "epoch": 2.4232641303268103, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006061428429522201, + "loss": 0.5572, + "step": 48790 + }, + { + "epoch": 2.4237608026224295, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006061031091685706, + "loss": 0.5505, + "step": 48800 + }, + { + "epoch": 2.424257474918049, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006060633753849211, + "loss": 0.5675, + "step": 48810 + }, + { + "epoch": 2.4247541472136684, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006060236416012715, + "loss": 0.5403, + "step": 48820 + }, + { + "epoch": 2.4252508195092877, + "grad_norm": 0.10595703125, + "learning_rate": 0.000605983907817622, + "loss": 0.5562, + "step": 48830 + }, + { + "epoch": 2.4257474918049073, + "grad_norm": 0.099609375, + "learning_rate": 0.0006059441740339724, + "loss": 0.5669, + "step": 48840 + }, + { + "epoch": 2.4262441641005266, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006059044402503228, + "loss": 0.5864, + "step": 48850 + }, + { + "epoch": 2.426740836396146, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006058647064666734, + "loss": 0.553, + "step": 48860 + }, + { + "epoch": 2.427237508691765, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006058249726830238, + "loss": 0.5676, + "step": 48870 + }, + { + "epoch": 2.4277341809873847, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006057852388993742, + "loss": 0.543, + "step": 48880 + }, + { + "epoch": 2.428230853283004, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006057455051157247, + "loss": 0.5484, + "step": 48890 + }, + { + "epoch": 2.428727525578623, + "grad_norm": 0.12890625, + "learning_rate": 0.0006057057713320751, + "loss": 0.5772, + "step": 48900 + }, + { + "epoch": 2.429224197874243, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006056660375484256, + "loss": 0.5984, + "step": 48910 + }, + { + "epoch": 2.429720870169862, + "grad_norm": 0.103515625, + "learning_rate": 0.000605626303764776, + "loss": 0.574, + "step": 48920 + }, + { + "epoch": 2.4302175424654813, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006055865699811265, + "loss": 0.5479, + "step": 48930 + }, + { + "epoch": 2.4307142147611005, + "grad_norm": 0.103515625, + "learning_rate": 0.0006055468361974769, + "loss": 0.5385, + "step": 48940 + }, + { + "epoch": 2.4312108870567197, + "grad_norm": 0.103515625, + "learning_rate": 0.0006055071024138273, + "loss": 0.5857, + "step": 48950 + }, + { + "epoch": 2.4317075593523394, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006054673686301779, + "loss": 0.5407, + "step": 48960 + }, + { + "epoch": 2.4322042316479586, + "grad_norm": 0.1533203125, + "learning_rate": 0.0006054276348465283, + "loss": 0.577, + "step": 48970 + }, + { + "epoch": 2.432700903943578, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006053879010628787, + "loss": 0.5924, + "step": 48980 + }, + { + "epoch": 2.4331975762391975, + "grad_norm": 0.134765625, + "learning_rate": 0.0006053481672792292, + "loss": 0.5802, + "step": 48990 + }, + { + "epoch": 2.4336942485348168, + "grad_norm": 0.1494140625, + "learning_rate": 0.0006053084334955796, + "loss": 0.5521, + "step": 49000 + }, + { + "epoch": 2.434190920830436, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006052686997119301, + "loss": 0.5648, + "step": 49010 + }, + { + "epoch": 2.434687593126055, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006052289659282806, + "loss": 0.58, + "step": 49020 + }, + { + "epoch": 2.435184265421675, + "grad_norm": 0.09619140625, + "learning_rate": 0.000605189232144631, + "loss": 0.5418, + "step": 49030 + }, + { + "epoch": 2.435680937717294, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006051494983609814, + "loss": 0.5535, + "step": 49040 + }, + { + "epoch": 2.4361776100129133, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006051097645773319, + "loss": 0.5718, + "step": 49050 + }, + { + "epoch": 2.436674282308533, + "grad_norm": 0.1083984375, + "learning_rate": 0.0006050700307936824, + "loss": 0.5783, + "step": 49060 + }, + { + "epoch": 2.4371709546041522, + "grad_norm": 0.119140625, + "learning_rate": 0.0006050302970100328, + "loss": 0.5711, + "step": 49070 + }, + { + "epoch": 2.4376676268997715, + "grad_norm": 0.107421875, + "learning_rate": 0.0006049905632263833, + "loss": 0.5727, + "step": 49080 + }, + { + "epoch": 2.4381642991953907, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006049508294427337, + "loss": 0.567, + "step": 49090 + }, + { + "epoch": 2.4386609714910104, + "grad_norm": 0.140625, + "learning_rate": 0.0006049110956590841, + "loss": 0.573, + "step": 49100 + }, + { + "epoch": 2.4391576437866296, + "grad_norm": 0.11474609375, + "learning_rate": 0.0006048713618754346, + "loss": 0.5678, + "step": 49110 + }, + { + "epoch": 2.439654316082249, + "grad_norm": 0.126953125, + "learning_rate": 0.0006048316280917851, + "loss": 0.5699, + "step": 49120 + }, + { + "epoch": 2.4401509883778685, + "grad_norm": 0.10107421875, + "learning_rate": 0.0006047918943081356, + "loss": 0.5472, + "step": 49130 + }, + { + "epoch": 2.4406476606734877, + "grad_norm": 0.1337890625, + "learning_rate": 0.0006047521605244859, + "loss": 0.5707, + "step": 49140 + }, + { + "epoch": 2.441144332969107, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006047124267408364, + "loss": 0.5915, + "step": 49150 + }, + { + "epoch": 2.441641005264726, + "grad_norm": 0.09912109375, + "learning_rate": 0.000604672692957187, + "loss": 0.5795, + "step": 49160 + }, + { + "epoch": 2.442137677560346, + "grad_norm": 0.1728515625, + "learning_rate": 0.0006046329591735373, + "loss": 0.5591, + "step": 49170 + }, + { + "epoch": 2.442634349855965, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006045932253898878, + "loss": 0.5705, + "step": 49180 + }, + { + "epoch": 2.4431310221515843, + "grad_norm": 0.12109375, + "learning_rate": 0.0006045534916062382, + "loss": 0.5731, + "step": 49190 + }, + { + "epoch": 2.443627694447204, + "grad_norm": 0.1396484375, + "learning_rate": 0.0006045137578225886, + "loss": 0.5761, + "step": 49200 + }, + { + "epoch": 2.444124366742823, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006044740240389392, + "loss": 0.5612, + "step": 49210 + }, + { + "epoch": 2.4446210390384424, + "grad_norm": 0.1591796875, + "learning_rate": 0.0006044342902552896, + "loss": 0.5573, + "step": 49220 + }, + { + "epoch": 2.4451177113340616, + "grad_norm": 0.126953125, + "learning_rate": 0.00060439455647164, + "loss": 0.5884, + "step": 49230 + }, + { + "epoch": 2.4456143836296813, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006043548226879905, + "loss": 0.5599, + "step": 49240 + }, + { + "epoch": 2.4461110559253005, + "grad_norm": 0.09375, + "learning_rate": 0.0006043150889043409, + "loss": 0.5611, + "step": 49250 + }, + { + "epoch": 2.4466077282209198, + "grad_norm": 0.10888671875, + "learning_rate": 0.0006042753551206914, + "loss": 0.5909, + "step": 49260 + }, + { + "epoch": 2.4471044005165394, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006042356213370419, + "loss": 0.5518, + "step": 49270 + }, + { + "epoch": 2.4476010728121587, + "grad_norm": 0.1005859375, + "learning_rate": 0.0006041958875533923, + "loss": 0.5707, + "step": 49280 + }, + { + "epoch": 2.448097745107778, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006041561537697428, + "loss": 0.5541, + "step": 49290 + }, + { + "epoch": 2.448594417403397, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006041164199860931, + "loss": 0.5475, + "step": 49300 + }, + { + "epoch": 2.4490910896990163, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006040766862024437, + "loss": 0.5545, + "step": 49310 + }, + { + "epoch": 2.449587761994636, + "grad_norm": 0.1630859375, + "learning_rate": 0.0006040369524187942, + "loss": 0.554, + "step": 49320 + }, + { + "epoch": 2.4500844342902552, + "grad_norm": 0.10546875, + "learning_rate": 0.0006039972186351445, + "loss": 0.553, + "step": 49330 + }, + { + "epoch": 2.4505811065858745, + "grad_norm": 0.09814453125, + "learning_rate": 0.000603957484851495, + "loss": 0.578, + "step": 49340 + }, + { + "epoch": 2.451077778881494, + "grad_norm": 0.099609375, + "learning_rate": 0.0006039177510678455, + "loss": 0.5772, + "step": 49350 + }, + { + "epoch": 2.4515744511771134, + "grad_norm": 0.099609375, + "learning_rate": 0.0006038780172841959, + "loss": 0.5496, + "step": 49360 + }, + { + "epoch": 2.4520711234727326, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006038382835005464, + "loss": 0.5625, + "step": 49370 + }, + { + "epoch": 2.452567795768352, + "grad_norm": 0.109375, + "learning_rate": 0.0006037985497168968, + "loss": 0.5499, + "step": 49380 + }, + { + "epoch": 2.4530644680639715, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006037588159332472, + "loss": 0.589, + "step": 49390 + }, + { + "epoch": 2.4535611403595907, + "grad_norm": 0.0966796875, + "learning_rate": 0.0006037190821495977, + "loss": 0.5583, + "step": 49400 + }, + { + "epoch": 2.45405781265521, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006036793483659482, + "loss": 0.5581, + "step": 49410 + }, + { + "epoch": 2.4545544849508296, + "grad_norm": 0.103515625, + "learning_rate": 0.0006036396145822986, + "loss": 0.5714, + "step": 49420 + }, + { + "epoch": 2.455051157246449, + "grad_norm": 0.1171875, + "learning_rate": 0.0006035998807986491, + "loss": 0.5595, + "step": 49430 + }, + { + "epoch": 2.455547829542068, + "grad_norm": 0.1474609375, + "learning_rate": 0.0006035601470149995, + "loss": 0.5451, + "step": 49440 + }, + { + "epoch": 2.4560445018376873, + "grad_norm": 0.125, + "learning_rate": 0.00060352041323135, + "loss": 0.5666, + "step": 49450 + }, + { + "epoch": 2.456541174133307, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006034806794477005, + "loss": 0.6094, + "step": 49460 + }, + { + "epoch": 2.457037846428926, + "grad_norm": 0.125, + "learning_rate": 0.0006034409456640509, + "loss": 0.5374, + "step": 49470 + }, + { + "epoch": 2.4575345187245454, + "grad_norm": 0.11669921875, + "learning_rate": 0.0006034012118804014, + "loss": 0.5574, + "step": 49480 + }, + { + "epoch": 2.458031191020165, + "grad_norm": 0.146484375, + "learning_rate": 0.0006033614780967518, + "loss": 0.5301, + "step": 49490 + }, + { + "epoch": 2.4585278633157843, + "grad_norm": 0.12109375, + "learning_rate": 0.0006033217443131022, + "loss": 0.5728, + "step": 49500 + }, + { + "epoch": 2.4590245356114036, + "grad_norm": 0.1455078125, + "learning_rate": 0.0006032820105294528, + "loss": 0.5635, + "step": 49510 + }, + { + "epoch": 2.4595212079070228, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006032422767458031, + "loss": 0.5471, + "step": 49520 + }, + { + "epoch": 2.4600178802026424, + "grad_norm": 0.146484375, + "learning_rate": 0.0006032025429621536, + "loss": 0.5895, + "step": 49530 + }, + { + "epoch": 2.4605145524982617, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006031628091785041, + "loss": 0.5626, + "step": 49540 + }, + { + "epoch": 2.461011224793881, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006031230753948544, + "loss": 0.5692, + "step": 49550 + }, + { + "epoch": 2.4615078970895006, + "grad_norm": 0.1279296875, + "learning_rate": 0.000603083341611205, + "loss": 0.5512, + "step": 49560 + }, + { + "epoch": 2.46200456938512, + "grad_norm": 0.1328125, + "learning_rate": 0.0006030436078275554, + "loss": 0.5326, + "step": 49570 + }, + { + "epoch": 2.462501241680739, + "grad_norm": 0.11767578125, + "learning_rate": 0.0006030038740439059, + "loss": 0.5489, + "step": 49580 + }, + { + "epoch": 2.4629979139763583, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006029641402602563, + "loss": 0.5411, + "step": 49590 + }, + { + "epoch": 2.463494586271978, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006029244064766067, + "loss": 0.571, + "step": 49600 + }, + { + "epoch": 2.463991258567597, + "grad_norm": 0.10205078125, + "learning_rate": 0.0006028846726929573, + "loss": 0.5845, + "step": 49610 + }, + { + "epoch": 2.4644879308632164, + "grad_norm": 0.1318359375, + "learning_rate": 0.0006028449389093077, + "loss": 0.5634, + "step": 49620 + }, + { + "epoch": 2.4649846031588356, + "grad_norm": 0.11865234375, + "learning_rate": 0.0006028052051256581, + "loss": 0.5268, + "step": 49630 + }, + { + "epoch": 2.4654812754544553, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006027654713420086, + "loss": 0.5525, + "step": 49640 + }, + { + "epoch": 2.4659779477500745, + "grad_norm": 0.12451171875, + "learning_rate": 0.000602725737558359, + "loss": 0.5734, + "step": 49650 + }, + { + "epoch": 2.4664746200456937, + "grad_norm": 0.115234375, + "learning_rate": 0.0006026860037747094, + "loss": 0.581, + "step": 49660 + }, + { + "epoch": 2.466971292341313, + "grad_norm": 0.10302734375, + "learning_rate": 0.00060264626999106, + "loss": 0.5424, + "step": 49670 + }, + { + "epoch": 2.4674679646369326, + "grad_norm": 0.109375, + "learning_rate": 0.0006026065362074104, + "loss": 0.5855, + "step": 49680 + }, + { + "epoch": 2.467964636932552, + "grad_norm": 0.109375, + "learning_rate": 0.0006025668024237608, + "loss": 0.5456, + "step": 49690 + }, + { + "epoch": 2.468461309228171, + "grad_norm": 0.1708984375, + "learning_rate": 0.0006025270686401113, + "loss": 0.5672, + "step": 49700 + }, + { + "epoch": 2.4689579815237908, + "grad_norm": 0.19921875, + "learning_rate": 0.0006024873348564616, + "loss": 0.5702, + "step": 49710 + }, + { + "epoch": 2.46945465381941, + "grad_norm": 0.09716796875, + "learning_rate": 0.0006024476010728122, + "loss": 0.5975, + "step": 49720 + }, + { + "epoch": 2.469951326115029, + "grad_norm": 0.109375, + "learning_rate": 0.0006024078672891627, + "loss": 0.5456, + "step": 49730 + }, + { + "epoch": 2.4704479984106484, + "grad_norm": 0.126953125, + "learning_rate": 0.0006023681335055131, + "loss": 0.5342, + "step": 49740 + }, + { + "epoch": 2.470944670706268, + "grad_norm": 0.115234375, + "learning_rate": 0.0006023283997218635, + "loss": 0.5671, + "step": 49750 + }, + { + "epoch": 2.4714413430018873, + "grad_norm": 0.1064453125, + "learning_rate": 0.0006022886659382141, + "loss": 0.5535, + "step": 49760 + }, + { + "epoch": 2.4719380152975066, + "grad_norm": 0.134765625, + "learning_rate": 0.0006022489321545645, + "loss": 0.5873, + "step": 49770 + }, + { + "epoch": 2.4724346875931262, + "grad_norm": 0.09375, + "learning_rate": 0.0006022091983709149, + "loss": 0.5657, + "step": 49780 + }, + { + "epoch": 2.4729313598887455, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006021694645872653, + "loss": 0.5449, + "step": 49790 + }, + { + "epoch": 2.4734280321843647, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006021297308036158, + "loss": 0.5494, + "step": 49800 + }, + { + "epoch": 2.473924704479984, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006020899970199662, + "loss": 0.5704, + "step": 49810 + }, + { + "epoch": 2.4744213767756036, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006020502632363167, + "loss": 0.5526, + "step": 49820 + }, + { + "epoch": 2.474918049071223, + "grad_norm": 0.09130859375, + "learning_rate": 0.0006020105294526672, + "loss": 0.5606, + "step": 49830 + }, + { + "epoch": 2.475414721366842, + "grad_norm": 0.1142578125, + "learning_rate": 0.0006019707956690176, + "loss": 0.588, + "step": 49840 + }, + { + "epoch": 2.4759113936624617, + "grad_norm": 0.1181640625, + "learning_rate": 0.000601931061885368, + "loss": 0.5424, + "step": 49850 + }, + { + "epoch": 2.476408065958081, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006018913281017186, + "loss": 0.5562, + "step": 49860 + }, + { + "epoch": 2.4769047382537, + "grad_norm": 0.1376953125, + "learning_rate": 0.000601851594318069, + "loss": 0.545, + "step": 49870 + }, + { + "epoch": 2.4774014105493194, + "grad_norm": 0.220703125, + "learning_rate": 0.0006018118605344194, + "loss": 0.5647, + "step": 49880 + }, + { + "epoch": 2.477898082844939, + "grad_norm": 0.1220703125, + "learning_rate": 0.0006017721267507699, + "loss": 0.5427, + "step": 49890 + }, + { + "epoch": 2.4783947551405583, + "grad_norm": 0.08837890625, + "learning_rate": 0.0006017323929671203, + "loss": 0.5381, + "step": 49900 + }, + { + "epoch": 2.4788914274361775, + "grad_norm": 0.138671875, + "learning_rate": 0.0006016926591834707, + "loss": 0.555, + "step": 49910 + }, + { + "epoch": 2.479388099731797, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006016529253998213, + "loss": 0.5809, + "step": 49920 + }, + { + "epoch": 2.4798847720274164, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006016131916161717, + "loss": 0.5339, + "step": 49930 + }, + { + "epoch": 2.4803814443230356, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006015734578325221, + "loss": 0.5717, + "step": 49940 + }, + { + "epoch": 2.480878116618655, + "grad_norm": 0.10693359375, + "learning_rate": 0.0006015337240488726, + "loss": 0.5455, + "step": 49950 + }, + { + "epoch": 2.4813747889142745, + "grad_norm": 0.11962890625, + "learning_rate": 0.000601493990265223, + "loss": 0.5582, + "step": 49960 + }, + { + "epoch": 2.4818714612098938, + "grad_norm": 0.162109375, + "learning_rate": 0.0006014542564815735, + "loss": 0.588, + "step": 49970 + }, + { + "epoch": 2.482368133505513, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006014145226979239, + "loss": 0.5391, + "step": 49980 + }, + { + "epoch": 2.4828648058011322, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006013747889142744, + "loss": 0.5665, + "step": 49990 + }, + { + "epoch": 2.483361478096752, + "grad_norm": 0.12158203125, + "learning_rate": 0.0006013350551306248, + "loss": 0.5499, + "step": 50000 + }, + { + "epoch": 2.483858150392371, + "grad_norm": 0.1376953125, + "learning_rate": 0.0006012953213469752, + "loss": 0.5816, + "step": 50010 + }, + { + "epoch": 2.4843548226879903, + "grad_norm": 0.095703125, + "learning_rate": 0.0006012555875633258, + "loss": 0.5779, + "step": 50020 + }, + { + "epoch": 2.4848514949836096, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006012158537796763, + "loss": 0.526, + "step": 50030 + }, + { + "epoch": 2.4853481672792292, + "grad_norm": 0.1416015625, + "learning_rate": 0.0006011761199960266, + "loss": 0.5614, + "step": 50040 + }, + { + "epoch": 2.4858448395748485, + "grad_norm": 0.0986328125, + "learning_rate": 0.0006011363862123771, + "loss": 0.5772, + "step": 50050 + }, + { + "epoch": 2.4863415118704677, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006010966524287275, + "loss": 0.5888, + "step": 50060 + }, + { + "epoch": 2.4868381841660874, + "grad_norm": 0.09521484375, + "learning_rate": 0.000601056918645078, + "loss": 0.5385, + "step": 50070 + }, + { + "epoch": 2.4873348564617066, + "grad_norm": 0.10498046875, + "learning_rate": 0.0006010171848614285, + "loss": 0.6059, + "step": 50080 + }, + { + "epoch": 2.487831528757326, + "grad_norm": 0.11962890625, + "learning_rate": 0.0006009774510777789, + "loss": 0.5604, + "step": 50090 + }, + { + "epoch": 2.488328201052945, + "grad_norm": 0.10302734375, + "learning_rate": 0.0006009377172941293, + "loss": 0.5387, + "step": 50100 + }, + { + "epoch": 2.4888248733485647, + "grad_norm": 0.12890625, + "learning_rate": 0.0006008979835104798, + "loss": 0.5603, + "step": 50110 + }, + { + "epoch": 2.489321545644184, + "grad_norm": 0.1494140625, + "learning_rate": 0.0006008582497268303, + "loss": 0.5659, + "step": 50120 + }, + { + "epoch": 2.489818217939803, + "grad_norm": 0.1279296875, + "learning_rate": 0.0006008185159431807, + "loss": 0.5607, + "step": 50130 + }, + { + "epoch": 2.490314890235423, + "grad_norm": 0.09912109375, + "learning_rate": 0.0006007787821595312, + "loss": 0.5308, + "step": 50140 + }, + { + "epoch": 2.490811562531042, + "grad_norm": 0.150390625, + "learning_rate": 0.0006007390483758816, + "loss": 0.5595, + "step": 50150 + }, + { + "epoch": 2.4913082348266613, + "grad_norm": 0.12060546875, + "learning_rate": 0.000600699314592232, + "loss": 0.5742, + "step": 50160 + }, + { + "epoch": 2.4918049071222805, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006006595808085825, + "loss": 0.5578, + "step": 50170 + }, + { + "epoch": 2.4923015794179, + "grad_norm": 0.09130859375, + "learning_rate": 0.000600619847024933, + "loss": 0.5511, + "step": 50180 + }, + { + "epoch": 2.4927982517135194, + "grad_norm": 0.1748046875, + "learning_rate": 0.0006005801132412835, + "loss": 0.5548, + "step": 50190 + }, + { + "epoch": 2.4932949240091387, + "grad_norm": 0.10546875, + "learning_rate": 0.0006005403794576338, + "loss": 0.5776, + "step": 50200 + }, + { + "epoch": 2.4937915963047583, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006005006456739843, + "loss": 0.5829, + "step": 50210 + }, + { + "epoch": 2.4942882686003776, + "grad_norm": 0.11279296875, + "learning_rate": 0.0006004609118903349, + "loss": 0.5768, + "step": 50220 + }, + { + "epoch": 2.494784940895997, + "grad_norm": 0.15234375, + "learning_rate": 0.0006004211781066852, + "loss": 0.5617, + "step": 50230 + }, + { + "epoch": 2.495281613191616, + "grad_norm": 0.16796875, + "learning_rate": 0.0006003814443230357, + "loss": 0.5578, + "step": 50240 + }, + { + "epoch": 2.4957782854872357, + "grad_norm": 0.1259765625, + "learning_rate": 0.0006003417105393861, + "loss": 0.5624, + "step": 50250 + }, + { + "epoch": 2.496274957782855, + "grad_norm": 0.1044921875, + "learning_rate": 0.0006003019767557365, + "loss": 0.5755, + "step": 50260 + }, + { + "epoch": 2.496771630078474, + "grad_norm": 0.10986328125, + "learning_rate": 0.0006002622429720871, + "loss": 0.5617, + "step": 50270 + }, + { + "epoch": 2.497268302374094, + "grad_norm": 0.22265625, + "learning_rate": 0.0006002225091884375, + "loss": 0.538, + "step": 50280 + }, + { + "epoch": 2.497764974669713, + "grad_norm": 0.1103515625, + "learning_rate": 0.0006001827754047879, + "loss": 0.5541, + "step": 50290 + }, + { + "epoch": 2.4982616469653323, + "grad_norm": 0.1162109375, + "learning_rate": 0.0006001430416211384, + "loss": 0.5606, + "step": 50300 + }, + { + "epoch": 2.4987583192609515, + "grad_norm": 0.1123046875, + "learning_rate": 0.0006001033078374888, + "loss": 0.5533, + "step": 50310 + }, + { + "epoch": 2.499254991556571, + "grad_norm": 0.1015625, + "learning_rate": 0.0006000635740538393, + "loss": 0.5404, + "step": 50320 + }, + { + "epoch": 2.4997516638521904, + "grad_norm": 0.095703125, + "learning_rate": 0.0006000238402701898, + "loss": 0.5438, + "step": 50330 + }, + { + "epoch": 2.5002483361478096, + "grad_norm": 0.126953125, + "learning_rate": 0.0005999841064865402, + "loss": 0.556, + "step": 50340 + }, + { + "epoch": 2.5007450084434293, + "grad_norm": 0.125, + "learning_rate": 0.0005999443727028907, + "loss": 0.589, + "step": 50350 + }, + { + "epoch": 2.5012416807390485, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005999046389192411, + "loss": 0.5908, + "step": 50360 + }, + { + "epoch": 2.5017383530346677, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005998649051355916, + "loss": 0.5258, + "step": 50370 + }, + { + "epoch": 2.502235025330287, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005998251713519421, + "loss": 0.5685, + "step": 50380 + }, + { + "epoch": 2.502731697625906, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005997854375682924, + "loss": 0.5652, + "step": 50390 + }, + { + "epoch": 2.503228369921526, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005997457037846429, + "loss": 0.5472, + "step": 50400 + }, + { + "epoch": 2.503725042217145, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005997059700009934, + "loss": 0.5512, + "step": 50410 + }, + { + "epoch": 2.5042217145127643, + "grad_norm": 0.095703125, + "learning_rate": 0.0005996662362173438, + "loss": 0.5654, + "step": 50420 + }, + { + "epoch": 2.504718386808384, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005996265024336943, + "loss": 0.5999, + "step": 50430 + }, + { + "epoch": 2.505215059104003, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005995867686500447, + "loss": 0.555, + "step": 50440 + }, + { + "epoch": 2.5057117313996224, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005995470348663951, + "loss": 0.5657, + "step": 50450 + }, + { + "epoch": 2.5062084036952417, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005995073010827456, + "loss": 0.5458, + "step": 50460 + }, + { + "epoch": 2.5067050759908613, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005994675672990961, + "loss": 0.565, + "step": 50470 + }, + { + "epoch": 2.5072017482864806, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005994278335154466, + "loss": 0.566, + "step": 50480 + }, + { + "epoch": 2.5076984205821, + "grad_norm": 0.13671875, + "learning_rate": 0.000599388099731797, + "loss": 0.5603, + "step": 50490 + }, + { + "epoch": 2.5081950928777195, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005993483659481474, + "loss": 0.544, + "step": 50500 + }, + { + "epoch": 2.5086917651733387, + "grad_norm": 0.150390625, + "learning_rate": 0.0005993086321644979, + "loss": 0.5736, + "step": 50510 + }, + { + "epoch": 2.509188437468958, + "grad_norm": 0.126953125, + "learning_rate": 0.0005992688983808484, + "loss": 0.5321, + "step": 50520 + }, + { + "epoch": 2.509685109764577, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005992291645971988, + "loss": 0.5619, + "step": 50530 + }, + { + "epoch": 2.510181782060197, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005991894308135493, + "loss": 0.5561, + "step": 50540 + }, + { + "epoch": 2.510678454355816, + "grad_norm": 0.15234375, + "learning_rate": 0.0005991496970298997, + "loss": 0.5537, + "step": 50550 + }, + { + "epoch": 2.5111751266514353, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005991099632462501, + "loss": 0.5722, + "step": 50560 + }, + { + "epoch": 2.511671798947055, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005990702294626007, + "loss": 0.5708, + "step": 50570 + }, + { + "epoch": 2.512168471242674, + "grad_norm": 0.11376953125, + "learning_rate": 0.000599030495678951, + "loss": 0.5803, + "step": 50580 + }, + { + "epoch": 2.5126651435382934, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005989907618953015, + "loss": 0.5337, + "step": 50590 + }, + { + "epoch": 2.5131618158339126, + "grad_norm": 0.09521484375, + "learning_rate": 0.000598951028111652, + "loss": 0.5547, + "step": 50600 + }, + { + "epoch": 2.5136584881295323, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005989112943280023, + "loss": 0.5613, + "step": 50610 + }, + { + "epoch": 2.5141551604251515, + "grad_norm": 0.115234375, + "learning_rate": 0.0005988715605443529, + "loss": 0.5465, + "step": 50620 + }, + { + "epoch": 2.5146518327207708, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005988318267607034, + "loss": 0.5655, + "step": 50630 + }, + { + "epoch": 2.5151485050163904, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005987920929770538, + "loss": 0.5754, + "step": 50640 + }, + { + "epoch": 2.5156451773120097, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005987523591934042, + "loss": 0.5651, + "step": 50650 + }, + { + "epoch": 2.516141849607629, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005987126254097546, + "loss": 0.5524, + "step": 50660 + }, + { + "epoch": 2.516638521903248, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005986728916261052, + "loss": 0.5469, + "step": 50670 + }, + { + "epoch": 2.5171351941988673, + "grad_norm": 0.119140625, + "learning_rate": 0.0005986331578424556, + "loss": 0.5544, + "step": 50680 + }, + { + "epoch": 2.517631866494487, + "grad_norm": 0.09765625, + "learning_rate": 0.000598593424058806, + "loss": 0.5957, + "step": 50690 + }, + { + "epoch": 2.5181285387901062, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005985536902751565, + "loss": 0.5565, + "step": 50700 + }, + { + "epoch": 2.518625211085726, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005985139564915069, + "loss": 0.574, + "step": 50710 + }, + { + "epoch": 2.519121883381345, + "grad_norm": 0.154296875, + "learning_rate": 0.0005984742227078574, + "loss": 0.5723, + "step": 50720 + }, + { + "epoch": 2.5196185556769644, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005984344889242079, + "loss": 0.5467, + "step": 50730 + }, + { + "epoch": 2.5201152279725836, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005983947551405583, + "loss": 0.5421, + "step": 50740 + }, + { + "epoch": 2.520611900268203, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005983550213569087, + "loss": 0.5228, + "step": 50750 + }, + { + "epoch": 2.5211085725638225, + "grad_norm": 0.103515625, + "learning_rate": 0.0005983152875732592, + "loss": 0.5707, + "step": 50760 + }, + { + "epoch": 2.5216052448594417, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005982755537896097, + "loss": 0.5645, + "step": 50770 + }, + { + "epoch": 2.522101917155061, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005982358200059601, + "loss": 0.563, + "step": 50780 + }, + { + "epoch": 2.5225985894506806, + "grad_norm": 0.140625, + "learning_rate": 0.0005981960862223106, + "loss": 0.5781, + "step": 50790 + }, + { + "epoch": 2.5230952617463, + "grad_norm": 0.1396484375, + "learning_rate": 0.000598156352438661, + "loss": 0.5269, + "step": 50800 + }, + { + "epoch": 2.523591934041919, + "grad_norm": 0.126953125, + "learning_rate": 0.0005981166186550114, + "loss": 0.55, + "step": 50810 + }, + { + "epoch": 2.5240886063375383, + "grad_norm": 0.1083984375, + "learning_rate": 0.000598076884871362, + "loss": 0.5676, + "step": 50820 + }, + { + "epoch": 2.524585278633158, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005980371510877124, + "loss": 0.5254, + "step": 50830 + }, + { + "epoch": 2.525081950928777, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005979974173040628, + "loss": 0.5673, + "step": 50840 + }, + { + "epoch": 2.5255786232243964, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005979576835204132, + "loss": 0.5848, + "step": 50850 + }, + { + "epoch": 2.526075295520016, + "grad_norm": 0.177734375, + "learning_rate": 0.0005979179497367637, + "loss": 0.5343, + "step": 50860 + }, + { + "epoch": 2.5265719678156353, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005978782159531142, + "loss": 0.5898, + "step": 50870 + }, + { + "epoch": 2.5270686401112545, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005978384821694646, + "loss": 0.5488, + "step": 50880 + }, + { + "epoch": 2.5275653124068738, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005977987483858151, + "loss": 0.5552, + "step": 50890 + }, + { + "epoch": 2.5280619847024934, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005977590146021655, + "loss": 0.5616, + "step": 50900 + }, + { + "epoch": 2.5285586569981127, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005977192808185159, + "loss": 0.5752, + "step": 50910 + }, + { + "epoch": 2.529055329293732, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005976795470348665, + "loss": 0.5545, + "step": 50920 + }, + { + "epoch": 2.5295520015893516, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005976398132512169, + "loss": 0.5562, + "step": 50930 + }, + { + "epoch": 2.530048673884971, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005976000794675673, + "loss": 0.56, + "step": 50940 + }, + { + "epoch": 2.53054534618059, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005975603456839178, + "loss": 0.5614, + "step": 50950 + }, + { + "epoch": 2.5310420184762092, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005975206119002682, + "loss": 0.5647, + "step": 50960 + }, + { + "epoch": 2.5315386907718285, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005974808781166186, + "loss": 0.5495, + "step": 50970 + }, + { + "epoch": 2.532035363067448, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005974411443329692, + "loss": 0.5878, + "step": 50980 + }, + { + "epoch": 2.5325320353630674, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005974014105493196, + "loss": 0.5465, + "step": 50990 + }, + { + "epoch": 2.533028707658687, + "grad_norm": 0.11083984375, + "learning_rate": 0.00059736167676567, + "loss": 0.5815, + "step": 51000 + }, + { + "epoch": 2.5335253799543063, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005973219429820205, + "loss": 0.5638, + "step": 51010 + }, + { + "epoch": 2.5340220522499255, + "grad_norm": 0.11865234375, + "learning_rate": 0.000597282209198371, + "loss": 0.5659, + "step": 51020 + }, + { + "epoch": 2.5345187245455447, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005972424754147214, + "loss": 0.5737, + "step": 51030 + }, + { + "epoch": 2.535015396841164, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005972027416310718, + "loss": 0.5632, + "step": 51040 + }, + { + "epoch": 2.5355120691367836, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005971630078474223, + "loss": 0.568, + "step": 51050 + }, + { + "epoch": 2.536008741432403, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005971232740637727, + "loss": 0.5503, + "step": 51060 + }, + { + "epoch": 2.5365054137280225, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005970835402801231, + "loss": 0.5756, + "step": 51070 + }, + { + "epoch": 2.5370020860236417, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005970438064964737, + "loss": 0.5414, + "step": 51080 + }, + { + "epoch": 2.537498758319261, + "grad_norm": 0.173828125, + "learning_rate": 0.0005970040727128242, + "loss": 0.5714, + "step": 51090 + }, + { + "epoch": 2.53799543061488, + "grad_norm": 0.103515625, + "learning_rate": 0.0005969643389291745, + "loss": 0.5618, + "step": 51100 + }, + { + "epoch": 2.5384921029104994, + "grad_norm": 0.12158203125, + "learning_rate": 0.000596924605145525, + "loss": 0.551, + "step": 51110 + }, + { + "epoch": 2.538988775206119, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005968848713618754, + "loss": 0.5455, + "step": 51120 + }, + { + "epoch": 2.5394854475017383, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005968451375782259, + "loss": 0.5642, + "step": 51130 + }, + { + "epoch": 2.5399821197973576, + "grad_norm": 0.150390625, + "learning_rate": 0.0005968054037945764, + "loss": 0.5395, + "step": 51140 + }, + { + "epoch": 2.5404787920929772, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005967656700109268, + "loss": 0.5503, + "step": 51150 + }, + { + "epoch": 2.5409754643885964, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005967259362272772, + "loss": 0.5227, + "step": 51160 + }, + { + "epoch": 2.5414721366842157, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005966862024436278, + "loss": 0.5776, + "step": 51170 + }, + { + "epoch": 2.541968808979835, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005966464686599782, + "loss": 0.5362, + "step": 51180 + }, + { + "epoch": 2.5424654812754546, + "grad_norm": 0.12109375, + "learning_rate": 0.0005966067348763286, + "loss": 0.5623, + "step": 51190 + }, + { + "epoch": 2.542962153571074, + "grad_norm": 0.091796875, + "learning_rate": 0.0005965670010926791, + "loss": 0.5747, + "step": 51200 + }, + { + "epoch": 2.543458825866693, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005965272673090295, + "loss": 0.5386, + "step": 51210 + }, + { + "epoch": 2.5439554981623127, + "grad_norm": 0.09814453125, + "learning_rate": 0.00059648753352538, + "loss": 0.5459, + "step": 51220 + }, + { + "epoch": 2.544452170457932, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005964477997417305, + "loss": 0.5909, + "step": 51230 + }, + { + "epoch": 2.544948842753551, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005964080659580809, + "loss": 0.5465, + "step": 51240 + }, + { + "epoch": 2.5454455150491704, + "grad_norm": 0.1328125, + "learning_rate": 0.0005963683321744314, + "loss": 0.5811, + "step": 51250 + }, + { + "epoch": 2.54594218734479, + "grad_norm": 0.109375, + "learning_rate": 0.0005963285983907817, + "loss": 0.5755, + "step": 51260 + }, + { + "epoch": 2.5464388596404093, + "grad_norm": 0.142578125, + "learning_rate": 0.0005962888646071322, + "loss": 0.5701, + "step": 51270 + }, + { + "epoch": 2.5469355319360285, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005962491308234828, + "loss": 0.5748, + "step": 51280 + }, + { + "epoch": 2.547432204231648, + "grad_norm": 0.10546875, + "learning_rate": 0.0005962093970398331, + "loss": 0.5806, + "step": 51290 + }, + { + "epoch": 2.5479288765272674, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005961696632561836, + "loss": 0.5596, + "step": 51300 + }, + { + "epoch": 2.5484255488228866, + "grad_norm": 0.103515625, + "learning_rate": 0.000596129929472534, + "loss": 0.5824, + "step": 51310 + }, + { + "epoch": 2.548922221118506, + "grad_norm": 0.14453125, + "learning_rate": 0.0005960901956888844, + "loss": 0.5651, + "step": 51320 + }, + { + "epoch": 2.549418893414125, + "grad_norm": 0.193359375, + "learning_rate": 0.000596050461905235, + "loss": 0.5594, + "step": 51330 + }, + { + "epoch": 2.5499155657097448, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005960107281215854, + "loss": 0.5409, + "step": 51340 + }, + { + "epoch": 2.550412238005364, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005959709943379358, + "loss": 0.5479, + "step": 51350 + }, + { + "epoch": 2.5509089103009837, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005959312605542863, + "loss": 0.5584, + "step": 51360 + }, + { + "epoch": 2.551405582596603, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005958915267706367, + "loss": 0.568, + "step": 51370 + }, + { + "epoch": 2.551902254892222, + "grad_norm": 0.123046875, + "learning_rate": 0.0005958517929869873, + "loss": 0.5619, + "step": 51380 + }, + { + "epoch": 2.5523989271878413, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005958120592033377, + "loss": 0.5573, + "step": 51390 + }, + { + "epoch": 2.5528955994834606, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005957723254196881, + "loss": 0.561, + "step": 51400 + }, + { + "epoch": 2.5533922717790802, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005957325916360386, + "loss": 0.5522, + "step": 51410 + }, + { + "epoch": 2.5538889440746995, + "grad_norm": 0.10400390625, + "learning_rate": 0.000595692857852389, + "loss": 0.5504, + "step": 51420 + }, + { + "epoch": 2.554385616370319, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005956531240687395, + "loss": 0.5398, + "step": 51430 + }, + { + "epoch": 2.5548822886659384, + "grad_norm": 0.10302734375, + "learning_rate": 0.00059561339028509, + "loss": 0.5599, + "step": 51440 + }, + { + "epoch": 2.5553789609615576, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005955736565014403, + "loss": 0.5635, + "step": 51450 + }, + { + "epoch": 2.555875633257177, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005955339227177908, + "loss": 0.5605, + "step": 51460 + }, + { + "epoch": 2.556372305552796, + "grad_norm": 0.109375, + "learning_rate": 0.0005954941889341414, + "loss": 0.5695, + "step": 51470 + }, + { + "epoch": 2.5568689778484157, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005954544551504917, + "loss": 0.576, + "step": 51480 + }, + { + "epoch": 2.557365650144035, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005954147213668422, + "loss": 0.5342, + "step": 51490 + }, + { + "epoch": 2.557862322439654, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005953749875831927, + "loss": 0.5262, + "step": 51500 + }, + { + "epoch": 2.558358994735274, + "grad_norm": 0.09765625, + "learning_rate": 0.000595335253799543, + "loss": 0.5752, + "step": 51510 + }, + { + "epoch": 2.558855667030893, + "grad_norm": 0.0859375, + "learning_rate": 0.0005952955200158935, + "loss": 0.5404, + "step": 51520 + }, + { + "epoch": 2.5593523393265123, + "grad_norm": 0.10546875, + "learning_rate": 0.000595255786232244, + "loss": 0.583, + "step": 51530 + }, + { + "epoch": 2.5598490116221315, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005952160524485945, + "loss": 0.5767, + "step": 51540 + }, + { + "epoch": 2.560345683917751, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005951763186649449, + "loss": 0.5468, + "step": 51550 + }, + { + "epoch": 2.5608423562133704, + "grad_norm": 0.109375, + "learning_rate": 0.0005951365848812953, + "loss": 0.5662, + "step": 51560 + }, + { + "epoch": 2.5613390285089896, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005950968510976458, + "loss": 0.5559, + "step": 51570 + }, + { + "epoch": 2.5618357008046093, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005950571173139963, + "loss": 0.5359, + "step": 51580 + }, + { + "epoch": 2.5623323731002285, + "grad_norm": 0.14453125, + "learning_rate": 0.0005950173835303467, + "loss": 0.5524, + "step": 51590 + }, + { + "epoch": 2.5628290453958478, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005949776497466972, + "loss": 0.5748, + "step": 51600 + }, + { + "epoch": 2.563325717691467, + "grad_norm": 0.134765625, + "learning_rate": 0.0005949379159630476, + "loss": 0.589, + "step": 51610 + }, + { + "epoch": 2.5638223899870867, + "grad_norm": 0.12060546875, + "learning_rate": 0.000594898182179398, + "loss": 0.5767, + "step": 51620 + }, + { + "epoch": 2.564319062282706, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005948584483957486, + "loss": 0.5685, + "step": 51630 + }, + { + "epoch": 2.564815734578325, + "grad_norm": 0.09375, + "learning_rate": 0.0005948187146120989, + "loss": 0.5551, + "step": 51640 + }, + { + "epoch": 2.565312406873945, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005947789808284494, + "loss": 0.5697, + "step": 51650 + }, + { + "epoch": 2.565809079169564, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005947392470447999, + "loss": 0.5754, + "step": 51660 + }, + { + "epoch": 2.5663057514651832, + "grad_norm": 0.1875, + "learning_rate": 0.0005946995132611503, + "loss": 0.5471, + "step": 51670 + }, + { + "epoch": 2.5668024237608025, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005946597794775008, + "loss": 0.5898, + "step": 51680 + }, + { + "epoch": 2.5672990960564217, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005946200456938513, + "loss": 0.5595, + "step": 51690 + }, + { + "epoch": 2.5677957683520414, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005945803119102017, + "loss": 0.5411, + "step": 51700 + }, + { + "epoch": 2.5682924406476606, + "grad_norm": 0.10546875, + "learning_rate": 0.0005945405781265521, + "loss": 0.5632, + "step": 51710 + }, + { + "epoch": 2.5687891129432803, + "grad_norm": 0.091796875, + "learning_rate": 0.0005945008443429025, + "loss": 0.5642, + "step": 51720 + }, + { + "epoch": 2.5692857852388995, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005944611105592531, + "loss": 0.5499, + "step": 51730 + }, + { + "epoch": 2.5697824575345187, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005944213767756035, + "loss": 0.5764, + "step": 51740 + }, + { + "epoch": 2.570279129830138, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005943816429919539, + "loss": 0.53, + "step": 51750 + }, + { + "epoch": 2.570775802125757, + "grad_norm": 0.11328125, + "learning_rate": 0.0005943419092083044, + "loss": 0.6109, + "step": 51760 + }, + { + "epoch": 2.571272474421377, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005943021754246548, + "loss": 0.5751, + "step": 51770 + }, + { + "epoch": 2.571769146716996, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005942624416410053, + "loss": 0.5906, + "step": 51780 + }, + { + "epoch": 2.5722658190126158, + "grad_norm": 0.15625, + "learning_rate": 0.0005942227078573558, + "loss": 0.5624, + "step": 51790 + }, + { + "epoch": 2.572762491308235, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005941829740737062, + "loss": 0.5324, + "step": 51800 + }, + { + "epoch": 2.573259163603854, + "grad_norm": 0.09375, + "learning_rate": 0.0005941432402900566, + "loss": 0.5542, + "step": 51810 + }, + { + "epoch": 2.5737558358994734, + "grad_norm": 0.08984375, + "learning_rate": 0.0005941035065064071, + "loss": 0.5472, + "step": 51820 + }, + { + "epoch": 2.5742525081950927, + "grad_norm": 0.12109375, + "learning_rate": 0.0005940637727227576, + "loss": 0.5527, + "step": 51830 + }, + { + "epoch": 2.5747491804907123, + "grad_norm": 0.1025390625, + "learning_rate": 0.000594024038939108, + "loss": 0.5812, + "step": 51840 + }, + { + "epoch": 2.5752458527863316, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005939843051554585, + "loss": 0.5717, + "step": 51850 + }, + { + "epoch": 2.575742525081951, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005939445713718089, + "loss": 0.5504, + "step": 51860 + }, + { + "epoch": 2.5762391973775705, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005939048375881593, + "loss": 0.5747, + "step": 51870 + }, + { + "epoch": 2.5767358696731897, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005938651038045099, + "loss": 0.5129, + "step": 51880 + }, + { + "epoch": 2.577232541968809, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005938253700208603, + "loss": 0.5677, + "step": 51890 + }, + { + "epoch": 2.577729214264428, + "grad_norm": 0.169921875, + "learning_rate": 0.0005937856362372107, + "loss": 0.5708, + "step": 51900 + }, + { + "epoch": 2.578225886560048, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005937459024535611, + "loss": 0.5443, + "step": 51910 + }, + { + "epoch": 2.578722558855667, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005937061686699116, + "loss": 0.5586, + "step": 51920 + }, + { + "epoch": 2.5792192311512863, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005936664348862621, + "loss": 0.5766, + "step": 51930 + }, + { + "epoch": 2.579715903446906, + "grad_norm": 0.12890625, + "learning_rate": 0.0005936267011026125, + "loss": 0.579, + "step": 51940 + }, + { + "epoch": 2.580212575742525, + "grad_norm": 0.095703125, + "learning_rate": 0.000593586967318963, + "loss": 0.5805, + "step": 51950 + }, + { + "epoch": 2.5807092480381444, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005935472335353134, + "loss": 0.5551, + "step": 51960 + }, + { + "epoch": 2.5812059203337636, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005935074997516638, + "loss": 0.5564, + "step": 51970 + }, + { + "epoch": 2.5817025926293833, + "grad_norm": 0.12890625, + "learning_rate": 0.0005934677659680144, + "loss": 0.5701, + "step": 51980 + }, + { + "epoch": 2.5821992649250025, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005934280321843648, + "loss": 0.5799, + "step": 51990 + }, + { + "epoch": 2.5826959372206217, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005933882984007152, + "loss": 0.5812, + "step": 52000 + }, + { + "epoch": 2.5831926095162414, + "grad_norm": 0.154296875, + "learning_rate": 0.0005933485646170657, + "loss": 0.5707, + "step": 52010 + }, + { + "epoch": 2.5836892818118606, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005933088308334161, + "loss": 0.5802, + "step": 52020 + }, + { + "epoch": 2.58418595410748, + "grad_norm": 0.115234375, + "learning_rate": 0.0005932690970497666, + "loss": 0.5655, + "step": 52030 + }, + { + "epoch": 2.584682626403099, + "grad_norm": 0.130859375, + "learning_rate": 0.0005932293632661171, + "loss": 0.5621, + "step": 52040 + }, + { + "epoch": 2.5851792986987183, + "grad_norm": 0.140625, + "learning_rate": 0.0005931896294824675, + "loss": 0.5851, + "step": 52050 + }, + { + "epoch": 2.585675970994338, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005931498956988179, + "loss": 0.5652, + "step": 52060 + }, + { + "epoch": 2.586172643289957, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005931101619151684, + "loss": 0.5645, + "step": 52070 + }, + { + "epoch": 2.586669315585577, + "grad_norm": 0.107421875, + "learning_rate": 0.0005930704281315189, + "loss": 0.5482, + "step": 52080 + }, + { + "epoch": 2.587165987881196, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005930306943478693, + "loss": 0.5524, + "step": 52090 + }, + { + "epoch": 2.5876626601768153, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005929909605642197, + "loss": 0.5543, + "step": 52100 + }, + { + "epoch": 2.5881593324724346, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005929512267805702, + "loss": 0.5657, + "step": 52110 + }, + { + "epoch": 2.588656004768054, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005929114929969207, + "loss": 0.5542, + "step": 52120 + }, + { + "epoch": 2.5891526770636735, + "grad_norm": 0.12890625, + "learning_rate": 0.000592871759213271, + "loss": 0.571, + "step": 52130 + }, + { + "epoch": 2.5896493493592927, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005928320254296216, + "loss": 0.5935, + "step": 52140 + }, + { + "epoch": 2.590146021654912, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005927922916459721, + "loss": 0.5556, + "step": 52150 + }, + { + "epoch": 2.5906426939505316, + "grad_norm": 0.107421875, + "learning_rate": 0.0005927525578623224, + "loss": 0.5633, + "step": 52160 + }, + { + "epoch": 2.591139366246151, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005927128240786729, + "loss": 0.5721, + "step": 52170 + }, + { + "epoch": 2.59163603854177, + "grad_norm": 0.150390625, + "learning_rate": 0.0005926730902950234, + "loss": 0.5669, + "step": 52180 + }, + { + "epoch": 2.5921327108373893, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005926333565113738, + "loss": 0.5699, + "step": 52190 + }, + { + "epoch": 2.592629383133009, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005925936227277243, + "loss": 0.5165, + "step": 52200 + }, + { + "epoch": 2.593126055428628, + "grad_norm": 0.119140625, + "learning_rate": 0.0005925538889440747, + "loss": 0.5432, + "step": 52210 + }, + { + "epoch": 2.5936227277242474, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005925141551604251, + "loss": 0.5869, + "step": 52220 + }, + { + "epoch": 2.594119400019867, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005924744213767757, + "loss": 0.5908, + "step": 52230 + }, + { + "epoch": 2.5946160723154863, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005924346875931261, + "loss": 0.5426, + "step": 52240 + }, + { + "epoch": 2.5951127446111055, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005923949538094765, + "loss": 0.5939, + "step": 52250 + }, + { + "epoch": 2.5956094169067248, + "grad_norm": 0.09716796875, + "learning_rate": 0.000592355220025827, + "loss": 0.5533, + "step": 52260 + }, + { + "epoch": 2.5961060892023444, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005923154862421774, + "loss": 0.5361, + "step": 52270 + }, + { + "epoch": 2.5966027614979637, + "grad_norm": 0.1484375, + "learning_rate": 0.000592275752458528, + "loss": 0.5498, + "step": 52280 + }, + { + "epoch": 2.597099433793583, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005922360186748784, + "loss": 0.5545, + "step": 52290 + }, + { + "epoch": 2.5975961060892026, + "grad_norm": 0.154296875, + "learning_rate": 0.0005921962848912288, + "loss": 0.5817, + "step": 52300 + }, + { + "epoch": 2.5980927783848218, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005921565511075793, + "loss": 0.5931, + "step": 52310 + }, + { + "epoch": 2.598589450680441, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005921168173239296, + "loss": 0.5344, + "step": 52320 + }, + { + "epoch": 2.5990861229760602, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005920770835402802, + "loss": 0.5631, + "step": 52330 + }, + { + "epoch": 2.59958279527168, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005920373497566307, + "loss": 0.5554, + "step": 52340 + }, + { + "epoch": 2.600079467567299, + "grad_norm": 0.1484375, + "learning_rate": 0.000591997615972981, + "loss": 0.5733, + "step": 52350 + }, + { + "epoch": 2.6005761398629184, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005919578821893315, + "loss": 0.5849, + "step": 52360 + }, + { + "epoch": 2.601072812158538, + "grad_norm": 0.13671875, + "learning_rate": 0.0005919181484056819, + "loss": 0.5489, + "step": 52370 + }, + { + "epoch": 2.6015694844541573, + "grad_norm": 0.087890625, + "learning_rate": 0.0005918784146220323, + "loss": 0.5192, + "step": 52380 + }, + { + "epoch": 2.6020661567497765, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005918386808383829, + "loss": 0.5245, + "step": 52390 + }, + { + "epoch": 2.6025628290453957, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005917989470547333, + "loss": 0.5205, + "step": 52400 + }, + { + "epoch": 2.603059501341015, + "grad_norm": 0.103515625, + "learning_rate": 0.0005917592132710837, + "loss": 0.5696, + "step": 52410 + }, + { + "epoch": 2.6035561736366346, + "grad_norm": 0.126953125, + "learning_rate": 0.0005917194794874342, + "loss": 0.5737, + "step": 52420 + }, + { + "epoch": 2.604052845932254, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005916797457037846, + "loss": 0.5474, + "step": 52430 + }, + { + "epoch": 2.6045495182278735, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005916400119201352, + "loss": 0.5651, + "step": 52440 + }, + { + "epoch": 2.6050461905234927, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005916002781364856, + "loss": 0.5621, + "step": 52450 + }, + { + "epoch": 2.605542862819112, + "grad_norm": 0.1484375, + "learning_rate": 0.000591560544352836, + "loss": 0.5554, + "step": 52460 + }, + { + "epoch": 2.606039535114731, + "grad_norm": 0.130859375, + "learning_rate": 0.0005915208105691865, + "loss": 0.5747, + "step": 52470 + }, + { + "epoch": 2.6065362074103504, + "grad_norm": 0.10693359375, + "learning_rate": 0.000591481076785537, + "loss": 0.5467, + "step": 52480 + }, + { + "epoch": 2.60703287970597, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005914413430018874, + "loss": 0.5722, + "step": 52490 + }, + { + "epoch": 2.6075295520015893, + "grad_norm": 0.1640625, + "learning_rate": 0.0005914016092182379, + "loss": 0.5646, + "step": 52500 + }, + { + "epoch": 2.6080262242972085, + "grad_norm": 0.111328125, + "learning_rate": 0.0005913618754345882, + "loss": 0.6004, + "step": 52510 + }, + { + "epoch": 2.608522896592828, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005913221416509387, + "loss": 0.5948, + "step": 52520 + }, + { + "epoch": 2.6090195688884474, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005912824078672893, + "loss": 0.5654, + "step": 52530 + }, + { + "epoch": 2.6095162411840667, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005912426740836396, + "loss": 0.5787, + "step": 52540 + }, + { + "epoch": 2.610012913479686, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005912029402999901, + "loss": 0.5808, + "step": 52550 + }, + { + "epoch": 2.6105095857753056, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005911632065163406, + "loss": 0.5478, + "step": 52560 + }, + { + "epoch": 2.611006258070925, + "grad_norm": 0.095703125, + "learning_rate": 0.000591123472732691, + "loss": 0.5723, + "step": 52570 + }, + { + "epoch": 2.611502930366544, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005910837389490414, + "loss": 0.541, + "step": 52580 + }, + { + "epoch": 2.6119996026621637, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005910440051653919, + "loss": 0.5594, + "step": 52590 + }, + { + "epoch": 2.612496274957783, + "grad_norm": 0.125, + "learning_rate": 0.0005910042713817424, + "loss": 0.5633, + "step": 52600 + }, + { + "epoch": 2.612992947253402, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005909645375980928, + "loss": 0.5304, + "step": 52610 + }, + { + "epoch": 2.6134896195490214, + "grad_norm": 0.111328125, + "learning_rate": 0.0005909248038144432, + "loss": 0.5781, + "step": 52620 + }, + { + "epoch": 2.613986291844641, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005908850700307938, + "loss": 0.525, + "step": 52630 + }, + { + "epoch": 2.6144829641402603, + "grad_norm": 0.1171875, + "learning_rate": 0.0005908453362471442, + "loss": 0.551, + "step": 52640 + }, + { + "epoch": 2.6149796364358795, + "grad_norm": 0.146484375, + "learning_rate": 0.0005908056024634946, + "loss": 0.5546, + "step": 52650 + }, + { + "epoch": 2.615476308731499, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005907658686798451, + "loss": 0.5552, + "step": 52660 + }, + { + "epoch": 2.6159729810271184, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005907261348961955, + "loss": 0.5338, + "step": 52670 + }, + { + "epoch": 2.6164696533227376, + "grad_norm": 0.146484375, + "learning_rate": 0.0005906864011125459, + "loss": 0.5431, + "step": 52680 + }, + { + "epoch": 2.616966325618357, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005906466673288965, + "loss": 0.5516, + "step": 52690 + }, + { + "epoch": 2.6174629979139765, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005906069335452468, + "loss": 0.5614, + "step": 52700 + }, + { + "epoch": 2.6179596702095957, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005905671997615973, + "loss": 0.5881, + "step": 52710 + }, + { + "epoch": 2.618456342505215, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005905274659779478, + "loss": 0.5574, + "step": 52720 + }, + { + "epoch": 2.6189530148008346, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005904877321942982, + "loss": 0.5601, + "step": 52730 + }, + { + "epoch": 2.619449687096454, + "grad_norm": 0.150390625, + "learning_rate": 0.0005904479984106487, + "loss": 0.5525, + "step": 52740 + }, + { + "epoch": 2.619946359392073, + "grad_norm": 0.12109375, + "learning_rate": 0.0005904082646269992, + "loss": 0.5587, + "step": 52750 + }, + { + "epoch": 2.6204430316876923, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005903685308433496, + "loss": 0.5613, + "step": 52760 + }, + { + "epoch": 2.6209397039833116, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005903287970597, + "loss": 0.53, + "step": 52770 + }, + { + "epoch": 2.6214363762789312, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005902890632760504, + "loss": 0.571, + "step": 52780 + }, + { + "epoch": 2.6219330485745505, + "grad_norm": 0.126953125, + "learning_rate": 0.000590249329492401, + "loss": 0.5459, + "step": 52790 + }, + { + "epoch": 2.62242972087017, + "grad_norm": 0.095703125, + "learning_rate": 0.0005902095957087514, + "loss": 0.5785, + "step": 52800 + }, + { + "epoch": 2.6229263931657893, + "grad_norm": 0.10546875, + "learning_rate": 0.0005901698619251018, + "loss": 0.5447, + "step": 52810 + }, + { + "epoch": 2.6234230654614086, + "grad_norm": 0.103515625, + "learning_rate": 0.0005901301281414523, + "loss": 0.5534, + "step": 52820 + }, + { + "epoch": 2.623919737757028, + "grad_norm": 0.111328125, + "learning_rate": 0.0005900903943578027, + "loss": 0.5597, + "step": 52830 + }, + { + "epoch": 2.624416410052647, + "grad_norm": 0.22265625, + "learning_rate": 0.0005900506605741532, + "loss": 0.5841, + "step": 52840 + }, + { + "epoch": 2.6249130823482667, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005900109267905037, + "loss": 0.5437, + "step": 52850 + }, + { + "epoch": 2.625409754643886, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005899711930068541, + "loss": 0.5652, + "step": 52860 + }, + { + "epoch": 2.625906426939505, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005899314592232045, + "loss": 0.563, + "step": 52870 + }, + { + "epoch": 2.626403099235125, + "grad_norm": 0.1357421875, + "learning_rate": 0.000589891725439555, + "loss": 0.5704, + "step": 52880 + }, + { + "epoch": 2.626899771530744, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005898519916559055, + "loss": 0.5478, + "step": 52890 + }, + { + "epoch": 2.6273964438263633, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005898122578722559, + "loss": 0.5751, + "step": 52900 + }, + { + "epoch": 2.6278931161219825, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005897725240886064, + "loss": 0.5358, + "step": 52910 + }, + { + "epoch": 2.628389788417602, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005897327903049568, + "loss": 0.5574, + "step": 52920 + }, + { + "epoch": 2.6288864607132214, + "grad_norm": 0.19140625, + "learning_rate": 0.0005896930565213072, + "loss": 0.5408, + "step": 52930 + }, + { + "epoch": 2.6293831330088406, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005896533227376578, + "loss": 0.5505, + "step": 52940 + }, + { + "epoch": 2.6298798053044603, + "grad_norm": 0.1640625, + "learning_rate": 0.0005896135889540082, + "loss": 0.5553, + "step": 52950 + }, + { + "epoch": 2.6303764776000795, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005895738551703586, + "loss": 0.5837, + "step": 52960 + }, + { + "epoch": 2.6308731498956988, + "grad_norm": 0.125, + "learning_rate": 0.000589534121386709, + "loss": 0.5853, + "step": 52970 + }, + { + "epoch": 2.631369822191318, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005894943876030595, + "loss": 0.5668, + "step": 52980 + }, + { + "epoch": 2.6318664944869377, + "grad_norm": 0.10498046875, + "learning_rate": 0.00058945465381941, + "loss": 0.5911, + "step": 52990 + }, + { + "epoch": 2.632363166782557, + "grad_norm": 0.115234375, + "learning_rate": 0.0005894149200357604, + "loss": 0.5381, + "step": 53000 + }, + { + "epoch": 2.632859839078176, + "grad_norm": 0.09375, + "learning_rate": 0.0005893751862521109, + "loss": 0.543, + "step": 53010 + }, + { + "epoch": 2.633356511373796, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005893354524684614, + "loss": 0.5565, + "step": 53020 + }, + { + "epoch": 2.633853183669415, + "grad_norm": 0.1328125, + "learning_rate": 0.0005892957186848117, + "loss": 0.5544, + "step": 53030 + }, + { + "epoch": 2.6343498559650342, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005892559849011623, + "loss": 0.5533, + "step": 53040 + }, + { + "epoch": 2.6348465282606535, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005892162511175127, + "loss": 0.5711, + "step": 53050 + }, + { + "epoch": 2.635343200556273, + "grad_norm": 0.103515625, + "learning_rate": 0.0005891765173338631, + "loss": 0.5413, + "step": 53060 + }, + { + "epoch": 2.6358398728518924, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005891367835502136, + "loss": 0.5541, + "step": 53070 + }, + { + "epoch": 2.6363365451475116, + "grad_norm": 0.0927734375, + "learning_rate": 0.000589097049766564, + "loss": 0.6096, + "step": 53080 + }, + { + "epoch": 2.6368332174431313, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005890573159829145, + "loss": 0.5926, + "step": 53090 + }, + { + "epoch": 2.6373298897387505, + "grad_norm": 0.1494140625, + "learning_rate": 0.000589017582199265, + "loss": 0.5598, + "step": 53100 + }, + { + "epoch": 2.6378265620343697, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005889778484156154, + "loss": 0.5967, + "step": 53110 + }, + { + "epoch": 2.638323234329989, + "grad_norm": 0.111328125, + "learning_rate": 0.0005889381146319658, + "loss": 0.5706, + "step": 53120 + }, + { + "epoch": 2.638819906625608, + "grad_norm": 0.125, + "learning_rate": 0.0005888983808483163, + "loss": 0.5699, + "step": 53130 + }, + { + "epoch": 2.639316578921228, + "grad_norm": 0.21484375, + "learning_rate": 0.0005888586470646668, + "loss": 0.5778, + "step": 53140 + }, + { + "epoch": 2.639813251216847, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005888189132810172, + "loss": 0.5371, + "step": 53150 + }, + { + "epoch": 2.6403099235124667, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005887791794973677, + "loss": 0.5268, + "step": 53160 + }, + { + "epoch": 2.640806595808086, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005887394457137181, + "loss": 0.5741, + "step": 53170 + }, + { + "epoch": 2.641303268103705, + "grad_norm": 0.09765625, + "learning_rate": 0.0005886997119300686, + "loss": 0.5917, + "step": 53180 + }, + { + "epoch": 2.6417999403993244, + "grad_norm": 0.150390625, + "learning_rate": 0.000588659978146419, + "loss": 0.5463, + "step": 53190 + }, + { + "epoch": 2.6422966126949436, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005886202443627695, + "loss": 0.5711, + "step": 53200 + }, + { + "epoch": 2.6427932849905633, + "grad_norm": 0.11328125, + "learning_rate": 0.00058858051057912, + "loss": 0.5452, + "step": 53210 + }, + { + "epoch": 2.6432899572861825, + "grad_norm": 0.16796875, + "learning_rate": 0.0005885407767954703, + "loss": 0.5595, + "step": 53220 + }, + { + "epoch": 2.6437866295818018, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005885010430118208, + "loss": 0.5474, + "step": 53230 + }, + { + "epoch": 2.6442833018774214, + "grad_norm": 0.134765625, + "learning_rate": 0.0005884613092281713, + "loss": 0.5801, + "step": 53240 + }, + { + "epoch": 2.6447799741730407, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005884215754445217, + "loss": 0.5728, + "step": 53250 + }, + { + "epoch": 2.64527664646866, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005883818416608722, + "loss": 0.5775, + "step": 53260 + }, + { + "epoch": 2.645773318764279, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005883421078772226, + "loss": 0.5507, + "step": 53270 + }, + { + "epoch": 2.646269991059899, + "grad_norm": 0.09033203125, + "learning_rate": 0.000588302374093573, + "loss": 0.5794, + "step": 53280 + }, + { + "epoch": 2.646766663355518, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005882626403099236, + "loss": 0.5759, + "step": 53290 + }, + { + "epoch": 2.6472633356511373, + "grad_norm": 0.11865234375, + "learning_rate": 0.000588222906526274, + "loss": 0.5717, + "step": 53300 + }, + { + "epoch": 2.647760007946757, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005881831727426245, + "loss": 0.5721, + "step": 53310 + }, + { + "epoch": 2.648256680242376, + "grad_norm": 0.095703125, + "learning_rate": 0.0005881434389589749, + "loss": 0.5629, + "step": 53320 + }, + { + "epoch": 2.6487533525379954, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005881037051753253, + "loss": 0.5881, + "step": 53330 + }, + { + "epoch": 2.6492500248336146, + "grad_norm": 0.11328125, + "learning_rate": 0.0005880639713916759, + "loss": 0.5568, + "step": 53340 + }, + { + "epoch": 2.6497466971292343, + "grad_norm": 0.123046875, + "learning_rate": 0.0005880242376080263, + "loss": 0.5692, + "step": 53350 + }, + { + "epoch": 2.6502433694248535, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005879845038243767, + "loss": 0.5416, + "step": 53360 + }, + { + "epoch": 2.6507400417204727, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005879447700407272, + "loss": 0.5644, + "step": 53370 + }, + { + "epoch": 2.6512367140160924, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005879050362570775, + "loss": 0.5573, + "step": 53380 + }, + { + "epoch": 2.6517333863117116, + "grad_norm": 0.1328125, + "learning_rate": 0.0005878653024734281, + "loss": 0.5575, + "step": 53390 + }, + { + "epoch": 2.652230058607331, + "grad_norm": 0.134765625, + "learning_rate": 0.0005878255686897786, + "loss": 0.5463, + "step": 53400 + }, + { + "epoch": 2.65272673090295, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005877858349061289, + "loss": 0.5543, + "step": 53410 + }, + { + "epoch": 2.6532234031985693, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005877461011224794, + "loss": 0.5464, + "step": 53420 + }, + { + "epoch": 2.653720075494189, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005877063673388299, + "loss": 0.5422, + "step": 53430 + }, + { + "epoch": 2.654216747789808, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005876666335551802, + "loss": 0.5569, + "step": 53440 + }, + { + "epoch": 2.654713420085428, + "grad_norm": 0.103515625, + "learning_rate": 0.0005876268997715308, + "loss": 0.5385, + "step": 53450 + }, + { + "epoch": 2.655210092381047, + "grad_norm": 0.1640625, + "learning_rate": 0.0005875871659878812, + "loss": 0.5348, + "step": 53460 + }, + { + "epoch": 2.6557067646766663, + "grad_norm": 0.09765625, + "learning_rate": 0.0005875474322042317, + "loss": 0.5475, + "step": 53470 + }, + { + "epoch": 2.6562034369722856, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005875076984205821, + "loss": 0.5525, + "step": 53480 + }, + { + "epoch": 2.656700109267905, + "grad_norm": 0.14453125, + "learning_rate": 0.0005874679646369326, + "loss": 0.5494, + "step": 53490 + }, + { + "epoch": 2.6571967815635245, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005874282308532831, + "loss": 0.5446, + "step": 53500 + }, + { + "epoch": 2.6576934538591437, + "grad_norm": 0.099609375, + "learning_rate": 0.0005873884970696335, + "loss": 0.5354, + "step": 53510 + }, + { + "epoch": 2.6581901261547634, + "grad_norm": 0.11328125, + "learning_rate": 0.0005873487632859839, + "loss": 0.5875, + "step": 53520 + }, + { + "epoch": 2.6586867984503826, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005873090295023344, + "loss": 0.5827, + "step": 53530 + }, + { + "epoch": 2.659183470746002, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005872692957186849, + "loss": 0.5811, + "step": 53540 + }, + { + "epoch": 2.659680143041621, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005872295619350353, + "loss": 0.5338, + "step": 53550 + }, + { + "epoch": 2.6601768153372403, + "grad_norm": 0.099609375, + "learning_rate": 0.0005871898281513858, + "loss": 0.5841, + "step": 53560 + }, + { + "epoch": 2.66067348763286, + "grad_norm": 0.111328125, + "learning_rate": 0.0005871500943677361, + "loss": 0.5524, + "step": 53570 + }, + { + "epoch": 2.661170159928479, + "grad_norm": 0.12109375, + "learning_rate": 0.0005871103605840866, + "loss": 0.5401, + "step": 53580 + }, + { + "epoch": 2.6616668322240984, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005870706268004372, + "loss": 0.5596, + "step": 53590 + }, + { + "epoch": 2.662163504519718, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005870308930167875, + "loss": 0.5294, + "step": 53600 + }, + { + "epoch": 2.6626601768153373, + "grad_norm": 0.09423828125, + "learning_rate": 0.000586991159233138, + "loss": 0.5505, + "step": 53610 + }, + { + "epoch": 2.6631568491109565, + "grad_norm": 0.103515625, + "learning_rate": 0.0005869514254494885, + "loss": 0.5808, + "step": 53620 + }, + { + "epoch": 2.6636535214065757, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005869116916658389, + "loss": 0.559, + "step": 53630 + }, + { + "epoch": 2.6641501937021954, + "grad_norm": 0.103515625, + "learning_rate": 0.0005868719578821894, + "loss": 0.5737, + "step": 53640 + }, + { + "epoch": 2.6646468659978146, + "grad_norm": 0.140625, + "learning_rate": 0.0005868322240985398, + "loss": 0.5706, + "step": 53650 + }, + { + "epoch": 2.665143538293434, + "grad_norm": 0.111328125, + "learning_rate": 0.0005867924903148903, + "loss": 0.5827, + "step": 53660 + }, + { + "epoch": 2.6656402105890535, + "grad_norm": 0.09765625, + "learning_rate": 0.0005867527565312407, + "loss": 0.5591, + "step": 53670 + }, + { + "epoch": 2.6661368828846728, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005867130227475911, + "loss": 0.5814, + "step": 53680 + }, + { + "epoch": 2.666633555180292, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005866732889639417, + "loss": 0.5339, + "step": 53690 + }, + { + "epoch": 2.667130227475911, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005866335551802921, + "loss": 0.5854, + "step": 53700 + }, + { + "epoch": 2.667626899771531, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005865938213966425, + "loss": 0.5793, + "step": 53710 + }, + { + "epoch": 2.66812357206715, + "grad_norm": 0.10400390625, + "learning_rate": 0.000586554087612993, + "loss": 0.5616, + "step": 53720 + }, + { + "epoch": 2.6686202443627693, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005865143538293434, + "loss": 0.5383, + "step": 53730 + }, + { + "epoch": 2.669116916658389, + "grad_norm": 0.125, + "learning_rate": 0.0005864746200456938, + "loss": 0.5441, + "step": 53740 + }, + { + "epoch": 2.6696135889540082, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005864348862620444, + "loss": 0.554, + "step": 53750 + }, + { + "epoch": 2.6701102612496275, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005863951524783948, + "loss": 0.5615, + "step": 53760 + }, + { + "epoch": 2.6706069335452467, + "grad_norm": 0.158203125, + "learning_rate": 0.0005863554186947452, + "loss": 0.5313, + "step": 53770 + }, + { + "epoch": 2.671103605840866, + "grad_norm": 0.1171875, + "learning_rate": 0.0005863156849110957, + "loss": 0.5787, + "step": 53780 + }, + { + "epoch": 2.6716002781364856, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005862759511274462, + "loss": 0.5651, + "step": 53790 + }, + { + "epoch": 2.672096950432105, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005862362173437966, + "loss": 0.5787, + "step": 53800 + }, + { + "epoch": 2.6725936227277245, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005861964835601471, + "loss": 0.554, + "step": 53810 + }, + { + "epoch": 2.6730902950233437, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005861567497764975, + "loss": 0.5671, + "step": 53820 + }, + { + "epoch": 2.673586967318963, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005861170159928479, + "loss": 0.5346, + "step": 53830 + }, + { + "epoch": 2.674083639614582, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005860772822091983, + "loss": 0.5534, + "step": 53840 + }, + { + "epoch": 2.6745803119102014, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005860375484255489, + "loss": 0.5411, + "step": 53850 + }, + { + "epoch": 2.675076984205821, + "grad_norm": 0.1015625, + "learning_rate": 0.0005859978146418993, + "loss": 0.5475, + "step": 53860 + }, + { + "epoch": 2.6755736565014403, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005859580808582497, + "loss": 0.5515, + "step": 53870 + }, + { + "epoch": 2.67607032879706, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005859183470746002, + "loss": 0.575, + "step": 53880 + }, + { + "epoch": 2.676567001092679, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005858786132909506, + "loss": 0.531, + "step": 53890 + }, + { + "epoch": 2.6770636733882984, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005858388795073011, + "loss": 0.5366, + "step": 53900 + }, + { + "epoch": 2.6775603456839177, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005857991457236516, + "loss": 0.5634, + "step": 53910 + }, + { + "epoch": 2.678057017979537, + "grad_norm": 0.12353515625, + "learning_rate": 0.000585759411940002, + "loss": 0.542, + "step": 53920 + }, + { + "epoch": 2.6785536902751566, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005857196781563524, + "loss": 0.5662, + "step": 53930 + }, + { + "epoch": 2.679050362570776, + "grad_norm": 0.11572265625, + "learning_rate": 0.000585679944372703, + "loss": 0.5822, + "step": 53940 + }, + { + "epoch": 2.679547034866395, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005856402105890534, + "loss": 0.5319, + "step": 53950 + }, + { + "epoch": 2.6800437071620147, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005856004768054038, + "loss": 0.5512, + "step": 53960 + }, + { + "epoch": 2.680540379457634, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005855607430217543, + "loss": 0.5579, + "step": 53970 + }, + { + "epoch": 2.681037051753253, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005855210092381047, + "loss": 0.5863, + "step": 53980 + }, + { + "epoch": 2.6815337240488724, + "grad_norm": 0.142578125, + "learning_rate": 0.0005854812754544551, + "loss": 0.5027, + "step": 53990 + }, + { + "epoch": 2.682030396344492, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005854415416708057, + "loss": 0.5308, + "step": 54000 + }, + { + "epoch": 2.6825270686401113, + "grad_norm": 0.09375, + "learning_rate": 0.0005854018078871561, + "loss": 0.5463, + "step": 54010 + }, + { + "epoch": 2.6830237409357305, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005853620741035065, + "loss": 0.5609, + "step": 54020 + }, + { + "epoch": 2.68352041323135, + "grad_norm": 0.10400390625, + "learning_rate": 0.000585322340319857, + "loss": 0.5414, + "step": 54030 + }, + { + "epoch": 2.6840170855269694, + "grad_norm": 0.107421875, + "learning_rate": 0.0005852826065362074, + "loss": 0.556, + "step": 54040 + }, + { + "epoch": 2.6845137578225886, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005852428727525579, + "loss": 0.5399, + "step": 54050 + }, + { + "epoch": 2.685010430118208, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005852031389689083, + "loss": 0.5864, + "step": 54060 + }, + { + "epoch": 2.6855071024138275, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005851634051852588, + "loss": 0.5751, + "step": 54070 + }, + { + "epoch": 2.6860037747094467, + "grad_norm": 0.099609375, + "learning_rate": 0.0005851236714016093, + "loss": 0.5782, + "step": 54080 + }, + { + "epoch": 2.686500447005066, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005850839376179596, + "loss": 0.5686, + "step": 54090 + }, + { + "epoch": 2.6869971193006856, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005850442038343102, + "loss": 0.5474, + "step": 54100 + }, + { + "epoch": 2.687493791596305, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005850044700506606, + "loss": 0.5418, + "step": 54110 + }, + { + "epoch": 2.687990463891924, + "grad_norm": 0.0947265625, + "learning_rate": 0.000584964736267011, + "loss": 0.541, + "step": 54120 + }, + { + "epoch": 2.6884871361875433, + "grad_norm": 0.12890625, + "learning_rate": 0.0005849250024833615, + "loss": 0.5756, + "step": 54130 + }, + { + "epoch": 2.6889838084831625, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005848852686997119, + "loss": 0.5503, + "step": 54140 + }, + { + "epoch": 2.689480480778782, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005848455349160624, + "loss": 0.551, + "step": 54150 + }, + { + "epoch": 2.6899771530744014, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005848058011324129, + "loss": 0.5432, + "step": 54160 + }, + { + "epoch": 2.690473825370021, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005847660673487633, + "loss": 0.5528, + "step": 54170 + }, + { + "epoch": 2.6909704976656403, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005847263335651137, + "loss": 0.5809, + "step": 54180 + }, + { + "epoch": 2.6914671699612596, + "grad_norm": 0.103515625, + "learning_rate": 0.0005846865997814642, + "loss": 0.5633, + "step": 54190 + }, + { + "epoch": 2.691963842256879, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005846468659978147, + "loss": 0.5667, + "step": 54200 + }, + { + "epoch": 2.692460514552498, + "grad_norm": 0.099609375, + "learning_rate": 0.0005846071322141652, + "loss": 0.5571, + "step": 54210 + }, + { + "epoch": 2.6929571868481177, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005845673984305156, + "loss": 0.5512, + "step": 54220 + }, + { + "epoch": 2.693453859143737, + "grad_norm": 0.103515625, + "learning_rate": 0.000584527664646866, + "loss": 0.5544, + "step": 54230 + }, + { + "epoch": 2.6939505314393566, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005844879308632165, + "loss": 0.5588, + "step": 54240 + }, + { + "epoch": 2.694447203734976, + "grad_norm": 0.298828125, + "learning_rate": 0.0005844481970795669, + "loss": 0.5605, + "step": 54250 + }, + { + "epoch": 2.694943876030595, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005844084632959174, + "loss": 0.5892, + "step": 54260 + }, + { + "epoch": 2.6954405483262143, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005843687295122679, + "loss": 0.5496, + "step": 54270 + }, + { + "epoch": 2.6959372206218335, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005843289957286182, + "loss": 0.5694, + "step": 54280 + }, + { + "epoch": 2.696433892917453, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005842892619449687, + "loss": 0.5944, + "step": 54290 + }, + { + "epoch": 2.6969305652130724, + "grad_norm": 0.158203125, + "learning_rate": 0.0005842495281613193, + "loss": 0.5402, + "step": 54300 + }, + { + "epoch": 2.6974272375086916, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005842097943776696, + "loss": 0.5805, + "step": 54310 + }, + { + "epoch": 2.6979239098043113, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005841700605940201, + "loss": 0.5464, + "step": 54320 + }, + { + "epoch": 2.6984205820999305, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005841303268103705, + "loss": 0.5548, + "step": 54330 + }, + { + "epoch": 2.6989172543955497, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005840905930267209, + "loss": 0.5549, + "step": 54340 + }, + { + "epoch": 2.699413926691169, + "grad_norm": 0.109375, + "learning_rate": 0.0005840508592430715, + "loss": 0.5741, + "step": 54350 + }, + { + "epoch": 2.6999105989867886, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005840111254594219, + "loss": 0.582, + "step": 54360 + }, + { + "epoch": 2.700407271282408, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005839713916757724, + "loss": 0.5709, + "step": 54370 + }, + { + "epoch": 2.700903943578027, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005839316578921228, + "loss": 0.5393, + "step": 54380 + }, + { + "epoch": 2.7014006158736468, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005838919241084732, + "loss": 0.5663, + "step": 54390 + }, + { + "epoch": 2.701897288169266, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005838521903248238, + "loss": 0.5497, + "step": 54400 + }, + { + "epoch": 2.7023939604648852, + "grad_norm": 0.134765625, + "learning_rate": 0.0005838124565411742, + "loss": 0.5285, + "step": 54410 + }, + { + "epoch": 2.7028906327605045, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005837727227575246, + "loss": 0.5538, + "step": 54420 + }, + { + "epoch": 2.703387305056124, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005837329889738751, + "loss": 0.5732, + "step": 54430 + }, + { + "epoch": 2.7038839773517434, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005836932551902254, + "loss": 0.5686, + "step": 54440 + }, + { + "epoch": 2.7043806496473626, + "grad_norm": 0.0947265625, + "learning_rate": 0.000583653521406576, + "loss": 0.5449, + "step": 54450 + }, + { + "epoch": 2.7048773219429822, + "grad_norm": 0.107421875, + "learning_rate": 0.0005836137876229265, + "loss": 0.5793, + "step": 54460 + }, + { + "epoch": 2.7053739942386015, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005835740538392768, + "loss": 0.5356, + "step": 54470 + }, + { + "epoch": 2.7058706665342207, + "grad_norm": 0.125, + "learning_rate": 0.0005835343200556273, + "loss": 0.5417, + "step": 54480 + }, + { + "epoch": 2.70636733882984, + "grad_norm": 0.125, + "learning_rate": 0.0005834945862719778, + "loss": 0.5341, + "step": 54490 + }, + { + "epoch": 2.706864011125459, + "grad_norm": 0.130859375, + "learning_rate": 0.0005834548524883282, + "loss": 0.5406, + "step": 54500 + }, + { + "epoch": 2.707360683421079, + "grad_norm": 0.154296875, + "learning_rate": 0.0005834151187046787, + "loss": 0.586, + "step": 54510 + }, + { + "epoch": 2.707857355716698, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005833753849210291, + "loss": 0.5395, + "step": 54520 + }, + { + "epoch": 2.7083540280123177, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005833356511373796, + "loss": 0.5561, + "step": 54530 + }, + { + "epoch": 2.708850700307937, + "grad_norm": 0.103515625, + "learning_rate": 0.00058329591735373, + "loss": 0.5365, + "step": 54540 + }, + { + "epoch": 2.709347372603556, + "grad_norm": 0.115234375, + "learning_rate": 0.0005832561835700805, + "loss": 0.5619, + "step": 54550 + }, + { + "epoch": 2.7098440448991754, + "grad_norm": 0.1337890625, + "learning_rate": 0.000583216449786431, + "loss": 0.5714, + "step": 54560 + }, + { + "epoch": 2.7103407171947946, + "grad_norm": 0.091796875, + "learning_rate": 0.0005831767160027814, + "loss": 0.5312, + "step": 54570 + }, + { + "epoch": 2.7108373894904143, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005831369822191318, + "loss": 0.5659, + "step": 54580 + }, + { + "epoch": 2.7113340617860335, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005830972484354823, + "loss": 0.5533, + "step": 54590 + }, + { + "epoch": 2.7118307340816528, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005830575146518328, + "loss": 0.5633, + "step": 54600 + }, + { + "epoch": 2.7123274063772724, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005830177808681832, + "loss": 0.5577, + "step": 54610 + }, + { + "epoch": 2.7128240786728917, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005829780470845337, + "loss": 0.572, + "step": 54620 + }, + { + "epoch": 2.713320750968511, + "grad_norm": 0.10546875, + "learning_rate": 0.0005829383133008841, + "loss": 0.5672, + "step": 54630 + }, + { + "epoch": 2.71381742326413, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005828985795172345, + "loss": 0.5401, + "step": 54640 + }, + { + "epoch": 2.71431409555975, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005828588457335851, + "loss": 0.5604, + "step": 54650 + }, + { + "epoch": 2.714810767855369, + "grad_norm": 0.126953125, + "learning_rate": 0.0005828191119499355, + "loss": 0.5605, + "step": 54660 + }, + { + "epoch": 2.7153074401509882, + "grad_norm": 0.095703125, + "learning_rate": 0.0005827793781662859, + "loss": 0.5624, + "step": 54670 + }, + { + "epoch": 2.715804112446608, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005827396443826364, + "loss": 0.5725, + "step": 54680 + }, + { + "epoch": 2.716300784742227, + "grad_norm": 0.123046875, + "learning_rate": 0.0005826999105989868, + "loss": 0.5552, + "step": 54690 + }, + { + "epoch": 2.7167974570378464, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005826601768153373, + "loss": 0.5755, + "step": 54700 + }, + { + "epoch": 2.7172941293334656, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005826204430316877, + "loss": 0.5701, + "step": 54710 + }, + { + "epoch": 2.7177908016290853, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005825807092480382, + "loss": 0.5731, + "step": 54720 + }, + { + "epoch": 2.7182874739247045, + "grad_norm": 0.140625, + "learning_rate": 0.0005825409754643886, + "loss": 0.5941, + "step": 54730 + }, + { + "epoch": 2.7187841462203237, + "grad_norm": 0.1123046875, + "learning_rate": 0.000582501241680739, + "loss": 0.5641, + "step": 54740 + }, + { + "epoch": 2.7192808185159434, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005824615078970896, + "loss": 0.5417, + "step": 54750 + }, + { + "epoch": 2.7197774908115626, + "grad_norm": 0.1728515625, + "learning_rate": 0.00058242177411344, + "loss": 0.5744, + "step": 54760 + }, + { + "epoch": 2.720274163107182, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005823820403297904, + "loss": 0.571, + "step": 54770 + }, + { + "epoch": 2.720770835402801, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005823423065461409, + "loss": 0.5549, + "step": 54780 + }, + { + "epoch": 2.7212675076984207, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005823025727624913, + "loss": 0.5888, + "step": 54790 + }, + { + "epoch": 2.72176417999404, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005822628389788418, + "loss": 0.5551, + "step": 54800 + }, + { + "epoch": 2.722260852289659, + "grad_norm": 0.197265625, + "learning_rate": 0.0005822231051951923, + "loss": 0.5548, + "step": 54810 + }, + { + "epoch": 2.722757524585279, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005821833714115427, + "loss": 0.5363, + "step": 54820 + }, + { + "epoch": 2.723254196880898, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005821436376278931, + "loss": 0.5589, + "step": 54830 + }, + { + "epoch": 2.7237508691765173, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005821039038442436, + "loss": 0.5889, + "step": 54840 + }, + { + "epoch": 2.7242475414721365, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005820641700605941, + "loss": 0.5374, + "step": 54850 + }, + { + "epoch": 2.7247442137677558, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005820244362769445, + "loss": 0.5606, + "step": 54860 + }, + { + "epoch": 2.7252408860633754, + "grad_norm": 0.10302734375, + "learning_rate": 0.000581984702493295, + "loss": 0.5569, + "step": 54870 + }, + { + "epoch": 2.7257375583589947, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005819449687096454, + "loss": 0.5761, + "step": 54880 + }, + { + "epoch": 2.7262342306546143, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005819052349259958, + "loss": 0.5551, + "step": 54890 + }, + { + "epoch": 2.7267309029502336, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005818655011423464, + "loss": 0.5721, + "step": 54900 + }, + { + "epoch": 2.727227575245853, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005818257673586968, + "loss": 0.5725, + "step": 54910 + }, + { + "epoch": 2.727724247541472, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005817860335750472, + "loss": 0.5593, + "step": 54920 + }, + { + "epoch": 2.7282209198370913, + "grad_norm": 0.109375, + "learning_rate": 0.0005817462997913976, + "loss": 0.5551, + "step": 54930 + }, + { + "epoch": 2.728717592132711, + "grad_norm": 0.09765625, + "learning_rate": 0.0005817065660077481, + "loss": 0.5658, + "step": 54940 + }, + { + "epoch": 2.72921426442833, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005816668322240986, + "loss": 0.5853, + "step": 54950 + }, + { + "epoch": 2.7297109367239494, + "grad_norm": 0.154296875, + "learning_rate": 0.000581627098440449, + "loss": 0.5695, + "step": 54960 + }, + { + "epoch": 2.730207609019569, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005815873646567995, + "loss": 0.5309, + "step": 54970 + }, + { + "epoch": 2.7307042813151883, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005815476308731499, + "loss": 0.5484, + "step": 54980 + }, + { + "epoch": 2.7312009536108075, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005815078970895003, + "loss": 0.5488, + "step": 54990 + }, + { + "epoch": 2.7316976259064267, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005814681633058509, + "loss": 0.5306, + "step": 55000 + }, + { + "epoch": 2.7321942982020464, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005814284295222013, + "loss": 0.5647, + "step": 55010 + }, + { + "epoch": 2.7326909704976656, + "grad_norm": 0.111328125, + "learning_rate": 0.0005813886957385517, + "loss": 0.5721, + "step": 55020 + }, + { + "epoch": 2.733187642793285, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005813489619549022, + "loss": 0.5647, + "step": 55030 + }, + { + "epoch": 2.7336843150889045, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005813092281712526, + "loss": 0.5479, + "step": 55040 + }, + { + "epoch": 2.7341809873845238, + "grad_norm": 0.2578125, + "learning_rate": 0.000581269494387603, + "loss": 0.5689, + "step": 55050 + }, + { + "epoch": 2.734677659680143, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005812297606039536, + "loss": 0.535, + "step": 55060 + }, + { + "epoch": 2.735174331975762, + "grad_norm": 0.140625, + "learning_rate": 0.000581190026820304, + "loss": 0.5574, + "step": 55070 + }, + { + "epoch": 2.735671004271382, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005811502930366544, + "loss": 0.5488, + "step": 55080 + }, + { + "epoch": 2.736167676567001, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005811105592530049, + "loss": 0.5373, + "step": 55090 + }, + { + "epoch": 2.7366643488626203, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005810708254693554, + "loss": 0.5725, + "step": 55100 + }, + { + "epoch": 2.73716102115824, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005810310916857059, + "loss": 0.5674, + "step": 55110 + }, + { + "epoch": 2.7376576934538592, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005809913579020562, + "loss": 0.5623, + "step": 55120 + }, + { + "epoch": 2.7381543657494785, + "grad_norm": 0.15234375, + "learning_rate": 0.0005809516241184067, + "loss": 0.5248, + "step": 55130 + }, + { + "epoch": 2.7386510380450977, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005809118903347572, + "loss": 0.5304, + "step": 55140 + }, + { + "epoch": 2.7391477103407174, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005808721565511075, + "loss": 0.5724, + "step": 55150 + }, + { + "epoch": 2.7396443826363366, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005808324227674581, + "loss": 0.5655, + "step": 55160 + }, + { + "epoch": 2.740141054931956, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005807926889838085, + "loss": 0.5197, + "step": 55170 + }, + { + "epoch": 2.7406377272275755, + "grad_norm": 0.126953125, + "learning_rate": 0.0005807529552001589, + "loss": 0.551, + "step": 55180 + }, + { + "epoch": 2.7411343995231947, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005807132214165094, + "loss": 0.5737, + "step": 55190 + }, + { + "epoch": 2.741631071818814, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005806734876328598, + "loss": 0.5701, + "step": 55200 + }, + { + "epoch": 2.742127744114433, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005806337538492103, + "loss": 0.5167, + "step": 55210 + }, + { + "epoch": 2.7426244164100524, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005805940200655608, + "loss": 0.5687, + "step": 55220 + }, + { + "epoch": 2.743121088705672, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005805542862819112, + "loss": 0.5623, + "step": 55230 + }, + { + "epoch": 2.7436177610012913, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005805145524982616, + "loss": 0.5348, + "step": 55240 + }, + { + "epoch": 2.744114433296911, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005804748187146121, + "loss": 0.549, + "step": 55250 + }, + { + "epoch": 2.74461110559253, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005804350849309626, + "loss": 0.5745, + "step": 55260 + }, + { + "epoch": 2.7451077778881494, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005803953511473131, + "loss": 0.5593, + "step": 55270 + }, + { + "epoch": 2.7456044501837686, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005803556173636635, + "loss": 0.5544, + "step": 55280 + }, + { + "epoch": 2.746101122479388, + "grad_norm": 0.142578125, + "learning_rate": 0.0005803158835800139, + "loss": 0.5419, + "step": 55290 + }, + { + "epoch": 2.7465977947750075, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005802761497963645, + "loss": 0.6107, + "step": 55300 + }, + { + "epoch": 2.7470944670706268, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005802364160127148, + "loss": 0.5794, + "step": 55310 + }, + { + "epoch": 2.747591139366246, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005801966822290653, + "loss": 0.5429, + "step": 55320 + }, + { + "epoch": 2.7480878116618657, + "grad_norm": 0.125, + "learning_rate": 0.0005801569484454158, + "loss": 0.5616, + "step": 55330 + }, + { + "epoch": 2.748584483957485, + "grad_norm": 0.1640625, + "learning_rate": 0.0005801172146617661, + "loss": 0.5419, + "step": 55340 + }, + { + "epoch": 2.749081156253104, + "grad_norm": 0.140625, + "learning_rate": 0.0005800774808781166, + "loss": 0.5558, + "step": 55350 + }, + { + "epoch": 2.7495778285487233, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005800377470944672, + "loss": 0.5258, + "step": 55360 + }, + { + "epoch": 2.750074500844343, + "grad_norm": 0.09375, + "learning_rate": 0.0005799980133108175, + "loss": 0.5429, + "step": 55370 + }, + { + "epoch": 2.7505711731399622, + "grad_norm": 0.1298828125, + "learning_rate": 0.000579958279527168, + "loss": 0.5463, + "step": 55380 + }, + { + "epoch": 2.7510678454355815, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005799185457435184, + "loss": 0.5475, + "step": 55390 + }, + { + "epoch": 2.751564517731201, + "grad_norm": 0.130859375, + "learning_rate": 0.000579878811959869, + "loss": 0.5423, + "step": 55400 + }, + { + "epoch": 2.7520611900268204, + "grad_norm": 0.158203125, + "learning_rate": 0.0005798390781762194, + "loss": 0.5488, + "step": 55410 + }, + { + "epoch": 2.7525578623224396, + "grad_norm": 0.171875, + "learning_rate": 0.0005797993443925698, + "loss": 0.5434, + "step": 55420 + }, + { + "epoch": 2.753054534618059, + "grad_norm": 0.130859375, + "learning_rate": 0.0005797596106089203, + "loss": 0.5555, + "step": 55430 + }, + { + "epoch": 2.7535512069136785, + "grad_norm": 0.11328125, + "learning_rate": 0.0005797198768252707, + "loss": 0.5677, + "step": 55440 + }, + { + "epoch": 2.7540478792092977, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005796801430416211, + "loss": 0.5583, + "step": 55450 + }, + { + "epoch": 2.754544551504917, + "grad_norm": 0.111328125, + "learning_rate": 0.0005796404092579717, + "loss": 0.5538, + "step": 55460 + }, + { + "epoch": 2.7550412238005366, + "grad_norm": 0.091796875, + "learning_rate": 0.0005796006754743221, + "loss": 0.5444, + "step": 55470 + }, + { + "epoch": 2.755537896096156, + "grad_norm": 0.13671875, + "learning_rate": 0.0005795609416906725, + "loss": 0.5492, + "step": 55480 + }, + { + "epoch": 2.756034568391775, + "grad_norm": 0.1083984375, + "learning_rate": 0.000579521207907023, + "loss": 0.5969, + "step": 55490 + }, + { + "epoch": 2.7565312406873943, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005794814741233733, + "loss": 0.5599, + "step": 55500 + }, + { + "epoch": 2.757027912983014, + "grad_norm": 0.107421875, + "learning_rate": 0.0005794417403397239, + "loss": 0.5281, + "step": 55510 + }, + { + "epoch": 2.757524585278633, + "grad_norm": 0.10546875, + "learning_rate": 0.0005794020065560744, + "loss": 0.565, + "step": 55520 + }, + { + "epoch": 2.7580212575742524, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005793622727724247, + "loss": 0.5562, + "step": 55530 + }, + { + "epoch": 2.758517929869872, + "grad_norm": 0.169921875, + "learning_rate": 0.0005793225389887752, + "loss": 0.5884, + "step": 55540 + }, + { + "epoch": 2.7590146021654913, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005792828052051257, + "loss": 0.5439, + "step": 55550 + }, + { + "epoch": 2.7595112744611106, + "grad_norm": 0.107421875, + "learning_rate": 0.0005792430714214762, + "loss": 0.5555, + "step": 55560 + }, + { + "epoch": 2.76000794675673, + "grad_norm": 0.16015625, + "learning_rate": 0.0005792033376378266, + "loss": 0.5547, + "step": 55570 + }, + { + "epoch": 2.760504619052349, + "grad_norm": 0.1162109375, + "learning_rate": 0.000579163603854177, + "loss": 0.5592, + "step": 55580 + }, + { + "epoch": 2.7610012913479687, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005791238700705275, + "loss": 0.5555, + "step": 55590 + }, + { + "epoch": 2.761497963643588, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005790841362868779, + "loss": 0.5415, + "step": 55600 + }, + { + "epoch": 2.7619946359392076, + "grad_norm": 0.09765625, + "learning_rate": 0.0005790444025032284, + "loss": 0.5891, + "step": 55610 + }, + { + "epoch": 2.762491308234827, + "grad_norm": 0.103515625, + "learning_rate": 0.0005790046687195789, + "loss": 0.5656, + "step": 55620 + }, + { + "epoch": 2.762987980530446, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005789649349359293, + "loss": 0.5647, + "step": 55630 + }, + { + "epoch": 2.7634846528260653, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005789252011522797, + "loss": 0.5689, + "step": 55640 + }, + { + "epoch": 2.7639813251216845, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005788854673686302, + "loss": 0.5426, + "step": 55650 + }, + { + "epoch": 2.764477997417304, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005788457335849807, + "loss": 0.5417, + "step": 55660 + }, + { + "epoch": 2.7649746697129234, + "grad_norm": 0.169921875, + "learning_rate": 0.0005788059998013311, + "loss": 0.5529, + "step": 55670 + }, + { + "epoch": 2.7654713420085426, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005787662660176816, + "loss": 0.5507, + "step": 55680 + }, + { + "epoch": 2.7659680143041623, + "grad_norm": 0.11865234375, + "learning_rate": 0.000578726532234032, + "loss": 0.5881, + "step": 55690 + }, + { + "epoch": 2.7664646865997815, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005786867984503824, + "loss": 0.5548, + "step": 55700 + }, + { + "epoch": 2.7669613588954007, + "grad_norm": 0.21484375, + "learning_rate": 0.000578647064666733, + "loss": 0.5443, + "step": 55710 + }, + { + "epoch": 2.76745803119102, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005786073308830834, + "loss": 0.5477, + "step": 55720 + }, + { + "epoch": 2.7679547034866396, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005785675970994338, + "loss": 0.5643, + "step": 55730 + }, + { + "epoch": 2.768451375782259, + "grad_norm": 0.17578125, + "learning_rate": 0.0005785278633157843, + "loss": 0.5501, + "step": 55740 + }, + { + "epoch": 2.768948048077878, + "grad_norm": 0.09375, + "learning_rate": 0.0005784881295321347, + "loss": 0.5595, + "step": 55750 + }, + { + "epoch": 2.7694447203734978, + "grad_norm": 0.11328125, + "learning_rate": 0.0005784483957484852, + "loss": 0.5723, + "step": 55760 + }, + { + "epoch": 2.769941392669117, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005784086619648356, + "loss": 0.5765, + "step": 55770 + }, + { + "epoch": 2.770438064964736, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005783689281811861, + "loss": 0.5515, + "step": 55780 + }, + { + "epoch": 2.7709347372603554, + "grad_norm": 0.1328125, + "learning_rate": 0.0005783291943975365, + "loss": 0.5348, + "step": 55790 + }, + { + "epoch": 2.771431409555975, + "grad_norm": 0.115234375, + "learning_rate": 0.0005782894606138869, + "loss": 0.5507, + "step": 55800 + }, + { + "epoch": 2.7719280818515943, + "grad_norm": 0.099609375, + "learning_rate": 0.0005782497268302375, + "loss": 0.5576, + "step": 55810 + }, + { + "epoch": 2.7724247541472136, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005782099930465879, + "loss": 0.5549, + "step": 55820 + }, + { + "epoch": 2.7729214264428332, + "grad_norm": 0.09375, + "learning_rate": 0.0005781702592629383, + "loss": 0.5687, + "step": 55830 + }, + { + "epoch": 2.7734180987384525, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005781305254792888, + "loss": 0.5809, + "step": 55840 + }, + { + "epoch": 2.7739147710340717, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005780907916956392, + "loss": 0.5701, + "step": 55850 + }, + { + "epoch": 2.774411443329691, + "grad_norm": 0.2890625, + "learning_rate": 0.0005780510579119897, + "loss": 0.568, + "step": 55860 + }, + { + "epoch": 2.77490811562531, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005780113241283402, + "loss": 0.5715, + "step": 55870 + }, + { + "epoch": 2.77540478792093, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005779715903446906, + "loss": 0.5823, + "step": 55880 + }, + { + "epoch": 2.775901460216549, + "grad_norm": 0.1357421875, + "learning_rate": 0.000577931856561041, + "loss": 0.5197, + "step": 55890 + }, + { + "epoch": 2.7763981325121687, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005778921227773915, + "loss": 0.6089, + "step": 55900 + }, + { + "epoch": 2.776894804807788, + "grad_norm": 0.087890625, + "learning_rate": 0.000577852388993742, + "loss": 0.5394, + "step": 55910 + }, + { + "epoch": 2.777391477103407, + "grad_norm": 0.150390625, + "learning_rate": 0.0005778126552100924, + "loss": 0.5576, + "step": 55920 + }, + { + "epoch": 2.7778881493990264, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005777729214264429, + "loss": 0.5577, + "step": 55930 + }, + { + "epoch": 2.7783848216946456, + "grad_norm": 0.146484375, + "learning_rate": 0.0005777331876427933, + "loss": 0.5627, + "step": 55940 + }, + { + "epoch": 2.7788814939902653, + "grad_norm": 0.146484375, + "learning_rate": 0.0005776934538591437, + "loss": 0.5685, + "step": 55950 + }, + { + "epoch": 2.7793781662858845, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005776537200754943, + "loss": 0.5755, + "step": 55960 + }, + { + "epoch": 2.779874838581504, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005776139862918447, + "loss": 0.5418, + "step": 55970 + }, + { + "epoch": 2.7803715108771234, + "grad_norm": 0.11328125, + "learning_rate": 0.0005775742525081951, + "loss": 0.5429, + "step": 55980 + }, + { + "epoch": 2.7808681831727426, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005775345187245455, + "loss": 0.5879, + "step": 55990 + }, + { + "epoch": 2.781364855468362, + "grad_norm": 0.0947265625, + "learning_rate": 0.000577494784940896, + "loss": 0.5522, + "step": 56000 + }, + { + "epoch": 2.781861527763981, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005774550511572466, + "loss": 0.5791, + "step": 56010 + }, + { + "epoch": 2.7823582000596008, + "grad_norm": 0.103515625, + "learning_rate": 0.0005774153173735969, + "loss": 0.5753, + "step": 56020 + }, + { + "epoch": 2.78285487235522, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005773755835899474, + "loss": 0.5485, + "step": 56030 + }, + { + "epoch": 2.7833515446508392, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005773358498062978, + "loss": 0.5378, + "step": 56040 + }, + { + "epoch": 2.783848216946459, + "grad_norm": 0.126953125, + "learning_rate": 0.0005772961160226482, + "loss": 0.5962, + "step": 56050 + }, + { + "epoch": 2.784344889242078, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005772563822389988, + "loss": 0.5633, + "step": 56060 + }, + { + "epoch": 2.7848415615376974, + "grad_norm": 0.091796875, + "learning_rate": 0.0005772166484553492, + "loss": 0.5915, + "step": 56070 + }, + { + "epoch": 2.7853382338333166, + "grad_norm": 0.13671875, + "learning_rate": 0.0005771769146716996, + "loss": 0.565, + "step": 56080 + }, + { + "epoch": 2.7858349061289363, + "grad_norm": 0.1171875, + "learning_rate": 0.0005771371808880501, + "loss": 0.544, + "step": 56090 + }, + { + "epoch": 2.7863315784245555, + "grad_norm": 0.11328125, + "learning_rate": 0.0005770974471044005, + "loss": 0.5503, + "step": 56100 + }, + { + "epoch": 2.7868282507201747, + "grad_norm": 0.0966796875, + "learning_rate": 0.000577057713320751, + "loss": 0.5926, + "step": 56110 + }, + { + "epoch": 2.7873249230157944, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005770179795371015, + "loss": 0.5907, + "step": 56120 + }, + { + "epoch": 2.7878215953114136, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005769782457534519, + "loss": 0.542, + "step": 56130 + }, + { + "epoch": 2.788318267607033, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005769385119698023, + "loss": 0.5577, + "step": 56140 + }, + { + "epoch": 2.788814939902652, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005768987781861528, + "loss": 0.5581, + "step": 56150 + }, + { + "epoch": 2.7893116121982717, + "grad_norm": 0.091796875, + "learning_rate": 0.0005768590444025033, + "loss": 0.5403, + "step": 56160 + }, + { + "epoch": 2.789808284493891, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005768193106188538, + "loss": 0.5795, + "step": 56170 + }, + { + "epoch": 2.79030495678951, + "grad_norm": 0.12890625, + "learning_rate": 0.0005767795768352041, + "loss": 0.529, + "step": 56180 + }, + { + "epoch": 2.79080162908513, + "grad_norm": 0.12109375, + "learning_rate": 0.0005767398430515546, + "loss": 0.5354, + "step": 56190 + }, + { + "epoch": 2.791298301380749, + "grad_norm": 0.1328125, + "learning_rate": 0.0005767001092679051, + "loss": 0.5647, + "step": 56200 + }, + { + "epoch": 2.7917949736763683, + "grad_norm": 0.1171875, + "learning_rate": 0.0005766603754842554, + "loss": 0.5636, + "step": 56210 + }, + { + "epoch": 2.7922916459719875, + "grad_norm": 0.08544921875, + "learning_rate": 0.000576620641700606, + "loss": 0.5823, + "step": 56220 + }, + { + "epoch": 2.7927883182676068, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005765809079169565, + "loss": 0.5592, + "step": 56230 + }, + { + "epoch": 2.7932849905632264, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005765411741333068, + "loss": 0.5779, + "step": 56240 + }, + { + "epoch": 2.7937816628588457, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005765014403496573, + "loss": 0.5916, + "step": 56250 + }, + { + "epoch": 2.7942783351544653, + "grad_norm": 0.20703125, + "learning_rate": 0.0005764617065660078, + "loss": 0.567, + "step": 56260 + }, + { + "epoch": 2.7947750074500846, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005764219727823582, + "loss": 0.5717, + "step": 56270 + }, + { + "epoch": 2.795271679745704, + "grad_norm": 0.115234375, + "learning_rate": 0.0005763822389987087, + "loss": 0.5514, + "step": 56280 + }, + { + "epoch": 2.795768352041323, + "grad_norm": 0.09765625, + "learning_rate": 0.0005763425052150591, + "loss": 0.5704, + "step": 56290 + }, + { + "epoch": 2.7962650243369422, + "grad_norm": 0.166015625, + "learning_rate": 0.0005763027714314096, + "loss": 0.5569, + "step": 56300 + }, + { + "epoch": 2.796761696632562, + "grad_norm": 0.11669921875, + "learning_rate": 0.00057626303764776, + "loss": 0.5818, + "step": 56310 + }, + { + "epoch": 2.797258368928181, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005762233038641105, + "loss": 0.5351, + "step": 56320 + }, + { + "epoch": 2.797755041223801, + "grad_norm": 0.126953125, + "learning_rate": 0.000576183570080461, + "loss": 0.5564, + "step": 56330 + }, + { + "epoch": 2.79825171351942, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005761438362968114, + "loss": 0.577, + "step": 56340 + }, + { + "epoch": 2.7987483858150393, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005761041025131618, + "loss": 0.5136, + "step": 56350 + }, + { + "epoch": 2.7992450581106585, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005760643687295124, + "loss": 0.5576, + "step": 56360 + }, + { + "epoch": 2.7997417304062777, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005760246349458627, + "loss": 0.5479, + "step": 56370 + }, + { + "epoch": 2.8002384027018974, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005759849011622132, + "loss": 0.5534, + "step": 56380 + }, + { + "epoch": 2.8007350749975166, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005759451673785637, + "loss": 0.5432, + "step": 56390 + }, + { + "epoch": 2.801231747293136, + "grad_norm": 0.1025390625, + "learning_rate": 0.000575905433594914, + "loss": 0.5506, + "step": 56400 + }, + { + "epoch": 2.8017284195887555, + "grad_norm": 0.107421875, + "learning_rate": 0.0005758656998112646, + "loss": 0.5494, + "step": 56410 + }, + { + "epoch": 2.8022250918843747, + "grad_norm": 0.15625, + "learning_rate": 0.0005758259660276151, + "loss": 0.5985, + "step": 56420 + }, + { + "epoch": 2.802721764179994, + "grad_norm": 0.103515625, + "learning_rate": 0.0005757862322439654, + "loss": 0.5345, + "step": 56430 + }, + { + "epoch": 2.803218436475613, + "grad_norm": 0.12890625, + "learning_rate": 0.0005757464984603159, + "loss": 0.5927, + "step": 56440 + }, + { + "epoch": 2.803715108771233, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005757067646766663, + "loss": 0.5388, + "step": 56450 + }, + { + "epoch": 2.804211781066852, + "grad_norm": 0.13671875, + "learning_rate": 0.0005756670308930169, + "loss": 0.5468, + "step": 56460 + }, + { + "epoch": 2.8047084533624713, + "grad_norm": 0.111328125, + "learning_rate": 0.0005756272971093673, + "loss": 0.592, + "step": 56470 + }, + { + "epoch": 2.805205125658091, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005755875633257177, + "loss": 0.5579, + "step": 56480 + }, + { + "epoch": 2.80570179795371, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005755478295420682, + "loss": 0.556, + "step": 56490 + }, + { + "epoch": 2.8061984702493294, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005755080957584186, + "loss": 0.5605, + "step": 56500 + }, + { + "epoch": 2.8066951425449487, + "grad_norm": 0.10009765625, + "learning_rate": 0.000575468361974769, + "loss": 0.5704, + "step": 56510 + }, + { + "epoch": 2.8071918148405683, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005754286281911196, + "loss": 0.538, + "step": 56520 + }, + { + "epoch": 2.8076884871361876, + "grad_norm": 0.16015625, + "learning_rate": 0.00057538889440747, + "loss": 0.5666, + "step": 56530 + }, + { + "epoch": 2.808185159431807, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005753491606238204, + "loss": 0.579, + "step": 56540 + }, + { + "epoch": 2.8086818317274265, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005753094268401709, + "loss": 0.5407, + "step": 56550 + }, + { + "epoch": 2.8091785040230457, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005752696930565213, + "loss": 0.55, + "step": 56560 + }, + { + "epoch": 2.809675176318665, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005752299592728718, + "loss": 0.5505, + "step": 56570 + }, + { + "epoch": 2.810171848614284, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005751902254892223, + "loss": 0.5341, + "step": 56580 + }, + { + "epoch": 2.8106685209099034, + "grad_norm": 0.134765625, + "learning_rate": 0.0005751504917055726, + "loss": 0.5854, + "step": 56590 + }, + { + "epoch": 2.811165193205523, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005751107579219231, + "loss": 0.6014, + "step": 56600 + }, + { + "epoch": 2.8116618655011423, + "grad_norm": 0.146484375, + "learning_rate": 0.0005750710241382737, + "loss": 0.5923, + "step": 56610 + }, + { + "epoch": 2.812158537796762, + "grad_norm": 0.130859375, + "learning_rate": 0.0005750312903546241, + "loss": 0.5599, + "step": 56620 + }, + { + "epoch": 2.812655210092381, + "grad_norm": 0.09375, + "learning_rate": 0.0005749915565709745, + "loss": 0.5852, + "step": 56630 + }, + { + "epoch": 2.8131518823880004, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005749518227873249, + "loss": 0.5262, + "step": 56640 + }, + { + "epoch": 2.8136485546836196, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005749120890036754, + "loss": 0.5398, + "step": 56650 + }, + { + "epoch": 2.814145226979239, + "grad_norm": 0.1171875, + "learning_rate": 0.0005748723552200258, + "loss": 0.5394, + "step": 56660 + }, + { + "epoch": 2.8146418992748585, + "grad_norm": 0.119140625, + "learning_rate": 0.0005748326214363763, + "loss": 0.5383, + "step": 56670 + }, + { + "epoch": 2.8151385715704778, + "grad_norm": 0.1328125, + "learning_rate": 0.0005747928876527268, + "loss": 0.5352, + "step": 56680 + }, + { + "epoch": 2.8156352438660974, + "grad_norm": 0.19140625, + "learning_rate": 0.0005747531538690772, + "loss": 0.5325, + "step": 56690 + }, + { + "epoch": 2.8161319161617167, + "grad_norm": 0.1875, + "learning_rate": 0.0005747134200854276, + "loss": 0.5311, + "step": 56700 + }, + { + "epoch": 2.816628588457336, + "grad_norm": 0.169921875, + "learning_rate": 0.0005746736863017781, + "loss": 0.5797, + "step": 56710 + }, + { + "epoch": 2.817125260752955, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005746339525181286, + "loss": 0.5722, + "step": 56720 + }, + { + "epoch": 2.8176219330485743, + "grad_norm": 0.130859375, + "learning_rate": 0.000574594218734479, + "loss": 0.5606, + "step": 56730 + }, + { + "epoch": 2.818118605344194, + "grad_norm": 0.111328125, + "learning_rate": 0.0005745544849508295, + "loss": 0.5685, + "step": 56740 + }, + { + "epoch": 2.8186152776398132, + "grad_norm": 0.1484375, + "learning_rate": 0.0005745147511671799, + "loss": 0.5442, + "step": 56750 + }, + { + "epoch": 2.8191119499354325, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005744750173835303, + "loss": 0.5349, + "step": 56760 + }, + { + "epoch": 2.819608622231052, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005744352835998809, + "loss": 0.5479, + "step": 56770 + }, + { + "epoch": 2.8201052945266714, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005743955498162313, + "loss": 0.5517, + "step": 56780 + }, + { + "epoch": 2.8206019668222906, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005743558160325817, + "loss": 0.5598, + "step": 56790 + }, + { + "epoch": 2.82109863911791, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005743160822489322, + "loss": 0.546, + "step": 56800 + }, + { + "epoch": 2.8215953114135295, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005742763484652826, + "loss": 0.5627, + "step": 56810 + }, + { + "epoch": 2.8220919837091487, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005742366146816331, + "loss": 0.5589, + "step": 56820 + }, + { + "epoch": 2.822588656004768, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005741968808979836, + "loss": 0.5491, + "step": 56830 + }, + { + "epoch": 2.8230853283003876, + "grad_norm": 0.09765625, + "learning_rate": 0.000574157147114334, + "loss": 0.5563, + "step": 56840 + }, + { + "epoch": 2.823582000596007, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005741174133306844, + "loss": 0.5788, + "step": 56850 + }, + { + "epoch": 2.824078672891626, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005740776795470348, + "loss": 0.5798, + "step": 56860 + }, + { + "epoch": 2.8245753451872453, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005740379457633854, + "loss": 0.5597, + "step": 56870 + }, + { + "epoch": 2.825072017482865, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005739982119797358, + "loss": 0.5476, + "step": 56880 + }, + { + "epoch": 2.825568689778484, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005739584781960862, + "loss": 0.5343, + "step": 56890 + }, + { + "epoch": 2.8260653620741034, + "grad_norm": 0.11328125, + "learning_rate": 0.0005739187444124367, + "loss": 0.5601, + "step": 56900 + }, + { + "epoch": 2.826562034369723, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005738790106287871, + "loss": 0.5578, + "step": 56910 + }, + { + "epoch": 2.8270587066653423, + "grad_norm": 0.1015625, + "learning_rate": 0.0005738392768451376, + "loss": 0.5794, + "step": 56920 + }, + { + "epoch": 2.8275553789609615, + "grad_norm": 0.1484375, + "learning_rate": 0.0005737995430614881, + "loss": 0.5499, + "step": 56930 + }, + { + "epoch": 2.8280520512565808, + "grad_norm": 0.158203125, + "learning_rate": 0.0005737598092778385, + "loss": 0.5601, + "step": 56940 + }, + { + "epoch": 2.8285487235522, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005737200754941889, + "loss": 0.5375, + "step": 56950 + }, + { + "epoch": 2.8290453958478197, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005736803417105394, + "loss": 0.5793, + "step": 56960 + }, + { + "epoch": 2.829542068143439, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005736406079268899, + "loss": 0.5278, + "step": 56970 + }, + { + "epoch": 2.8300387404390586, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005736008741432403, + "loss": 0.5501, + "step": 56980 + }, + { + "epoch": 2.830535412734678, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005735611403595908, + "loss": 0.5422, + "step": 56990 + }, + { + "epoch": 2.831032085030297, + "grad_norm": 0.08984375, + "learning_rate": 0.0005735214065759412, + "loss": 0.5629, + "step": 57000 + }, + { + "epoch": 2.8315287573259162, + "grad_norm": 0.09375, + "learning_rate": 0.0005734816727922916, + "loss": 0.5634, + "step": 57010 + }, + { + "epoch": 2.8320254296215355, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005734419390086422, + "loss": 0.5477, + "step": 57020 + }, + { + "epoch": 2.832522101917155, + "grad_norm": 0.134765625, + "learning_rate": 0.0005734022052249926, + "loss": 0.5744, + "step": 57030 + }, + { + "epoch": 2.8330187742127744, + "grad_norm": 0.10888671875, + "learning_rate": 0.000573362471441343, + "loss": 0.5712, + "step": 57040 + }, + { + "epoch": 2.8335154465083936, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005733227376576934, + "loss": 0.513, + "step": 57050 + }, + { + "epoch": 2.8340121188040133, + "grad_norm": 0.119140625, + "learning_rate": 0.0005732830038740439, + "loss": 0.5702, + "step": 57060 + }, + { + "epoch": 2.8345087910996325, + "grad_norm": 0.09765625, + "learning_rate": 0.0005732432700903945, + "loss": 0.5636, + "step": 57070 + }, + { + "epoch": 2.8350054633952517, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005732035363067448, + "loss": 0.5351, + "step": 57080 + }, + { + "epoch": 2.835502135690871, + "grad_norm": 0.1484375, + "learning_rate": 0.0005731638025230953, + "loss": 0.5575, + "step": 57090 + }, + { + "epoch": 2.8359988079864906, + "grad_norm": 0.119140625, + "learning_rate": 0.0005731240687394458, + "loss": 0.5511, + "step": 57100 + }, + { + "epoch": 2.83649548028211, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005730843349557961, + "loss": 0.564, + "step": 57110 + }, + { + "epoch": 2.836992152577729, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005730446011721467, + "loss": 0.5408, + "step": 57120 + }, + { + "epoch": 2.8374888248733487, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005730048673884971, + "loss": 0.5589, + "step": 57130 + }, + { + "epoch": 2.837985497168968, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005729651336048475, + "loss": 0.5682, + "step": 57140 + }, + { + "epoch": 2.838482169464587, + "grad_norm": 0.12060546875, + "learning_rate": 0.000572925399821198, + "loss": 0.5336, + "step": 57150 + }, + { + "epoch": 2.8389788417602064, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005728856660375484, + "loss": 0.5409, + "step": 57160 + }, + { + "epoch": 2.839475514055826, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005728459322538989, + "loss": 0.5335, + "step": 57170 + }, + { + "epoch": 2.8399721863514453, + "grad_norm": 0.134765625, + "learning_rate": 0.0005728061984702494, + "loss": 0.5436, + "step": 57180 + }, + { + "epoch": 2.8404688586470646, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005727664646865998, + "loss": 0.5297, + "step": 57190 + }, + { + "epoch": 2.8409655309426842, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005727267309029503, + "loss": 0.5597, + "step": 57200 + }, + { + "epoch": 2.8414622032383035, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005726869971193007, + "loss": 0.5425, + "step": 57210 + }, + { + "epoch": 2.8419588755339227, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005726472633356512, + "loss": 0.5408, + "step": 57220 + }, + { + "epoch": 2.842455547829542, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005726075295520017, + "loss": 0.5352, + "step": 57230 + }, + { + "epoch": 2.8429522201251616, + "grad_norm": 0.11181640625, + "learning_rate": 0.000572567795768352, + "loss": 0.574, + "step": 57240 + }, + { + "epoch": 2.843448892420781, + "grad_norm": 0.109375, + "learning_rate": 0.0005725280619847025, + "loss": 0.5742, + "step": 57250 + }, + { + "epoch": 2.8439455647164, + "grad_norm": 0.1484375, + "learning_rate": 0.000572488328201053, + "loss": 0.5515, + "step": 57260 + }, + { + "epoch": 2.8444422370120197, + "grad_norm": 0.107421875, + "learning_rate": 0.0005724485944174034, + "loss": 0.593, + "step": 57270 + }, + { + "epoch": 2.844938909307639, + "grad_norm": 0.150390625, + "learning_rate": 0.0005724088606337539, + "loss": 0.528, + "step": 57280 + }, + { + "epoch": 2.845435581603258, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005723691268501044, + "loss": 0.551, + "step": 57290 + }, + { + "epoch": 2.8459322538988774, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005723293930664547, + "loss": 0.5678, + "step": 57300 + }, + { + "epoch": 2.8464289261944966, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005722896592828052, + "loss": 0.5736, + "step": 57310 + }, + { + "epoch": 2.8469255984901163, + "grad_norm": 0.169921875, + "learning_rate": 0.0005722499254991557, + "loss": 0.5452, + "step": 57320 + }, + { + "epoch": 2.8474222707857355, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005722101917155061, + "loss": 0.5354, + "step": 57330 + }, + { + "epoch": 2.847918943081355, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005721704579318566, + "loss": 0.5659, + "step": 57340 + }, + { + "epoch": 2.8484156153769744, + "grad_norm": 0.0908203125, + "learning_rate": 0.000572130724148207, + "loss": 0.532, + "step": 57350 + }, + { + "epoch": 2.8489122876725936, + "grad_norm": 0.12890625, + "learning_rate": 0.0005720909903645575, + "loss": 0.5466, + "step": 57360 + }, + { + "epoch": 2.849408959968213, + "grad_norm": 0.154296875, + "learning_rate": 0.000572051256580908, + "loss": 0.5825, + "step": 57370 + }, + { + "epoch": 2.849905632263832, + "grad_norm": 0.171875, + "learning_rate": 0.0005720115227972584, + "loss": 0.5741, + "step": 57380 + }, + { + "epoch": 2.8504023045594518, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005719717890136089, + "loss": 0.5571, + "step": 57390 + }, + { + "epoch": 2.850898976855071, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005719320552299593, + "loss": 0.5709, + "step": 57400 + }, + { + "epoch": 2.85139564915069, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005718923214463097, + "loss": 0.569, + "step": 57410 + }, + { + "epoch": 2.85189232144631, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005718525876626603, + "loss": 0.5289, + "step": 57420 + }, + { + "epoch": 2.852388993741929, + "grad_norm": 0.1015625, + "learning_rate": 0.0005718128538790107, + "loss": 0.5553, + "step": 57430 + }, + { + "epoch": 2.8528856660375483, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005717731200953611, + "loss": 0.539, + "step": 57440 + }, + { + "epoch": 2.8533823383331676, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005717333863117116, + "loss": 0.5497, + "step": 57450 + }, + { + "epoch": 2.8538790106287872, + "grad_norm": 0.091796875, + "learning_rate": 0.0005716936525280619, + "loss": 0.5593, + "step": 57460 + }, + { + "epoch": 2.8543756829244065, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005716539187444125, + "loss": 0.5529, + "step": 57470 + }, + { + "epoch": 2.8548723552200257, + "grad_norm": 0.1396484375, + "learning_rate": 0.000571614184960763, + "loss": 0.557, + "step": 57480 + }, + { + "epoch": 2.8553690275156454, + "grad_norm": 0.1015625, + "learning_rate": 0.0005715744511771133, + "loss": 0.5768, + "step": 57490 + }, + { + "epoch": 2.8558656998112646, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005715347173934638, + "loss": 0.5636, + "step": 57500 + }, + { + "epoch": 2.856362372106884, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005714949836098142, + "loss": 0.5743, + "step": 57510 + }, + { + "epoch": 2.856859044402503, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005714552498261648, + "loss": 0.559, + "step": 57520 + }, + { + "epoch": 2.8573557166981227, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005714155160425152, + "loss": 0.5599, + "step": 57530 + }, + { + "epoch": 2.857852388993742, + "grad_norm": 0.1015625, + "learning_rate": 0.0005713757822588656, + "loss": 0.5598, + "step": 57540 + }, + { + "epoch": 2.858349061289361, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005713360484752161, + "loss": 0.5347, + "step": 57550 + }, + { + "epoch": 2.858845733584981, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005712963146915665, + "loss": 0.5707, + "step": 57560 + }, + { + "epoch": 2.8593424058806, + "grad_norm": 0.115234375, + "learning_rate": 0.000571256580907917, + "loss": 0.528, + "step": 57570 + }, + { + "epoch": 2.8598390781762193, + "grad_norm": 0.12890625, + "learning_rate": 0.0005712168471242675, + "loss": 0.5671, + "step": 57580 + }, + { + "epoch": 2.8603357504718385, + "grad_norm": 0.10546875, + "learning_rate": 0.0005711771133406179, + "loss": 0.5585, + "step": 57590 + }, + { + "epoch": 2.860832422767458, + "grad_norm": 0.10546875, + "learning_rate": 0.0005711373795569683, + "loss": 0.5669, + "step": 57600 + }, + { + "epoch": 2.8613290950630774, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005710976457733188, + "loss": 0.579, + "step": 57610 + }, + { + "epoch": 2.8618257673586966, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005710579119896693, + "loss": 0.5508, + "step": 57620 + }, + { + "epoch": 2.8623224396543163, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005710181782060197, + "loss": 0.5277, + "step": 57630 + }, + { + "epoch": 2.8628191119499355, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005709784444223702, + "loss": 0.5405, + "step": 57640 + }, + { + "epoch": 2.8633157842455548, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005709387106387206, + "loss": 0.5483, + "step": 57650 + }, + { + "epoch": 2.863812456541174, + "grad_norm": 0.1455078125, + "learning_rate": 0.000570898976855071, + "loss": 0.5332, + "step": 57660 + }, + { + "epoch": 2.8643091288367932, + "grad_norm": 0.119140625, + "learning_rate": 0.0005708592430714216, + "loss": 0.5571, + "step": 57670 + }, + { + "epoch": 2.864805801132413, + "grad_norm": 0.09228515625, + "learning_rate": 0.000570819509287772, + "loss": 0.5255, + "step": 57680 + }, + { + "epoch": 2.865302473428032, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005707797755041224, + "loss": 0.5287, + "step": 57690 + }, + { + "epoch": 2.865799145723652, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005707400417204729, + "loss": 0.5412, + "step": 57700 + }, + { + "epoch": 2.866295818019271, + "grad_norm": 0.16015625, + "learning_rate": 0.0005707003079368233, + "loss": 0.5548, + "step": 57710 + }, + { + "epoch": 2.8667924903148903, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005706605741531738, + "loss": 0.5363, + "step": 57720 + }, + { + "epoch": 2.8672891626105095, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005706208403695242, + "loss": 0.5702, + "step": 57730 + }, + { + "epoch": 2.8677858349061287, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005705811065858747, + "loss": 0.5496, + "step": 57740 + }, + { + "epoch": 2.8682825072017484, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005705413728022251, + "loss": 0.5126, + "step": 57750 + }, + { + "epoch": 2.8687791794973676, + "grad_norm": 0.154296875, + "learning_rate": 0.0005705016390185755, + "loss": 0.5608, + "step": 57760 + }, + { + "epoch": 2.869275851792987, + "grad_norm": 0.09521484375, + "learning_rate": 0.000570461905234926, + "loss": 0.5755, + "step": 57770 + }, + { + "epoch": 2.8697725240886065, + "grad_norm": 0.140625, + "learning_rate": 0.0005704221714512765, + "loss": 0.5264, + "step": 57780 + }, + { + "epoch": 2.8702691963842257, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005703824376676269, + "loss": 0.5669, + "step": 57790 + }, + { + "epoch": 2.870765868679845, + "grad_norm": 0.12890625, + "learning_rate": 0.0005703427038839774, + "loss": 0.564, + "step": 57800 + }, + { + "epoch": 2.871262540975464, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005703029701003278, + "loss": 0.5497, + "step": 57810 + }, + { + "epoch": 2.871759213271084, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005702632363166782, + "loss": 0.5965, + "step": 57820 + }, + { + "epoch": 2.872255885566703, + "grad_norm": 0.134765625, + "learning_rate": 0.0005702235025330288, + "loss": 0.5384, + "step": 57830 + }, + { + "epoch": 2.8727525578623223, + "grad_norm": 0.162109375, + "learning_rate": 0.0005701837687493792, + "loss": 0.5137, + "step": 57840 + }, + { + "epoch": 2.873249230157942, + "grad_norm": 0.091796875, + "learning_rate": 0.0005701440349657296, + "loss": 0.562, + "step": 57850 + }, + { + "epoch": 2.873745902453561, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005701043011820801, + "loss": 0.5425, + "step": 57860 + }, + { + "epoch": 2.8742425747491804, + "grad_norm": 0.10546875, + "learning_rate": 0.0005700645673984305, + "loss": 0.5806, + "step": 57870 + }, + { + "epoch": 2.8747392470447997, + "grad_norm": 0.1376953125, + "learning_rate": 0.000570024833614781, + "loss": 0.5594, + "step": 57880 + }, + { + "epoch": 2.8752359193404193, + "grad_norm": 0.12109375, + "learning_rate": 0.0005699850998311315, + "loss": 0.5513, + "step": 57890 + }, + { + "epoch": 2.8757325916360386, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005699453660474819, + "loss": 0.5623, + "step": 57900 + }, + { + "epoch": 2.876229263931658, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005699056322638323, + "loss": 0.5909, + "step": 57910 + }, + { + "epoch": 2.8767259362272775, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005698658984801827, + "loss": 0.5394, + "step": 57920 + }, + { + "epoch": 2.8772226085228967, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005698261646965333, + "loss": 0.556, + "step": 57930 + }, + { + "epoch": 2.877719280818516, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005697864309128838, + "loss": 0.576, + "step": 57940 + }, + { + "epoch": 2.878215953114135, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005697466971292341, + "loss": 0.555, + "step": 57950 + }, + { + "epoch": 2.8787126254097544, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005697069633455846, + "loss": 0.5515, + "step": 57960 + }, + { + "epoch": 2.879209297705374, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005696672295619352, + "loss": 0.568, + "step": 57970 + }, + { + "epoch": 2.8797059700009933, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005696274957782855, + "loss": 0.5729, + "step": 57980 + }, + { + "epoch": 2.880202642296613, + "grad_norm": 0.158203125, + "learning_rate": 0.000569587761994636, + "loss": 0.5552, + "step": 57990 + }, + { + "epoch": 2.880699314592232, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005695480282109864, + "loss": 0.5485, + "step": 58000 + }, + { + "epoch": 2.8811959868878514, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005695082944273368, + "loss": 0.545, + "step": 58010 + }, + { + "epoch": 2.8816926591834706, + "grad_norm": 0.166015625, + "learning_rate": 0.0005694685606436873, + "loss": 0.5598, + "step": 58020 + }, + { + "epoch": 2.88218933147909, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005694288268600378, + "loss": 0.5485, + "step": 58030 + }, + { + "epoch": 2.8826860037747095, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005693890930763882, + "loss": 0.5435, + "step": 58040 + }, + { + "epoch": 2.8831826760703287, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005693493592927387, + "loss": 0.5659, + "step": 58050 + }, + { + "epoch": 2.8836793483659484, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005693096255090891, + "loss": 0.5266, + "step": 58060 + }, + { + "epoch": 2.8841760206615676, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005692698917254395, + "loss": 0.5522, + "step": 58070 + }, + { + "epoch": 2.884672692957187, + "grad_norm": 0.13671875, + "learning_rate": 0.0005692301579417901, + "loss": 0.5434, + "step": 58080 + }, + { + "epoch": 2.885169365252806, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005691904241581405, + "loss": 0.5619, + "step": 58090 + }, + { + "epoch": 2.8856660375484253, + "grad_norm": 0.099609375, + "learning_rate": 0.000569150690374491, + "loss": 0.569, + "step": 58100 + }, + { + "epoch": 2.886162709844045, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005691109565908413, + "loss": 0.5799, + "step": 58110 + }, + { + "epoch": 2.886659382139664, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005690712228071918, + "loss": 0.572, + "step": 58120 + }, + { + "epoch": 2.8871560544352834, + "grad_norm": 0.10546875, + "learning_rate": 0.0005690314890235424, + "loss": 0.5379, + "step": 58130 + }, + { + "epoch": 2.887652726730903, + "grad_norm": 0.15625, + "learning_rate": 0.0005689917552398927, + "loss": 0.5252, + "step": 58140 + }, + { + "epoch": 2.8881493990265223, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005689520214562432, + "loss": 0.5574, + "step": 58150 + }, + { + "epoch": 2.8886460713221416, + "grad_norm": 0.099609375, + "learning_rate": 0.0005689122876725937, + "loss": 0.551, + "step": 58160 + }, + { + "epoch": 2.889142743617761, + "grad_norm": 0.1171875, + "learning_rate": 0.000568872553888944, + "loss": 0.5691, + "step": 58170 + }, + { + "epoch": 2.8896394159133805, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005688328201052946, + "loss": 0.5442, + "step": 58180 + }, + { + "epoch": 2.8901360882089997, + "grad_norm": 0.11279296875, + "learning_rate": 0.000568793086321645, + "loss": 0.5618, + "step": 58190 + }, + { + "epoch": 2.890632760504619, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005687533525379954, + "loss": 0.5685, + "step": 58200 + }, + { + "epoch": 2.8911294328002386, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005687136187543459, + "loss": 0.5804, + "step": 58210 + }, + { + "epoch": 2.891626105095858, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005686738849706963, + "loss": 0.5615, + "step": 58220 + }, + { + "epoch": 2.892122777391477, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005686341511870468, + "loss": 0.5725, + "step": 58230 + }, + { + "epoch": 2.8926194496870963, + "grad_norm": 0.166015625, + "learning_rate": 0.0005685944174033973, + "loss": 0.5545, + "step": 58240 + }, + { + "epoch": 2.893116121982716, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005685546836197477, + "loss": 0.5463, + "step": 58250 + }, + { + "epoch": 2.893612794278335, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005685149498360982, + "loss": 0.5872, + "step": 58260 + }, + { + "epoch": 2.8941094665739544, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005684752160524486, + "loss": 0.5727, + "step": 58270 + }, + { + "epoch": 2.894606138869574, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005684354822687991, + "loss": 0.5308, + "step": 58280 + }, + { + "epoch": 2.8951028111651933, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005683957484851496, + "loss": 0.5595, + "step": 58290 + }, + { + "epoch": 2.8955994834608125, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005683560147015, + "loss": 0.5706, + "step": 58300 + }, + { + "epoch": 2.8960961557564318, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005683162809178504, + "loss": 0.5375, + "step": 58310 + }, + { + "epoch": 2.896592828052051, + "grad_norm": 0.10888671875, + "learning_rate": 0.000568276547134201, + "loss": 0.543, + "step": 58320 + }, + { + "epoch": 2.8970895003476707, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005682368133505513, + "loss": 0.5418, + "step": 58330 + }, + { + "epoch": 2.89758617264329, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005681970795669018, + "loss": 0.5714, + "step": 58340 + }, + { + "epoch": 2.8980828449389096, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005681573457832523, + "loss": 0.5453, + "step": 58350 + }, + { + "epoch": 2.898579517234529, + "grad_norm": 0.09765625, + "learning_rate": 0.0005681176119996026, + "loss": 0.5533, + "step": 58360 + }, + { + "epoch": 2.899076189530148, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005680778782159531, + "loss": 0.5524, + "step": 58370 + }, + { + "epoch": 2.8995728618257672, + "grad_norm": 0.09375, + "learning_rate": 0.0005680381444323036, + "loss": 0.5252, + "step": 58380 + }, + { + "epoch": 2.9000695341213865, + "grad_norm": 0.138671875, + "learning_rate": 0.0005679984106486541, + "loss": 0.5223, + "step": 58390 + }, + { + "epoch": 2.900566206417006, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005679586768650045, + "loss": 0.553, + "step": 58400 + }, + { + "epoch": 2.9010628787126254, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005679189430813549, + "loss": 0.5507, + "step": 58410 + }, + { + "epoch": 2.901559551008245, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005678792092977054, + "loss": 0.5367, + "step": 58420 + }, + { + "epoch": 2.9020562233038643, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005678394755140559, + "loss": 0.5669, + "step": 58430 + }, + { + "epoch": 2.9025528955994835, + "grad_norm": 0.087890625, + "learning_rate": 0.0005677997417304063, + "loss": 0.5491, + "step": 58440 + }, + { + "epoch": 2.9030495678951027, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005677600079467568, + "loss": 0.5746, + "step": 58450 + }, + { + "epoch": 2.903546240190722, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005677202741631072, + "loss": 0.5686, + "step": 58460 + }, + { + "epoch": 2.9040429124863416, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005676805403794576, + "loss": 0.5631, + "step": 58470 + }, + { + "epoch": 2.904539584781961, + "grad_norm": 0.09375, + "learning_rate": 0.0005676408065958082, + "loss": 0.5275, + "step": 58480 + }, + { + "epoch": 2.90503625707758, + "grad_norm": 0.1015625, + "learning_rate": 0.0005676010728121586, + "loss": 0.5573, + "step": 58490 + }, + { + "epoch": 2.9055329293731997, + "grad_norm": 0.11181640625, + "learning_rate": 0.000567561339028509, + "loss": 0.5584, + "step": 58500 + }, + { + "epoch": 2.906029601668819, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005675216052448595, + "loss": 0.5452, + "step": 58510 + }, + { + "epoch": 2.906526273964438, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005674818714612098, + "loss": 0.5653, + "step": 58520 + }, + { + "epoch": 2.9070229462600574, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005674421376775604, + "loss": 0.5529, + "step": 58530 + }, + { + "epoch": 2.907519618555677, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005674024038939109, + "loss": 0.5543, + "step": 58540 + }, + { + "epoch": 2.9080162908512963, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005673626701102613, + "loss": 0.5662, + "step": 58550 + }, + { + "epoch": 2.9085129631469155, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005673229363266117, + "loss": 0.5571, + "step": 58560 + }, + { + "epoch": 2.909009635442535, + "grad_norm": 0.119140625, + "learning_rate": 0.0005672832025429621, + "loss": 0.5496, + "step": 58570 + }, + { + "epoch": 2.9095063077381544, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005672434687593127, + "loss": 0.5188, + "step": 58580 + }, + { + "epoch": 2.9100029800337737, + "grad_norm": 0.087890625, + "learning_rate": 0.0005672037349756631, + "loss": 0.5523, + "step": 58590 + }, + { + "epoch": 2.910499652329393, + "grad_norm": 0.10546875, + "learning_rate": 0.0005671640011920135, + "loss": 0.5485, + "step": 58600 + }, + { + "epoch": 2.9109963246250126, + "grad_norm": 0.0986328125, + "learning_rate": 0.000567124267408364, + "loss": 0.5503, + "step": 58610 + }, + { + "epoch": 2.911492996920632, + "grad_norm": 0.10546875, + "learning_rate": 0.0005670845336247144, + "loss": 0.562, + "step": 58620 + }, + { + "epoch": 2.911989669216251, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005670447998410649, + "loss": 0.5759, + "step": 58630 + }, + { + "epoch": 2.9124863415118707, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005670050660574154, + "loss": 0.5494, + "step": 58640 + }, + { + "epoch": 2.91298301380749, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005669653322737658, + "loss": 0.5299, + "step": 58650 + }, + { + "epoch": 2.913479686103109, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005669255984901162, + "loss": 0.5516, + "step": 58660 + }, + { + "epoch": 2.9139763583987284, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005668858647064667, + "loss": 0.5365, + "step": 58670 + }, + { + "epoch": 2.9144730306943476, + "grad_norm": 0.125, + "learning_rate": 0.0005668461309228172, + "loss": 0.5389, + "step": 58680 + }, + { + "epoch": 2.9149697029899673, + "grad_norm": 0.099609375, + "learning_rate": 0.0005668063971391676, + "loss": 0.5402, + "step": 58690 + }, + { + "epoch": 2.9154663752855865, + "grad_norm": 0.111328125, + "learning_rate": 0.0005667666633555181, + "loss": 0.5457, + "step": 58700 + }, + { + "epoch": 2.915963047581206, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005667269295718685, + "loss": 0.5668, + "step": 58710 + }, + { + "epoch": 2.9164597198768254, + "grad_norm": 0.18359375, + "learning_rate": 0.0005666871957882189, + "loss": 0.5632, + "step": 58720 + }, + { + "epoch": 2.9169563921724446, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005666474620045695, + "loss": 0.5504, + "step": 58730 + }, + { + "epoch": 2.917453064468064, + "grad_norm": 0.166015625, + "learning_rate": 0.0005666077282209199, + "loss": 0.5523, + "step": 58740 + }, + { + "epoch": 2.917949736763683, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005665679944372703, + "loss": 0.5607, + "step": 58750 + }, + { + "epoch": 2.9184464090593027, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005665282606536208, + "loss": 0.5626, + "step": 58760 + }, + { + "epoch": 2.918943081354922, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005664885268699712, + "loss": 0.5397, + "step": 58770 + }, + { + "epoch": 2.9194397536505416, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005664487930863217, + "loss": 0.5625, + "step": 58780 + }, + { + "epoch": 2.919936425946161, + "grad_norm": 0.12890625, + "learning_rate": 0.0005664090593026721, + "loss": 0.5383, + "step": 58790 + }, + { + "epoch": 2.92043309824178, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005663693255190226, + "loss": 0.5734, + "step": 58800 + }, + { + "epoch": 2.9209297705373993, + "grad_norm": 0.09521484375, + "learning_rate": 0.000566329591735373, + "loss": 0.5363, + "step": 58810 + }, + { + "epoch": 2.9214264428330186, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005662898579517234, + "loss": 0.5437, + "step": 58820 + }, + { + "epoch": 2.9219231151286382, + "grad_norm": 0.138671875, + "learning_rate": 0.000566250124168074, + "loss": 0.5454, + "step": 58830 + }, + { + "epoch": 2.9224197874242575, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005662103903844244, + "loss": 0.5451, + "step": 58840 + }, + { + "epoch": 2.9229164597198767, + "grad_norm": 0.103515625, + "learning_rate": 0.0005661706566007748, + "loss": 0.5632, + "step": 58850 + }, + { + "epoch": 2.9234131320154964, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005661309228171253, + "loss": 0.5585, + "step": 58860 + }, + { + "epoch": 2.9239098043111156, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005660911890334757, + "loss": 0.557, + "step": 58870 + }, + { + "epoch": 2.924406476606735, + "grad_norm": 0.091796875, + "learning_rate": 0.0005660514552498262, + "loss": 0.5664, + "step": 58880 + }, + { + "epoch": 2.924903148902354, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005660117214661767, + "loss": 0.5604, + "step": 58890 + }, + { + "epoch": 2.9253998211979737, + "grad_norm": 0.103515625, + "learning_rate": 0.0005659719876825271, + "loss": 0.6002, + "step": 58900 + }, + { + "epoch": 2.925896493493593, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005659322538988775, + "loss": 0.5526, + "step": 58910 + }, + { + "epoch": 2.926393165789212, + "grad_norm": 0.095703125, + "learning_rate": 0.000565892520115228, + "loss": 0.5556, + "step": 58920 + }, + { + "epoch": 2.926889838084832, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005658527863315785, + "loss": 0.5418, + "step": 58930 + }, + { + "epoch": 2.927386510380451, + "grad_norm": 0.138671875, + "learning_rate": 0.0005658130525479289, + "loss": 0.5495, + "step": 58940 + }, + { + "epoch": 2.9278831826760703, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005657733187642794, + "loss": 0.5517, + "step": 58950 + }, + { + "epoch": 2.9283798549716895, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005657335849806298, + "loss": 0.5698, + "step": 58960 + }, + { + "epoch": 2.928876527267309, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005656938511969802, + "loss": 0.5611, + "step": 58970 + }, + { + "epoch": 2.9293731995629284, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005656541174133306, + "loss": 0.5241, + "step": 58980 + }, + { + "epoch": 2.9298698718585476, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005656143836296812, + "loss": 0.5577, + "step": 58990 + }, + { + "epoch": 2.9303665441541673, + "grad_norm": 0.138671875, + "learning_rate": 0.0005655746498460317, + "loss": 0.5671, + "step": 59000 + }, + { + "epoch": 2.9308632164497865, + "grad_norm": 0.1181640625, + "learning_rate": 0.000565534916062382, + "loss": 0.5119, + "step": 59010 + }, + { + "epoch": 2.9313598887454058, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005654951822787325, + "loss": 0.5553, + "step": 59020 + }, + { + "epoch": 2.931856561041025, + "grad_norm": 0.099609375, + "learning_rate": 0.0005654554484950831, + "loss": 0.5578, + "step": 59030 + }, + { + "epoch": 2.932353233336644, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005654157147114334, + "loss": 0.5429, + "step": 59040 + }, + { + "epoch": 2.932849905632264, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005653759809277839, + "loss": 0.5359, + "step": 59050 + }, + { + "epoch": 2.933346577927883, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005653362471441343, + "loss": 0.5667, + "step": 59060 + }, + { + "epoch": 2.933843250223503, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005652965133604847, + "loss": 0.5741, + "step": 59070 + }, + { + "epoch": 2.934339922519122, + "grad_norm": 0.162109375, + "learning_rate": 0.0005652567795768353, + "loss": 0.5537, + "step": 59080 + }, + { + "epoch": 2.9348365948147412, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005652170457931857, + "loss": 0.5458, + "step": 59090 + }, + { + "epoch": 2.9353332671103605, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005651773120095361, + "loss": 0.597, + "step": 59100 + }, + { + "epoch": 2.9358299394059797, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005651375782258866, + "loss": 0.5725, + "step": 59110 + }, + { + "epoch": 2.9363266117015994, + "grad_norm": 0.1279296875, + "learning_rate": 0.000565097844442237, + "loss": 0.5645, + "step": 59120 + }, + { + "epoch": 2.9368232839972186, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005650581106585874, + "loss": 0.5217, + "step": 59130 + }, + { + "epoch": 2.937319956292838, + "grad_norm": 0.09814453125, + "learning_rate": 0.000565018376874938, + "loss": 0.5433, + "step": 59140 + }, + { + "epoch": 2.9378166285884575, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005649786430912884, + "loss": 0.5658, + "step": 59150 + }, + { + "epoch": 2.9383133008840767, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005649389093076389, + "loss": 0.5592, + "step": 59160 + }, + { + "epoch": 2.938809973179696, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005648991755239892, + "loss": 0.5329, + "step": 59170 + }, + { + "epoch": 2.939306645475315, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005648594417403397, + "loss": 0.5381, + "step": 59180 + }, + { + "epoch": 2.939803317770935, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005648197079566903, + "loss": 0.5439, + "step": 59190 + }, + { + "epoch": 2.940299990066554, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005647799741730406, + "loss": 0.5555, + "step": 59200 + }, + { + "epoch": 2.9407966623621733, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005647402403893911, + "loss": 0.5537, + "step": 59210 + }, + { + "epoch": 2.941293334657793, + "grad_norm": 0.10546875, + "learning_rate": 0.0005647005066057416, + "loss": 0.5486, + "step": 59220 + }, + { + "epoch": 2.941790006953412, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005646607728220919, + "loss": 0.5506, + "step": 59230 + }, + { + "epoch": 2.9422866792490314, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005646210390384425, + "loss": 0.5734, + "step": 59240 + }, + { + "epoch": 2.9427833515446506, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005645813052547929, + "loss": 0.5873, + "step": 59250 + }, + { + "epoch": 2.9432800238402703, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005645415714711433, + "loss": 0.5386, + "step": 59260 + }, + { + "epoch": 2.9437766961358895, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005645018376874938, + "loss": 0.542, + "step": 59270 + }, + { + "epoch": 2.9442733684315088, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005644621039038442, + "loss": 0.5314, + "step": 59280 + }, + { + "epoch": 2.9447700407271284, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005644223701201948, + "loss": 0.5418, + "step": 59290 + }, + { + "epoch": 2.9452667130227477, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005643826363365452, + "loss": 0.5572, + "step": 59300 + }, + { + "epoch": 2.945763385318367, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005643429025528956, + "loss": 0.5328, + "step": 59310 + }, + { + "epoch": 2.946260057613986, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005643031687692461, + "loss": 0.5504, + "step": 59320 + }, + { + "epoch": 2.946756729909606, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005642634349855965, + "loss": 0.5546, + "step": 59330 + }, + { + "epoch": 2.947253402205225, + "grad_norm": 0.09619140625, + "learning_rate": 0.000564223701201947, + "loss": 0.5449, + "step": 59340 + }, + { + "epoch": 2.9477500745008443, + "grad_norm": 0.12890625, + "learning_rate": 0.0005641839674182975, + "loss": 0.5657, + "step": 59350 + }, + { + "epoch": 2.948246746796464, + "grad_norm": 0.107421875, + "learning_rate": 0.0005641442336346479, + "loss": 0.5305, + "step": 59360 + }, + { + "epoch": 2.948743419092083, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005641044998509983, + "loss": 0.5326, + "step": 59370 + }, + { + "epoch": 2.9492400913877024, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005640647660673489, + "loss": 0.5455, + "step": 59380 + }, + { + "epoch": 2.9497367636833216, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005640250322836992, + "loss": 0.5653, + "step": 59390 + }, + { + "epoch": 2.950233435978941, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005639852985000497, + "loss": 0.5481, + "step": 59400 + }, + { + "epoch": 2.9507301082745605, + "grad_norm": 0.111328125, + "learning_rate": 0.0005639455647164002, + "loss": 0.5329, + "step": 59410 + }, + { + "epoch": 2.9512267805701797, + "grad_norm": 0.1328125, + "learning_rate": 0.0005639058309327505, + "loss": 0.5738, + "step": 59420 + }, + { + "epoch": 2.9517234528657994, + "grad_norm": 0.0986328125, + "learning_rate": 0.000563866097149101, + "loss": 0.5695, + "step": 59430 + }, + { + "epoch": 2.9522201251614186, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005638263633654515, + "loss": 0.5291, + "step": 59440 + }, + { + "epoch": 2.952716797457038, + "grad_norm": 0.1298828125, + "learning_rate": 0.000563786629581802, + "loss": 0.5616, + "step": 59450 + }, + { + "epoch": 2.953213469752657, + "grad_norm": 0.12890625, + "learning_rate": 0.0005637468957981524, + "loss": 0.5414, + "step": 59460 + }, + { + "epoch": 2.9537101420482763, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005637071620145028, + "loss": 0.5406, + "step": 59470 + }, + { + "epoch": 2.954206814343896, + "grad_norm": 0.15234375, + "learning_rate": 0.0005636674282308533, + "loss": 0.5475, + "step": 59480 + }, + { + "epoch": 2.954703486639515, + "grad_norm": 0.15234375, + "learning_rate": 0.0005636276944472038, + "loss": 0.541, + "step": 59490 + }, + { + "epoch": 2.9552001589351344, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005635879606635542, + "loss": 0.5627, + "step": 59500 + }, + { + "epoch": 2.955696831230754, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005635482268799047, + "loss": 0.5566, + "step": 59510 + }, + { + "epoch": 2.9561935035263733, + "grad_norm": 0.111328125, + "learning_rate": 0.0005635084930962551, + "loss": 0.5491, + "step": 59520 + }, + { + "epoch": 2.9566901758219926, + "grad_norm": 0.119140625, + "learning_rate": 0.0005634687593126055, + "loss": 0.5696, + "step": 59530 + }, + { + "epoch": 2.957186848117612, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005634290255289561, + "loss": 0.5736, + "step": 59540 + }, + { + "epoch": 2.9576835204132315, + "grad_norm": 0.14453125, + "learning_rate": 0.0005633892917453065, + "loss": 0.5498, + "step": 59550 + }, + { + "epoch": 2.9581801927088507, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005633495579616569, + "loss": 0.571, + "step": 59560 + }, + { + "epoch": 2.95867686500447, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005633098241780074, + "loss": 0.5617, + "step": 59570 + }, + { + "epoch": 2.9591735373000896, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005632700903943577, + "loss": 0.563, + "step": 59580 + }, + { + "epoch": 2.959670209595709, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005632303566107083, + "loss": 0.5737, + "step": 59590 + }, + { + "epoch": 2.960166881891328, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005631906228270588, + "loss": 0.5386, + "step": 59600 + }, + { + "epoch": 2.9606635541869473, + "grad_norm": 0.107421875, + "learning_rate": 0.0005631508890434092, + "loss": 0.561, + "step": 59610 + }, + { + "epoch": 2.961160226482567, + "grad_norm": 0.099609375, + "learning_rate": 0.0005631111552597596, + "loss": 0.5493, + "step": 59620 + }, + { + "epoch": 2.961656898778186, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005630714214761101, + "loss": 0.5367, + "step": 59630 + }, + { + "epoch": 2.9621535710738054, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005630316876924606, + "loss": 0.5491, + "step": 59640 + }, + { + "epoch": 2.962650243369425, + "grad_norm": 0.0966796875, + "learning_rate": 0.000562991953908811, + "loss": 0.536, + "step": 59650 + }, + { + "epoch": 2.9631469156650443, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005629522201251614, + "loss": 0.5557, + "step": 59660 + }, + { + "epoch": 2.9636435879606635, + "grad_norm": 0.125, + "learning_rate": 0.0005629124863415119, + "loss": 0.57, + "step": 59670 + }, + { + "epoch": 2.9641402602562827, + "grad_norm": 0.150390625, + "learning_rate": 0.0005628727525578623, + "loss": 0.5497, + "step": 59680 + }, + { + "epoch": 2.9646369325519024, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005628330187742128, + "loss": 0.5465, + "step": 59690 + }, + { + "epoch": 2.9651336048475216, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005627932849905633, + "loss": 0.5469, + "step": 59700 + }, + { + "epoch": 2.965630277143141, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005627535512069137, + "loss": 0.5711, + "step": 59710 + }, + { + "epoch": 2.9661269494387605, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005627138174232641, + "loss": 0.5855, + "step": 59720 + }, + { + "epoch": 2.9666236217343798, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005626740836396146, + "loss": 0.5443, + "step": 59730 + }, + { + "epoch": 2.967120294029999, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005626343498559651, + "loss": 0.593, + "step": 59740 + }, + { + "epoch": 2.967616966325618, + "grad_norm": 0.126953125, + "learning_rate": 0.0005625946160723155, + "loss": 0.5584, + "step": 59750 + }, + { + "epoch": 2.9681136386212374, + "grad_norm": 0.1240234375, + "learning_rate": 0.000562554882288666, + "loss": 0.5535, + "step": 59760 + }, + { + "epoch": 2.968610310916857, + "grad_norm": 0.09765625, + "learning_rate": 0.0005625151485050164, + "loss": 0.5354, + "step": 59770 + }, + { + "epoch": 2.9691069832124763, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005624754147213668, + "loss": 0.5449, + "step": 59780 + }, + { + "epoch": 2.969603655508096, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005624356809377174, + "loss": 0.5572, + "step": 59790 + }, + { + "epoch": 2.9701003278037152, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005623959471540678, + "loss": 0.5555, + "step": 59800 + }, + { + "epoch": 2.9705970000993345, + "grad_norm": 0.10546875, + "learning_rate": 0.0005623562133704182, + "loss": 0.5341, + "step": 59810 + }, + { + "epoch": 2.9710936723949537, + "grad_norm": 0.1484375, + "learning_rate": 0.0005623164795867687, + "loss": 0.5561, + "step": 59820 + }, + { + "epoch": 2.971590344690573, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005622767458031191, + "loss": 0.5475, + "step": 59830 + }, + { + "epoch": 2.9720870169861926, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005622370120194696, + "loss": 0.5521, + "step": 59840 + }, + { + "epoch": 2.972583689281812, + "grad_norm": 0.1455078125, + "learning_rate": 0.00056219727823582, + "loss": 0.5514, + "step": 59850 + }, + { + "epoch": 2.973080361577431, + "grad_norm": 0.10546875, + "learning_rate": 0.0005621575444521705, + "loss": 0.5566, + "step": 59860 + }, + { + "epoch": 2.9735770338730507, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005621178106685209, + "loss": 0.5545, + "step": 59870 + }, + { + "epoch": 2.97407370616867, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005620780768848713, + "loss": 0.5559, + "step": 59880 + }, + { + "epoch": 2.974570378464289, + "grad_norm": 0.1875, + "learning_rate": 0.0005620383431012219, + "loss": 0.5307, + "step": 59890 + }, + { + "epoch": 2.9750670507599084, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005619986093175724, + "loss": 0.5339, + "step": 59900 + }, + { + "epoch": 2.975563723055528, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005619588755339227, + "loss": 0.5683, + "step": 59910 + }, + { + "epoch": 2.9760603953511473, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005619191417502732, + "loss": 0.5408, + "step": 59920 + }, + { + "epoch": 2.9765570676467665, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005618794079666236, + "loss": 0.5533, + "step": 59930 + }, + { + "epoch": 2.977053739942386, + "grad_norm": 0.1513671875, + "learning_rate": 0.000561839674182974, + "loss": 0.5609, + "step": 59940 + }, + { + "epoch": 2.9775504122380054, + "grad_norm": 0.103515625, + "learning_rate": 0.0005617999403993246, + "loss": 0.5143, + "step": 59950 + }, + { + "epoch": 2.9780470845336247, + "grad_norm": 0.11376953125, + "learning_rate": 0.000561760206615675, + "loss": 0.5777, + "step": 59960 + }, + { + "epoch": 2.978543756829244, + "grad_norm": 0.11328125, + "learning_rate": 0.0005617204728320254, + "loss": 0.5602, + "step": 59970 + }, + { + "epoch": 2.9790404291248636, + "grad_norm": 0.099609375, + "learning_rate": 0.0005616807390483759, + "loss": 0.5576, + "step": 59980 + }, + { + "epoch": 2.979537101420483, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005616410052647264, + "loss": 0.5405, + "step": 59990 + }, + { + "epoch": 2.980033773716102, + "grad_norm": 0.1171875, + "learning_rate": 0.0005616012714810768, + "loss": 0.5464, + "step": 60000 + }, + { + "epoch": 2.9805304460117217, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005615615376974273, + "loss": 0.557, + "step": 60010 + }, + { + "epoch": 2.981027118307341, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005615218039137777, + "loss": 0.5384, + "step": 60020 + }, + { + "epoch": 2.98152379060296, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005614820701301282, + "loss": 0.5597, + "step": 60030 + }, + { + "epoch": 2.9820204628985794, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005614423363464786, + "loss": 0.5859, + "step": 60040 + }, + { + "epoch": 2.982517135194199, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005614026025628291, + "loss": 0.5451, + "step": 60050 + }, + { + "epoch": 2.9830138074898183, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005613628687791796, + "loss": 0.5357, + "step": 60060 + }, + { + "epoch": 2.9835104797854375, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005613231349955299, + "loss": 0.5503, + "step": 60070 + }, + { + "epoch": 2.984007152081057, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005612834012118804, + "loss": 0.5272, + "step": 60080 + }, + { + "epoch": 2.9845038243766764, + "grad_norm": 0.10986328125, + "learning_rate": 0.000561243667428231, + "loss": 0.5474, + "step": 60090 + }, + { + "epoch": 2.9850004966722956, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005612039336445813, + "loss": 0.5673, + "step": 60100 + }, + { + "epoch": 2.985497168967915, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005611641998609318, + "loss": 0.5401, + "step": 60110 + }, + { + "epoch": 2.985993841263534, + "grad_norm": 0.1015625, + "learning_rate": 0.0005611244660772822, + "loss": 0.5523, + "step": 60120 + }, + { + "epoch": 2.9864905135591537, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005610847322936326, + "loss": 0.5665, + "step": 60130 + }, + { + "epoch": 2.986987185854773, + "grad_norm": 0.146484375, + "learning_rate": 0.0005610449985099832, + "loss": 0.517, + "step": 60140 + }, + { + "epoch": 2.9874838581503926, + "grad_norm": 0.095703125, + "learning_rate": 0.0005610052647263336, + "loss": 0.552, + "step": 60150 + }, + { + "epoch": 2.987980530446012, + "grad_norm": 0.09521484375, + "learning_rate": 0.000560965530942684, + "loss": 0.5065, + "step": 60160 + }, + { + "epoch": 2.988477202741631, + "grad_norm": 0.10546875, + "learning_rate": 0.0005609257971590345, + "loss": 0.542, + "step": 60170 + }, + { + "epoch": 2.9889738750372503, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005608860633753849, + "loss": 0.5447, + "step": 60180 + }, + { + "epoch": 2.9894705473328695, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005608463295917355, + "loss": 0.5466, + "step": 60190 + }, + { + "epoch": 2.989967219628489, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005608065958080859, + "loss": 0.5523, + "step": 60200 + }, + { + "epoch": 2.9904638919241084, + "grad_norm": 0.099609375, + "learning_rate": 0.0005607668620244363, + "loss": 0.5566, + "step": 60210 + }, + { + "epoch": 2.9909605642197277, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005607271282407868, + "loss": 0.5332, + "step": 60220 + }, + { + "epoch": 2.9914572365153473, + "grad_norm": 0.09375, + "learning_rate": 0.0005606873944571372, + "loss": 0.5049, + "step": 60230 + }, + { + "epoch": 2.9919539088109666, + "grad_norm": 0.123046875, + "learning_rate": 0.0005606476606734877, + "loss": 0.5662, + "step": 60240 + }, + { + "epoch": 2.992450581106586, + "grad_norm": 0.115234375, + "learning_rate": 0.0005606079268898382, + "loss": 0.5306, + "step": 60250 + }, + { + "epoch": 2.992947253402205, + "grad_norm": 0.1171875, + "learning_rate": 0.0005605681931061885, + "loss": 0.5561, + "step": 60260 + }, + { + "epoch": 2.9934439256978247, + "grad_norm": 0.1025390625, + "learning_rate": 0.000560528459322539, + "loss": 0.5283, + "step": 60270 + }, + { + "epoch": 2.993940597993444, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005604887255388895, + "loss": 0.5596, + "step": 60280 + }, + { + "epoch": 2.994437270289063, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005604489917552398, + "loss": 0.5754, + "step": 60290 + }, + { + "epoch": 2.994933942584683, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005604092579715904, + "loss": 0.5628, + "step": 60300 + }, + { + "epoch": 2.995430614880302, + "grad_norm": 0.134765625, + "learning_rate": 0.0005603695241879408, + "loss": 0.5484, + "step": 60310 + }, + { + "epoch": 2.9959272871759213, + "grad_norm": 0.12890625, + "learning_rate": 0.0005603297904042912, + "loss": 0.5422, + "step": 60320 + }, + { + "epoch": 2.9964239594715405, + "grad_norm": 0.111328125, + "learning_rate": 0.0005602900566206417, + "loss": 0.5361, + "step": 60330 + }, + { + "epoch": 2.99692063176716, + "grad_norm": 0.130859375, + "learning_rate": 0.0005602503228369921, + "loss": 0.5161, + "step": 60340 + }, + { + "epoch": 2.9974173040627794, + "grad_norm": 0.109375, + "learning_rate": 0.0005602105890533427, + "loss": 0.5492, + "step": 60350 + }, + { + "epoch": 2.9979139763583986, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005601708552696931, + "loss": 0.5375, + "step": 60360 + }, + { + "epoch": 2.9984106486540183, + "grad_norm": 0.099609375, + "learning_rate": 0.0005601311214860435, + "loss": 0.5327, + "step": 60370 + }, + { + "epoch": 2.9989073209496375, + "grad_norm": 0.17578125, + "learning_rate": 0.000560091387702394, + "loss": 0.5367, + "step": 60380 + }, + { + "epoch": 2.9994039932452567, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005600516539187445, + "loss": 0.5471, + "step": 60390 + }, + { + "epoch": 2.999900665540876, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005600119201350949, + "loss": 0.5716, + "step": 60400 + }, + { + "epoch": 3.0003973378364956, + "grad_norm": 0.1328125, + "learning_rate": 0.0005599721863514454, + "loss": 0.5505, + "step": 60410 + }, + { + "epoch": 3.000894010132115, + "grad_norm": 0.111328125, + "learning_rate": 0.0005599324525677958, + "loss": 0.5274, + "step": 60420 + }, + { + "epoch": 3.001390682427734, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005598927187841462, + "loss": 0.5332, + "step": 60430 + }, + { + "epoch": 3.0018873547233533, + "grad_norm": 0.08984375, + "learning_rate": 0.0005598529850004968, + "loss": 0.5394, + "step": 60440 + }, + { + "epoch": 3.002384027018973, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005598132512168471, + "loss": 0.5643, + "step": 60450 + }, + { + "epoch": 3.0028806993145922, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005597735174331976, + "loss": 0.5487, + "step": 60460 + }, + { + "epoch": 3.0033773716102115, + "grad_norm": 0.1015625, + "learning_rate": 0.0005597337836495481, + "loss": 0.5516, + "step": 60470 + }, + { + "epoch": 3.003874043905831, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005596940498658985, + "loss": 0.5343, + "step": 60480 + }, + { + "epoch": 3.0043707162014504, + "grad_norm": 0.1064453125, + "learning_rate": 0.000559654316082249, + "loss": 0.5349, + "step": 60490 + }, + { + "epoch": 3.0048673884970696, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005596145822985995, + "loss": 0.5361, + "step": 60500 + }, + { + "epoch": 3.005364060792689, + "grad_norm": 0.09375, + "learning_rate": 0.0005595748485149499, + "loss": 0.5158, + "step": 60510 + }, + { + "epoch": 3.0058607330883085, + "grad_norm": 0.109375, + "learning_rate": 0.0005595351147313003, + "loss": 0.5549, + "step": 60520 + }, + { + "epoch": 3.0063574053839277, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005594953809476507, + "loss": 0.5027, + "step": 60530 + }, + { + "epoch": 3.006854077679547, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005594556471640013, + "loss": 0.5512, + "step": 60540 + }, + { + "epoch": 3.0073507499751666, + "grad_norm": 0.11328125, + "learning_rate": 0.0005594159133803517, + "loss": 0.5563, + "step": 60550 + }, + { + "epoch": 3.007847422270786, + "grad_norm": 0.10546875, + "learning_rate": 0.0005593761795967021, + "loss": 0.551, + "step": 60560 + }, + { + "epoch": 3.008344094566405, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005593364458130526, + "loss": 0.5456, + "step": 60570 + }, + { + "epoch": 3.0088407668620243, + "grad_norm": 0.162109375, + "learning_rate": 0.000559296712029403, + "loss": 0.5606, + "step": 60580 + }, + { + "epoch": 3.009337439157644, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005592569782457534, + "loss": 0.5347, + "step": 60590 + }, + { + "epoch": 3.009834111453263, + "grad_norm": 0.138671875, + "learning_rate": 0.000559217244462104, + "loss": 0.5382, + "step": 60600 + }, + { + "epoch": 3.0103307837488824, + "grad_norm": 0.140625, + "learning_rate": 0.0005591775106784544, + "loss": 0.5571, + "step": 60610 + }, + { + "epoch": 3.0108274560445016, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005591377768948048, + "loss": 0.5239, + "step": 60620 + }, + { + "epoch": 3.0113241283401213, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005590980431111553, + "loss": 0.5453, + "step": 60630 + }, + { + "epoch": 3.0118208006357405, + "grad_norm": 0.09765625, + "learning_rate": 0.0005590583093275057, + "loss": 0.5146, + "step": 60640 + }, + { + "epoch": 3.0123174729313598, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005590185755438562, + "loss": 0.506, + "step": 60650 + }, + { + "epoch": 3.0128141452269794, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005589788417602067, + "loss": 0.5441, + "step": 60660 + }, + { + "epoch": 3.0133108175225987, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005589391079765571, + "loss": 0.5373, + "step": 60670 + }, + { + "epoch": 3.013807489818218, + "grad_norm": 0.169921875, + "learning_rate": 0.0005588993741929075, + "loss": 0.5262, + "step": 60680 + }, + { + "epoch": 3.014304162113837, + "grad_norm": 0.10009765625, + "learning_rate": 0.000558859640409258, + "loss": 0.5653, + "step": 60690 + }, + { + "epoch": 3.014800834409457, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005588199066256085, + "loss": 0.5368, + "step": 60700 + }, + { + "epoch": 3.015297506705076, + "grad_norm": 0.1484375, + "learning_rate": 0.0005587801728419589, + "loss": 0.5373, + "step": 60710 + }, + { + "epoch": 3.0157941790006952, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005587404390583093, + "loss": 0.5432, + "step": 60720 + }, + { + "epoch": 3.016290851296315, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005587007052746598, + "loss": 0.5181, + "step": 60730 + }, + { + "epoch": 3.016787523591934, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005586609714910102, + "loss": 0.5347, + "step": 60740 + }, + { + "epoch": 3.0172841958875534, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005586212377073607, + "loss": 0.5342, + "step": 60750 + }, + { + "epoch": 3.0177808681831726, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005585815039237112, + "loss": 0.5126, + "step": 60760 + }, + { + "epoch": 3.0182775404787923, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005585417701400616, + "loss": 0.5561, + "step": 60770 + }, + { + "epoch": 3.0187742127744115, + "grad_norm": 0.1083984375, + "learning_rate": 0.000558502036356412, + "loss": 0.5455, + "step": 60780 + }, + { + "epoch": 3.0192708850700307, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005584623025727625, + "loss": 0.5802, + "step": 60790 + }, + { + "epoch": 3.01976755736565, + "grad_norm": 0.1328125, + "learning_rate": 0.000558422568789113, + "loss": 0.5601, + "step": 60800 + }, + { + "epoch": 3.0202642296612696, + "grad_norm": 0.169921875, + "learning_rate": 0.0005583828350054634, + "loss": 0.5845, + "step": 60810 + }, + { + "epoch": 3.020760901956889, + "grad_norm": 0.095703125, + "learning_rate": 0.0005583431012218139, + "loss": 0.5392, + "step": 60820 + }, + { + "epoch": 3.021257574252508, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005583033674381643, + "loss": 0.5334, + "step": 60830 + }, + { + "epoch": 3.0217542465481277, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005582636336545147, + "loss": 0.5474, + "step": 60840 + }, + { + "epoch": 3.022250918843747, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005582238998708653, + "loss": 0.5282, + "step": 60850 + }, + { + "epoch": 3.022747591139366, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005581841660872157, + "loss": 0.547, + "step": 60860 + }, + { + "epoch": 3.0232442634349854, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005581444323035661, + "loss": 0.5394, + "step": 60870 + }, + { + "epoch": 3.023740935730605, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005581046985199166, + "loss": 0.5465, + "step": 60880 + }, + { + "epoch": 3.0242376080262243, + "grad_norm": 0.10595703125, + "learning_rate": 0.000558064964736267, + "loss": 0.5242, + "step": 60890 + }, + { + "epoch": 3.0247342803218435, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005580252309526175, + "loss": 0.5338, + "step": 60900 + }, + { + "epoch": 3.0252309526174628, + "grad_norm": 0.09375, + "learning_rate": 0.0005579854971689679, + "loss": 0.5479, + "step": 60910 + }, + { + "epoch": 3.0257276249130824, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005579457633853184, + "loss": 0.5434, + "step": 60920 + }, + { + "epoch": 3.0262242972087017, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005579060296016689, + "loss": 0.5436, + "step": 60930 + }, + { + "epoch": 3.026720969504321, + "grad_norm": 0.109375, + "learning_rate": 0.0005578662958180192, + "loss": 0.5053, + "step": 60940 + }, + { + "epoch": 3.0272176417999406, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005578265620343698, + "loss": 0.52, + "step": 60950 + }, + { + "epoch": 3.02771431409556, + "grad_norm": 0.1015625, + "learning_rate": 0.0005577868282507203, + "loss": 0.5536, + "step": 60960 + }, + { + "epoch": 3.028210986391179, + "grad_norm": 0.10546875, + "learning_rate": 0.0005577470944670706, + "loss": 0.54, + "step": 60970 + }, + { + "epoch": 3.0287076586867983, + "grad_norm": 0.134765625, + "learning_rate": 0.0005577073606834211, + "loss": 0.5429, + "step": 60980 + }, + { + "epoch": 3.029204330982418, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005576676268997715, + "loss": 0.4954, + "step": 60990 + }, + { + "epoch": 3.029701003278037, + "grad_norm": 0.1044921875, + "learning_rate": 0.000557627893116122, + "loss": 0.5265, + "step": 61000 + }, + { + "epoch": 3.0301976755736564, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005575881593324725, + "loss": 0.554, + "step": 61010 + }, + { + "epoch": 3.030694347869276, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005575484255488229, + "loss": 0.595, + "step": 61020 + }, + { + "epoch": 3.0311910201648953, + "grad_norm": 0.14453125, + "learning_rate": 0.0005575086917651733, + "loss": 0.5713, + "step": 61030 + }, + { + "epoch": 3.0316876924605145, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005574689579815238, + "loss": 0.5409, + "step": 61040 + }, + { + "epoch": 3.0321843647561337, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005574292241978743, + "loss": 0.529, + "step": 61050 + }, + { + "epoch": 3.0326810370517534, + "grad_norm": 0.109375, + "learning_rate": 0.0005573894904142247, + "loss": 0.5127, + "step": 61060 + }, + { + "epoch": 3.0331777093473726, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005573497566305752, + "loss": 0.5463, + "step": 61070 + }, + { + "epoch": 3.033674381642992, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005573100228469256, + "loss": 0.5276, + "step": 61080 + }, + { + "epoch": 3.0341710539386115, + "grad_norm": 0.125, + "learning_rate": 0.0005572702890632761, + "loss": 0.5473, + "step": 61090 + }, + { + "epoch": 3.0346677262342308, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005572305552796266, + "loss": 0.5491, + "step": 61100 + }, + { + "epoch": 3.03516439852985, + "grad_norm": 0.126953125, + "learning_rate": 0.000557190821495977, + "loss": 0.5376, + "step": 61110 + }, + { + "epoch": 3.035661070825469, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005571510877123275, + "loss": 0.5374, + "step": 61120 + }, + { + "epoch": 3.036157743121089, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005571113539286778, + "loss": 0.549, + "step": 61130 + }, + { + "epoch": 3.036654415416708, + "grad_norm": 0.14453125, + "learning_rate": 0.0005570716201450283, + "loss": 0.5694, + "step": 61140 + }, + { + "epoch": 3.0371510877123273, + "grad_norm": 0.16015625, + "learning_rate": 0.0005570318863613789, + "loss": 0.5125, + "step": 61150 + }, + { + "epoch": 3.0376477600079466, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005569921525777292, + "loss": 0.5519, + "step": 61160 + }, + { + "epoch": 3.0381444323035662, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005569524187940797, + "loss": 0.5451, + "step": 61170 + }, + { + "epoch": 3.0386411045991855, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005569126850104301, + "loss": 0.5599, + "step": 61180 + }, + { + "epoch": 3.0391377768948047, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005568729512267805, + "loss": 0.4976, + "step": 61190 + }, + { + "epoch": 3.0396344491904244, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005568332174431311, + "loss": 0.5451, + "step": 61200 + }, + { + "epoch": 3.0401311214860436, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005567934836594815, + "loss": 0.5469, + "step": 61210 + }, + { + "epoch": 3.040627793781663, + "grad_norm": 0.126953125, + "learning_rate": 0.0005567537498758319, + "loss": 0.5476, + "step": 61220 + }, + { + "epoch": 3.041124466077282, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005567140160921824, + "loss": 0.5501, + "step": 61230 + }, + { + "epoch": 3.0416211383729017, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005566742823085328, + "loss": 0.5679, + "step": 61240 + }, + { + "epoch": 3.042117810668521, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005566345485248834, + "loss": 0.5431, + "step": 61250 + }, + { + "epoch": 3.04261448296414, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005565948147412338, + "loss": 0.5192, + "step": 61260 + }, + { + "epoch": 3.0431111552597594, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005565550809575842, + "loss": 0.5309, + "step": 61270 + }, + { + "epoch": 3.043607827555379, + "grad_norm": 0.091796875, + "learning_rate": 0.0005565153471739347, + "loss": 0.5166, + "step": 61280 + }, + { + "epoch": 3.0441044998509983, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005564756133902851, + "loss": 0.5208, + "step": 61290 + }, + { + "epoch": 3.0446011721466175, + "grad_norm": 0.130859375, + "learning_rate": 0.0005564358796066356, + "loss": 0.5551, + "step": 61300 + }, + { + "epoch": 3.045097844442237, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005563961458229861, + "loss": 0.5407, + "step": 61310 + }, + { + "epoch": 3.0455945167378564, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005563564120393364, + "loss": 0.5215, + "step": 61320 + }, + { + "epoch": 3.0460911890334756, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005563166782556869, + "loss": 0.5664, + "step": 61330 + }, + { + "epoch": 3.046587861329095, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005562769444720374, + "loss": 0.5375, + "step": 61340 + }, + { + "epoch": 3.0470845336247145, + "grad_norm": 0.09765625, + "learning_rate": 0.0005562372106883878, + "loss": 0.5631, + "step": 61350 + }, + { + "epoch": 3.0475812059203338, + "grad_norm": 0.099609375, + "learning_rate": 0.0005561974769047383, + "loss": 0.5521, + "step": 61360 + }, + { + "epoch": 3.048077878215953, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005561577431210888, + "loss": 0.5416, + "step": 61370 + }, + { + "epoch": 3.0485745505115727, + "grad_norm": 0.16015625, + "learning_rate": 0.0005561180093374392, + "loss": 0.5347, + "step": 61380 + }, + { + "epoch": 3.049071222807192, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005560782755537896, + "loss": 0.5525, + "step": 61390 + }, + { + "epoch": 3.049567895102811, + "grad_norm": 0.1337890625, + "learning_rate": 0.00055603854177014, + "loss": 0.5232, + "step": 61400 + }, + { + "epoch": 3.0500645673984303, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005559988079864906, + "loss": 0.5682, + "step": 61410 + }, + { + "epoch": 3.05056123969405, + "grad_norm": 0.0966796875, + "learning_rate": 0.000555959074202841, + "loss": 0.5561, + "step": 61420 + }, + { + "epoch": 3.0510579119896692, + "grad_norm": 0.09375, + "learning_rate": 0.0005559193404191914, + "loss": 0.547, + "step": 61430 + }, + { + "epoch": 3.0515545842852885, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005558796066355419, + "loss": 0.5549, + "step": 61440 + }, + { + "epoch": 3.0520512565809077, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005558398728518924, + "loss": 0.534, + "step": 61450 + }, + { + "epoch": 3.0525479288765274, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005558001390682428, + "loss": 0.5269, + "step": 61460 + }, + { + "epoch": 3.0530446011721466, + "grad_norm": 0.107421875, + "learning_rate": 0.0005557604052845933, + "loss": 0.554, + "step": 61470 + }, + { + "epoch": 3.053541273467766, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005557206715009437, + "loss": 0.5262, + "step": 61480 + }, + { + "epoch": 3.0540379457633855, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005556809377172941, + "loss": 0.5439, + "step": 61490 + }, + { + "epoch": 3.0545346180590047, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005556412039336447, + "loss": 0.5188, + "step": 61500 + }, + { + "epoch": 3.055031290354624, + "grad_norm": 0.10107421875, + "learning_rate": 0.000555601470149995, + "loss": 0.5862, + "step": 61510 + }, + { + "epoch": 3.055527962650243, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005555617363663455, + "loss": 0.5218, + "step": 61520 + }, + { + "epoch": 3.056024634945863, + "grad_norm": 0.09326171875, + "learning_rate": 0.000555522002582696, + "loss": 0.5514, + "step": 61530 + }, + { + "epoch": 3.056521307241482, + "grad_norm": 0.177734375, + "learning_rate": 0.0005554822687990464, + "loss": 0.5412, + "step": 61540 + }, + { + "epoch": 3.0570179795371013, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005554425350153969, + "loss": 0.5377, + "step": 61550 + }, + { + "epoch": 3.057514651832721, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005554028012317474, + "loss": 0.519, + "step": 61560 + }, + { + "epoch": 3.05801132412834, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005553630674480978, + "loss": 0.5234, + "step": 61570 + }, + { + "epoch": 3.0585079964239594, + "grad_norm": 0.115234375, + "learning_rate": 0.0005553233336644482, + "loss": 0.5131, + "step": 61580 + }, + { + "epoch": 3.0590046687195787, + "grad_norm": 0.138671875, + "learning_rate": 0.0005552835998807986, + "loss": 0.5561, + "step": 61590 + }, + { + "epoch": 3.0595013410151983, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005552438660971492, + "loss": 0.5365, + "step": 61600 + }, + { + "epoch": 3.0599980133108176, + "grad_norm": 0.109375, + "learning_rate": 0.0005552041323134996, + "loss": 0.5246, + "step": 61610 + }, + { + "epoch": 3.060494685606437, + "grad_norm": 0.142578125, + "learning_rate": 0.00055516439852985, + "loss": 0.5475, + "step": 61620 + }, + { + "epoch": 3.060991357902056, + "grad_norm": 0.11328125, + "learning_rate": 0.0005551246647462005, + "loss": 0.5341, + "step": 61630 + }, + { + "epoch": 3.0614880301976757, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005550849309625509, + "loss": 0.529, + "step": 61640 + }, + { + "epoch": 3.061984702493295, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005550451971789013, + "loss": 0.5492, + "step": 61650 + }, + { + "epoch": 3.062481374788914, + "grad_norm": 0.119140625, + "learning_rate": 0.0005550054633952519, + "loss": 0.5384, + "step": 61660 + }, + { + "epoch": 3.062978047084534, + "grad_norm": 0.099609375, + "learning_rate": 0.0005549657296116023, + "loss": 0.5352, + "step": 61670 + }, + { + "epoch": 3.063474719380153, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005549259958279527, + "loss": 0.5321, + "step": 61680 + }, + { + "epoch": 3.0639713916757723, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005548862620443032, + "loss": 0.5378, + "step": 61690 + }, + { + "epoch": 3.0644680639713915, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005548465282606537, + "loss": 0.5709, + "step": 61700 + }, + { + "epoch": 3.064964736267011, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005548067944770041, + "loss": 0.5592, + "step": 61710 + }, + { + "epoch": 3.0654614085626304, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005547670606933546, + "loss": 0.5601, + "step": 61720 + }, + { + "epoch": 3.0659580808582496, + "grad_norm": 0.1005859375, + "learning_rate": 0.000554727326909705, + "loss": 0.5452, + "step": 61730 + }, + { + "epoch": 3.0664547531538693, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005546875931260554, + "loss": 0.554, + "step": 61740 + }, + { + "epoch": 3.0669514254494885, + "grad_norm": 0.10693359375, + "learning_rate": 0.000554647859342406, + "loss": 0.5515, + "step": 61750 + }, + { + "epoch": 3.0674480977451077, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005546081255587564, + "loss": 0.537, + "step": 61760 + }, + { + "epoch": 3.067944770040727, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005545683917751068, + "loss": 0.5719, + "step": 61770 + }, + { + "epoch": 3.0684414423363466, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005545286579914572, + "loss": 0.4947, + "step": 61780 + }, + { + "epoch": 3.068938114631966, + "grad_norm": 0.103515625, + "learning_rate": 0.0005544889242078077, + "loss": 0.5461, + "step": 61790 + }, + { + "epoch": 3.069434786927585, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005544491904241581, + "loss": 0.5729, + "step": 61800 + }, + { + "epoch": 3.0699314592232043, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005544094566405086, + "loss": 0.5313, + "step": 61810 + }, + { + "epoch": 3.070428131518824, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005543697228568591, + "loss": 0.5505, + "step": 61820 + }, + { + "epoch": 3.070924803814443, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005543299890732096, + "loss": 0.5322, + "step": 61830 + }, + { + "epoch": 3.0714214761100624, + "grad_norm": 0.12890625, + "learning_rate": 0.0005542902552895599, + "loss": 0.5548, + "step": 61840 + }, + { + "epoch": 3.071918148405682, + "grad_norm": 0.1328125, + "learning_rate": 0.0005542505215059105, + "loss": 0.5299, + "step": 61850 + }, + { + "epoch": 3.0724148207013013, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005542107877222609, + "loss": 0.5366, + "step": 61860 + }, + { + "epoch": 3.0729114929969206, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005541710539386113, + "loss": 0.5327, + "step": 61870 + }, + { + "epoch": 3.07340816529254, + "grad_norm": 0.138671875, + "learning_rate": 0.0005541313201549618, + "loss": 0.5327, + "step": 61880 + }, + { + "epoch": 3.0739048375881595, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005540915863713122, + "loss": 0.563, + "step": 61890 + }, + { + "epoch": 3.0744015098837787, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005540518525876626, + "loss": 0.5256, + "step": 61900 + }, + { + "epoch": 3.074898182179398, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005540121188040132, + "loss": 0.5415, + "step": 61910 + }, + { + "epoch": 3.0753948544750176, + "grad_norm": 0.123046875, + "learning_rate": 0.0005539723850203636, + "loss": 0.5438, + "step": 61920 + }, + { + "epoch": 3.075891526770637, + "grad_norm": 0.1279296875, + "learning_rate": 0.000553932651236714, + "loss": 0.5567, + "step": 61930 + }, + { + "epoch": 3.076388199066256, + "grad_norm": 0.09765625, + "learning_rate": 0.0005538929174530645, + "loss": 0.5373, + "step": 61940 + }, + { + "epoch": 3.0768848713618753, + "grad_norm": 0.11376953125, + "learning_rate": 0.000553853183669415, + "loss": 0.5203, + "step": 61950 + }, + { + "epoch": 3.077381543657495, + "grad_norm": 0.154296875, + "learning_rate": 0.0005538134498857654, + "loss": 0.5388, + "step": 61960 + }, + { + "epoch": 3.077878215953114, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005537737161021158, + "loss": 0.5174, + "step": 61970 + }, + { + "epoch": 3.0783748882487334, + "grad_norm": 0.109375, + "learning_rate": 0.0005537339823184663, + "loss": 0.5629, + "step": 61980 + }, + { + "epoch": 3.0788715605443526, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005536942485348168, + "loss": 0.5578, + "step": 61990 + }, + { + "epoch": 3.0793682328399723, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005536545147511671, + "loss": 0.5396, + "step": 62000 + }, + { + "epoch": 3.0798649051355915, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005536147809675177, + "loss": 0.5275, + "step": 62010 + }, + { + "epoch": 3.0803615774312108, + "grad_norm": 0.109375, + "learning_rate": 0.0005535750471838682, + "loss": 0.5266, + "step": 62020 + }, + { + "epoch": 3.0808582497268304, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005535353134002185, + "loss": 0.5339, + "step": 62030 + }, + { + "epoch": 3.0813549220224496, + "grad_norm": 0.1279296875, + "learning_rate": 0.000553495579616569, + "loss": 0.5053, + "step": 62040 + }, + { + "epoch": 3.081851594318069, + "grad_norm": 0.095703125, + "learning_rate": 0.0005534558458329194, + "loss": 0.5093, + "step": 62050 + }, + { + "epoch": 3.082348266613688, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005534161120492699, + "loss": 0.5779, + "step": 62060 + }, + { + "epoch": 3.0828449389093078, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005533763782656204, + "loss": 0.5301, + "step": 62070 + }, + { + "epoch": 3.083341611204927, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005533366444819708, + "loss": 0.5422, + "step": 62080 + }, + { + "epoch": 3.0838382835005462, + "grad_norm": 0.115234375, + "learning_rate": 0.0005532969106983212, + "loss": 0.5463, + "step": 62090 + }, + { + "epoch": 3.084334955796166, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005532571769146717, + "loss": 0.5811, + "step": 62100 + }, + { + "epoch": 3.084831628091785, + "grad_norm": 0.1171875, + "learning_rate": 0.0005532174431310222, + "loss": 0.5423, + "step": 62110 + }, + { + "epoch": 3.0853283003874044, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005531777093473726, + "loss": 0.5375, + "step": 62120 + }, + { + "epoch": 3.0858249726830236, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005531379755637231, + "loss": 0.5421, + "step": 62130 + }, + { + "epoch": 3.0863216449786433, + "grad_norm": 0.08984375, + "learning_rate": 0.0005530982417800735, + "loss": 0.5549, + "step": 62140 + }, + { + "epoch": 3.0868183172742625, + "grad_norm": 0.1005859375, + "learning_rate": 0.000553058507996424, + "loss": 0.5281, + "step": 62150 + }, + { + "epoch": 3.0873149895698817, + "grad_norm": 0.1171875, + "learning_rate": 0.0005530187742127745, + "loss": 0.5695, + "step": 62160 + }, + { + "epoch": 3.087811661865501, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005529790404291249, + "loss": 0.5456, + "step": 62170 + }, + { + "epoch": 3.0883083341611206, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005529393066454754, + "loss": 0.5652, + "step": 62180 + }, + { + "epoch": 3.08880500645674, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005528995728618257, + "loss": 0.5569, + "step": 62190 + }, + { + "epoch": 3.089301678752359, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005528598390781762, + "loss": 0.5453, + "step": 62200 + }, + { + "epoch": 3.0897983510479787, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005528201052945268, + "loss": 0.5413, + "step": 62210 + }, + { + "epoch": 3.090295023343598, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005527803715108771, + "loss": 0.545, + "step": 62220 + }, + { + "epoch": 3.090791695639217, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005527406377272276, + "loss": 0.5355, + "step": 62230 + }, + { + "epoch": 3.0912883679348364, + "grad_norm": 0.10302734375, + "learning_rate": 0.000552700903943578, + "loss": 0.5564, + "step": 62240 + }, + { + "epoch": 3.091785040230456, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005526611701599284, + "loss": 0.5451, + "step": 62250 + }, + { + "epoch": 3.0922817125260753, + "grad_norm": 0.1298828125, + "learning_rate": 0.000552621436376279, + "loss": 0.5192, + "step": 62260 + }, + { + "epoch": 3.0927783848216945, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005525817025926294, + "loss": 0.521, + "step": 62270 + }, + { + "epoch": 3.093275057117314, + "grad_norm": 0.119140625, + "learning_rate": 0.0005525419688089799, + "loss": 0.5656, + "step": 62280 + }, + { + "epoch": 3.0937717294129334, + "grad_norm": 0.103515625, + "learning_rate": 0.0005525022350253303, + "loss": 0.5135, + "step": 62290 + }, + { + "epoch": 3.0942684017085527, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005524625012416807, + "loss": 0.5452, + "step": 62300 + }, + { + "epoch": 3.094765074004172, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005524227674580313, + "loss": 0.5428, + "step": 62310 + }, + { + "epoch": 3.0952617462997916, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005523830336743817, + "loss": 0.5487, + "step": 62320 + }, + { + "epoch": 3.095758418595411, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005523432998907321, + "loss": 0.5499, + "step": 62330 + }, + { + "epoch": 3.09625509089103, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005523035661070826, + "loss": 0.55, + "step": 62340 + }, + { + "epoch": 3.0967517631866492, + "grad_norm": 0.12158203125, + "learning_rate": 0.000552263832323433, + "loss": 0.5263, + "step": 62350 + }, + { + "epoch": 3.097248435482269, + "grad_norm": 0.15625, + "learning_rate": 0.0005522240985397835, + "loss": 0.5364, + "step": 62360 + }, + { + "epoch": 3.097745107777888, + "grad_norm": 0.11181640625, + "learning_rate": 0.000552184364756134, + "loss": 0.5405, + "step": 62370 + }, + { + "epoch": 3.0982417800735074, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005521446309724843, + "loss": 0.5298, + "step": 62380 + }, + { + "epoch": 3.098738452369127, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005521048971888348, + "loss": 0.5596, + "step": 62390 + }, + { + "epoch": 3.0992351246647463, + "grad_norm": 0.095703125, + "learning_rate": 0.0005520651634051853, + "loss": 0.5401, + "step": 62400 + }, + { + "epoch": 3.0997317969603655, + "grad_norm": 0.1015625, + "learning_rate": 0.0005520254296215357, + "loss": 0.5401, + "step": 62410 + }, + { + "epoch": 3.1002284692559847, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005519856958378862, + "loss": 0.5443, + "step": 62420 + }, + { + "epoch": 3.1007251415516044, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005519459620542367, + "loss": 0.538, + "step": 62430 + }, + { + "epoch": 3.1012218138472236, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005519062282705871, + "loss": 0.524, + "step": 62440 + }, + { + "epoch": 3.101718486142843, + "grad_norm": 0.15234375, + "learning_rate": 0.0005518664944869375, + "loss": 0.5423, + "step": 62450 + }, + { + "epoch": 3.1022151584384625, + "grad_norm": 0.1298828125, + "learning_rate": 0.000551826760703288, + "loss": 0.5312, + "step": 62460 + }, + { + "epoch": 3.1027118307340817, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005517870269196385, + "loss": 0.5245, + "step": 62470 + }, + { + "epoch": 3.103208503029701, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005517472931359889, + "loss": 0.5601, + "step": 62480 + }, + { + "epoch": 3.10370517532532, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005517075593523393, + "loss": 0.5267, + "step": 62490 + }, + { + "epoch": 3.10420184762094, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005516678255686898, + "loss": 0.569, + "step": 62500 + }, + { + "epoch": 3.104698519916559, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005516280917850403, + "loss": 0.5465, + "step": 62510 + }, + { + "epoch": 3.1051951922121783, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005515883580013907, + "loss": 0.5337, + "step": 62520 + }, + { + "epoch": 3.1056918645077976, + "grad_norm": 0.15625, + "learning_rate": 0.0005515486242177412, + "loss": 0.5241, + "step": 62530 + }, + { + "epoch": 3.106188536803417, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005515088904340916, + "loss": 0.5715, + "step": 62540 + }, + { + "epoch": 3.1066852090990364, + "grad_norm": 0.12890625, + "learning_rate": 0.000551469156650442, + "loss": 0.5474, + "step": 62550 + }, + { + "epoch": 3.1071818813946557, + "grad_norm": 0.111328125, + "learning_rate": 0.0005514294228667926, + "loss": 0.5265, + "step": 62560 + }, + { + "epoch": 3.1076785536902753, + "grad_norm": 0.0966796875, + "learning_rate": 0.000551389689083143, + "loss": 0.5357, + "step": 62570 + }, + { + "epoch": 3.1081752259858946, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005513499552994934, + "loss": 0.5471, + "step": 62580 + }, + { + "epoch": 3.108671898281514, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005513102215158439, + "loss": 0.5478, + "step": 62590 + }, + { + "epoch": 3.109168570577133, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005512704877321943, + "loss": 0.5785, + "step": 62600 + }, + { + "epoch": 3.1096652428727527, + "grad_norm": 0.109375, + "learning_rate": 0.0005512307539485448, + "loss": 0.5711, + "step": 62610 + }, + { + "epoch": 3.110161915168372, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005511910201648953, + "loss": 0.533, + "step": 62620 + }, + { + "epoch": 3.110658587463991, + "grad_norm": 0.09375, + "learning_rate": 0.0005511512863812457, + "loss": 0.5451, + "step": 62630 + }, + { + "epoch": 3.1111552597596104, + "grad_norm": 0.109375, + "learning_rate": 0.0005511115525975961, + "loss": 0.544, + "step": 62640 + }, + { + "epoch": 3.11165193205523, + "grad_norm": 0.10546875, + "learning_rate": 0.0005510718188139465, + "loss": 0.529, + "step": 62650 + }, + { + "epoch": 3.1121486043508493, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005510320850302971, + "loss": 0.5538, + "step": 62660 + }, + { + "epoch": 3.1126452766464685, + "grad_norm": 0.103515625, + "learning_rate": 0.0005509923512466475, + "loss": 0.5443, + "step": 62670 + }, + { + "epoch": 3.113141948942088, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005509526174629979, + "loss": 0.5412, + "step": 62680 + }, + { + "epoch": 3.1136386212377074, + "grad_norm": 0.10546875, + "learning_rate": 0.0005509128836793484, + "loss": 0.5474, + "step": 62690 + }, + { + "epoch": 3.1141352935333266, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005508731498956988, + "loss": 0.5529, + "step": 62700 + }, + { + "epoch": 3.114631965828946, + "grad_norm": 0.119140625, + "learning_rate": 0.0005508334161120493, + "loss": 0.5555, + "step": 62710 + }, + { + "epoch": 3.1151286381245655, + "grad_norm": 0.17578125, + "learning_rate": 0.0005507936823283998, + "loss": 0.5342, + "step": 62720 + }, + { + "epoch": 3.1156253104201848, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005507539485447502, + "loss": 0.5442, + "step": 62730 + }, + { + "epoch": 3.116121982715804, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005507142147611006, + "loss": 0.5198, + "step": 62740 + }, + { + "epoch": 3.1166186550114237, + "grad_norm": 0.162109375, + "learning_rate": 0.0005506744809774511, + "loss": 0.5384, + "step": 62750 + }, + { + "epoch": 3.117115327307043, + "grad_norm": 0.10546875, + "learning_rate": 0.0005506347471938016, + "loss": 0.5622, + "step": 62760 + }, + { + "epoch": 3.117611999602662, + "grad_norm": 0.1572265625, + "learning_rate": 0.000550595013410152, + "loss": 0.5312, + "step": 62770 + }, + { + "epoch": 3.1181086718982813, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005505552796265025, + "loss": 0.5747, + "step": 62780 + }, + { + "epoch": 3.118605344193901, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005505155458428529, + "loss": 0.5556, + "step": 62790 + }, + { + "epoch": 3.1191020164895202, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005504758120592033, + "loss": 0.5464, + "step": 62800 + }, + { + "epoch": 3.1195986887851395, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005504360782755539, + "loss": 0.5591, + "step": 62810 + }, + { + "epoch": 3.120095361080759, + "grad_norm": 0.095703125, + "learning_rate": 0.0005503963444919043, + "loss": 0.5534, + "step": 62820 + }, + { + "epoch": 3.1205920333763784, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005503566107082547, + "loss": 0.5294, + "step": 62830 + }, + { + "epoch": 3.1210887056719976, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005503168769246051, + "loss": 0.5288, + "step": 62840 + }, + { + "epoch": 3.121585377967617, + "grad_norm": 0.10546875, + "learning_rate": 0.0005502771431409556, + "loss": 0.5457, + "step": 62850 + }, + { + "epoch": 3.1220820502632365, + "grad_norm": 0.10693359375, + "learning_rate": 0.000550237409357306, + "loss": 0.5185, + "step": 62860 + }, + { + "epoch": 3.1225787225588557, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005501976755736565, + "loss": 0.5331, + "step": 62870 + }, + { + "epoch": 3.123075394854475, + "grad_norm": 0.10302734375, + "learning_rate": 0.000550157941790007, + "loss": 0.5471, + "step": 62880 + }, + { + "epoch": 3.123572067150094, + "grad_norm": 0.11328125, + "learning_rate": 0.0005501182080063575, + "loss": 0.5435, + "step": 62890 + }, + { + "epoch": 3.124068739445714, + "grad_norm": 0.150390625, + "learning_rate": 0.0005500784742227078, + "loss": 0.5571, + "step": 62900 + }, + { + "epoch": 3.124565411741333, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005500387404390584, + "loss": 0.5406, + "step": 62910 + }, + { + "epoch": 3.1250620840369523, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005499990066554088, + "loss": 0.5539, + "step": 62920 + }, + { + "epoch": 3.125558756332572, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005499592728717592, + "loss": 0.5627, + "step": 62930 + }, + { + "epoch": 3.126055428628191, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005499195390881097, + "loss": 0.5457, + "step": 62940 + }, + { + "epoch": 3.1265521009238104, + "grad_norm": 0.109375, + "learning_rate": 0.0005498798053044601, + "loss": 0.5293, + "step": 62950 + }, + { + "epoch": 3.1270487732194296, + "grad_norm": 0.099609375, + "learning_rate": 0.0005498400715208105, + "loss": 0.5657, + "step": 62960 + }, + { + "epoch": 3.1275454455150493, + "grad_norm": 0.099609375, + "learning_rate": 0.0005498003377371611, + "loss": 0.5352, + "step": 62970 + }, + { + "epoch": 3.1280421178106685, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005497606039535115, + "loss": 0.5144, + "step": 62980 + }, + { + "epoch": 3.1285387901062878, + "grad_norm": 0.126953125, + "learning_rate": 0.0005497208701698619, + "loss": 0.5005, + "step": 62990 + }, + { + "epoch": 3.129035462401907, + "grad_norm": 0.1171875, + "learning_rate": 0.0005496811363862124, + "loss": 0.5394, + "step": 63000 + }, + { + "epoch": 3.1295321346975267, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005496414026025629, + "loss": 0.5577, + "step": 63010 + }, + { + "epoch": 3.130028806993146, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005496016688189134, + "loss": 0.5583, + "step": 63020 + }, + { + "epoch": 3.130525479288765, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005495619350352638, + "loss": 0.5588, + "step": 63030 + }, + { + "epoch": 3.131022151584385, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005495222012516142, + "loss": 0.5226, + "step": 63040 + }, + { + "epoch": 3.131518823880004, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005494824674679647, + "loss": 0.5239, + "step": 63050 + }, + { + "epoch": 3.1320154961756232, + "grad_norm": 0.09130859375, + "learning_rate": 0.000549442733684315, + "loss": 0.5049, + "step": 63060 + }, + { + "epoch": 3.1325121684712425, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005494029999006656, + "loss": 0.5572, + "step": 63070 + }, + { + "epoch": 3.133008840766862, + "grad_norm": 0.134765625, + "learning_rate": 0.0005493632661170161, + "loss": 0.5536, + "step": 63080 + }, + { + "epoch": 3.1335055130624814, + "grad_norm": 0.115234375, + "learning_rate": 0.0005493235323333664, + "loss": 0.5535, + "step": 63090 + }, + { + "epoch": 3.1340021853581006, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005492837985497169, + "loss": 0.5422, + "step": 63100 + }, + { + "epoch": 3.1344988576537203, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005492440647660673, + "loss": 0.569, + "step": 63110 + }, + { + "epoch": 3.1349955299493395, + "grad_norm": 0.111328125, + "learning_rate": 0.0005492043309824178, + "loss": 0.5438, + "step": 63120 + }, + { + "epoch": 3.1354922022449587, + "grad_norm": 0.158203125, + "learning_rate": 0.0005491645971987683, + "loss": 0.5527, + "step": 63130 + }, + { + "epoch": 3.135988874540578, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005491248634151187, + "loss": 0.5441, + "step": 63140 + }, + { + "epoch": 3.1364855468361976, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005490851296314691, + "loss": 0.5532, + "step": 63150 + }, + { + "epoch": 3.136982219131817, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005490453958478197, + "loss": 0.5299, + "step": 63160 + }, + { + "epoch": 3.137478891427436, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005490056620641701, + "loss": 0.5217, + "step": 63170 + }, + { + "epoch": 3.1379755637230557, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005489659282805206, + "loss": 0.5396, + "step": 63180 + }, + { + "epoch": 3.138472236018675, + "grad_norm": 0.109375, + "learning_rate": 0.000548926194496871, + "loss": 0.5253, + "step": 63190 + }, + { + "epoch": 3.138968908314294, + "grad_norm": 0.166015625, + "learning_rate": 0.0005488864607132214, + "loss": 0.5396, + "step": 63200 + }, + { + "epoch": 3.1394655806099134, + "grad_norm": 0.10595703125, + "learning_rate": 0.000548846726929572, + "loss": 0.5891, + "step": 63210 + }, + { + "epoch": 3.139962252905533, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005488069931459224, + "loss": 0.5388, + "step": 63220 + }, + { + "epoch": 3.1404589252011523, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005487672593622728, + "loss": 0.5178, + "step": 63230 + }, + { + "epoch": 3.1409555974967716, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005487275255786233, + "loss": 0.5149, + "step": 63240 + }, + { + "epoch": 3.141452269792391, + "grad_norm": 0.09765625, + "learning_rate": 0.0005486877917949736, + "loss": 0.5216, + "step": 63250 + }, + { + "epoch": 3.1419489420880105, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005486480580113241, + "loss": 0.5156, + "step": 63260 + }, + { + "epoch": 3.1424456143836297, + "grad_norm": 0.115234375, + "learning_rate": 0.0005486083242276747, + "loss": 0.5361, + "step": 63270 + }, + { + "epoch": 3.142942286679249, + "grad_norm": 0.197265625, + "learning_rate": 0.000548568590444025, + "loss": 0.5553, + "step": 63280 + }, + { + "epoch": 3.1434389589748686, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005485288566603755, + "loss": 0.5313, + "step": 63290 + }, + { + "epoch": 3.143935631270488, + "grad_norm": 0.11962890625, + "learning_rate": 0.000548489122876726, + "loss": 0.5264, + "step": 63300 + }, + { + "epoch": 3.144432303566107, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005484493890930763, + "loss": 0.5595, + "step": 63310 + }, + { + "epoch": 3.1449289758617263, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005484096553094269, + "loss": 0.526, + "step": 63320 + }, + { + "epoch": 3.145425648157346, + "grad_norm": 0.134765625, + "learning_rate": 0.0005483699215257773, + "loss": 0.5358, + "step": 63330 + }, + { + "epoch": 3.145922320452965, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005483301877421278, + "loss": 0.5369, + "step": 63340 + }, + { + "epoch": 3.1464189927485844, + "grad_norm": 0.140625, + "learning_rate": 0.0005482904539584782, + "loss": 0.537, + "step": 63350 + }, + { + "epoch": 3.1469156650442036, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005482507201748286, + "loss": 0.5195, + "step": 63360 + }, + { + "epoch": 3.1474123373398233, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005482109863911792, + "loss": 0.5355, + "step": 63370 + }, + { + "epoch": 3.1479090096354425, + "grad_norm": 0.138671875, + "learning_rate": 0.0005481712526075296, + "loss": 0.5421, + "step": 63380 + }, + { + "epoch": 3.1484056819310617, + "grad_norm": 0.125, + "learning_rate": 0.00054813151882388, + "loss": 0.5275, + "step": 63390 + }, + { + "epoch": 3.1489023542266814, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005480917850402305, + "loss": 0.5508, + "step": 63400 + }, + { + "epoch": 3.1493990265223006, + "grad_norm": 0.10986328125, + "learning_rate": 0.000548052051256581, + "loss": 0.5531, + "step": 63410 + }, + { + "epoch": 3.14989569881792, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005480123174729314, + "loss": 0.5266, + "step": 63420 + }, + { + "epoch": 3.150392371113539, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005479725836892819, + "loss": 0.5375, + "step": 63430 + }, + { + "epoch": 3.1508890434091588, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005479328499056322, + "loss": 0.5275, + "step": 63440 + }, + { + "epoch": 3.151385715704778, + "grad_norm": 0.142578125, + "learning_rate": 0.0005478931161219827, + "loss": 0.5394, + "step": 63450 + }, + { + "epoch": 3.151882388000397, + "grad_norm": 0.125, + "learning_rate": 0.0005478533823383332, + "loss": 0.5159, + "step": 63460 + }, + { + "epoch": 3.152379060296017, + "grad_norm": 0.158203125, + "learning_rate": 0.0005478136485546837, + "loss": 0.5586, + "step": 63470 + }, + { + "epoch": 3.152875732591636, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005477739147710341, + "loss": 0.5405, + "step": 63480 + }, + { + "epoch": 3.1533724048872553, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005477341809873846, + "loss": 0.5514, + "step": 63490 + }, + { + "epoch": 3.1538690771828746, + "grad_norm": 0.1015625, + "learning_rate": 0.000547694447203735, + "loss": 0.5254, + "step": 63500 + }, + { + "epoch": 3.1543657494784942, + "grad_norm": 0.115234375, + "learning_rate": 0.0005476547134200854, + "loss": 0.5192, + "step": 63510 + }, + { + "epoch": 3.1548624217741135, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005476149796364359, + "loss": 0.5334, + "step": 63520 + }, + { + "epoch": 3.1553590940697327, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005475752458527864, + "loss": 0.5305, + "step": 63530 + }, + { + "epoch": 3.1558557663653524, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005475355120691368, + "loss": 0.5516, + "step": 63540 + }, + { + "epoch": 3.1563524386609716, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005474957782854872, + "loss": 0.5457, + "step": 63550 + }, + { + "epoch": 3.156849110956591, + "grad_norm": 0.095703125, + "learning_rate": 0.0005474560445018377, + "loss": 0.5303, + "step": 63560 + }, + { + "epoch": 3.15734578325221, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005474163107181882, + "loss": 0.5292, + "step": 63570 + }, + { + "epoch": 3.1578424555478297, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005473765769345386, + "loss": 0.5164, + "step": 63580 + }, + { + "epoch": 3.158339127843449, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005473368431508891, + "loss": 0.5831, + "step": 63590 + }, + { + "epoch": 3.158835800139068, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005472971093672395, + "loss": 0.5374, + "step": 63600 + }, + { + "epoch": 3.1593324724346874, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005472573755835899, + "loss": 0.5068, + "step": 63610 + }, + { + "epoch": 3.159829144730307, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005472176417999405, + "loss": 0.5486, + "step": 63620 + }, + { + "epoch": 3.1603258170259263, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005471779080162909, + "loss": 0.5547, + "step": 63630 + }, + { + "epoch": 3.1608224893215455, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005471381742326413, + "loss": 0.5387, + "step": 63640 + }, + { + "epoch": 3.161319161617165, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005470984404489918, + "loss": 0.5184, + "step": 63650 + }, + { + "epoch": 3.1618158339127844, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005470587066653422, + "loss": 0.5584, + "step": 63660 + }, + { + "epoch": 3.1623125062084037, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005470189728816927, + "loss": 0.5666, + "step": 63670 + }, + { + "epoch": 3.162809178504023, + "grad_norm": 0.111328125, + "learning_rate": 0.0005469792390980432, + "loss": 0.552, + "step": 63680 + }, + { + "epoch": 3.1633058507996425, + "grad_norm": 0.099609375, + "learning_rate": 0.0005469395053143936, + "loss": 0.564, + "step": 63690 + }, + { + "epoch": 3.1638025230952618, + "grad_norm": 0.11669921875, + "learning_rate": 0.000546899771530744, + "loss": 0.5487, + "step": 63700 + }, + { + "epoch": 3.164299195390881, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005468600377470944, + "loss": 0.5383, + "step": 63710 + }, + { + "epoch": 3.1647958676865002, + "grad_norm": 0.11962890625, + "learning_rate": 0.000546820303963445, + "loss": 0.5467, + "step": 63720 + }, + { + "epoch": 3.16529253998212, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005467805701797954, + "loss": 0.573, + "step": 63730 + }, + { + "epoch": 3.165789212277739, + "grad_norm": 0.1953125, + "learning_rate": 0.0005467408363961458, + "loss": 0.5421, + "step": 63740 + }, + { + "epoch": 3.1662858845733584, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005467011026124963, + "loss": 0.5125, + "step": 63750 + }, + { + "epoch": 3.166782556868978, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005466613688288467, + "loss": 0.5599, + "step": 63760 + }, + { + "epoch": 3.1672792291645973, + "grad_norm": 0.11328125, + "learning_rate": 0.0005466216350451972, + "loss": 0.5077, + "step": 63770 + }, + { + "epoch": 3.1677759014602165, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005465819012615477, + "loss": 0.5331, + "step": 63780 + }, + { + "epoch": 3.1682725737558357, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005465421674778981, + "loss": 0.5454, + "step": 63790 + }, + { + "epoch": 3.1687692460514554, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005465024336942485, + "loss": 0.5786, + "step": 63800 + }, + { + "epoch": 3.1692659183470746, + "grad_norm": 0.091796875, + "learning_rate": 0.000546462699910599, + "loss": 0.5214, + "step": 63810 + }, + { + "epoch": 3.169762590642694, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005464229661269495, + "loss": 0.5239, + "step": 63820 + }, + { + "epoch": 3.1702592629383135, + "grad_norm": 0.119140625, + "learning_rate": 0.0005463832323432999, + "loss": 0.5442, + "step": 63830 + }, + { + "epoch": 3.1707559352339327, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005463434985596504, + "loss": 0.5585, + "step": 63840 + }, + { + "epoch": 3.171252607529552, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005463037647760008, + "loss": 0.5462, + "step": 63850 + }, + { + "epoch": 3.171749279825171, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005462640309923512, + "loss": 0.5352, + "step": 63860 + }, + { + "epoch": 3.172245952120791, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005462242972087018, + "loss": 0.5169, + "step": 63870 + }, + { + "epoch": 3.17274262441641, + "grad_norm": 0.11328125, + "learning_rate": 0.0005461845634250522, + "loss": 0.5455, + "step": 63880 + }, + { + "epoch": 3.1732392967120293, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005461448296414026, + "loss": 0.5168, + "step": 63890 + }, + { + "epoch": 3.173735969007649, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005461050958577531, + "loss": 0.5427, + "step": 63900 + }, + { + "epoch": 3.174232641303268, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005460653620741035, + "loss": 0.5546, + "step": 63910 + }, + { + "epoch": 3.1747293135988874, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005460256282904541, + "loss": 0.5372, + "step": 63920 + }, + { + "epoch": 3.1752259858945067, + "grad_norm": 0.125, + "learning_rate": 0.0005459858945068044, + "loss": 0.5347, + "step": 63930 + }, + { + "epoch": 3.1757226581901263, + "grad_norm": 0.115234375, + "learning_rate": 0.0005459461607231549, + "loss": 0.5307, + "step": 63940 + }, + { + "epoch": 3.1762193304857456, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005459064269395054, + "loss": 0.5004, + "step": 63950 + }, + { + "epoch": 3.176716002781365, + "grad_norm": 0.1015625, + "learning_rate": 0.0005458666931558557, + "loss": 0.5764, + "step": 63960 + }, + { + "epoch": 3.177212675076984, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005458269593722063, + "loss": 0.5565, + "step": 63970 + }, + { + "epoch": 3.1777093473726037, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005457872255885567, + "loss": 0.5638, + "step": 63980 + }, + { + "epoch": 3.178206019668223, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005457474918049071, + "loss": 0.5408, + "step": 63990 + }, + { + "epoch": 3.178702691963842, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005457077580212576, + "loss": 0.5744, + "step": 64000 + }, + { + "epoch": 3.179199364259462, + "grad_norm": 0.1298828125, + "learning_rate": 0.000545668024237608, + "loss": 0.5473, + "step": 64010 + }, + { + "epoch": 3.179696036555081, + "grad_norm": 0.146484375, + "learning_rate": 0.0005456282904539585, + "loss": 0.5372, + "step": 64020 + }, + { + "epoch": 3.1801927088507003, + "grad_norm": 0.1337890625, + "learning_rate": 0.000545588556670309, + "loss": 0.5091, + "step": 64030 + }, + { + "epoch": 3.1806893811463195, + "grad_norm": 0.1171875, + "learning_rate": 0.0005455488228866594, + "loss": 0.5632, + "step": 64040 + }, + { + "epoch": 3.181186053441939, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005455090891030098, + "loss": 0.5374, + "step": 64050 + }, + { + "epoch": 3.1816827257375584, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005454693553193603, + "loss": 0.5519, + "step": 64060 + }, + { + "epoch": 3.1821793980331776, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005454296215357108, + "loss": 0.5184, + "step": 64070 + }, + { + "epoch": 3.182676070328797, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005453898877520613, + "loss": 0.5471, + "step": 64080 + }, + { + "epoch": 3.1831727426244165, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005453501539684117, + "loss": 0.5274, + "step": 64090 + }, + { + "epoch": 3.1836694149200357, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005453104201847621, + "loss": 0.5229, + "step": 64100 + }, + { + "epoch": 3.184166087215655, + "grad_norm": 0.10546875, + "learning_rate": 0.0005452706864011126, + "loss": 0.5344, + "step": 64110 + }, + { + "epoch": 3.1846627595112746, + "grad_norm": 0.1416015625, + "learning_rate": 0.000545230952617463, + "loss": 0.5281, + "step": 64120 + }, + { + "epoch": 3.185159431806894, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005451912188338135, + "loss": 0.5332, + "step": 64130 + }, + { + "epoch": 3.185656104102513, + "grad_norm": 0.1884765625, + "learning_rate": 0.000545151485050164, + "loss": 0.5792, + "step": 64140 + }, + { + "epoch": 3.1861527763981323, + "grad_norm": 0.109375, + "learning_rate": 0.0005451117512665143, + "loss": 0.5684, + "step": 64150 + }, + { + "epoch": 3.186649448693752, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005450720174828648, + "loss": 0.5583, + "step": 64160 + }, + { + "epoch": 3.1871461209893712, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005450322836992154, + "loss": 0.5471, + "step": 64170 + }, + { + "epoch": 3.1876427932849905, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005449925499155657, + "loss": 0.5191, + "step": 64180 + }, + { + "epoch": 3.18813946558061, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005449528161319162, + "loss": 0.5254, + "step": 64190 + }, + { + "epoch": 3.1886361378762293, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005449130823482666, + "loss": 0.5393, + "step": 64200 + }, + { + "epoch": 3.1891328101718486, + "grad_norm": 0.11669921875, + "learning_rate": 0.000544873348564617, + "loss": 0.5409, + "step": 64210 + }, + { + "epoch": 3.189629482467468, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005448336147809676, + "loss": 0.5351, + "step": 64220 + }, + { + "epoch": 3.1901261547630875, + "grad_norm": 0.142578125, + "learning_rate": 0.000544793880997318, + "loss": 0.5417, + "step": 64230 + }, + { + "epoch": 3.1906228270587067, + "grad_norm": 0.109375, + "learning_rate": 0.0005447541472136685, + "loss": 0.5297, + "step": 64240 + }, + { + "epoch": 3.191119499354326, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005447144134300189, + "loss": 0.5272, + "step": 64250 + }, + { + "epoch": 3.1916161716499456, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005446746796463693, + "loss": 0.5497, + "step": 64260 + }, + { + "epoch": 3.192112843945565, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005446349458627199, + "loss": 0.5246, + "step": 64270 + }, + { + "epoch": 3.192609516241184, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005445952120790703, + "loss": 0.5257, + "step": 64280 + }, + { + "epoch": 3.1931061885368033, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005445554782954207, + "loss": 0.5019, + "step": 64290 + }, + { + "epoch": 3.193602860832423, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005445157445117712, + "loss": 0.5745, + "step": 64300 + }, + { + "epoch": 3.194099533128042, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005444760107281215, + "loss": 0.5407, + "step": 64310 + }, + { + "epoch": 3.1945962054236614, + "grad_norm": 0.1513671875, + "learning_rate": 0.000544436276944472, + "loss": 0.5295, + "step": 64320 + }, + { + "epoch": 3.1950928777192806, + "grad_norm": 0.099609375, + "learning_rate": 0.0005443965431608226, + "loss": 0.5248, + "step": 64330 + }, + { + "epoch": 3.1955895500149003, + "grad_norm": 0.12109375, + "learning_rate": 0.0005443568093771729, + "loss": 0.5247, + "step": 64340 + }, + { + "epoch": 3.1960862223105195, + "grad_norm": 0.107421875, + "learning_rate": 0.0005443170755935234, + "loss": 0.5307, + "step": 64350 + }, + { + "epoch": 3.1965828946061388, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005442773418098739, + "loss": 0.5546, + "step": 64360 + }, + { + "epoch": 3.197079566901758, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005442376080262244, + "loss": 0.5415, + "step": 64370 + }, + { + "epoch": 3.1975762391973777, + "grad_norm": 0.12890625, + "learning_rate": 0.0005441978742425748, + "loss": 0.5384, + "step": 64380 + }, + { + "epoch": 3.198072911492997, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005441581404589252, + "loss": 0.5277, + "step": 64390 + }, + { + "epoch": 3.198569583788616, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005441184066752757, + "loss": 0.5105, + "step": 64400 + }, + { + "epoch": 3.199066256084236, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005440786728916261, + "loss": 0.526, + "step": 64410 + }, + { + "epoch": 3.199562928379855, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005440389391079765, + "loss": 0.5225, + "step": 64420 + }, + { + "epoch": 3.2000596006754742, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005439992053243271, + "loss": 0.5477, + "step": 64430 + }, + { + "epoch": 3.2005562729710935, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005439594715406775, + "loss": 0.5536, + "step": 64440 + }, + { + "epoch": 3.201052945266713, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005439197377570279, + "loss": 0.5606, + "step": 64450 + }, + { + "epoch": 3.2015496175623324, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005438800039733784, + "loss": 0.5213, + "step": 64460 + }, + { + "epoch": 3.2020462898579516, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005438402701897289, + "loss": 0.5197, + "step": 64470 + }, + { + "epoch": 3.2025429621535713, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005438005364060793, + "loss": 0.5368, + "step": 64480 + }, + { + "epoch": 3.2030396344491905, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005437608026224298, + "loss": 0.5127, + "step": 64490 + }, + { + "epoch": 3.2035363067448097, + "grad_norm": 0.12109375, + "learning_rate": 0.0005437210688387802, + "loss": 0.5114, + "step": 64500 + }, + { + "epoch": 3.204032979040429, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005436813350551306, + "loss": 0.5162, + "step": 64510 + }, + { + "epoch": 3.2045296513360486, + "grad_norm": 0.09375, + "learning_rate": 0.0005436416012714812, + "loss": 0.5653, + "step": 64520 + }, + { + "epoch": 3.205026323631668, + "grad_norm": 0.173828125, + "learning_rate": 0.0005436018674878316, + "loss": 0.56, + "step": 64530 + }, + { + "epoch": 3.205522995927287, + "grad_norm": 0.103515625, + "learning_rate": 0.000543562133704182, + "loss": 0.5731, + "step": 64540 + }, + { + "epoch": 3.2060196682229067, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005435223999205325, + "loss": 0.5533, + "step": 64550 + }, + { + "epoch": 3.206516340518526, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005434826661368829, + "loss": 0.5644, + "step": 64560 + }, + { + "epoch": 3.207013012814145, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005434429323532333, + "loss": 0.5294, + "step": 64570 + }, + { + "epoch": 3.2075096851097644, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005434031985695838, + "loss": 0.5623, + "step": 64580 + }, + { + "epoch": 3.208006357405384, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005433634647859343, + "loss": 0.5674, + "step": 64590 + }, + { + "epoch": 3.2085030297010033, + "grad_norm": 0.11328125, + "learning_rate": 0.0005433237310022847, + "loss": 0.5413, + "step": 64600 + }, + { + "epoch": 3.2089997019966225, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005432839972186351, + "loss": 0.517, + "step": 64610 + }, + { + "epoch": 3.2094963742922418, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005432442634349856, + "loss": 0.5401, + "step": 64620 + }, + { + "epoch": 3.2099930465878614, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005432045296513361, + "loss": 0.5611, + "step": 64630 + }, + { + "epoch": 3.2104897188834807, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005431647958676865, + "loss": 0.5207, + "step": 64640 + }, + { + "epoch": 3.2109863911791, + "grad_norm": 0.0966796875, + "learning_rate": 0.000543125062084037, + "loss": 0.5277, + "step": 64650 + }, + { + "epoch": 3.2114830634747196, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005430853283003874, + "loss": 0.5313, + "step": 64660 + }, + { + "epoch": 3.211979735770339, + "grad_norm": 0.171875, + "learning_rate": 0.0005430455945167378, + "loss": 0.5264, + "step": 64670 + }, + { + "epoch": 3.212476408065958, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005430058607330884, + "loss": 0.5481, + "step": 64680 + }, + { + "epoch": 3.2129730803615772, + "grad_norm": 0.111328125, + "learning_rate": 0.0005429661269494388, + "loss": 0.5533, + "step": 64690 + }, + { + "epoch": 3.213469752657197, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005429263931657892, + "loss": 0.5321, + "step": 64700 + }, + { + "epoch": 3.213966424952816, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005428866593821397, + "loss": 0.5544, + "step": 64710 + }, + { + "epoch": 3.2144630972484354, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005428469255984901, + "loss": 0.5066, + "step": 64720 + }, + { + "epoch": 3.2149597695440546, + "grad_norm": 0.1015625, + "learning_rate": 0.0005428071918148406, + "loss": 0.5334, + "step": 64730 + }, + { + "epoch": 3.2154564418396743, + "grad_norm": 0.18359375, + "learning_rate": 0.0005427674580311911, + "loss": 0.5496, + "step": 64740 + }, + { + "epoch": 3.2159531141352935, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005427277242475415, + "loss": 0.5501, + "step": 64750 + }, + { + "epoch": 3.2164497864309127, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005426879904638919, + "loss": 0.5431, + "step": 64760 + }, + { + "epoch": 3.2169464587265324, + "grad_norm": 0.12109375, + "learning_rate": 0.0005426482566802424, + "loss": 0.5598, + "step": 64770 + }, + { + "epoch": 3.2174431310221516, + "grad_norm": 0.16796875, + "learning_rate": 0.0005426085228965929, + "loss": 0.5536, + "step": 64780 + }, + { + "epoch": 3.217939803317771, + "grad_norm": 0.095703125, + "learning_rate": 0.0005425687891129433, + "loss": 0.5343, + "step": 64790 + }, + { + "epoch": 3.21843647561339, + "grad_norm": 0.099609375, + "learning_rate": 0.0005425290553292937, + "loss": 0.5431, + "step": 64800 + }, + { + "epoch": 3.2189331479090098, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005424893215456442, + "loss": 0.5393, + "step": 64810 + }, + { + "epoch": 3.219429820204629, + "grad_norm": 0.150390625, + "learning_rate": 0.0005424495877619948, + "loss": 0.5208, + "step": 64820 + }, + { + "epoch": 3.219926492500248, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005424098539783451, + "loss": 0.5303, + "step": 64830 + }, + { + "epoch": 3.220423164795868, + "grad_norm": 0.1875, + "learning_rate": 0.0005423701201946956, + "loss": 0.5394, + "step": 64840 + }, + { + "epoch": 3.220919837091487, + "grad_norm": 0.1396484375, + "learning_rate": 0.000542330386411046, + "loss": 0.5353, + "step": 64850 + }, + { + "epoch": 3.2214165093871063, + "grad_norm": 0.13671875, + "learning_rate": 0.0005422906526273964, + "loss": 0.5584, + "step": 64860 + }, + { + "epoch": 3.2219131816827256, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005422509188437469, + "loss": 0.5353, + "step": 64870 + }, + { + "epoch": 3.2224098539783452, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005422111850600974, + "loss": 0.524, + "step": 64880 + }, + { + "epoch": 3.2229065262739645, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005421714512764478, + "loss": 0.5158, + "step": 64890 + }, + { + "epoch": 3.2234031985695837, + "grad_norm": 0.115234375, + "learning_rate": 0.0005421317174927983, + "loss": 0.537, + "step": 64900 + }, + { + "epoch": 3.2238998708652034, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005420919837091487, + "loss": 0.511, + "step": 64910 + }, + { + "epoch": 3.2243965431608226, + "grad_norm": 0.126953125, + "learning_rate": 0.0005420522499254991, + "loss": 0.5867, + "step": 64920 + }, + { + "epoch": 3.224893215456442, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005420125161418497, + "loss": 0.5472, + "step": 64930 + }, + { + "epoch": 3.225389887752061, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005419727823582001, + "loss": 0.5355, + "step": 64940 + }, + { + "epoch": 3.2258865600476807, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005419330485745505, + "loss": 0.5414, + "step": 64950 + }, + { + "epoch": 3.2263832323433, + "grad_norm": 0.1787109375, + "learning_rate": 0.000541893314790901, + "loss": 0.5407, + "step": 64960 + }, + { + "epoch": 3.226879904638919, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005418535810072514, + "loss": 0.5396, + "step": 64970 + }, + { + "epoch": 3.2273765769345384, + "grad_norm": 0.09375, + "learning_rate": 0.000541813847223602, + "loss": 0.5581, + "step": 64980 + }, + { + "epoch": 3.227873249230158, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005417741134399523, + "loss": 0.5349, + "step": 64990 + }, + { + "epoch": 3.2283699215257773, + "grad_norm": 0.091796875, + "learning_rate": 0.0005417343796563028, + "loss": 0.5391, + "step": 65000 + }, + { + "epoch": 3.2288665938213965, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005416946458726533, + "loss": 0.5329, + "step": 65010 + }, + { + "epoch": 3.229363266117016, + "grad_norm": 0.111328125, + "learning_rate": 0.0005416549120890036, + "loss": 0.5257, + "step": 65020 + }, + { + "epoch": 3.2298599384126354, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005416151783053542, + "loss": 0.5322, + "step": 65030 + }, + { + "epoch": 3.2303566107082546, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005415754445217047, + "loss": 0.5467, + "step": 65040 + }, + { + "epoch": 3.230853283003874, + "grad_norm": 0.10498046875, + "learning_rate": 0.000541535710738055, + "loss": 0.5571, + "step": 65050 + }, + { + "epoch": 3.2313499552994935, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005414959769544055, + "loss": 0.5597, + "step": 65060 + }, + { + "epoch": 3.2318466275951128, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005414562431707559, + "loss": 0.5388, + "step": 65070 + }, + { + "epoch": 3.232343299890732, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005414165093871064, + "loss": 0.5341, + "step": 65080 + }, + { + "epoch": 3.232839972186351, + "grad_norm": 0.1015625, + "learning_rate": 0.0005413767756034569, + "loss": 0.5413, + "step": 65090 + }, + { + "epoch": 3.233336644481971, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005413370418198073, + "loss": 0.5701, + "step": 65100 + }, + { + "epoch": 3.23383331677759, + "grad_norm": 0.109375, + "learning_rate": 0.0005412973080361578, + "loss": 0.5333, + "step": 65110 + }, + { + "epoch": 3.2343299890732093, + "grad_norm": 0.099609375, + "learning_rate": 0.0005412575742525082, + "loss": 0.5367, + "step": 65120 + }, + { + "epoch": 3.234826661368829, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005412178404688587, + "loss": 0.5415, + "step": 65130 + }, + { + "epoch": 3.2353233336644482, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005411781066852092, + "loss": 0.5212, + "step": 65140 + }, + { + "epoch": 3.2358200059600675, + "grad_norm": 0.130859375, + "learning_rate": 0.0005411383729015596, + "loss": 0.529, + "step": 65150 + }, + { + "epoch": 3.2363166782556867, + "grad_norm": 0.1005859375, + "learning_rate": 0.00054109863911791, + "loss": 0.5667, + "step": 65160 + }, + { + "epoch": 3.2368133505513064, + "grad_norm": 0.08984375, + "learning_rate": 0.0005410589053342605, + "loss": 0.5229, + "step": 65170 + }, + { + "epoch": 3.2373100228469256, + "grad_norm": 0.1171875, + "learning_rate": 0.0005410191715506109, + "loss": 0.5514, + "step": 65180 + }, + { + "epoch": 3.237806695142545, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005409794377669614, + "loss": 0.5677, + "step": 65190 + }, + { + "epoch": 3.2383033674381645, + "grad_norm": 0.103515625, + "learning_rate": 0.0005409397039833119, + "loss": 0.5259, + "step": 65200 + }, + { + "epoch": 3.2388000397337837, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005408999701996622, + "loss": 0.5209, + "step": 65210 + }, + { + "epoch": 3.239296712029403, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005408602364160127, + "loss": 0.532, + "step": 65220 + }, + { + "epoch": 3.239793384325022, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005408205026323633, + "loss": 0.5373, + "step": 65230 + }, + { + "epoch": 3.240290056620642, + "grad_norm": 0.123046875, + "learning_rate": 0.0005407807688487136, + "loss": 0.5476, + "step": 65240 + }, + { + "epoch": 3.240786728916261, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005407410350650641, + "loss": 0.5462, + "step": 65250 + }, + { + "epoch": 3.2412834012118803, + "grad_norm": 0.109375, + "learning_rate": 0.0005407013012814145, + "loss": 0.5377, + "step": 65260 + }, + { + "epoch": 3.2417800735075, + "grad_norm": 0.12060546875, + "learning_rate": 0.000540661567497765, + "loss": 0.557, + "step": 65270 + }, + { + "epoch": 3.242276745803119, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005406218337141155, + "loss": 0.5376, + "step": 65280 + }, + { + "epoch": 3.2427734180987384, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005405820999304659, + "loss": 0.5319, + "step": 65290 + }, + { + "epoch": 3.2432700903943577, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005405423661468164, + "loss": 0.536, + "step": 65300 + }, + { + "epoch": 3.2437667626899773, + "grad_norm": 0.11328125, + "learning_rate": 0.0005405026323631668, + "loss": 0.5568, + "step": 65310 + }, + { + "epoch": 3.2442634349855966, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005404628985795172, + "loss": 0.5463, + "step": 65320 + }, + { + "epoch": 3.2447601072812158, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005404231647958678, + "loss": 0.5409, + "step": 65330 + }, + { + "epoch": 3.245256779576835, + "grad_norm": 0.14453125, + "learning_rate": 0.0005403834310122182, + "loss": 0.5473, + "step": 65340 + }, + { + "epoch": 3.2457534518724547, + "grad_norm": 0.09765625, + "learning_rate": 0.0005403436972285686, + "loss": 0.5728, + "step": 65350 + }, + { + "epoch": 3.246250124168074, + "grad_norm": 0.10546875, + "learning_rate": 0.0005403039634449191, + "loss": 0.5339, + "step": 65360 + }, + { + "epoch": 3.246746796463693, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005402642296612694, + "loss": 0.57, + "step": 65370 + }, + { + "epoch": 3.247243468759313, + "grad_norm": 0.10888671875, + "learning_rate": 0.00054022449587762, + "loss": 0.5259, + "step": 65380 + }, + { + "epoch": 3.247740141054932, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005401847620939705, + "loss": 0.5278, + "step": 65390 + }, + { + "epoch": 3.2482368133505513, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005401450283103208, + "loss": 0.5341, + "step": 65400 + }, + { + "epoch": 3.2487334856461705, + "grad_norm": 0.1015625, + "learning_rate": 0.0005401052945266713, + "loss": 0.5482, + "step": 65410 + }, + { + "epoch": 3.24923015794179, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005400655607430218, + "loss": 0.5626, + "step": 65420 + }, + { + "epoch": 3.2497268302374094, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005400258269593723, + "loss": 0.5739, + "step": 65430 + }, + { + "epoch": 3.2502235025330286, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005399860931757227, + "loss": 0.5266, + "step": 65440 + }, + { + "epoch": 3.250720174828648, + "grad_norm": 0.158203125, + "learning_rate": 0.0005399463593920731, + "loss": 0.541, + "step": 65450 + }, + { + "epoch": 3.2512168471242675, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005399066256084236, + "loss": 0.5476, + "step": 65460 + }, + { + "epoch": 3.2517135194198867, + "grad_norm": 0.0986328125, + "learning_rate": 0.000539866891824774, + "loss": 0.5393, + "step": 65470 + }, + { + "epoch": 3.252210191715506, + "grad_norm": 0.125, + "learning_rate": 0.0005398271580411245, + "loss": 0.5299, + "step": 65480 + }, + { + "epoch": 3.2527068640111256, + "grad_norm": 0.1259765625, + "learning_rate": 0.000539787424257475, + "loss": 0.5222, + "step": 65490 + }, + { + "epoch": 3.253203536306745, + "grad_norm": 0.13671875, + "learning_rate": 0.0005397476904738254, + "loss": 0.544, + "step": 65500 + }, + { + "epoch": 3.253700208602364, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005397079566901758, + "loss": 0.547, + "step": 65510 + }, + { + "epoch": 3.2541968808979833, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005396682229065263, + "loss": 0.5586, + "step": 65520 + }, + { + "epoch": 3.254693553193603, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005396284891228768, + "loss": 0.55, + "step": 65530 + }, + { + "epoch": 3.255190225489222, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005395887553392272, + "loss": 0.5558, + "step": 65540 + }, + { + "epoch": 3.2556868977848414, + "grad_norm": 0.185546875, + "learning_rate": 0.0005395490215555777, + "loss": 0.5421, + "step": 65550 + }, + { + "epoch": 3.256183570080461, + "grad_norm": 0.130859375, + "learning_rate": 0.0005395092877719281, + "loss": 0.5712, + "step": 65560 + }, + { + "epoch": 3.2566802423760803, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005394695539882785, + "loss": 0.5159, + "step": 65570 + }, + { + "epoch": 3.2571769146716996, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005394298202046291, + "loss": 0.545, + "step": 65580 + }, + { + "epoch": 3.257673586967319, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005393900864209795, + "loss": 0.535, + "step": 65590 + }, + { + "epoch": 3.2581702592629385, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005393503526373299, + "loss": 0.5352, + "step": 65600 + }, + { + "epoch": 3.2586669315585577, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005393106188536804, + "loss": 0.5546, + "step": 65610 + }, + { + "epoch": 3.259163603854177, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005392708850700308, + "loss": 0.5228, + "step": 65620 + }, + { + "epoch": 3.2596602761497966, + "grad_norm": 0.109375, + "learning_rate": 0.0005392311512863813, + "loss": 0.5123, + "step": 65630 + }, + { + "epoch": 3.260156948445416, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005391914175027317, + "loss": 0.5049, + "step": 65640 + }, + { + "epoch": 3.260653620741035, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005391516837190822, + "loss": 0.5455, + "step": 65650 + }, + { + "epoch": 3.2611502930366543, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005391119499354326, + "loss": 0.5342, + "step": 65660 + }, + { + "epoch": 3.261646965332274, + "grad_norm": 0.171875, + "learning_rate": 0.000539072216151783, + "loss": 0.5565, + "step": 65670 + }, + { + "epoch": 3.262143637627893, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005390324823681336, + "loss": 0.5447, + "step": 65680 + }, + { + "epoch": 3.2626403099235124, + "grad_norm": 0.130859375, + "learning_rate": 0.000538992748584484, + "loss": 0.5521, + "step": 65690 + }, + { + "epoch": 3.263136982219132, + "grad_norm": 0.103515625, + "learning_rate": 0.0005389530148008344, + "loss": 0.5454, + "step": 65700 + }, + { + "epoch": 3.2636336545147513, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005389132810171849, + "loss": 0.5164, + "step": 65710 + }, + { + "epoch": 3.2641303268103705, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005388735472335353, + "loss": 0.5159, + "step": 65720 + }, + { + "epoch": 3.2646269991059897, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005388338134498857, + "loss": 0.5367, + "step": 65730 + }, + { + "epoch": 3.265123671401609, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005387940796662363, + "loss": 0.5297, + "step": 65740 + }, + { + "epoch": 3.2656203436972286, + "grad_norm": 0.13671875, + "learning_rate": 0.0005387543458825867, + "loss": 0.5748, + "step": 65750 + }, + { + "epoch": 3.266117015992848, + "grad_norm": 0.1328125, + "learning_rate": 0.0005387146120989371, + "loss": 0.5504, + "step": 65760 + }, + { + "epoch": 3.266613688288467, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005386748783152876, + "loss": 0.4994, + "step": 65770 + }, + { + "epoch": 3.2671103605840868, + "grad_norm": 0.1845703125, + "learning_rate": 0.000538635144531638, + "loss": 0.5118, + "step": 65780 + }, + { + "epoch": 3.267607032879706, + "grad_norm": 0.1328125, + "learning_rate": 0.0005385954107479885, + "loss": 0.529, + "step": 65790 + }, + { + "epoch": 3.2681037051753252, + "grad_norm": 0.1318359375, + "learning_rate": 0.000538555676964339, + "loss": 0.5371, + "step": 65800 + }, + { + "epoch": 3.2686003774709445, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005385159431806894, + "loss": 0.5454, + "step": 65810 + }, + { + "epoch": 3.269097049766564, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005384762093970398, + "loss": 0.5322, + "step": 65820 + }, + { + "epoch": 3.2695937220621833, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005384364756133904, + "loss": 0.561, + "step": 65830 + }, + { + "epoch": 3.2700903943578026, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005383967418297408, + "loss": 0.5195, + "step": 65840 + }, + { + "epoch": 3.2705870666534222, + "grad_norm": 0.146484375, + "learning_rate": 0.0005383570080460912, + "loss": 0.5318, + "step": 65850 + }, + { + "epoch": 3.2710837389490415, + "grad_norm": 0.09765625, + "learning_rate": 0.0005383172742624416, + "loss": 0.5518, + "step": 65860 + }, + { + "epoch": 3.2715804112446607, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005382775404787921, + "loss": 0.5417, + "step": 65870 + }, + { + "epoch": 3.27207708354028, + "grad_norm": 0.123046875, + "learning_rate": 0.0005382378066951427, + "loss": 0.5254, + "step": 65880 + }, + { + "epoch": 3.2725737558358996, + "grad_norm": 0.1025390625, + "learning_rate": 0.000538198072911493, + "loss": 0.5294, + "step": 65890 + }, + { + "epoch": 3.273070428131519, + "grad_norm": 0.142578125, + "learning_rate": 0.0005381583391278435, + "loss": 0.5571, + "step": 65900 + }, + { + "epoch": 3.273567100427138, + "grad_norm": 0.138671875, + "learning_rate": 0.0005381186053441939, + "loss": 0.5311, + "step": 65910 + }, + { + "epoch": 3.2740637727227577, + "grad_norm": 0.109375, + "learning_rate": 0.0005380788715605443, + "loss": 0.5349, + "step": 65920 + }, + { + "epoch": 3.274560445018377, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005380391377768948, + "loss": 0.5339, + "step": 65930 + }, + { + "epoch": 3.275057117313996, + "grad_norm": 0.125, + "learning_rate": 0.0005379994039932453, + "loss": 0.5454, + "step": 65940 + }, + { + "epoch": 3.2755537896096154, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005379596702095957, + "loss": 0.5472, + "step": 65950 + }, + { + "epoch": 3.276050461905235, + "grad_norm": 0.158203125, + "learning_rate": 0.0005379199364259462, + "loss": 0.5065, + "step": 65960 + }, + { + "epoch": 3.2765471342008543, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005378802026422966, + "loss": 0.5536, + "step": 65970 + }, + { + "epoch": 3.2770438064964735, + "grad_norm": 0.16796875, + "learning_rate": 0.000537840468858647, + "loss": 0.5362, + "step": 65980 + }, + { + "epoch": 3.277540478792093, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005378007350749976, + "loss": 0.5506, + "step": 65990 + }, + { + "epoch": 3.2780371510877124, + "grad_norm": 0.1259765625, + "learning_rate": 0.000537761001291348, + "loss": 0.5296, + "step": 66000 + }, + { + "epoch": 3.2785338233833317, + "grad_norm": 0.162109375, + "learning_rate": 0.0005377212675076985, + "loss": 0.5581, + "step": 66010 + }, + { + "epoch": 3.279030495678951, + "grad_norm": 0.10546875, + "learning_rate": 0.0005376815337240489, + "loss": 0.5496, + "step": 66020 + }, + { + "epoch": 3.2795271679745706, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005376417999403993, + "loss": 0.5502, + "step": 66030 + }, + { + "epoch": 3.28002384027019, + "grad_norm": 0.19140625, + "learning_rate": 0.0005376020661567499, + "loss": 0.5442, + "step": 66040 + }, + { + "epoch": 3.280520512565809, + "grad_norm": 0.10546875, + "learning_rate": 0.0005375623323731002, + "loss": 0.5387, + "step": 66050 + }, + { + "epoch": 3.2810171848614282, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005375225985894507, + "loss": 0.5739, + "step": 66060 + }, + { + "epoch": 3.281513857157048, + "grad_norm": 0.099609375, + "learning_rate": 0.0005374828648058012, + "loss": 0.5438, + "step": 66070 + }, + { + "epoch": 3.282010529452667, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005374431310221515, + "loss": 0.5437, + "step": 66080 + }, + { + "epoch": 3.2825072017482864, + "grad_norm": 0.111328125, + "learning_rate": 0.0005374033972385021, + "loss": 0.5223, + "step": 66090 + }, + { + "epoch": 3.2830038740439056, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005373636634548526, + "loss": 0.5179, + "step": 66100 + }, + { + "epoch": 3.2835005463395253, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005373239296712029, + "loss": 0.5414, + "step": 66110 + }, + { + "epoch": 3.2839972186351445, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005372841958875534, + "loss": 0.5291, + "step": 66120 + }, + { + "epoch": 3.2844938909307637, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005372444621039038, + "loss": 0.5523, + "step": 66130 + }, + { + "epoch": 3.2849905632263834, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005372047283202543, + "loss": 0.5257, + "step": 66140 + }, + { + "epoch": 3.2854872355220026, + "grad_norm": 0.19921875, + "learning_rate": 0.0005371649945366048, + "loss": 0.5845, + "step": 66150 + }, + { + "epoch": 3.285983907817622, + "grad_norm": 0.150390625, + "learning_rate": 0.0005371252607529552, + "loss": 0.5372, + "step": 66160 + }, + { + "epoch": 3.286480580113241, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005370855269693057, + "loss": 0.5626, + "step": 66170 + }, + { + "epoch": 3.2869772524088607, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005370457931856561, + "loss": 0.5303, + "step": 66180 + }, + { + "epoch": 3.28747392470448, + "grad_norm": 0.103515625, + "learning_rate": 0.0005370060594020066, + "loss": 0.5337, + "step": 66190 + }, + { + "epoch": 3.287970597000099, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005369663256183571, + "loss": 0.5547, + "step": 66200 + }, + { + "epoch": 3.288467269295719, + "grad_norm": 0.146484375, + "learning_rate": 0.0005369265918347075, + "loss": 0.5274, + "step": 66210 + }, + { + "epoch": 3.288963941591338, + "grad_norm": 0.09765625, + "learning_rate": 0.0005368868580510579, + "loss": 0.5411, + "step": 66220 + }, + { + "epoch": 3.2894606138869573, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005368471242674084, + "loss": 0.5588, + "step": 66230 + }, + { + "epoch": 3.2899572861825765, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005368073904837588, + "loss": 0.5181, + "step": 66240 + }, + { + "epoch": 3.290453958478196, + "grad_norm": 0.14453125, + "learning_rate": 0.0005367676567001093, + "loss": 0.5378, + "step": 66250 + }, + { + "epoch": 3.2909506307738154, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005367279229164598, + "loss": 0.5639, + "step": 66260 + }, + { + "epoch": 3.2914473030694347, + "grad_norm": 0.12109375, + "learning_rate": 0.0005366881891328101, + "loss": 0.5596, + "step": 66270 + }, + { + "epoch": 3.2919439753650543, + "grad_norm": 0.119140625, + "learning_rate": 0.0005366484553491606, + "loss": 0.557, + "step": 66280 + }, + { + "epoch": 3.2924406476606736, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005366087215655112, + "loss": 0.55, + "step": 66290 + }, + { + "epoch": 3.292937319956293, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005365689877818615, + "loss": 0.5637, + "step": 66300 + }, + { + "epoch": 3.293433992251912, + "grad_norm": 0.134765625, + "learning_rate": 0.000536529253998212, + "loss": 0.5482, + "step": 66310 + }, + { + "epoch": 3.2939306645475317, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005364895202145624, + "loss": 0.5404, + "step": 66320 + }, + { + "epoch": 3.294427336843151, + "grad_norm": 0.17578125, + "learning_rate": 0.0005364497864309129, + "loss": 0.5382, + "step": 66330 + }, + { + "epoch": 3.29492400913877, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005364100526472634, + "loss": 0.5647, + "step": 66340 + }, + { + "epoch": 3.29542068143439, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005363703188636138, + "loss": 0.5365, + "step": 66350 + }, + { + "epoch": 3.295917353730009, + "grad_norm": 0.138671875, + "learning_rate": 0.0005363305850799643, + "loss": 0.5564, + "step": 66360 + }, + { + "epoch": 3.2964140260256283, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005362908512963147, + "loss": 0.531, + "step": 66370 + }, + { + "epoch": 3.2969106983212475, + "grad_norm": 0.08984375, + "learning_rate": 0.0005362511175126651, + "loss": 0.5241, + "step": 66380 + }, + { + "epoch": 3.297407370616867, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005362113837290157, + "loss": 0.528, + "step": 66390 + }, + { + "epoch": 3.2979040429124864, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005361716499453661, + "loss": 0.5528, + "step": 66400 + }, + { + "epoch": 3.2984007152081056, + "grad_norm": 0.125, + "learning_rate": 0.0005361319161617165, + "loss": 0.5735, + "step": 66410 + }, + { + "epoch": 3.298897387503725, + "grad_norm": 0.10595703125, + "learning_rate": 0.000536092182378067, + "loss": 0.5501, + "step": 66420 + }, + { + "epoch": 3.2993940597993445, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005360524485944174, + "loss": 0.5369, + "step": 66430 + }, + { + "epoch": 3.2998907320949638, + "grad_norm": 0.10546875, + "learning_rate": 0.0005360127148107679, + "loss": 0.5435, + "step": 66440 + }, + { + "epoch": 3.300387404390583, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005359729810271184, + "loss": 0.5274, + "step": 66450 + }, + { + "epoch": 3.300884076686202, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005359332472434688, + "loss": 0.5446, + "step": 66460 + }, + { + "epoch": 3.301380748981822, + "grad_norm": 0.15625, + "learning_rate": 0.0005358935134598192, + "loss": 0.5579, + "step": 66470 + }, + { + "epoch": 3.301877421277441, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005358537796761697, + "loss": 0.5127, + "step": 66480 + }, + { + "epoch": 3.3023740935730603, + "grad_norm": 0.10546875, + "learning_rate": 0.0005358140458925202, + "loss": 0.5156, + "step": 66490 + }, + { + "epoch": 3.30287076586868, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005357743121088706, + "loss": 0.5647, + "step": 66500 + }, + { + "epoch": 3.3033674381642992, + "grad_norm": 0.1298828125, + "learning_rate": 0.000535734578325221, + "loss": 0.5426, + "step": 66510 + }, + { + "epoch": 3.3038641104599185, + "grad_norm": 0.099609375, + "learning_rate": 0.0005356948445415715, + "loss": 0.5492, + "step": 66520 + }, + { + "epoch": 3.3043607827555377, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005356551107579219, + "loss": 0.5311, + "step": 66530 + }, + { + "epoch": 3.3048574550511574, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005356153769742724, + "loss": 0.5401, + "step": 66540 + }, + { + "epoch": 3.3053541273467766, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005355756431906229, + "loss": 0.5331, + "step": 66550 + }, + { + "epoch": 3.305850799642396, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005355359094069733, + "loss": 0.5468, + "step": 66560 + }, + { + "epoch": 3.3063474719380155, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005354961756233237, + "loss": 0.5447, + "step": 66570 + }, + { + "epoch": 3.3068441442336347, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005354564418396742, + "loss": 0.5458, + "step": 66580 + }, + { + "epoch": 3.307340816529254, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005354167080560247, + "loss": 0.5301, + "step": 66590 + }, + { + "epoch": 3.307837488824873, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005353769742723751, + "loss": 0.5473, + "step": 66600 + }, + { + "epoch": 3.308334161120493, + "grad_norm": 0.08984375, + "learning_rate": 0.0005353372404887256, + "loss": 0.5619, + "step": 66610 + }, + { + "epoch": 3.308830833416112, + "grad_norm": 0.10205078125, + "learning_rate": 0.000535297506705076, + "loss": 0.5796, + "step": 66620 + }, + { + "epoch": 3.3093275057117313, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005352577729214264, + "loss": 0.5296, + "step": 66630 + }, + { + "epoch": 3.309824178007351, + "grad_norm": 0.1279296875, + "learning_rate": 0.000535218039137777, + "loss": 0.5319, + "step": 66640 + }, + { + "epoch": 3.31032085030297, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005351783053541274, + "loss": 0.5339, + "step": 66650 + }, + { + "epoch": 3.3108175225985894, + "grad_norm": 0.109375, + "learning_rate": 0.0005351385715704778, + "loss": 0.535, + "step": 66660 + }, + { + "epoch": 3.3113141948942086, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005350988377868283, + "loss": 0.5615, + "step": 66670 + }, + { + "epoch": 3.3118108671898283, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005350591040031787, + "loss": 0.5534, + "step": 66680 + }, + { + "epoch": 3.3123075394854475, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005350193702195292, + "loss": 0.5532, + "step": 66690 + }, + { + "epoch": 3.3128042117810668, + "grad_norm": 0.125, + "learning_rate": 0.0005349796364358797, + "loss": 0.5475, + "step": 66700 + }, + { + "epoch": 3.3133008840766864, + "grad_norm": 0.126953125, + "learning_rate": 0.0005349399026522301, + "loss": 0.5217, + "step": 66710 + }, + { + "epoch": 3.3137975563723057, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005349001688685805, + "loss": 0.5623, + "step": 66720 + }, + { + "epoch": 3.314294228667925, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005348604350849309, + "loss": 0.5609, + "step": 66730 + }, + { + "epoch": 3.314790900963544, + "grad_norm": 0.1015625, + "learning_rate": 0.0005348207013012815, + "loss": 0.5325, + "step": 66740 + }, + { + "epoch": 3.315287573259164, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005347809675176319, + "loss": 0.516, + "step": 66750 + }, + { + "epoch": 3.315784245554783, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005347412337339823, + "loss": 0.5524, + "step": 66760 + }, + { + "epoch": 3.3162809178504022, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005347014999503328, + "loss": 0.549, + "step": 66770 + }, + { + "epoch": 3.3167775901460215, + "grad_norm": 0.099609375, + "learning_rate": 0.0005346617661666832, + "loss": 0.5395, + "step": 66780 + }, + { + "epoch": 3.317274262441641, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005346220323830337, + "loss": 0.5433, + "step": 66790 + }, + { + "epoch": 3.3177709347372604, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005345822985993842, + "loss": 0.526, + "step": 66800 + }, + { + "epoch": 3.3182676070328796, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005345425648157346, + "loss": 0.5638, + "step": 66810 + }, + { + "epoch": 3.318764279328499, + "grad_norm": 0.162109375, + "learning_rate": 0.000534502831032085, + "loss": 0.5345, + "step": 66820 + }, + { + "epoch": 3.3192609516241185, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005344630972484355, + "loss": 0.5446, + "step": 66830 + }, + { + "epoch": 3.3197576239197377, + "grad_norm": 0.14453125, + "learning_rate": 0.000534423363464786, + "loss": 0.5389, + "step": 66840 + }, + { + "epoch": 3.320254296215357, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005343836296811364, + "loss": 0.5606, + "step": 66850 + }, + { + "epoch": 3.3207509685109766, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005343438958974869, + "loss": 0.528, + "step": 66860 + }, + { + "epoch": 3.321247640806596, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005343041621138373, + "loss": 0.556, + "step": 66870 + }, + { + "epoch": 3.321744313102215, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005342644283301877, + "loss": 0.5319, + "step": 66880 + }, + { + "epoch": 3.3222409853978343, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005342246945465383, + "loss": 0.523, + "step": 66890 + }, + { + "epoch": 3.322737657693454, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005341849607628887, + "loss": 0.5343, + "step": 66900 + }, + { + "epoch": 3.323234329989073, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005341452269792392, + "loss": 0.5325, + "step": 66910 + }, + { + "epoch": 3.3237310022846924, + "grad_norm": 0.138671875, + "learning_rate": 0.0005341054931955895, + "loss": 0.5168, + "step": 66920 + }, + { + "epoch": 3.324227674580312, + "grad_norm": 0.1435546875, + "learning_rate": 0.00053406575941194, + "loss": 0.5139, + "step": 66930 + }, + { + "epoch": 3.3247243468759313, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005340260256282906, + "loss": 0.5396, + "step": 66940 + }, + { + "epoch": 3.3252210191715506, + "grad_norm": 0.115234375, + "learning_rate": 0.0005339862918446409, + "loss": 0.5139, + "step": 66950 + }, + { + "epoch": 3.32571769146717, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005339465580609914, + "loss": 0.5107, + "step": 66960 + }, + { + "epoch": 3.3262143637627894, + "grad_norm": 0.14453125, + "learning_rate": 0.0005339068242773419, + "loss": 0.5205, + "step": 66970 + }, + { + "epoch": 3.3267110360584087, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005338670904936922, + "loss": 0.5306, + "step": 66980 + }, + { + "epoch": 3.327207708354028, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005338273567100428, + "loss": 0.5516, + "step": 66990 + }, + { + "epoch": 3.3277043806496476, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005337876229263932, + "loss": 0.5457, + "step": 67000 + }, + { + "epoch": 3.328201052945267, + "grad_norm": 0.150390625, + "learning_rate": 0.0005337478891427436, + "loss": 0.5139, + "step": 67010 + }, + { + "epoch": 3.328697725240886, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005337081553590941, + "loss": 0.5391, + "step": 67020 + }, + { + "epoch": 3.3291943975365053, + "grad_norm": 0.14453125, + "learning_rate": 0.0005336684215754445, + "loss": 0.5683, + "step": 67030 + }, + { + "epoch": 3.329691069832125, + "grad_norm": 0.1416015625, + "learning_rate": 0.000533628687791795, + "loss": 0.5214, + "step": 67040 + }, + { + "epoch": 3.330187742127744, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005335889540081455, + "loss": 0.5251, + "step": 67050 + }, + { + "epoch": 3.3306844144233634, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005335492202244959, + "loss": 0.5435, + "step": 67060 + }, + { + "epoch": 3.331181086718983, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005335094864408464, + "loss": 0.5237, + "step": 67070 + }, + { + "epoch": 3.3316777590146023, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005334697526571968, + "loss": 0.5006, + "step": 67080 + }, + { + "epoch": 3.3321744313102215, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005334300188735472, + "loss": 0.5426, + "step": 67090 + }, + { + "epoch": 3.3326711036058407, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005333902850898978, + "loss": 0.5197, + "step": 67100 + }, + { + "epoch": 3.3331677759014604, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005333505513062481, + "loss": 0.5393, + "step": 67110 + }, + { + "epoch": 3.3336644481970796, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005333108175225986, + "loss": 0.5338, + "step": 67120 + }, + { + "epoch": 3.334161120492699, + "grad_norm": 0.13671875, + "learning_rate": 0.0005332710837389491, + "loss": 0.554, + "step": 67130 + }, + { + "epoch": 3.334657792788318, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005332313499552994, + "loss": 0.536, + "step": 67140 + }, + { + "epoch": 3.3351544650839378, + "grad_norm": 0.10986328125, + "learning_rate": 0.00053319161617165, + "loss": 0.5317, + "step": 67150 + }, + { + "epoch": 3.335651137379557, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005331518823880005, + "loss": 0.5181, + "step": 67160 + }, + { + "epoch": 3.336147809675176, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005331121486043508, + "loss": 0.5273, + "step": 67170 + }, + { + "epoch": 3.3366444819707954, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005330724148207013, + "loss": 0.5372, + "step": 67180 + }, + { + "epoch": 3.337141154266415, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005330326810370517, + "loss": 0.5223, + "step": 67190 + }, + { + "epoch": 3.3376378265620343, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005329929472534023, + "loss": 0.5198, + "step": 67200 + }, + { + "epoch": 3.3381344988576536, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005329532134697527, + "loss": 0.5214, + "step": 67210 + }, + { + "epoch": 3.3386311711532732, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005329134796861031, + "loss": 0.5321, + "step": 67220 + }, + { + "epoch": 3.3391278434488925, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005328737459024536, + "loss": 0.553, + "step": 67230 + }, + { + "epoch": 3.3396245157445117, + "grad_norm": 0.109375, + "learning_rate": 0.000532834012118804, + "loss": 0.5799, + "step": 67240 + }, + { + "epoch": 3.340121188040131, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005327942783351545, + "loss": 0.5631, + "step": 67250 + }, + { + "epoch": 3.3406178603357506, + "grad_norm": 0.10009765625, + "learning_rate": 0.000532754544551505, + "loss": 0.5336, + "step": 67260 + }, + { + "epoch": 3.34111453263137, + "grad_norm": 0.11328125, + "learning_rate": 0.0005327148107678554, + "loss": 0.527, + "step": 67270 + }, + { + "epoch": 3.341611204926989, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005326750769842058, + "loss": 0.5445, + "step": 67280 + }, + { + "epoch": 3.3421078772226087, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005326353432005564, + "loss": 0.5429, + "step": 67290 + }, + { + "epoch": 3.342604549518228, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005325956094169068, + "loss": 0.5717, + "step": 67300 + }, + { + "epoch": 3.343101221813847, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005325558756332572, + "loss": 0.5346, + "step": 67310 + }, + { + "epoch": 3.3435978941094664, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005325161418496077, + "loss": 0.518, + "step": 67320 + }, + { + "epoch": 3.344094566405086, + "grad_norm": 0.169921875, + "learning_rate": 0.000532476408065958, + "loss": 0.5257, + "step": 67330 + }, + { + "epoch": 3.3445912387007053, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005324366742823085, + "loss": 0.5612, + "step": 67340 + }, + { + "epoch": 3.3450879109963245, + "grad_norm": 0.12890625, + "learning_rate": 0.0005323969404986591, + "loss": 0.5445, + "step": 67350 + }, + { + "epoch": 3.345584583291944, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005323572067150095, + "loss": 0.5257, + "step": 67360 + }, + { + "epoch": 3.3460812555875634, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005323174729313599, + "loss": 0.5268, + "step": 67370 + }, + { + "epoch": 3.3465779278831826, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005322777391477103, + "loss": 0.5465, + "step": 67380 + }, + { + "epoch": 3.347074600178802, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005322380053640608, + "loss": 0.5473, + "step": 67390 + }, + { + "epoch": 3.3475712724744215, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005321982715804113, + "loss": 0.5397, + "step": 67400 + }, + { + "epoch": 3.3480679447700408, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005321585377967617, + "loss": 0.5467, + "step": 67410 + }, + { + "epoch": 3.34856461706566, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005321188040131122, + "loss": 0.5278, + "step": 67420 + }, + { + "epoch": 3.3490612893612797, + "grad_norm": 0.107421875, + "learning_rate": 0.0005320790702294626, + "loss": 0.5759, + "step": 67430 + }, + { + "epoch": 3.349557961656899, + "grad_norm": 0.1015625, + "learning_rate": 0.000532039336445813, + "loss": 0.5251, + "step": 67440 + }, + { + "epoch": 3.350054633952518, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005319996026621636, + "loss": 0.5322, + "step": 67450 + }, + { + "epoch": 3.3505513062481374, + "grad_norm": 0.09326171875, + "learning_rate": 0.000531959868878514, + "loss": 0.5063, + "step": 67460 + }, + { + "epoch": 3.351047978543757, + "grad_norm": 0.15625, + "learning_rate": 0.0005319201350948644, + "loss": 0.5257, + "step": 67470 + }, + { + "epoch": 3.3515446508393762, + "grad_norm": 0.1953125, + "learning_rate": 0.0005318804013112149, + "loss": 0.5474, + "step": 67480 + }, + { + "epoch": 3.3520413231349955, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005318406675275653, + "loss": 0.5393, + "step": 67490 + }, + { + "epoch": 3.3525379954306147, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005318009337439158, + "loss": 0.5335, + "step": 67500 + }, + { + "epoch": 3.3530346677262344, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005317611999602663, + "loss": 0.5153, + "step": 67510 + }, + { + "epoch": 3.3535313400218536, + "grad_norm": 0.119140625, + "learning_rate": 0.0005317214661766167, + "loss": 0.528, + "step": 67520 + }, + { + "epoch": 3.354028012317473, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005316817323929671, + "loss": 0.5356, + "step": 67530 + }, + { + "epoch": 3.354524684613092, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005316419986093176, + "loss": 0.5734, + "step": 67540 + }, + { + "epoch": 3.3550213569087117, + "grad_norm": 0.1015625, + "learning_rate": 0.0005316022648256681, + "loss": 0.5431, + "step": 67550 + }, + { + "epoch": 3.355518029204331, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005315625310420185, + "loss": 0.5309, + "step": 67560 + }, + { + "epoch": 3.35601470149995, + "grad_norm": 0.1376953125, + "learning_rate": 0.000531522797258369, + "loss": 0.5435, + "step": 67570 + }, + { + "epoch": 3.35651137379557, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005314830634747194, + "loss": 0.552, + "step": 67580 + }, + { + "epoch": 3.357008046091189, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005314433296910698, + "loss": 0.5352, + "step": 67590 + }, + { + "epoch": 3.3575047183868083, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005314035959074203, + "loss": 0.5418, + "step": 67600 + }, + { + "epoch": 3.3580013906824275, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005313638621237708, + "loss": 0.5286, + "step": 67610 + }, + { + "epoch": 3.358498062978047, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005313241283401212, + "loss": 0.5356, + "step": 67620 + }, + { + "epoch": 3.3589947352736664, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005312843945564716, + "loss": 0.5358, + "step": 67630 + }, + { + "epoch": 3.3594914075692857, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005312446607728221, + "loss": 0.5482, + "step": 67640 + }, + { + "epoch": 3.3599880798649053, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005312049269891726, + "loss": 0.5531, + "step": 67650 + }, + { + "epoch": 3.3604847521605246, + "grad_norm": 0.12060546875, + "learning_rate": 0.000531165193205523, + "loss": 0.5543, + "step": 67660 + }, + { + "epoch": 3.360981424456144, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005311254594218735, + "loss": 0.5504, + "step": 67670 + }, + { + "epoch": 3.361478096751763, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005310857256382239, + "loss": 0.5481, + "step": 67680 + }, + { + "epoch": 3.3619747690473827, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005310459918545743, + "loss": 0.5184, + "step": 67690 + }, + { + "epoch": 3.362471441343002, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005310062580709249, + "loss": 0.5619, + "step": 67700 + }, + { + "epoch": 3.362968113638621, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005309665242872753, + "loss": 0.5395, + "step": 67710 + }, + { + "epoch": 3.363464785934241, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005309267905036257, + "loss": 0.5428, + "step": 67720 + }, + { + "epoch": 3.36396145822986, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005308870567199762, + "loss": 0.538, + "step": 67730 + }, + { + "epoch": 3.3644581305254793, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005308473229363266, + "loss": 0.5436, + "step": 67740 + }, + { + "epoch": 3.3649548028210985, + "grad_norm": 0.16015625, + "learning_rate": 0.0005308075891526771, + "loss": 0.5553, + "step": 67750 + }, + { + "epoch": 3.365451475116718, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005307678553690276, + "loss": 0.5249, + "step": 67760 + }, + { + "epoch": 3.3659481474123374, + "grad_norm": 0.11279296875, + "learning_rate": 0.000530728121585378, + "loss": 0.5369, + "step": 67770 + }, + { + "epoch": 3.3664448197079566, + "grad_norm": 0.142578125, + "learning_rate": 0.0005306883878017284, + "loss": 0.5338, + "step": 67780 + }, + { + "epoch": 3.3669414920035763, + "grad_norm": 0.130859375, + "learning_rate": 0.0005306486540180788, + "loss": 0.5266, + "step": 67790 + }, + { + "epoch": 3.3674381642991955, + "grad_norm": 0.107421875, + "learning_rate": 0.0005306089202344294, + "loss": 0.5549, + "step": 67800 + }, + { + "epoch": 3.3679348365948147, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005305691864507799, + "loss": 0.5414, + "step": 67810 + }, + { + "epoch": 3.368431508890434, + "grad_norm": 0.162109375, + "learning_rate": 0.0005305294526671302, + "loss": 0.5242, + "step": 67820 + }, + { + "epoch": 3.368928181186053, + "grad_norm": 0.107421875, + "learning_rate": 0.0005304897188834807, + "loss": 0.5392, + "step": 67830 + }, + { + "epoch": 3.369424853481673, + "grad_norm": 0.1484375, + "learning_rate": 0.0005304499850998312, + "loss": 0.5174, + "step": 67840 + }, + { + "epoch": 3.369921525777292, + "grad_norm": 0.115234375, + "learning_rate": 0.0005304102513161816, + "loss": 0.5748, + "step": 67850 + }, + { + "epoch": 3.3704181980729113, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005303705175325321, + "loss": 0.5387, + "step": 67860 + }, + { + "epoch": 3.370914870368531, + "grad_norm": 0.099609375, + "learning_rate": 0.0005303307837488825, + "loss": 0.5072, + "step": 67870 + }, + { + "epoch": 3.37141154266415, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005302910499652329, + "loss": 0.5303, + "step": 67880 + }, + { + "epoch": 3.3719082149597694, + "grad_norm": 0.09765625, + "learning_rate": 0.0005302513161815834, + "loss": 0.5631, + "step": 67890 + }, + { + "epoch": 3.3724048872553887, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005302115823979339, + "loss": 0.5304, + "step": 67900 + }, + { + "epoch": 3.3729015595510083, + "grad_norm": 0.169921875, + "learning_rate": 0.0005301718486142843, + "loss": 0.5107, + "step": 67910 + }, + { + "epoch": 3.3733982318466276, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005301321148306348, + "loss": 0.5199, + "step": 67920 + }, + { + "epoch": 3.373894904142247, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005300923810469852, + "loss": 0.5309, + "step": 67930 + }, + { + "epoch": 3.3743915764378665, + "grad_norm": 0.10546875, + "learning_rate": 0.0005300526472633356, + "loss": 0.5479, + "step": 67940 + }, + { + "epoch": 3.3748882487334857, + "grad_norm": 0.107421875, + "learning_rate": 0.0005300129134796862, + "loss": 0.513, + "step": 67950 + }, + { + "epoch": 3.375384921029105, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005299731796960366, + "loss": 0.5582, + "step": 67960 + }, + { + "epoch": 3.375881593324724, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005299334459123871, + "loss": 0.5594, + "step": 67970 + }, + { + "epoch": 3.376378265620344, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005298937121287374, + "loss": 0.5227, + "step": 67980 + }, + { + "epoch": 3.376874937915963, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005298539783450879, + "loss": 0.5474, + "step": 67990 + }, + { + "epoch": 3.3773716102115823, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005298142445614385, + "loss": 0.5357, + "step": 68000 + }, + { + "epoch": 3.377868282507202, + "grad_norm": 0.134765625, + "learning_rate": 0.0005297745107777888, + "loss": 0.5372, + "step": 68010 + }, + { + "epoch": 3.378364954802821, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005297347769941393, + "loss": 0.5519, + "step": 68020 + }, + { + "epoch": 3.3788616270984404, + "grad_norm": 0.126953125, + "learning_rate": 0.0005296950432104898, + "loss": 0.5776, + "step": 68030 + }, + { + "epoch": 3.3793582993940596, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005296553094268401, + "loss": 0.537, + "step": 68040 + }, + { + "epoch": 3.3798549716896793, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005296155756431907, + "loss": 0.5529, + "step": 68050 + }, + { + "epoch": 3.3803516439852985, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005295758418595411, + "loss": 0.5543, + "step": 68060 + }, + { + "epoch": 3.3808483162809178, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005295361080758915, + "loss": 0.5728, + "step": 68070 + }, + { + "epoch": 3.3813449885765374, + "grad_norm": 0.1318359375, + "learning_rate": 0.000529496374292242, + "loss": 0.525, + "step": 68080 + }, + { + "epoch": 3.3818416608721567, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005294566405085924, + "loss": 0.5424, + "step": 68090 + }, + { + "epoch": 3.382338333167776, + "grad_norm": 0.1259765625, + "learning_rate": 0.000529416906724943, + "loss": 0.5249, + "step": 68100 + }, + { + "epoch": 3.382835005463395, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005293771729412934, + "loss": 0.559, + "step": 68110 + }, + { + "epoch": 3.3833316777590148, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005293374391576438, + "loss": 0.52, + "step": 68120 + }, + { + "epoch": 3.383828350054634, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005292977053739943, + "loss": 0.5467, + "step": 68130 + }, + { + "epoch": 3.3843250223502532, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005292579715903447, + "loss": 0.5558, + "step": 68140 + }, + { + "epoch": 3.384821694645873, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005292182378066952, + "loss": 0.554, + "step": 68150 + }, + { + "epoch": 3.385318366941492, + "grad_norm": 0.134765625, + "learning_rate": 0.0005291785040230457, + "loss": 0.5432, + "step": 68160 + }, + { + "epoch": 3.3858150392371114, + "grad_norm": 0.095703125, + "learning_rate": 0.0005291387702393961, + "loss": 0.5391, + "step": 68170 + }, + { + "epoch": 3.3863117115327306, + "grad_norm": 0.10546875, + "learning_rate": 0.0005290990364557465, + "loss": 0.5326, + "step": 68180 + }, + { + "epoch": 3.38680838382835, + "grad_norm": 0.09765625, + "learning_rate": 0.000529059302672097, + "loss": 0.5463, + "step": 68190 + }, + { + "epoch": 3.3873050561239695, + "grad_norm": 0.111328125, + "learning_rate": 0.0005290195688884473, + "loss": 0.5734, + "step": 68200 + }, + { + "epoch": 3.3878017284195887, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005289798351047979, + "loss": 0.521, + "step": 68210 + }, + { + "epoch": 3.388298400715208, + "grad_norm": 0.11328125, + "learning_rate": 0.0005289401013211484, + "loss": 0.5451, + "step": 68220 + }, + { + "epoch": 3.3887950730108276, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005289003675374987, + "loss": 0.5402, + "step": 68230 + }, + { + "epoch": 3.389291745306447, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005288606337538492, + "loss": 0.5204, + "step": 68240 + }, + { + "epoch": 3.389788417602066, + "grad_norm": 0.103515625, + "learning_rate": 0.0005288208999701997, + "loss": 0.5358, + "step": 68250 + }, + { + "epoch": 3.3902850898976853, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005287811661865502, + "loss": 0.5163, + "step": 68260 + }, + { + "epoch": 3.390781762193305, + "grad_norm": 0.12890625, + "learning_rate": 0.0005287414324029006, + "loss": 0.5371, + "step": 68270 + }, + { + "epoch": 3.391278434488924, + "grad_norm": 0.1044921875, + "learning_rate": 0.000528701698619251, + "loss": 0.5256, + "step": 68280 + }, + { + "epoch": 3.3917751067845434, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005286619648356015, + "loss": 0.5205, + "step": 68290 + }, + { + "epoch": 3.392271779080163, + "grad_norm": 0.189453125, + "learning_rate": 0.000528622231051952, + "loss": 0.496, + "step": 68300 + }, + { + "epoch": 3.3927684513757823, + "grad_norm": 0.16015625, + "learning_rate": 0.0005285824972683024, + "loss": 0.5303, + "step": 68310 + }, + { + "epoch": 3.3932651236714015, + "grad_norm": 0.095703125, + "learning_rate": 0.0005285427634846529, + "loss": 0.5417, + "step": 68320 + }, + { + "epoch": 3.3937617959670208, + "grad_norm": 0.109375, + "learning_rate": 0.0005285030297010033, + "loss": 0.5569, + "step": 68330 + }, + { + "epoch": 3.3942584682626404, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005284632959173537, + "loss": 0.5545, + "step": 68340 + }, + { + "epoch": 3.3947551405582597, + "grad_norm": 0.095703125, + "learning_rate": 0.0005284235621337043, + "loss": 0.5759, + "step": 68350 + }, + { + "epoch": 3.395251812853879, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005283838283500547, + "loss": 0.5187, + "step": 68360 + }, + { + "epoch": 3.3957484851494986, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005283440945664051, + "loss": 0.5299, + "step": 68370 + }, + { + "epoch": 3.396245157445118, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005283043607827556, + "loss": 0.5582, + "step": 68380 + }, + { + "epoch": 3.396741829740737, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005282646269991059, + "loss": 0.5287, + "step": 68390 + }, + { + "epoch": 3.3972385020363562, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005282248932154564, + "loss": 0.5188, + "step": 68400 + }, + { + "epoch": 3.397735174331976, + "grad_norm": 0.10205078125, + "learning_rate": 0.000528185159431807, + "loss": 0.5522, + "step": 68410 + }, + { + "epoch": 3.398231846627595, + "grad_norm": 0.173828125, + "learning_rate": 0.0005281454256481574, + "loss": 0.5268, + "step": 68420 + }, + { + "epoch": 3.3987285189232144, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005281056918645078, + "loss": 0.5343, + "step": 68430 + }, + { + "epoch": 3.399225191218834, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005280659580808582, + "loss": 0.5571, + "step": 68440 + }, + { + "epoch": 3.3997218635144533, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005280262242972088, + "loss": 0.5443, + "step": 68450 + }, + { + "epoch": 3.4002185358100725, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005279864905135592, + "loss": 0.5339, + "step": 68460 + }, + { + "epoch": 3.4007152081056917, + "grad_norm": 0.15234375, + "learning_rate": 0.0005279467567299096, + "loss": 0.5359, + "step": 68470 + }, + { + "epoch": 3.4012118804013114, + "grad_norm": 0.150390625, + "learning_rate": 0.0005279070229462601, + "loss": 0.5478, + "step": 68480 + }, + { + "epoch": 3.4017085526969306, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005278672891626105, + "loss": 0.5385, + "step": 68490 + }, + { + "epoch": 3.40220522499255, + "grad_norm": 0.09912109375, + "learning_rate": 0.000527827555378961, + "loss": 0.5467, + "step": 68500 + }, + { + "epoch": 3.402701897288169, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005277878215953115, + "loss": 0.5592, + "step": 68510 + }, + { + "epoch": 3.4031985695837887, + "grad_norm": 0.10546875, + "learning_rate": 0.0005277480878116619, + "loss": 0.5589, + "step": 68520 + }, + { + "epoch": 3.403695241879408, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005277083540280123, + "loss": 0.5371, + "step": 68530 + }, + { + "epoch": 3.404191914175027, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005276686202443628, + "loss": 0.5365, + "step": 68540 + }, + { + "epoch": 3.4046885864706464, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005276288864607132, + "loss": 0.5139, + "step": 68550 + }, + { + "epoch": 3.405185258766266, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005275891526770637, + "loss": 0.546, + "step": 68560 + }, + { + "epoch": 3.4056819310618853, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005275494188934142, + "loss": 0.4999, + "step": 68570 + }, + { + "epoch": 3.4061786033575046, + "grad_norm": 0.14453125, + "learning_rate": 0.0005275096851097646, + "loss": 0.5094, + "step": 68580 + }, + { + "epoch": 3.4066752756531242, + "grad_norm": 0.11181640625, + "learning_rate": 0.000527469951326115, + "loss": 0.5602, + "step": 68590 + }, + { + "epoch": 3.4071719479487435, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005274302175424656, + "loss": 0.5446, + "step": 68600 + }, + { + "epoch": 3.4076686202443627, + "grad_norm": 0.10400390625, + "learning_rate": 0.000527390483758816, + "loss": 0.5177, + "step": 68610 + }, + { + "epoch": 3.408165292539982, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005273507499751664, + "loss": 0.5689, + "step": 68620 + }, + { + "epoch": 3.4086619648356016, + "grad_norm": 0.1015625, + "learning_rate": 0.0005273110161915169, + "loss": 0.5237, + "step": 68630 + }, + { + "epoch": 3.409158637131221, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005272712824078673, + "loss": 0.5226, + "step": 68640 + }, + { + "epoch": 3.40965530942684, + "grad_norm": 0.09375, + "learning_rate": 0.0005272315486242177, + "loss": 0.5478, + "step": 68650 + }, + { + "epoch": 3.4101519817224597, + "grad_norm": 0.16015625, + "learning_rate": 0.0005271918148405682, + "loss": 0.5422, + "step": 68660 + }, + { + "epoch": 3.410648654018079, + "grad_norm": 0.130859375, + "learning_rate": 0.0005271520810569187, + "loss": 0.5278, + "step": 68670 + }, + { + "epoch": 3.411145326313698, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005271123472732691, + "loss": 0.5128, + "step": 68680 + }, + { + "epoch": 3.4116419986093174, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005270726134896195, + "loss": 0.5396, + "step": 68690 + }, + { + "epoch": 3.412138670904937, + "grad_norm": 0.111328125, + "learning_rate": 0.00052703287970597, + "loss": 0.523, + "step": 68700 + }, + { + "epoch": 3.4126353432005563, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005269931459223205, + "loss": 0.5256, + "step": 68710 + }, + { + "epoch": 3.4131320154961755, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005269534121386709, + "loss": 0.5363, + "step": 68720 + }, + { + "epoch": 3.413628687791795, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005269136783550214, + "loss": 0.5413, + "step": 68730 + }, + { + "epoch": 3.4141253600874144, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005268739445713718, + "loss": 0.544, + "step": 68740 + }, + { + "epoch": 3.4146220323830336, + "grad_norm": 0.134765625, + "learning_rate": 0.0005268342107877222, + "loss": 0.541, + "step": 68750 + }, + { + "epoch": 3.415118704678653, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005267944770040728, + "loss": 0.5474, + "step": 68760 + }, + { + "epoch": 3.4156153769742725, + "grad_norm": 0.1484375, + "learning_rate": 0.0005267547432204232, + "loss": 0.5393, + "step": 68770 + }, + { + "epoch": 3.4161120492698918, + "grad_norm": 0.103515625, + "learning_rate": 0.0005267150094367736, + "loss": 0.5744, + "step": 68780 + }, + { + "epoch": 3.416608721565511, + "grad_norm": 0.16015625, + "learning_rate": 0.0005266752756531241, + "loss": 0.5633, + "step": 68790 + }, + { + "epoch": 3.4171053938611307, + "grad_norm": 0.126953125, + "learning_rate": 0.0005266355418694745, + "loss": 0.5428, + "step": 68800 + }, + { + "epoch": 3.41760206615675, + "grad_norm": 0.1171875, + "learning_rate": 0.000526595808085825, + "loss": 0.5468, + "step": 68810 + }, + { + "epoch": 3.418098738452369, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005265560743021755, + "loss": 0.5306, + "step": 68820 + }, + { + "epoch": 3.4185954107479883, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005265163405185259, + "loss": 0.5672, + "step": 68830 + }, + { + "epoch": 3.419092083043608, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005264766067348763, + "loss": 0.5292, + "step": 68840 + }, + { + "epoch": 3.4195887553392272, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005264368729512267, + "loss": 0.5451, + "step": 68850 + }, + { + "epoch": 3.4200854276348465, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005263971391675773, + "loss": 0.5447, + "step": 68860 + }, + { + "epoch": 3.4205820999304657, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005263574053839278, + "loss": 0.5387, + "step": 68870 + }, + { + "epoch": 3.4210787722260854, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005263176716002781, + "loss": 0.5183, + "step": 68880 + }, + { + "epoch": 3.4215754445217046, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005262779378166286, + "loss": 0.5336, + "step": 68890 + }, + { + "epoch": 3.422072116817324, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005262382040329792, + "loss": 0.5533, + "step": 68900 + }, + { + "epoch": 3.422568789112943, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005261984702493295, + "loss": 0.5835, + "step": 68910 + }, + { + "epoch": 3.4230654614085627, + "grad_norm": 0.10498046875, + "learning_rate": 0.00052615873646568, + "loss": 0.5446, + "step": 68920 + }, + { + "epoch": 3.423562133704182, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005261190026820304, + "loss": 0.548, + "step": 68930 + }, + { + "epoch": 3.424058805999801, + "grad_norm": 0.1328125, + "learning_rate": 0.0005260792688983808, + "loss": 0.5148, + "step": 68940 + }, + { + "epoch": 3.424555478295421, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005260395351147313, + "loss": 0.5478, + "step": 68950 + }, + { + "epoch": 3.42505215059104, + "grad_norm": 0.1171875, + "learning_rate": 0.0005259998013310818, + "loss": 0.541, + "step": 68960 + }, + { + "epoch": 3.4255488228866593, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005259600675474322, + "loss": 0.5277, + "step": 68970 + }, + { + "epoch": 3.4260454951822785, + "grad_norm": 0.107421875, + "learning_rate": 0.0005259203337637827, + "loss": 0.537, + "step": 68980 + }, + { + "epoch": 3.426542167477898, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005258805999801331, + "loss": 0.551, + "step": 68990 + }, + { + "epoch": 3.4270388397735174, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005258408661964836, + "loss": 0.5412, + "step": 69000 + }, + { + "epoch": 3.4275355120691366, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005258011324128341, + "loss": 0.5622, + "step": 69010 + }, + { + "epoch": 3.4280321843647563, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005257613986291845, + "loss": 0.5339, + "step": 69020 + }, + { + "epoch": 3.4285288566603755, + "grad_norm": 0.1494140625, + "learning_rate": 0.000525721664845535, + "loss": 0.5559, + "step": 69030 + }, + { + "epoch": 3.4290255289559948, + "grad_norm": 0.125, + "learning_rate": 0.0005256819310618853, + "loss": 0.5526, + "step": 69040 + }, + { + "epoch": 3.429522201251614, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005256421972782358, + "loss": 0.5512, + "step": 69050 + }, + { + "epoch": 3.4300188735472337, + "grad_norm": 0.134765625, + "learning_rate": 0.0005256024634945864, + "loss": 0.5645, + "step": 69060 + }, + { + "epoch": 3.430515545842853, + "grad_norm": 0.201171875, + "learning_rate": 0.0005255627297109367, + "loss": 0.538, + "step": 69070 + }, + { + "epoch": 3.431012218138472, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005255229959272872, + "loss": 0.5065, + "step": 69080 + }, + { + "epoch": 3.431508890434092, + "grad_norm": 0.140625, + "learning_rate": 0.0005254832621436377, + "loss": 0.5269, + "step": 69090 + }, + { + "epoch": 3.432005562729711, + "grad_norm": 0.1025390625, + "learning_rate": 0.000525443528359988, + "loss": 0.5125, + "step": 69100 + }, + { + "epoch": 3.4325022350253303, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005254037945763386, + "loss": 0.5389, + "step": 69110 + }, + { + "epoch": 3.4329989073209495, + "grad_norm": 0.11474609375, + "learning_rate": 0.000525364060792689, + "loss": 0.5265, + "step": 69120 + }, + { + "epoch": 3.433495579616569, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005253243270090394, + "loss": 0.5164, + "step": 69130 + }, + { + "epoch": 3.4339922519121884, + "grad_norm": 0.1484375, + "learning_rate": 0.0005252845932253899, + "loss": 0.5255, + "step": 69140 + }, + { + "epoch": 3.4344889242078076, + "grad_norm": 0.1484375, + "learning_rate": 0.0005252448594417403, + "loss": 0.5521, + "step": 69150 + }, + { + "epoch": 3.4349855965034273, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005252051256580909, + "loss": 0.5428, + "step": 69160 + }, + { + "epoch": 3.4354822687990465, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005251653918744413, + "loss": 0.5408, + "step": 69170 + }, + { + "epoch": 3.4359789410946657, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005251256580907917, + "loss": 0.547, + "step": 69180 + }, + { + "epoch": 3.436475613390285, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005250859243071422, + "loss": 0.5285, + "step": 69190 + }, + { + "epoch": 3.4369722856859046, + "grad_norm": 0.173828125, + "learning_rate": 0.0005250461905234926, + "loss": 0.5264, + "step": 69200 + }, + { + "epoch": 3.437468957981524, + "grad_norm": 0.109375, + "learning_rate": 0.0005250064567398431, + "loss": 0.512, + "step": 69210 + }, + { + "epoch": 3.437965630277143, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005249667229561936, + "loss": 0.535, + "step": 69220 + }, + { + "epoch": 3.4384623025727623, + "grad_norm": 0.126953125, + "learning_rate": 0.000524926989172544, + "loss": 0.5482, + "step": 69230 + }, + { + "epoch": 3.438958974868382, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005248872553888944, + "loss": 0.5437, + "step": 69240 + }, + { + "epoch": 3.439455647164001, + "grad_norm": 0.09765625, + "learning_rate": 0.0005248475216052449, + "loss": 0.543, + "step": 69250 + }, + { + "epoch": 3.4399523194596204, + "grad_norm": 0.13671875, + "learning_rate": 0.0005248077878215953, + "loss": 0.5419, + "step": 69260 + }, + { + "epoch": 3.4404489917552397, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005247680540379458, + "loss": 0.5374, + "step": 69270 + }, + { + "epoch": 3.4409456640508593, + "grad_norm": 0.115234375, + "learning_rate": 0.0005247283202542963, + "loss": 0.5166, + "step": 69280 + }, + { + "epoch": 3.4414423363464786, + "grad_norm": 0.095703125, + "learning_rate": 0.0005246885864706466, + "loss": 0.5189, + "step": 69290 + }, + { + "epoch": 3.441939008642098, + "grad_norm": 0.115234375, + "learning_rate": 0.0005246488526869971, + "loss": 0.5521, + "step": 69300 + }, + { + "epoch": 3.4424356809377175, + "grad_norm": 0.12109375, + "learning_rate": 0.0005246091189033476, + "loss": 0.5179, + "step": 69310 + }, + { + "epoch": 3.4429323532333367, + "grad_norm": 0.10546875, + "learning_rate": 0.0005245693851196981, + "loss": 0.5458, + "step": 69320 + }, + { + "epoch": 3.443429025528956, + "grad_norm": 0.1015625, + "learning_rate": 0.0005245296513360485, + "loss": 0.5593, + "step": 69330 + }, + { + "epoch": 3.443925697824575, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005244899175523989, + "loss": 0.5407, + "step": 69340 + }, + { + "epoch": 3.444422370120195, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005244501837687494, + "loss": 0.5337, + "step": 69350 + }, + { + "epoch": 3.444919042415814, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005244104499850999, + "loss": 0.5273, + "step": 69360 + }, + { + "epoch": 3.4454157147114333, + "grad_norm": 0.119140625, + "learning_rate": 0.0005243707162014503, + "loss": 0.5145, + "step": 69370 + }, + { + "epoch": 3.445912387007053, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005243309824178008, + "loss": 0.5227, + "step": 69380 + }, + { + "epoch": 3.446409059302672, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005242912486341512, + "loss": 0.5793, + "step": 69390 + }, + { + "epoch": 3.4469057315982914, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005242515148505016, + "loss": 0.5365, + "step": 69400 + }, + { + "epoch": 3.4474024038939106, + "grad_norm": 0.146484375, + "learning_rate": 0.0005242117810668522, + "loss": 0.5376, + "step": 69410 + }, + { + "epoch": 3.4478990761895303, + "grad_norm": 0.103515625, + "learning_rate": 0.0005241720472832026, + "loss": 0.5581, + "step": 69420 + }, + { + "epoch": 3.4483957484851495, + "grad_norm": 0.1357421875, + "learning_rate": 0.000524132313499553, + "loss": 0.5289, + "step": 69430 + }, + { + "epoch": 3.4488924207807687, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005240925797159035, + "loss": 0.5134, + "step": 69440 + }, + { + "epoch": 3.4493890930763884, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005240528459322539, + "loss": 0.5416, + "step": 69450 + }, + { + "epoch": 3.4498857653720076, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005240131121486044, + "loss": 0.5462, + "step": 69460 + }, + { + "epoch": 3.450382437667627, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005239733783649549, + "loss": 0.5286, + "step": 69470 + }, + { + "epoch": 3.450879109963246, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005239336445813053, + "loss": 0.5301, + "step": 69480 + }, + { + "epoch": 3.4513757822588658, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005238939107976557, + "loss": 0.5386, + "step": 69490 + }, + { + "epoch": 3.451872454554485, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005238541770140062, + "loss": 0.5514, + "step": 69500 + }, + { + "epoch": 3.452369126850104, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005238144432303567, + "loss": 0.5283, + "step": 69510 + }, + { + "epoch": 3.452865799145724, + "grad_norm": 0.111328125, + "learning_rate": 0.0005237747094467071, + "loss": 0.5174, + "step": 69520 + }, + { + "epoch": 3.453362471441343, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005237349756630575, + "loss": 0.53, + "step": 69530 + }, + { + "epoch": 3.4538591437369623, + "grad_norm": 0.12158203125, + "learning_rate": 0.000523695241879408, + "loss": 0.5549, + "step": 69540 + }, + { + "epoch": 3.4543558160325816, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005236555080957584, + "loss": 0.5244, + "step": 69550 + }, + { + "epoch": 3.4548524883282012, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005236157743121089, + "loss": 0.5421, + "step": 69560 + }, + { + "epoch": 3.4553491606238205, + "grad_norm": 0.166015625, + "learning_rate": 0.0005235760405284594, + "loss": 0.5856, + "step": 69570 + }, + { + "epoch": 3.4558458329194397, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005235363067448098, + "loss": 0.5223, + "step": 69580 + }, + { + "epoch": 3.456342505215059, + "grad_norm": 0.142578125, + "learning_rate": 0.0005234965729611602, + "loss": 0.5458, + "step": 69590 + }, + { + "epoch": 3.4568391775106786, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005234568391775107, + "loss": 0.5358, + "step": 69600 + }, + { + "epoch": 3.457335849806298, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005234171053938612, + "loss": 0.5253, + "step": 69610 + }, + { + "epoch": 3.457832522101917, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005233773716102116, + "loss": 0.5263, + "step": 69620 + }, + { + "epoch": 3.4583291943975363, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005233376378265621, + "loss": 0.5653, + "step": 69630 + }, + { + "epoch": 3.458825866693156, + "grad_norm": 0.169921875, + "learning_rate": 0.0005232979040429125, + "loss": 0.5114, + "step": 69640 + }, + { + "epoch": 3.459322538988775, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005232581702592629, + "loss": 0.5446, + "step": 69650 + }, + { + "epoch": 3.4598192112843944, + "grad_norm": 0.13671875, + "learning_rate": 0.0005232184364756135, + "loss": 0.5211, + "step": 69660 + }, + { + "epoch": 3.460315883580014, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005231787026919639, + "loss": 0.5337, + "step": 69670 + }, + { + "epoch": 3.4608125558756333, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005231389689083143, + "loss": 0.5658, + "step": 69680 + }, + { + "epoch": 3.4613092281712525, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005230992351246648, + "loss": 0.5337, + "step": 69690 + }, + { + "epoch": 3.4618059004668718, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005230595013410152, + "loss": 0.5631, + "step": 69700 + }, + { + "epoch": 3.4623025727624914, + "grad_norm": 0.134765625, + "learning_rate": 0.0005230197675573656, + "loss": 0.5428, + "step": 69710 + }, + { + "epoch": 3.4627992450581107, + "grad_norm": 0.18359375, + "learning_rate": 0.0005229800337737161, + "loss": 0.5287, + "step": 69720 + }, + { + "epoch": 3.46329591735373, + "grad_norm": 0.177734375, + "learning_rate": 0.0005229402999900666, + "loss": 0.5648, + "step": 69730 + }, + { + "epoch": 3.4637925896493496, + "grad_norm": 0.146484375, + "learning_rate": 0.0005229005662064171, + "loss": 0.5414, + "step": 69740 + }, + { + "epoch": 3.464289261944969, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005228608324227674, + "loss": 0.5192, + "step": 69750 + }, + { + "epoch": 3.464785934240588, + "grad_norm": 0.1396484375, + "learning_rate": 0.000522821098639118, + "loss": 0.538, + "step": 69760 + }, + { + "epoch": 3.4652826065362072, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005227813648554685, + "loss": 0.5175, + "step": 69770 + }, + { + "epoch": 3.465779278831827, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005227416310718188, + "loss": 0.548, + "step": 69780 + }, + { + "epoch": 3.466275951127446, + "grad_norm": 0.15234375, + "learning_rate": 0.0005227018972881693, + "loss": 0.5178, + "step": 69790 + }, + { + "epoch": 3.4667726234230654, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005226621635045197, + "loss": 0.5473, + "step": 69800 + }, + { + "epoch": 3.467269295718685, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005226224297208701, + "loss": 0.5624, + "step": 69810 + }, + { + "epoch": 3.4677659680143043, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005225826959372207, + "loss": 0.5316, + "step": 69820 + }, + { + "epoch": 3.4682626403099235, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005225429621535711, + "loss": 0.5235, + "step": 69830 + }, + { + "epoch": 3.4687593126055427, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005225032283699215, + "loss": 0.5579, + "step": 69840 + }, + { + "epoch": 3.4692559849011624, + "grad_norm": 0.10302734375, + "learning_rate": 0.000522463494586272, + "loss": 0.525, + "step": 69850 + }, + { + "epoch": 3.4697526571967816, + "grad_norm": 0.1015625, + "learning_rate": 0.0005224237608026224, + "loss": 0.5302, + "step": 69860 + }, + { + "epoch": 3.470249329492401, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005223840270189729, + "loss": 0.5348, + "step": 69870 + }, + { + "epoch": 3.4707460017880205, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005223442932353234, + "loss": 0.5421, + "step": 69880 + }, + { + "epoch": 3.4712426740836397, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005223045594516738, + "loss": 0.5356, + "step": 69890 + }, + { + "epoch": 3.471739346379259, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005222648256680243, + "loss": 0.5201, + "step": 69900 + }, + { + "epoch": 3.472236018674878, + "grad_norm": 0.177734375, + "learning_rate": 0.0005222250918843746, + "loss": 0.5484, + "step": 69910 + }, + { + "epoch": 3.472732690970498, + "grad_norm": 0.154296875, + "learning_rate": 0.0005221853581007252, + "loss": 0.5577, + "step": 69920 + }, + { + "epoch": 3.473229363266117, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005221456243170757, + "loss": 0.5364, + "step": 69930 + }, + { + "epoch": 3.4737260355617363, + "grad_norm": 0.125, + "learning_rate": 0.000522105890533426, + "loss": 0.5182, + "step": 69940 + }, + { + "epoch": 3.4742227078573555, + "grad_norm": 0.142578125, + "learning_rate": 0.0005220661567497765, + "loss": 0.5141, + "step": 69950 + }, + { + "epoch": 3.474719380152975, + "grad_norm": 0.18359375, + "learning_rate": 0.000522026422966127, + "loss": 0.5548, + "step": 69960 + }, + { + "epoch": 3.4752160524485944, + "grad_norm": 0.107421875, + "learning_rate": 0.0005219866891824774, + "loss": 0.5866, + "step": 69970 + }, + { + "epoch": 3.4757127247442137, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005219469553988279, + "loss": 0.503, + "step": 69980 + }, + { + "epoch": 3.476209397039833, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005219072216151783, + "loss": 0.5192, + "step": 69990 + }, + { + "epoch": 3.4767060693354526, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005218674878315287, + "loss": 0.5409, + "step": 70000 + }, + { + "epoch": 3.477202741631072, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005218277540478792, + "loss": 0.5133, + "step": 70010 + }, + { + "epoch": 3.477699413926691, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005217880202642297, + "loss": 0.5512, + "step": 70020 + }, + { + "epoch": 3.4781960862223107, + "grad_norm": 0.10546875, + "learning_rate": 0.0005217482864805801, + "loss": 0.5461, + "step": 70030 + }, + { + "epoch": 3.47869275851793, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005217085526969306, + "loss": 0.5825, + "step": 70040 + }, + { + "epoch": 3.479189430813549, + "grad_norm": 0.1240234375, + "learning_rate": 0.000521668818913281, + "loss": 0.5304, + "step": 70050 + }, + { + "epoch": 3.4796861031091684, + "grad_norm": 0.1328125, + "learning_rate": 0.0005216290851296316, + "loss": 0.5333, + "step": 70060 + }, + { + "epoch": 3.480182775404788, + "grad_norm": 0.1484375, + "learning_rate": 0.000521589351345982, + "loss": 0.5195, + "step": 70070 + }, + { + "epoch": 3.4806794477004073, + "grad_norm": 0.1015625, + "learning_rate": 0.0005215496175623324, + "loss": 0.5215, + "step": 70080 + }, + { + "epoch": 3.4811761199960265, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005215098837786829, + "loss": 0.5377, + "step": 70090 + }, + { + "epoch": 3.481672792291646, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005214701499950333, + "loss": 0.5759, + "step": 70100 + }, + { + "epoch": 3.4821694645872654, + "grad_norm": 0.98046875, + "learning_rate": 0.0005214304162113837, + "loss": 0.5493, + "step": 70110 + }, + { + "epoch": 3.4826661368828846, + "grad_norm": 0.125, + "learning_rate": 0.0005213906824277343, + "loss": 0.5331, + "step": 70120 + }, + { + "epoch": 3.483162809178504, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005213509486440846, + "loss": 0.5143, + "step": 70130 + }, + { + "epoch": 3.4836594814741235, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005213112148604351, + "loss": 0.5208, + "step": 70140 + }, + { + "epoch": 3.4841561537697427, + "grad_norm": 0.12890625, + "learning_rate": 0.0005212714810767856, + "loss": 0.5189, + "step": 70150 + }, + { + "epoch": 3.484652826065362, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005212317472931359, + "loss": 0.5287, + "step": 70160 + }, + { + "epoch": 3.4851494983609816, + "grad_norm": 0.15625, + "learning_rate": 0.0005211920135094865, + "loss": 0.5186, + "step": 70170 + }, + { + "epoch": 3.485646170656601, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005211522797258369, + "loss": 0.5472, + "step": 70180 + }, + { + "epoch": 3.48614284295222, + "grad_norm": 0.099609375, + "learning_rate": 0.0005211125459421874, + "loss": 0.5225, + "step": 70190 + }, + { + "epoch": 3.4866395152478393, + "grad_norm": 0.140625, + "learning_rate": 0.0005210728121585378, + "loss": 0.5379, + "step": 70200 + }, + { + "epoch": 3.487136187543459, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005210330783748882, + "loss": 0.5436, + "step": 70210 + }, + { + "epoch": 3.4876328598390782, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005209933445912388, + "loss": 0.5352, + "step": 70220 + }, + { + "epoch": 3.4881295321346975, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005209536108075892, + "loss": 0.5112, + "step": 70230 + }, + { + "epoch": 3.488626204430317, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005209138770239396, + "loss": 0.5121, + "step": 70240 + }, + { + "epoch": 3.4891228767259364, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005208741432402901, + "loss": 0.5324, + "step": 70250 + }, + { + "epoch": 3.4896195490215556, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005208344094566405, + "loss": 0.5386, + "step": 70260 + }, + { + "epoch": 3.490116221317175, + "grad_norm": 0.11328125, + "learning_rate": 0.000520794675672991, + "loss": 0.5745, + "step": 70270 + }, + { + "epoch": 3.490612893612794, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005207549418893415, + "loss": 0.5429, + "step": 70280 + }, + { + "epoch": 3.4911095659084137, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005207152081056919, + "loss": 0.567, + "step": 70290 + }, + { + "epoch": 3.491606238204033, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005206754743220423, + "loss": 0.5386, + "step": 70300 + }, + { + "epoch": 3.492102910499652, + "grad_norm": 0.103515625, + "learning_rate": 0.0005206357405383928, + "loss": 0.5078, + "step": 70310 + }, + { + "epoch": 3.492599582795272, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005205960067547432, + "loss": 0.5088, + "step": 70320 + }, + { + "epoch": 3.493096255090891, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005205562729710937, + "loss": 0.5491, + "step": 70330 + }, + { + "epoch": 3.4935929273865103, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005205165391874442, + "loss": 0.5201, + "step": 70340 + }, + { + "epoch": 3.4940895996821295, + "grad_norm": 0.099609375, + "learning_rate": 0.0005204768054037946, + "loss": 0.5218, + "step": 70350 + }, + { + "epoch": 3.494586271977749, + "grad_norm": 0.1640625, + "learning_rate": 0.000520437071620145, + "loss": 0.5051, + "step": 70360 + }, + { + "epoch": 3.4950829442733684, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005203973378364956, + "loss": 0.5346, + "step": 70370 + }, + { + "epoch": 3.4955796165689876, + "grad_norm": 0.107421875, + "learning_rate": 0.000520357604052846, + "loss": 0.5233, + "step": 70380 + }, + { + "epoch": 3.4960762888646073, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005203178702691964, + "loss": 0.526, + "step": 70390 + }, + { + "epoch": 3.4965729611602265, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005202781364855468, + "loss": 0.5111, + "step": 70400 + }, + { + "epoch": 3.4970696334558458, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005202384027018973, + "loss": 0.5377, + "step": 70410 + }, + { + "epoch": 3.497566305751465, + "grad_norm": 0.16015625, + "learning_rate": 0.0005201986689182478, + "loss": 0.5488, + "step": 70420 + }, + { + "epoch": 3.4980629780470847, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005201589351345982, + "loss": 0.5356, + "step": 70430 + }, + { + "epoch": 3.498559650342704, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005201192013509487, + "loss": 0.5158, + "step": 70440 + }, + { + "epoch": 3.499056322638323, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005200794675672991, + "loss": 0.545, + "step": 70450 + }, + { + "epoch": 3.499552994933943, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005200397337836495, + "loss": 0.5401, + "step": 70460 + }, + { + "epoch": 3.500049667229562, + "grad_norm": 0.173828125, + "learning_rate": 0.0005200000000000001, + "loss": 0.519, + "step": 70470 + }, + { + "epoch": 3.5005463395251812, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005199602662163505, + "loss": 0.5401, + "step": 70480 + }, + { + "epoch": 3.5010430118208005, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005199205324327009, + "loss": 0.5255, + "step": 70490 + }, + { + "epoch": 3.50153968411642, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005198807986490514, + "loss": 0.5162, + "step": 70500 + }, + { + "epoch": 3.5020363564120394, + "grad_norm": 0.18359375, + "learning_rate": 0.0005198410648654018, + "loss": 0.5138, + "step": 70510 + }, + { + "epoch": 3.5025330287076586, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005198013310817523, + "loss": 0.5377, + "step": 70520 + }, + { + "epoch": 3.5030297010032783, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005197615972981028, + "loss": 0.534, + "step": 70530 + }, + { + "epoch": 3.5035263732988975, + "grad_norm": 0.1015625, + "learning_rate": 0.0005197218635144532, + "loss": 0.557, + "step": 70540 + }, + { + "epoch": 3.5040230455945167, + "grad_norm": 0.123046875, + "learning_rate": 0.0005196821297308036, + "loss": 0.5508, + "step": 70550 + }, + { + "epoch": 3.504519717890136, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005196423959471541, + "loss": 0.5263, + "step": 70560 + }, + { + "epoch": 3.505016390185755, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005196026621635046, + "loss": 0.5132, + "step": 70570 + }, + { + "epoch": 3.505513062481375, + "grad_norm": 0.11083984375, + "learning_rate": 0.000519562928379855, + "loss": 0.5644, + "step": 70580 + }, + { + "epoch": 3.506009734776994, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005195231945962054, + "loss": 0.501, + "step": 70590 + }, + { + "epoch": 3.5065064070726137, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005194834608125559, + "loss": 0.5329, + "step": 70600 + }, + { + "epoch": 3.507003079368233, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005194437270289063, + "loss": 0.5443, + "step": 70610 + }, + { + "epoch": 3.507499751663852, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005194039932452568, + "loss": 0.5276, + "step": 70620 + }, + { + "epoch": 3.5079964239594714, + "grad_norm": 0.126953125, + "learning_rate": 0.0005193642594616073, + "loss": 0.5405, + "step": 70630 + }, + { + "epoch": 3.5084930962550906, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005193245256779578, + "loss": 0.5529, + "step": 70640 + }, + { + "epoch": 3.5089897685507103, + "grad_norm": 0.13671875, + "learning_rate": 0.0005192847918943081, + "loss": 0.5094, + "step": 70650 + }, + { + "epoch": 3.5094864408463295, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005192450581106586, + "loss": 0.5102, + "step": 70660 + }, + { + "epoch": 3.509983113141949, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005192053243270091, + "loss": 0.5402, + "step": 70670 + }, + { + "epoch": 3.5104797854375684, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005191655905433595, + "loss": 0.5365, + "step": 70680 + }, + { + "epoch": 3.5109764577331877, + "grad_norm": 0.1328125, + "learning_rate": 0.00051912585675971, + "loss": 0.554, + "step": 70690 + }, + { + "epoch": 3.511473130028807, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005190861229760604, + "loss": 0.5333, + "step": 70700 + }, + { + "epoch": 3.511969802324426, + "grad_norm": 0.107421875, + "learning_rate": 0.0005190463891924108, + "loss": 0.5351, + "step": 70710 + }, + { + "epoch": 3.512466474620046, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005190066554087614, + "loss": 0.5524, + "step": 70720 + }, + { + "epoch": 3.512963146915665, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005189669216251118, + "loss": 0.5221, + "step": 70730 + }, + { + "epoch": 3.5134598192112843, + "grad_norm": 0.11328125, + "learning_rate": 0.0005189271878414622, + "loss": 0.5439, + "step": 70740 + }, + { + "epoch": 3.513956491506904, + "grad_norm": 0.11328125, + "learning_rate": 0.0005188874540578127, + "loss": 0.534, + "step": 70750 + }, + { + "epoch": 3.514453163802523, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005188477202741631, + "loss": 0.5449, + "step": 70760 + }, + { + "epoch": 3.5149498360981424, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005188079864905136, + "loss": 0.5213, + "step": 70770 + }, + { + "epoch": 3.5154465083937616, + "grad_norm": 0.1123046875, + "learning_rate": 0.000518768252706864, + "loss": 0.5238, + "step": 70780 + }, + { + "epoch": 3.5159431806893813, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005187285189232145, + "loss": 0.5503, + "step": 70790 + }, + { + "epoch": 3.5164398529850005, + "grad_norm": 0.1572265625, + "learning_rate": 0.000518688785139565, + "loss": 0.5157, + "step": 70800 + }, + { + "epoch": 3.5169365252806197, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005186490513559153, + "loss": 0.5357, + "step": 70810 + }, + { + "epoch": 3.5174331975762394, + "grad_norm": 0.140625, + "learning_rate": 0.0005186093175722659, + "loss": 0.5286, + "step": 70820 + }, + { + "epoch": 3.5179298698718586, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005185695837886164, + "loss": 0.556, + "step": 70830 + }, + { + "epoch": 3.518426542167478, + "grad_norm": 0.154296875, + "learning_rate": 0.0005185298500049667, + "loss": 0.5277, + "step": 70840 + }, + { + "epoch": 3.518923214463097, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005184901162213172, + "loss": 0.5423, + "step": 70850 + }, + { + "epoch": 3.5194198867587168, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005184503824376676, + "loss": 0.5051, + "step": 70860 + }, + { + "epoch": 3.519916559054336, + "grad_norm": 0.1962890625, + "learning_rate": 0.000518410648654018, + "loss": 0.5264, + "step": 70870 + }, + { + "epoch": 3.520413231349955, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005183709148703686, + "loss": 0.5475, + "step": 70880 + }, + { + "epoch": 3.520909903645575, + "grad_norm": 0.12890625, + "learning_rate": 0.000518331181086719, + "loss": 0.5207, + "step": 70890 + }, + { + "epoch": 3.521406575941194, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005182914473030694, + "loss": 0.532, + "step": 70900 + }, + { + "epoch": 3.5219032482368133, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005182517135194199, + "loss": 0.5313, + "step": 70910 + }, + { + "epoch": 3.5223999205324326, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005182119797357704, + "loss": 0.5218, + "step": 70920 + }, + { + "epoch": 3.522896592828052, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005181722459521208, + "loss": 0.5331, + "step": 70930 + }, + { + "epoch": 3.5233932651236715, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005181325121684713, + "loss": 0.5286, + "step": 70940 + }, + { + "epoch": 3.5238899374192907, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005180927783848217, + "loss": 0.534, + "step": 70950 + }, + { + "epoch": 3.5243866097149104, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005180530446011722, + "loss": 0.5566, + "step": 70960 + }, + { + "epoch": 3.5248832820105296, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005180133108175227, + "loss": 0.5477, + "step": 70970 + }, + { + "epoch": 3.525379954306149, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005179735770338731, + "loss": 0.512, + "step": 70980 + }, + { + "epoch": 3.525876626601768, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005179338432502236, + "loss": 0.5373, + "step": 70990 + }, + { + "epoch": 3.5263732988973873, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005178941094665739, + "loss": 0.5523, + "step": 71000 + }, + { + "epoch": 3.526869971193007, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005178543756829244, + "loss": 0.5356, + "step": 71010 + }, + { + "epoch": 3.527366643488626, + "grad_norm": 0.1064453125, + "learning_rate": 0.000517814641899275, + "loss": 0.4895, + "step": 71020 + }, + { + "epoch": 3.5278633157842454, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005177749081156253, + "loss": 0.5625, + "step": 71030 + }, + { + "epoch": 3.528359988079865, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005177351743319758, + "loss": 0.5389, + "step": 71040 + }, + { + "epoch": 3.5288566603754843, + "grad_norm": 0.177734375, + "learning_rate": 0.0005176954405483262, + "loss": 0.5461, + "step": 71050 + }, + { + "epoch": 3.5293533326711035, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005176557067646766, + "loss": 0.5489, + "step": 71060 + }, + { + "epoch": 3.5298500049667227, + "grad_norm": 0.109375, + "learning_rate": 0.0005176159729810272, + "loss": 0.5162, + "step": 71070 + }, + { + "epoch": 3.5303466772623424, + "grad_norm": 0.109375, + "learning_rate": 0.0005175762391973776, + "loss": 0.5644, + "step": 71080 + }, + { + "epoch": 3.5308433495579616, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005175365054137281, + "loss": 0.5453, + "step": 71090 + }, + { + "epoch": 3.531340021853581, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005174967716300785, + "loss": 0.5447, + "step": 71100 + }, + { + "epoch": 3.5318366941492005, + "grad_norm": 0.134765625, + "learning_rate": 0.0005174570378464289, + "loss": 0.5287, + "step": 71110 + }, + { + "epoch": 3.5323333664448198, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005174173040627795, + "loss": 0.5363, + "step": 71120 + }, + { + "epoch": 3.532830038740439, + "grad_norm": 0.103515625, + "learning_rate": 0.0005173775702791299, + "loss": 0.5309, + "step": 71130 + }, + { + "epoch": 3.533326711036058, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005173378364954803, + "loss": 0.5407, + "step": 71140 + }, + { + "epoch": 3.533823383331678, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005172981027118308, + "loss": 0.5334, + "step": 71150 + }, + { + "epoch": 3.534320055627297, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005172583689281812, + "loss": 0.5472, + "step": 71160 + }, + { + "epoch": 3.5348167279229163, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005172186351445316, + "loss": 0.5391, + "step": 71170 + }, + { + "epoch": 3.535313400218536, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005171789013608822, + "loss": 0.5256, + "step": 71180 + }, + { + "epoch": 3.5358100725141552, + "grad_norm": 0.09765625, + "learning_rate": 0.0005171391675772325, + "loss": 0.5316, + "step": 71190 + }, + { + "epoch": 3.5363067448097745, + "grad_norm": 0.11376953125, + "learning_rate": 0.000517099433793583, + "loss": 0.5215, + "step": 71200 + }, + { + "epoch": 3.5368034171053937, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005170597000099335, + "loss": 0.5602, + "step": 71210 + }, + { + "epoch": 3.5373000894010134, + "grad_norm": 0.115234375, + "learning_rate": 0.0005170199662262838, + "loss": 0.5639, + "step": 71220 + }, + { + "epoch": 3.5377967616966326, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005169802324426344, + "loss": 0.5334, + "step": 71230 + }, + { + "epoch": 3.538293433992252, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005169404986589849, + "loss": 0.5257, + "step": 71240 + }, + { + "epoch": 3.5387901062878715, + "grad_norm": 0.130859375, + "learning_rate": 0.0005169007648753353, + "loss": 0.5557, + "step": 71250 + }, + { + "epoch": 3.5392867785834907, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005168610310916857, + "loss": 0.526, + "step": 71260 + }, + { + "epoch": 3.53978345087911, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005168212973080361, + "loss": 0.5291, + "step": 71270 + }, + { + "epoch": 3.540280123174729, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005167815635243867, + "loss": 0.5306, + "step": 71280 + }, + { + "epoch": 3.5407767954703484, + "grad_norm": 0.14453125, + "learning_rate": 0.0005167418297407371, + "loss": 0.5884, + "step": 71290 + }, + { + "epoch": 3.541273467765968, + "grad_norm": 0.09765625, + "learning_rate": 0.0005167020959570875, + "loss": 0.54, + "step": 71300 + }, + { + "epoch": 3.5417701400615873, + "grad_norm": 0.10595703125, + "learning_rate": 0.000516662362173438, + "loss": 0.5544, + "step": 71310 + }, + { + "epoch": 3.542266812357207, + "grad_norm": 0.09765625, + "learning_rate": 0.0005166226283897884, + "loss": 0.5319, + "step": 71320 + }, + { + "epoch": 3.542763484652826, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005165828946061389, + "loss": 0.5144, + "step": 71330 + }, + { + "epoch": 3.5432601569484454, + "grad_norm": 0.1015625, + "learning_rate": 0.0005165431608224894, + "loss": 0.5351, + "step": 71340 + }, + { + "epoch": 3.5437568292440647, + "grad_norm": 0.1015625, + "learning_rate": 0.0005165034270388398, + "loss": 0.528, + "step": 71350 + }, + { + "epoch": 3.544253501539684, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005164636932551902, + "loss": 0.5571, + "step": 71360 + }, + { + "epoch": 3.5447501738353036, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005164239594715408, + "loss": 0.5561, + "step": 71370 + }, + { + "epoch": 3.545246846130923, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005163842256878911, + "loss": 0.5146, + "step": 71380 + }, + { + "epoch": 3.545743518426542, + "grad_norm": 0.115234375, + "learning_rate": 0.0005163444919042416, + "loss": 0.5854, + "step": 71390 + }, + { + "epoch": 3.5462401907221617, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005163047581205921, + "loss": 0.5365, + "step": 71400 + }, + { + "epoch": 3.546736863017781, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005162650243369425, + "loss": 0.5505, + "step": 71410 + }, + { + "epoch": 3.5472335353134, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005162252905532929, + "loss": 0.5291, + "step": 71420 + }, + { + "epoch": 3.5477302076090194, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005161855567696435, + "loss": 0.5225, + "step": 71430 + }, + { + "epoch": 3.548226879904639, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005161458229859939, + "loss": 0.5633, + "step": 71440 + }, + { + "epoch": 3.5487235522002583, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005161060892023443, + "loss": 0.5341, + "step": 71450 + }, + { + "epoch": 3.5492202244958775, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005160663554186947, + "loss": 0.5525, + "step": 71460 + }, + { + "epoch": 3.549716896791497, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005160266216350452, + "loss": 0.5279, + "step": 71470 + }, + { + "epoch": 3.5502135690871164, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005159868878513957, + "loss": 0.5233, + "step": 71480 + }, + { + "epoch": 3.5507102413827356, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005159471540677461, + "loss": 0.5602, + "step": 71490 + }, + { + "epoch": 3.551206913678355, + "grad_norm": 0.177734375, + "learning_rate": 0.0005159074202840966, + "loss": 0.5283, + "step": 71500 + }, + { + "epoch": 3.5517035859739745, + "grad_norm": 0.1376953125, + "learning_rate": 0.000515867686500447, + "loss": 0.5276, + "step": 71510 + }, + { + "epoch": 3.5522002582695937, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005158279527167974, + "loss": 0.5337, + "step": 71520 + }, + { + "epoch": 3.552696930565213, + "grad_norm": 0.123046875, + "learning_rate": 0.000515788218933148, + "loss": 0.5452, + "step": 71530 + }, + { + "epoch": 3.5531936028608326, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005157484851494984, + "loss": 0.5402, + "step": 71540 + }, + { + "epoch": 3.553690275156452, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005157087513658488, + "loss": 0.5368, + "step": 71550 + }, + { + "epoch": 3.554186947452071, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005156690175821993, + "loss": 0.5163, + "step": 71560 + }, + { + "epoch": 3.5546836197476903, + "grad_norm": 0.11328125, + "learning_rate": 0.0005156292837985497, + "loss": 0.5423, + "step": 71570 + }, + { + "epoch": 3.55518029204331, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005155895500149002, + "loss": 0.5379, + "step": 71580 + }, + { + "epoch": 3.555676964338929, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005155498162312507, + "loss": 0.5347, + "step": 71590 + }, + { + "epoch": 3.5561736366345484, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005155100824476011, + "loss": 0.5431, + "step": 71600 + }, + { + "epoch": 3.556670308930168, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005154703486639515, + "loss": 0.5104, + "step": 71610 + }, + { + "epoch": 3.5571669812257873, + "grad_norm": 0.1640625, + "learning_rate": 0.000515430614880302, + "loss": 0.5474, + "step": 71620 + }, + { + "epoch": 3.5576636535214066, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005153908810966525, + "loss": 0.552, + "step": 71630 + }, + { + "epoch": 3.558160325817026, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005153511473130029, + "loss": 0.534, + "step": 71640 + }, + { + "epoch": 3.558656998112645, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005153114135293533, + "loss": 0.5133, + "step": 71650 + }, + { + "epoch": 3.5591536704082647, + "grad_norm": 0.146484375, + "learning_rate": 0.0005152716797457038, + "loss": 0.5526, + "step": 71660 + }, + { + "epoch": 3.559650342703884, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005152319459620542, + "loss": 0.5265, + "step": 71670 + }, + { + "epoch": 3.5601470149995036, + "grad_norm": 0.107421875, + "learning_rate": 0.0005151922121784047, + "loss": 0.5408, + "step": 71680 + }, + { + "epoch": 3.560643687295123, + "grad_norm": 0.12109375, + "learning_rate": 0.0005151524783947552, + "loss": 0.5153, + "step": 71690 + }, + { + "epoch": 3.561140359590742, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005151127446111057, + "loss": 0.5274, + "step": 71700 + }, + { + "epoch": 3.5616370318863613, + "grad_norm": 0.09619140625, + "learning_rate": 0.000515073010827456, + "loss": 0.5479, + "step": 71710 + }, + { + "epoch": 3.5621337041819805, + "grad_norm": 0.146484375, + "learning_rate": 0.0005150332770438065, + "loss": 0.5359, + "step": 71720 + }, + { + "epoch": 3.5626303764776, + "grad_norm": 0.1005859375, + "learning_rate": 0.000514993543260157, + "loss": 0.5266, + "step": 71730 + }, + { + "epoch": 3.5631270487732194, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005149538094765074, + "loss": 0.5389, + "step": 71740 + }, + { + "epoch": 3.5636237210688386, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005149140756928579, + "loss": 0.5269, + "step": 71750 + }, + { + "epoch": 3.5641203933644583, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005148743419092083, + "loss": 0.5298, + "step": 71760 + }, + { + "epoch": 3.5646170656600775, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005148346081255587, + "loss": 0.5324, + "step": 71770 + }, + { + "epoch": 3.5651137379556967, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005147948743419093, + "loss": 0.5094, + "step": 71780 + }, + { + "epoch": 3.565610410251316, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005147551405582597, + "loss": 0.541, + "step": 71790 + }, + { + "epoch": 3.5661070825469356, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005147154067746101, + "loss": 0.5141, + "step": 71800 + }, + { + "epoch": 3.566603754842555, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005146756729909606, + "loss": 0.5217, + "step": 71810 + }, + { + "epoch": 3.567100427138174, + "grad_norm": 0.1669921875, + "learning_rate": 0.000514635939207311, + "loss": 0.5366, + "step": 71820 + }, + { + "epoch": 3.5675970994337938, + "grad_norm": 0.130859375, + "learning_rate": 0.0005145962054236616, + "loss": 0.542, + "step": 71830 + }, + { + "epoch": 3.568093771729413, + "grad_norm": 0.115234375, + "learning_rate": 0.0005145564716400119, + "loss": 0.5197, + "step": 71840 + }, + { + "epoch": 3.5685904440250322, + "grad_norm": 0.169921875, + "learning_rate": 0.0005145167378563624, + "loss": 0.5421, + "step": 71850 + }, + { + "epoch": 3.5690871163206515, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005144770040727129, + "loss": 0.5638, + "step": 71860 + }, + { + "epoch": 3.569583788616271, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005144372702890632, + "loss": 0.501, + "step": 71870 + }, + { + "epoch": 3.5700804609118904, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005143975365054138, + "loss": 0.5251, + "step": 71880 + }, + { + "epoch": 3.5705771332075096, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005143578027217643, + "loss": 0.5545, + "step": 71890 + }, + { + "epoch": 3.5710738055031293, + "grad_norm": 0.11328125, + "learning_rate": 0.0005143180689381146, + "loss": 0.5231, + "step": 71900 + }, + { + "epoch": 3.5715704777987485, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005142783351544651, + "loss": 0.5064, + "step": 71910 + }, + { + "epoch": 3.5720671500943677, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005142386013708155, + "loss": 0.5371, + "step": 71920 + }, + { + "epoch": 3.572563822389987, + "grad_norm": 0.1328125, + "learning_rate": 0.000514198867587166, + "loss": 0.5559, + "step": 71930 + }, + { + "epoch": 3.5730604946856066, + "grad_norm": 0.142578125, + "learning_rate": 0.0005141591338035165, + "loss": 0.5532, + "step": 71940 + }, + { + "epoch": 3.573557166981226, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005141194000198669, + "loss": 0.5337, + "step": 71950 + }, + { + "epoch": 3.574053839276845, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005140796662362173, + "loss": 0.5438, + "step": 71960 + }, + { + "epoch": 3.5745505115724647, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005140399324525678, + "loss": 0.507, + "step": 71970 + }, + { + "epoch": 3.575047183868084, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005140001986689183, + "loss": 0.5545, + "step": 71980 + }, + { + "epoch": 3.575543856163703, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005139604648852688, + "loss": 0.5334, + "step": 71990 + }, + { + "epoch": 3.5760405284593224, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005139207311016192, + "loss": 0.5445, + "step": 72000 + }, + { + "epoch": 3.5765372007549416, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005138809973179696, + "loss": 0.5251, + "step": 72010 + }, + { + "epoch": 3.5770338730505613, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005138412635343201, + "loss": 0.5447, + "step": 72020 + }, + { + "epoch": 3.5775305453461805, + "grad_norm": 0.09765625, + "learning_rate": 0.0005138015297506706, + "loss": 0.5429, + "step": 72030 + }, + { + "epoch": 3.5780272176418, + "grad_norm": 0.111328125, + "learning_rate": 0.000513761795967021, + "loss": 0.5255, + "step": 72040 + }, + { + "epoch": 3.5785238899374194, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005137220621833715, + "loss": 0.5466, + "step": 72050 + }, + { + "epoch": 3.5790205622330387, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005136823283997218, + "loss": 0.5773, + "step": 72060 + }, + { + "epoch": 3.579517234528658, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005136425946160723, + "loss": 0.5379, + "step": 72070 + }, + { + "epoch": 3.580013906824277, + "grad_norm": 0.111328125, + "learning_rate": 0.0005136028608324229, + "loss": 0.5583, + "step": 72080 + }, + { + "epoch": 3.580510579119897, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005135631270487732, + "loss": 0.5559, + "step": 72090 + }, + { + "epoch": 3.581007251415516, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005135233932651237, + "loss": 0.5188, + "step": 72100 + }, + { + "epoch": 3.5815039237111352, + "grad_norm": 0.099609375, + "learning_rate": 0.0005134836594814741, + "loss": 0.5299, + "step": 72110 + }, + { + "epoch": 3.582000596006755, + "grad_norm": 0.107421875, + "learning_rate": 0.0005134439256978245, + "loss": 0.522, + "step": 72120 + }, + { + "epoch": 3.582497268302374, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005134041919141751, + "loss": 0.5496, + "step": 72130 + }, + { + "epoch": 3.5829939405979934, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005133644581305255, + "loss": 0.5553, + "step": 72140 + }, + { + "epoch": 3.5834906128936126, + "grad_norm": 0.1337890625, + "learning_rate": 0.000513324724346876, + "loss": 0.5767, + "step": 72150 + }, + { + "epoch": 3.5839872851892323, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005132849905632264, + "loss": 0.5259, + "step": 72160 + }, + { + "epoch": 3.5844839574848515, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005132452567795768, + "loss": 0.546, + "step": 72170 + }, + { + "epoch": 3.5849806297804707, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005132055229959274, + "loss": 0.5524, + "step": 72180 + }, + { + "epoch": 3.5854773020760904, + "grad_norm": 0.146484375, + "learning_rate": 0.0005131657892122778, + "loss": 0.5292, + "step": 72190 + }, + { + "epoch": 3.5859739743717096, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005131260554286282, + "loss": 0.5343, + "step": 72200 + }, + { + "epoch": 3.586470646667329, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005130863216449787, + "loss": 0.5208, + "step": 72210 + }, + { + "epoch": 3.586967318962948, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005130465878613291, + "loss": 0.5311, + "step": 72220 + }, + { + "epoch": 3.5874639912585677, + "grad_norm": 0.203125, + "learning_rate": 0.0005130068540776796, + "loss": 0.5472, + "step": 72230 + }, + { + "epoch": 3.587960663554187, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005129671202940301, + "loss": 0.5617, + "step": 72240 + }, + { + "epoch": 3.588457335849806, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005129273865103804, + "loss": 0.5349, + "step": 72250 + }, + { + "epoch": 3.588954008145426, + "grad_norm": 0.099609375, + "learning_rate": 0.0005128876527267309, + "loss": 0.5385, + "step": 72260 + }, + { + "epoch": 3.589450680441045, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005128479189430814, + "loss": 0.5568, + "step": 72270 + }, + { + "epoch": 3.5899473527366643, + "grad_norm": 0.115234375, + "learning_rate": 0.0005128081851594319, + "loss": 0.5277, + "step": 72280 + }, + { + "epoch": 3.5904440250322835, + "grad_norm": 0.103515625, + "learning_rate": 0.0005127684513757823, + "loss": 0.5236, + "step": 72290 + }, + { + "epoch": 3.5909406973279028, + "grad_norm": 0.12890625, + "learning_rate": 0.0005127287175921328, + "loss": 0.5263, + "step": 72300 + }, + { + "epoch": 3.5914373696235224, + "grad_norm": 0.099609375, + "learning_rate": 0.0005126889838084832, + "loss": 0.5068, + "step": 72310 + }, + { + "epoch": 3.5919340419191417, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005126492500248336, + "loss": 0.5226, + "step": 72320 + }, + { + "epoch": 3.5924307142147613, + "grad_norm": 0.1162109375, + "learning_rate": 0.000512609516241184, + "loss": 0.526, + "step": 72330 + }, + { + "epoch": 3.5929273865103806, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005125697824575346, + "loss": 0.5258, + "step": 72340 + }, + { + "epoch": 3.593424058806, + "grad_norm": 0.14453125, + "learning_rate": 0.000512530048673885, + "loss": 0.5363, + "step": 72350 + }, + { + "epoch": 3.593920731101619, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005124903148902354, + "loss": 0.5559, + "step": 72360 + }, + { + "epoch": 3.5944174033972383, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005124505811065859, + "loss": 0.5435, + "step": 72370 + }, + { + "epoch": 3.594914075692858, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005124108473229364, + "loss": 0.5365, + "step": 72380 + }, + { + "epoch": 3.595410747988477, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005123711135392868, + "loss": 0.5493, + "step": 72390 + }, + { + "epoch": 3.595907420284097, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005123313797556373, + "loss": 0.4943, + "step": 72400 + }, + { + "epoch": 3.596404092579716, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005122916459719877, + "loss": 0.5471, + "step": 72410 + }, + { + "epoch": 3.5969007648753353, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005122519121883381, + "loss": 0.5154, + "step": 72420 + }, + { + "epoch": 3.5973974371709545, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005122121784046887, + "loss": 0.5237, + "step": 72430 + }, + { + "epoch": 3.5978941094665737, + "grad_norm": 0.099609375, + "learning_rate": 0.0005121724446210391, + "loss": 0.5129, + "step": 72440 + }, + { + "epoch": 3.5983907817621934, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005121327108373895, + "loss": 0.5398, + "step": 72450 + }, + { + "epoch": 3.5988874540578126, + "grad_norm": 0.09423828125, + "learning_rate": 0.00051209297705374, + "loss": 0.5368, + "step": 72460 + }, + { + "epoch": 3.599384126353432, + "grad_norm": 0.130859375, + "learning_rate": 0.0005120532432700904, + "loss": 0.5295, + "step": 72470 + }, + { + "epoch": 3.5998807986490515, + "grad_norm": 0.11328125, + "learning_rate": 0.0005120135094864408, + "loss": 0.5352, + "step": 72480 + }, + { + "epoch": 3.6003774709446708, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005119737757027914, + "loss": 0.5417, + "step": 72490 + }, + { + "epoch": 3.60087414324029, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005119340419191418, + "loss": 0.5551, + "step": 72500 + }, + { + "epoch": 3.601370815535909, + "grad_norm": 0.099609375, + "learning_rate": 0.0005118943081354922, + "loss": 0.5224, + "step": 72510 + }, + { + "epoch": 3.601867487831529, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005118545743518426, + "loss": 0.5196, + "step": 72520 + }, + { + "epoch": 3.602364160127148, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005118148405681932, + "loss": 0.5537, + "step": 72530 + }, + { + "epoch": 3.6028608324227673, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005117751067845436, + "loss": 0.5389, + "step": 72540 + }, + { + "epoch": 3.603357504718387, + "grad_norm": 0.10205078125, + "learning_rate": 0.000511735373000894, + "loss": 0.5216, + "step": 72550 + }, + { + "epoch": 3.6038541770140062, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005116956392172445, + "loss": 0.5322, + "step": 72560 + }, + { + "epoch": 3.6043508493096255, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005116559054335949, + "loss": 0.5167, + "step": 72570 + }, + { + "epoch": 3.6048475216052447, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005116161716499453, + "loss": 0.5366, + "step": 72580 + }, + { + "epoch": 3.6053441939008644, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005115764378662959, + "loss": 0.5253, + "step": 72590 + }, + { + "epoch": 3.6058408661964836, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005115367040826463, + "loss": 0.5323, + "step": 72600 + }, + { + "epoch": 3.606337538492103, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005114969702989967, + "loss": 0.5361, + "step": 72610 + }, + { + "epoch": 3.6068342107877225, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005114572365153472, + "loss": 0.5282, + "step": 72620 + }, + { + "epoch": 3.6073308830833417, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005114175027316976, + "loss": 0.5132, + "step": 72630 + }, + { + "epoch": 3.607827555378961, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005113777689480481, + "loss": 0.5314, + "step": 72640 + }, + { + "epoch": 3.60832422767458, + "grad_norm": 0.126953125, + "learning_rate": 0.0005113380351643986, + "loss": 0.5729, + "step": 72650 + }, + { + "epoch": 3.6088208999701994, + "grad_norm": 0.09765625, + "learning_rate": 0.000511298301380749, + "loss": 0.5214, + "step": 72660 + }, + { + "epoch": 3.609317572265819, + "grad_norm": 0.138671875, + "learning_rate": 0.0005112585675970994, + "loss": 0.5506, + "step": 72670 + }, + { + "epoch": 3.6098142445614383, + "grad_norm": 0.10205078125, + "learning_rate": 0.00051121883381345, + "loss": 0.5271, + "step": 72680 + }, + { + "epoch": 3.610310916857058, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005111791000298004, + "loss": 0.5279, + "step": 72690 + }, + { + "epoch": 3.610807589152677, + "grad_norm": 0.11328125, + "learning_rate": 0.0005111393662461508, + "loss": 0.5279, + "step": 72700 + }, + { + "epoch": 3.6113042614482964, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005110996324625012, + "loss": 0.5447, + "step": 72710 + }, + { + "epoch": 3.6118009337439156, + "grad_norm": 0.12890625, + "learning_rate": 0.0005110598986788517, + "loss": 0.5431, + "step": 72720 + }, + { + "epoch": 3.612297606039535, + "grad_norm": 0.119140625, + "learning_rate": 0.0005110201648952023, + "loss": 0.5097, + "step": 72730 + }, + { + "epoch": 3.6127942783351545, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005109804311115526, + "loss": 0.5035, + "step": 72740 + }, + { + "epoch": 3.6132909506307738, + "grad_norm": 0.15234375, + "learning_rate": 0.0005109406973279031, + "loss": 0.5387, + "step": 72750 + }, + { + "epoch": 3.6137876229263934, + "grad_norm": 0.109375, + "learning_rate": 0.0005109009635442536, + "loss": 0.5378, + "step": 72760 + }, + { + "epoch": 3.6142842952220127, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005108612297606039, + "loss": 0.5338, + "step": 72770 + }, + { + "epoch": 3.614780967517632, + "grad_norm": 0.099609375, + "learning_rate": 0.0005108214959769544, + "loss": 0.5323, + "step": 72780 + }, + { + "epoch": 3.615277639813251, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005107817621933049, + "loss": 0.54, + "step": 72790 + }, + { + "epoch": 3.6157743121088703, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005107420284096553, + "loss": 0.5103, + "step": 72800 + }, + { + "epoch": 3.61627098440449, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005107022946260058, + "loss": 0.5156, + "step": 72810 + }, + { + "epoch": 3.6167676567001092, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005106625608423562, + "loss": 0.5502, + "step": 72820 + }, + { + "epoch": 3.6172643289957285, + "grad_norm": 0.09765625, + "learning_rate": 0.0005106228270587066, + "loss": 0.5266, + "step": 72830 + }, + { + "epoch": 3.617761001291348, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005105830932750572, + "loss": 0.564, + "step": 72840 + }, + { + "epoch": 3.6182576735869674, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005105433594914076, + "loss": 0.55, + "step": 72850 + }, + { + "epoch": 3.6187543458825866, + "grad_norm": 0.115234375, + "learning_rate": 0.000510503625707758, + "loss": 0.5557, + "step": 72860 + }, + { + "epoch": 3.619251018178206, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005104638919241085, + "loss": 0.5395, + "step": 72870 + }, + { + "epoch": 3.6197476904738255, + "grad_norm": 0.130859375, + "learning_rate": 0.0005104241581404589, + "loss": 0.5113, + "step": 72880 + }, + { + "epoch": 3.6202443627694447, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005103844243568095, + "loss": 0.5233, + "step": 72890 + }, + { + "epoch": 3.620741035065064, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005103446905731599, + "loss": 0.5474, + "step": 72900 + }, + { + "epoch": 3.6212377073606836, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005103049567895103, + "loss": 0.5339, + "step": 72910 + }, + { + "epoch": 3.621734379656303, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005102652230058608, + "loss": 0.5733, + "step": 72920 + }, + { + "epoch": 3.622231051951922, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005102254892222111, + "loss": 0.5275, + "step": 72930 + }, + { + "epoch": 3.6227277242475413, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005101857554385617, + "loss": 0.5442, + "step": 72940 + }, + { + "epoch": 3.623224396543161, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005101460216549122, + "loss": 0.5314, + "step": 72950 + }, + { + "epoch": 3.62372106883878, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005101062878712625, + "loss": 0.5269, + "step": 72960 + }, + { + "epoch": 3.6242177411343994, + "grad_norm": 0.1025390625, + "learning_rate": 0.000510066554087613, + "loss": 0.5378, + "step": 72970 + }, + { + "epoch": 3.624714413430019, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005100268203039634, + "loss": 0.5608, + "step": 72980 + }, + { + "epoch": 3.6252110857256383, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005099870865203139, + "loss": 0.5212, + "step": 72990 + }, + { + "epoch": 3.6257077580212576, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005099473527366644, + "loss": 0.5278, + "step": 73000 + }, + { + "epoch": 3.626204430316877, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005099076189530148, + "loss": 0.5407, + "step": 73010 + }, + { + "epoch": 3.626701102612496, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005098678851693652, + "loss": 0.5108, + "step": 73020 + }, + { + "epoch": 3.6271977749081157, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005098281513857157, + "loss": 0.553, + "step": 73030 + }, + { + "epoch": 3.627694447203735, + "grad_norm": 0.11328125, + "learning_rate": 0.0005097884176020662, + "loss": 0.5371, + "step": 73040 + }, + { + "epoch": 3.6281911194993546, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005097486838184167, + "loss": 0.5203, + "step": 73050 + }, + { + "epoch": 3.628687791794974, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005097089500347671, + "loss": 0.5108, + "step": 73060 + }, + { + "epoch": 3.629184464090593, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005096692162511175, + "loss": 0.5054, + "step": 73070 + }, + { + "epoch": 3.6296811363862123, + "grad_norm": 0.1103515625, + "learning_rate": 0.000509629482467468, + "loss": 0.5246, + "step": 73080 + }, + { + "epoch": 3.6301778086818315, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005095897486838185, + "loss": 0.5111, + "step": 73090 + }, + { + "epoch": 3.630674480977451, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005095500149001689, + "loss": 0.5068, + "step": 73100 + }, + { + "epoch": 3.6311711532730704, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005095102811165194, + "loss": 0.5182, + "step": 73110 + }, + { + "epoch": 3.63166782556869, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005094705473328697, + "loss": 0.5182, + "step": 73120 + }, + { + "epoch": 3.6321644978643093, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005094308135492202, + "loss": 0.5534, + "step": 73130 + }, + { + "epoch": 3.6326611701599285, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005093910797655708, + "loss": 0.553, + "step": 73140 + }, + { + "epoch": 3.6331578424555477, + "grad_norm": 0.125, + "learning_rate": 0.0005093513459819211, + "loss": 0.5526, + "step": 73150 + }, + { + "epoch": 3.633654514751167, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005093116121982716, + "loss": 0.5402, + "step": 73160 + }, + { + "epoch": 3.6341511870467866, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005092718784146221, + "loss": 0.515, + "step": 73170 + }, + { + "epoch": 3.634647859342406, + "grad_norm": 0.126953125, + "learning_rate": 0.0005092321446309725, + "loss": 0.5568, + "step": 73180 + }, + { + "epoch": 3.635144531638025, + "grad_norm": 0.1171875, + "learning_rate": 0.000509192410847323, + "loss": 0.5495, + "step": 73190 + }, + { + "epoch": 3.6356412039336448, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005091526770636734, + "loss": 0.5464, + "step": 73200 + }, + { + "epoch": 3.636137876229264, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005091129432800239, + "loss": 0.5332, + "step": 73210 + }, + { + "epoch": 3.636634548524883, + "grad_norm": 0.23828125, + "learning_rate": 0.0005090732094963743, + "loss": 0.5506, + "step": 73220 + }, + { + "epoch": 3.6371312208205024, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005090334757127247, + "loss": 0.5535, + "step": 73230 + }, + { + "epoch": 3.637627893116122, + "grad_norm": 0.109375, + "learning_rate": 0.0005089937419290753, + "loss": 0.5442, + "step": 73240 + }, + { + "epoch": 3.6381245654117413, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005089540081454257, + "loss": 0.5157, + "step": 73250 + }, + { + "epoch": 3.6386212377073606, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005089142743617761, + "loss": 0.5133, + "step": 73260 + }, + { + "epoch": 3.6391179100029802, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005088745405781266, + "loss": 0.5598, + "step": 73270 + }, + { + "epoch": 3.6396145822985995, + "grad_norm": 0.11279296875, + "learning_rate": 0.000508834806794477, + "loss": 0.5106, + "step": 73280 + }, + { + "epoch": 3.6401112545942187, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005087950730108275, + "loss": 0.5276, + "step": 73290 + }, + { + "epoch": 3.640607926889838, + "grad_norm": 0.138671875, + "learning_rate": 0.000508755339227178, + "loss": 0.5294, + "step": 73300 + }, + { + "epoch": 3.6411045991854576, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005087156054435283, + "loss": 0.5231, + "step": 73310 + }, + { + "epoch": 3.641601271481077, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005086758716598788, + "loss": 0.5216, + "step": 73320 + }, + { + "epoch": 3.642097943776696, + "grad_norm": 0.138671875, + "learning_rate": 0.0005086361378762293, + "loss": 0.536, + "step": 73330 + }, + { + "epoch": 3.6425946160723157, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005085964040925798, + "loss": 0.5273, + "step": 73340 + }, + { + "epoch": 3.643091288367935, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005085566703089302, + "loss": 0.5331, + "step": 73350 + }, + { + "epoch": 3.643587960663554, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005085169365252807, + "loss": 0.5148, + "step": 73360 + }, + { + "epoch": 3.6440846329591734, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005084772027416311, + "loss": 0.5313, + "step": 73370 + }, + { + "epoch": 3.6445813052547926, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005084374689579815, + "loss": 0.5294, + "step": 73380 + }, + { + "epoch": 3.6450779775504123, + "grad_norm": 0.0927734375, + "learning_rate": 0.000508397735174332, + "loss": 0.5306, + "step": 73390 + }, + { + "epoch": 3.6455746498460315, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005083580013906825, + "loss": 0.5003, + "step": 73400 + }, + { + "epoch": 3.646071322141651, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005083182676070329, + "loss": 0.5395, + "step": 73410 + }, + { + "epoch": 3.6465679944372704, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005082785338233833, + "loss": 0.5017, + "step": 73420 + }, + { + "epoch": 3.6470646667328896, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005082388000397338, + "loss": 0.5296, + "step": 73430 + }, + { + "epoch": 3.647561339028509, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005081990662560843, + "loss": 0.5262, + "step": 73440 + }, + { + "epoch": 3.648058011324128, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005081593324724347, + "loss": 0.5517, + "step": 73450 + }, + { + "epoch": 3.6485546836197478, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005081195986887852, + "loss": 0.5157, + "step": 73460 + }, + { + "epoch": 3.649051355915367, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005080798649051356, + "loss": 0.5272, + "step": 73470 + }, + { + "epoch": 3.6495480282109862, + "grad_norm": 0.0947265625, + "learning_rate": 0.000508040131121486, + "loss": 0.5168, + "step": 73480 + }, + { + "epoch": 3.650044700506606, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005080003973378366, + "loss": 0.5559, + "step": 73490 + }, + { + "epoch": 3.650541372802225, + "grad_norm": 0.09912109375, + "learning_rate": 0.000507960663554187, + "loss": 0.4941, + "step": 73500 + }, + { + "epoch": 3.6510380450978444, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005079209297705374, + "loss": 0.5256, + "step": 73510 + }, + { + "epoch": 3.6515347173934636, + "grad_norm": 0.11328125, + "learning_rate": 0.0005078811959868879, + "loss": 0.5269, + "step": 73520 + }, + { + "epoch": 3.6520313896890833, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005078414622032383, + "loss": 0.5148, + "step": 73530 + }, + { + "epoch": 3.6525280619847025, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005078017284195888, + "loss": 0.5282, + "step": 73540 + }, + { + "epoch": 3.6530247342803217, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005077619946359393, + "loss": 0.5345, + "step": 73550 + }, + { + "epoch": 3.6535214065759414, + "grad_norm": 0.103515625, + "learning_rate": 0.0005077222608522897, + "loss": 0.5298, + "step": 73560 + }, + { + "epoch": 3.6540180788715606, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005076825270686401, + "loss": 0.5444, + "step": 73570 + }, + { + "epoch": 3.65451475116718, + "grad_norm": 0.189453125, + "learning_rate": 0.0005076427932849905, + "loss": 0.5165, + "step": 73580 + }, + { + "epoch": 3.655011423462799, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005076030595013411, + "loss": 0.5404, + "step": 73590 + }, + { + "epoch": 3.6555080957584187, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005075633257176915, + "loss": 0.5279, + "step": 73600 + }, + { + "epoch": 3.656004768054038, + "grad_norm": 0.16796875, + "learning_rate": 0.0005075235919340419, + "loss": 0.5495, + "step": 73610 + }, + { + "epoch": 3.656501440349657, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005074838581503924, + "loss": 0.5339, + "step": 73620 + }, + { + "epoch": 3.656998112645277, + "grad_norm": 0.140625, + "learning_rate": 0.0005074441243667429, + "loss": 0.4827, + "step": 73630 + }, + { + "epoch": 3.657494784940896, + "grad_norm": 0.1484375, + "learning_rate": 0.0005074043905830932, + "loss": 0.5288, + "step": 73640 + }, + { + "epoch": 3.6579914572365153, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005073646567994438, + "loss": 0.5439, + "step": 73650 + }, + { + "epoch": 3.6584881295321345, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005073249230157942, + "loss": 0.5207, + "step": 73660 + }, + { + "epoch": 3.658984801827754, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005072851892321446, + "loss": 0.5458, + "step": 73670 + }, + { + "epoch": 3.6594814741233734, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005072454554484951, + "loss": 0.5007, + "step": 73680 + }, + { + "epoch": 3.6599781464189927, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005072057216648456, + "loss": 0.5203, + "step": 73690 + }, + { + "epoch": 3.6604748187146123, + "grad_norm": 0.1376953125, + "learning_rate": 0.000507165987881196, + "loss": 0.5408, + "step": 73700 + }, + { + "epoch": 3.6609714910102316, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005071262540975465, + "loss": 0.5383, + "step": 73710 + }, + { + "epoch": 3.661468163305851, + "grad_norm": 0.103515625, + "learning_rate": 0.0005070865203138969, + "loss": 0.5536, + "step": 73720 + }, + { + "epoch": 3.66196483560147, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005070467865302473, + "loss": 0.5546, + "step": 73730 + }, + { + "epoch": 3.6624615078970892, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005070070527465979, + "loss": 0.5166, + "step": 73740 + }, + { + "epoch": 3.662958180192709, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005069673189629483, + "loss": 0.5149, + "step": 73750 + }, + { + "epoch": 3.663454852488328, + "grad_norm": 0.099609375, + "learning_rate": 0.0005069275851792987, + "loss": 0.5278, + "step": 73760 + }, + { + "epoch": 3.663951524783948, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005068878513956492, + "loss": 0.5433, + "step": 73770 + }, + { + "epoch": 3.664448197079567, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005068481176119996, + "loss": 0.515, + "step": 73780 + }, + { + "epoch": 3.6649448693751863, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005068083838283502, + "loss": 0.5293, + "step": 73790 + }, + { + "epoch": 3.6654415416708055, + "grad_norm": 0.13671875, + "learning_rate": 0.0005067686500447005, + "loss": 0.5491, + "step": 73800 + }, + { + "epoch": 3.6659382139664247, + "grad_norm": 0.1083984375, + "learning_rate": 0.000506728916261051, + "loss": 0.5319, + "step": 73810 + }, + { + "epoch": 3.6664348862620444, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005066891824774015, + "loss": 0.5304, + "step": 73820 + }, + { + "epoch": 3.6669315585576636, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005066494486937518, + "loss": 0.5555, + "step": 73830 + }, + { + "epoch": 3.667428230853283, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005066097149101024, + "loss": 0.5552, + "step": 73840 + }, + { + "epoch": 3.6679249031489025, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005065699811264528, + "loss": 0.531, + "step": 73850 + }, + { + "epoch": 3.6684215754445217, + "grad_norm": 0.130859375, + "learning_rate": 0.0005065302473428032, + "loss": 0.5315, + "step": 73860 + }, + { + "epoch": 3.668918247740141, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005064905135591537, + "loss": 0.5336, + "step": 73870 + }, + { + "epoch": 3.66941492003576, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005064507797755041, + "loss": 0.5378, + "step": 73880 + }, + { + "epoch": 3.66991159233138, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005064110459918545, + "loss": 0.5245, + "step": 73890 + }, + { + "epoch": 3.670408264626999, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005063713122082051, + "loss": 0.5215, + "step": 73900 + }, + { + "epoch": 3.6709049369226183, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005063315784245555, + "loss": 0.526, + "step": 73910 + }, + { + "epoch": 3.671401609218238, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005062918446409059, + "loss": 0.5362, + "step": 73920 + }, + { + "epoch": 3.671898281513857, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005062521108572564, + "loss": 0.5355, + "step": 73930 + }, + { + "epoch": 3.6723949538094764, + "grad_norm": 0.115234375, + "learning_rate": 0.0005062123770736068, + "loss": 0.5274, + "step": 73940 + }, + { + "epoch": 3.6728916261050957, + "grad_norm": 0.1171875, + "learning_rate": 0.0005061726432899574, + "loss": 0.5348, + "step": 73950 + }, + { + "epoch": 3.6733882984007153, + "grad_norm": 0.18359375, + "learning_rate": 0.0005061329095063078, + "loss": 0.4846, + "step": 73960 + }, + { + "epoch": 3.6738849706963346, + "grad_norm": 0.126953125, + "learning_rate": 0.0005060931757226582, + "loss": 0.5432, + "step": 73970 + }, + { + "epoch": 3.674381642991954, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005060534419390087, + "loss": 0.5665, + "step": 73980 + }, + { + "epoch": 3.6748783152875735, + "grad_norm": 0.09814453125, + "learning_rate": 0.000506013708155359, + "loss": 0.5178, + "step": 73990 + }, + { + "epoch": 3.6753749875831927, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005059739743717096, + "loss": 0.5502, + "step": 74000 + }, + { + "epoch": 3.675871659878812, + "grad_norm": 0.150390625, + "learning_rate": 0.0005059342405880601, + "loss": 0.5316, + "step": 74010 + }, + { + "epoch": 3.676368332174431, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005058945068044104, + "loss": 0.521, + "step": 74020 + }, + { + "epoch": 3.676865004470051, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005058547730207609, + "loss": 0.5134, + "step": 74030 + }, + { + "epoch": 3.67736167676567, + "grad_norm": 0.142578125, + "learning_rate": 0.0005058150392371115, + "loss": 0.5453, + "step": 74040 + }, + { + "epoch": 3.6778583490612893, + "grad_norm": 0.115234375, + "learning_rate": 0.0005057753054534618, + "loss": 0.5439, + "step": 74050 + }, + { + "epoch": 3.678355021356909, + "grad_norm": 0.150390625, + "learning_rate": 0.0005057355716698123, + "loss": 0.5378, + "step": 74060 + }, + { + "epoch": 3.678851693652528, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005056958378861627, + "loss": 0.5359, + "step": 74070 + }, + { + "epoch": 3.6793483659481474, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005056561041025132, + "loss": 0.5216, + "step": 74080 + }, + { + "epoch": 3.6798450382437666, + "grad_norm": 0.09765625, + "learning_rate": 0.0005056163703188636, + "loss": 0.5172, + "step": 74090 + }, + { + "epoch": 3.680341710539386, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005055766365352141, + "loss": 0.5365, + "step": 74100 + }, + { + "epoch": 3.6808383828350055, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005055369027515646, + "loss": 0.5234, + "step": 74110 + }, + { + "epoch": 3.6813350551306248, + "grad_norm": 0.11279296875, + "learning_rate": 0.000505497168967915, + "loss": 0.5581, + "step": 74120 + }, + { + "epoch": 3.6818317274262444, + "grad_norm": 0.10546875, + "learning_rate": 0.0005054574351842654, + "loss": 0.5291, + "step": 74130 + }, + { + "epoch": 3.6823283997218637, + "grad_norm": 0.10498046875, + "learning_rate": 0.000505417701400616, + "loss": 0.5433, + "step": 74140 + }, + { + "epoch": 3.682825072017483, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005053779676169664, + "loss": 0.5088, + "step": 74150 + }, + { + "epoch": 3.683321744313102, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005053382338333168, + "loss": 0.552, + "step": 74160 + }, + { + "epoch": 3.6838184166087213, + "grad_norm": 0.1015625, + "learning_rate": 0.0005052985000496673, + "loss": 0.5277, + "step": 74170 + }, + { + "epoch": 3.684315088904341, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005052587662660176, + "loss": 0.5309, + "step": 74180 + }, + { + "epoch": 3.6848117611999602, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005052190324823681, + "loss": 0.5292, + "step": 74190 + }, + { + "epoch": 3.6853084334955795, + "grad_norm": 0.1875, + "learning_rate": 0.0005051792986987187, + "loss": 0.5, + "step": 74200 + }, + { + "epoch": 3.685805105791199, + "grad_norm": 0.09814453125, + "learning_rate": 0.000505139564915069, + "loss": 0.5177, + "step": 74210 + }, + { + "epoch": 3.6863017780868184, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005050998311314195, + "loss": 0.5462, + "step": 74220 + }, + { + "epoch": 3.6867984503824376, + "grad_norm": 0.12353515625, + "learning_rate": 0.00050506009734777, + "loss": 0.5618, + "step": 74230 + }, + { + "epoch": 3.687295122678057, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005050203635641204, + "loss": 0.5204, + "step": 74240 + }, + { + "epoch": 3.6877917949736765, + "grad_norm": 0.107421875, + "learning_rate": 0.0005049806297804709, + "loss": 0.5339, + "step": 74250 + }, + { + "epoch": 3.6882884672692957, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005049408959968213, + "loss": 0.5595, + "step": 74260 + }, + { + "epoch": 3.688785139564915, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005049011622131718, + "loss": 0.5333, + "step": 74270 + }, + { + "epoch": 3.6892818118605346, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005048614284295222, + "loss": 0.5175, + "step": 74280 + }, + { + "epoch": 3.689778484156154, + "grad_norm": 0.109375, + "learning_rate": 0.0005048216946458726, + "loss": 0.5102, + "step": 74290 + }, + { + "epoch": 3.690275156451773, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005047819608622232, + "loss": 0.5456, + "step": 74300 + }, + { + "epoch": 3.6907718287473923, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005047422270785736, + "loss": 0.5277, + "step": 74310 + }, + { + "epoch": 3.691268501043012, + "grad_norm": 0.10400390625, + "learning_rate": 0.000504702493294924, + "loss": 0.5348, + "step": 74320 + }, + { + "epoch": 3.691765173338631, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005046627595112745, + "loss": 0.5225, + "step": 74330 + }, + { + "epoch": 3.6922618456342504, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005046230257276249, + "loss": 0.5521, + "step": 74340 + }, + { + "epoch": 3.69275851792987, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005045832919439754, + "loss": 0.5586, + "step": 74350 + }, + { + "epoch": 3.6932551902254893, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005045435581603259, + "loss": 0.524, + "step": 74360 + }, + { + "epoch": 3.6937518625211085, + "grad_norm": 0.10546875, + "learning_rate": 0.0005045038243766763, + "loss": 0.5222, + "step": 74370 + }, + { + "epoch": 3.6942485348167278, + "grad_norm": 0.150390625, + "learning_rate": 0.0005044640905930267, + "loss": 0.5279, + "step": 74380 + }, + { + "epoch": 3.694745207112347, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005044243568093772, + "loss": 0.5513, + "step": 74390 + }, + { + "epoch": 3.6952418794079667, + "grad_norm": 0.130859375, + "learning_rate": 0.0005043846230257277, + "loss": 0.5075, + "step": 74400 + }, + { + "epoch": 3.695738551703586, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005043448892420781, + "loss": 0.5501, + "step": 74410 + }, + { + "epoch": 3.6962352239992056, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005043051554584286, + "loss": 0.5157, + "step": 74420 + }, + { + "epoch": 3.696731896294825, + "grad_norm": 0.0966796875, + "learning_rate": 0.000504265421674779, + "loss": 0.5792, + "step": 74430 + }, + { + "epoch": 3.697228568590444, + "grad_norm": 0.109375, + "learning_rate": 0.0005042256878911294, + "loss": 0.5353, + "step": 74440 + }, + { + "epoch": 3.6977252408860632, + "grad_norm": 0.103515625, + "learning_rate": 0.0005041859541074799, + "loss": 0.5202, + "step": 74450 + }, + { + "epoch": 3.6982219131816825, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005041462203238304, + "loss": 0.5369, + "step": 74460 + }, + { + "epoch": 3.698718585477302, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005041064865401808, + "loss": 0.5329, + "step": 74470 + }, + { + "epoch": 3.6992152577729214, + "grad_norm": 0.142578125, + "learning_rate": 0.0005040667527565312, + "loss": 0.5314, + "step": 74480 + }, + { + "epoch": 3.699711930068541, + "grad_norm": 0.154296875, + "learning_rate": 0.0005040270189728817, + "loss": 0.5389, + "step": 74490 + }, + { + "epoch": 3.7002086023641603, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005039872851892322, + "loss": 0.55, + "step": 74500 + }, + { + "epoch": 3.7007052746597795, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005039475514055826, + "loss": 0.5334, + "step": 74510 + }, + { + "epoch": 3.7012019469553987, + "grad_norm": 0.13671875, + "learning_rate": 0.0005039078176219331, + "loss": 0.5203, + "step": 74520 + }, + { + "epoch": 3.701698619251018, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005038680838382835, + "loss": 0.5108, + "step": 74530 + }, + { + "epoch": 3.7021952915466376, + "grad_norm": 0.1328125, + "learning_rate": 0.0005038283500546339, + "loss": 0.5133, + "step": 74540 + }, + { + "epoch": 3.702691963842257, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005037886162709845, + "loss": 0.5255, + "step": 74550 + }, + { + "epoch": 3.703188636137876, + "grad_norm": 0.115234375, + "learning_rate": 0.0005037488824873349, + "loss": 0.4999, + "step": 74560 + }, + { + "epoch": 3.7036853084334957, + "grad_norm": 0.130859375, + "learning_rate": 0.0005037091487036853, + "loss": 0.531, + "step": 74570 + }, + { + "epoch": 3.704181980729115, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005036694149200358, + "loss": 0.5445, + "step": 74580 + }, + { + "epoch": 3.704678653024734, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005036296811363862, + "loss": 0.5408, + "step": 74590 + }, + { + "epoch": 3.7051753253203534, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005035899473527367, + "loss": 0.5694, + "step": 74600 + }, + { + "epoch": 3.705671997615973, + "grad_norm": 0.11328125, + "learning_rate": 0.0005035502135690872, + "loss": 0.5388, + "step": 74610 + }, + { + "epoch": 3.7061686699115923, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005035104797854376, + "loss": 0.5405, + "step": 74620 + }, + { + "epoch": 3.7066653422072116, + "grad_norm": 0.140625, + "learning_rate": 0.000503470746001788, + "loss": 0.5164, + "step": 74630 + }, + { + "epoch": 3.7071620145028312, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005034310122181385, + "loss": 0.5371, + "step": 74640 + }, + { + "epoch": 3.7076586867984505, + "grad_norm": 0.11572265625, + "learning_rate": 0.000503391278434489, + "loss": 0.536, + "step": 74650 + }, + { + "epoch": 3.7081553590940697, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005033515446508394, + "loss": 0.5447, + "step": 74660 + }, + { + "epoch": 3.708652031389689, + "grad_norm": 0.1484375, + "learning_rate": 0.0005033118108671898, + "loss": 0.5733, + "step": 74670 + }, + { + "epoch": 3.7091487036853086, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005032720770835403, + "loss": 0.5171, + "step": 74680 + }, + { + "epoch": 3.709645375980928, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005032323432998908, + "loss": 0.5562, + "step": 74690 + }, + { + "epoch": 3.710142048276547, + "grad_norm": 0.13671875, + "learning_rate": 0.0005031926095162412, + "loss": 0.5388, + "step": 74700 + }, + { + "epoch": 3.7106387205721667, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005031528757325917, + "loss": 0.5385, + "step": 74710 + }, + { + "epoch": 3.711135392867786, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005031131419489421, + "loss": 0.5191, + "step": 74720 + }, + { + "epoch": 3.711632065163405, + "grad_norm": 0.111328125, + "learning_rate": 0.0005030734081652925, + "loss": 0.5243, + "step": 74730 + }, + { + "epoch": 3.7121287374590244, + "grad_norm": 0.09765625, + "learning_rate": 0.000503033674381643, + "loss": 0.5402, + "step": 74740 + }, + { + "epoch": 3.7126254097546436, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005029939405979935, + "loss": 0.5371, + "step": 74750 + }, + { + "epoch": 3.7131220820502633, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005029542068143439, + "loss": 0.5397, + "step": 74760 + }, + { + "epoch": 3.7136187543458825, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005029144730306944, + "loss": 0.5406, + "step": 74770 + }, + { + "epoch": 3.714115426641502, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005028747392470448, + "loss": 0.5853, + "step": 74780 + }, + { + "epoch": 3.7146120989371214, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005028350054633952, + "loss": 0.5466, + "step": 74790 + }, + { + "epoch": 3.7151087712327406, + "grad_norm": 0.126953125, + "learning_rate": 0.0005027952716797458, + "loss": 0.5634, + "step": 74800 + }, + { + "epoch": 3.71560544352836, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005027555378960962, + "loss": 0.526, + "step": 74810 + }, + { + "epoch": 3.716102115823979, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005027158041124467, + "loss": 0.5252, + "step": 74820 + }, + { + "epoch": 3.7165987881195988, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005026760703287971, + "loss": 0.5454, + "step": 74830 + }, + { + "epoch": 3.717095460415218, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005026363365451475, + "loss": 0.5045, + "step": 74840 + }, + { + "epoch": 3.7175921327108377, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005025966027614981, + "loss": 0.5203, + "step": 74850 + }, + { + "epoch": 3.718088805006457, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005025568689778484, + "loss": 0.5681, + "step": 74860 + }, + { + "epoch": 3.718585477302076, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005025171351941989, + "loss": 0.5234, + "step": 74870 + }, + { + "epoch": 3.7190821495976953, + "grad_norm": 0.123046875, + "learning_rate": 0.0005024774014105494, + "loss": 0.5626, + "step": 74880 + }, + { + "epoch": 3.7195788218933146, + "grad_norm": 0.173828125, + "learning_rate": 0.0005024376676268997, + "loss": 0.5342, + "step": 74890 + }, + { + "epoch": 3.7200754941889342, + "grad_norm": 0.099609375, + "learning_rate": 0.0005023979338432503, + "loss": 0.5172, + "step": 74900 + }, + { + "epoch": 3.7205721664845535, + "grad_norm": 0.125, + "learning_rate": 0.0005023582000596008, + "loss": 0.5401, + "step": 74910 + }, + { + "epoch": 3.7210688387801727, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005023184662759511, + "loss": 0.5451, + "step": 74920 + }, + { + "epoch": 3.7215655110757924, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005022787324923016, + "loss": 0.534, + "step": 74930 + }, + { + "epoch": 3.7220621833714116, + "grad_norm": 0.10009765625, + "learning_rate": 0.000502238998708652, + "loss": 0.5271, + "step": 74940 + }, + { + "epoch": 3.722558855667031, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005021992649250024, + "loss": 0.5351, + "step": 74950 + }, + { + "epoch": 3.72305552796265, + "grad_norm": 0.10595703125, + "learning_rate": 0.000502159531141353, + "loss": 0.5425, + "step": 74960 + }, + { + "epoch": 3.7235522002582697, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005021197973577034, + "loss": 0.5562, + "step": 74970 + }, + { + "epoch": 3.724048872553889, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005020800635740539, + "loss": 0.5429, + "step": 74980 + }, + { + "epoch": 3.724545544849508, + "grad_norm": 0.134765625, + "learning_rate": 0.0005020403297904043, + "loss": 0.5185, + "step": 74990 + }, + { + "epoch": 3.725042217145128, + "grad_norm": 0.109375, + "learning_rate": 0.0005020005960067548, + "loss": 0.5465, + "step": 75000 + }, + { + "epoch": 3.725538889440747, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005019608622231053, + "loss": 0.5251, + "step": 75010 + }, + { + "epoch": 3.7260355617363663, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005019211284394557, + "loss": 0.5146, + "step": 75020 + }, + { + "epoch": 3.7265322340319855, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005018813946558061, + "loss": 0.499, + "step": 75030 + }, + { + "epoch": 3.727028906327605, + "grad_norm": 0.123046875, + "learning_rate": 0.0005018416608721566, + "loss": 0.5298, + "step": 75040 + }, + { + "epoch": 3.7275255786232244, + "grad_norm": 0.123046875, + "learning_rate": 0.0005018019270885069, + "loss": 0.5222, + "step": 75050 + }, + { + "epoch": 3.7280222509188436, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005017621933048575, + "loss": 0.5474, + "step": 75060 + }, + { + "epoch": 3.7285189232144633, + "grad_norm": 0.10888671875, + "learning_rate": 0.000501722459521208, + "loss": 0.5366, + "step": 75070 + }, + { + "epoch": 3.7290155955100825, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005016827257375583, + "loss": 0.5366, + "step": 75080 + }, + { + "epoch": 3.7295122678057018, + "grad_norm": 0.103515625, + "learning_rate": 0.0005016429919539088, + "loss": 0.5356, + "step": 75090 + }, + { + "epoch": 3.730008940101321, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005016032581702594, + "loss": 0.5207, + "step": 75100 + }, + { + "epoch": 3.7305056123969402, + "grad_norm": 0.15625, + "learning_rate": 0.0005015635243866097, + "loss": 0.5548, + "step": 75110 + }, + { + "epoch": 3.73100228469256, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005015237906029602, + "loss": 0.558, + "step": 75120 + }, + { + "epoch": 3.731498956988179, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005014840568193106, + "loss": 0.5615, + "step": 75130 + }, + { + "epoch": 3.731995629283799, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005014443230356611, + "loss": 0.5319, + "step": 75140 + }, + { + "epoch": 3.732492301579418, + "grad_norm": 0.134765625, + "learning_rate": 0.0005014045892520116, + "loss": 0.5452, + "step": 75150 + }, + { + "epoch": 3.7329889738750373, + "grad_norm": 0.0947265625, + "learning_rate": 0.000501364855468362, + "loss": 0.5418, + "step": 75160 + }, + { + "epoch": 3.7334856461706565, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005013251216847125, + "loss": 0.5408, + "step": 75170 + }, + { + "epoch": 3.7339823184662757, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005012853879010629, + "loss": 0.5412, + "step": 75180 + }, + { + "epoch": 3.7344789907618954, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005012456541174133, + "loss": 0.5283, + "step": 75190 + }, + { + "epoch": 3.7349756630575146, + "grad_norm": 0.140625, + "learning_rate": 0.0005012059203337639, + "loss": 0.5539, + "step": 75200 + }, + { + "epoch": 3.7354723353531343, + "grad_norm": 0.12109375, + "learning_rate": 0.0005011661865501143, + "loss": 0.5243, + "step": 75210 + }, + { + "epoch": 3.7359690076487535, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005011264527664647, + "loss": 0.5207, + "step": 75220 + }, + { + "epoch": 3.7364656799443727, + "grad_norm": 0.1015625, + "learning_rate": 0.0005010867189828152, + "loss": 0.5092, + "step": 75230 + }, + { + "epoch": 3.736962352239992, + "grad_norm": 0.134765625, + "learning_rate": 0.0005010469851991655, + "loss": 0.5549, + "step": 75240 + }, + { + "epoch": 3.737459024535611, + "grad_norm": 0.10107421875, + "learning_rate": 0.000501007251415516, + "loss": 0.5267, + "step": 75250 + }, + { + "epoch": 3.737955696831231, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005009675176318666, + "loss": 0.504, + "step": 75260 + }, + { + "epoch": 3.73845236912685, + "grad_norm": 0.109375, + "learning_rate": 0.000500927783848217, + "loss": 0.5222, + "step": 75270 + }, + { + "epoch": 3.7389490414224693, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005008880500645674, + "loss": 0.5661, + "step": 75280 + }, + { + "epoch": 3.739445713718089, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005008483162809179, + "loss": 0.5216, + "step": 75290 + }, + { + "epoch": 3.739942386013708, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005008085824972683, + "loss": 0.5375, + "step": 75300 + }, + { + "epoch": 3.7404390583093274, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005007688487136188, + "loss": 0.5243, + "step": 75310 + }, + { + "epoch": 3.7409357306049467, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005007291149299692, + "loss": 0.5113, + "step": 75320 + }, + { + "epoch": 3.7414324029005663, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005006893811463197, + "loss": 0.5147, + "step": 75330 + }, + { + "epoch": 3.7419290751961856, + "grad_norm": 0.181640625, + "learning_rate": 0.0005006496473626701, + "loss": 0.535, + "step": 75340 + }, + { + "epoch": 3.742425747491805, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005006099135790205, + "loss": 0.5412, + "step": 75350 + }, + { + "epoch": 3.7429224197874245, + "grad_norm": 0.111328125, + "learning_rate": 0.0005005701797953711, + "loss": 0.5162, + "step": 75360 + }, + { + "epoch": 3.7434190920830437, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005005304460117215, + "loss": 0.5369, + "step": 75370 + }, + { + "epoch": 3.743915764378663, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005004907122280719, + "loss": 0.5315, + "step": 75380 + }, + { + "epoch": 3.744412436674282, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005004509784444224, + "loss": 0.538, + "step": 75390 + }, + { + "epoch": 3.744909108969902, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005004112446607728, + "loss": 0.5474, + "step": 75400 + }, + { + "epoch": 3.745405781265521, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005003715108771233, + "loss": 0.5329, + "step": 75410 + }, + { + "epoch": 3.7459024535611403, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005003317770934738, + "loss": 0.5152, + "step": 75420 + }, + { + "epoch": 3.74639912585676, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005002920433098242, + "loss": 0.5356, + "step": 75430 + }, + { + "epoch": 3.746895798152379, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005002523095261746, + "loss": 0.5321, + "step": 75440 + }, + { + "epoch": 3.7473924704479984, + "grad_norm": 0.09765625, + "learning_rate": 0.0005002125757425251, + "loss": 0.5199, + "step": 75450 + }, + { + "epoch": 3.7478891427436176, + "grad_norm": 0.107421875, + "learning_rate": 0.0005001728419588756, + "loss": 0.5606, + "step": 75460 + }, + { + "epoch": 3.748385815039237, + "grad_norm": 0.12060546875, + "learning_rate": 0.000500133108175226, + "loss": 0.5587, + "step": 75470 + }, + { + "epoch": 3.7488824873348565, + "grad_norm": 0.12109375, + "learning_rate": 0.0005000933743915765, + "loss": 0.5451, + "step": 75480 + }, + { + "epoch": 3.7493791596304757, + "grad_norm": 0.12109375, + "learning_rate": 0.0005000536406079269, + "loss": 0.5322, + "step": 75490 + }, + { + "epoch": 3.7498758319260954, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005000139068242773, + "loss": 0.5317, + "step": 75500 + }, + { + "epoch": 3.7503725042217146, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004999741730406278, + "loss": 0.5438, + "step": 75510 + }, + { + "epoch": 3.750869176517334, + "grad_norm": 0.1630859375, + "learning_rate": 0.0004999344392569783, + "loss": 0.5394, + "step": 75520 + }, + { + "epoch": 3.751365848812953, + "grad_norm": 0.103515625, + "learning_rate": 0.0004998947054733287, + "loss": 0.5357, + "step": 75530 + }, + { + "epoch": 3.7518625211085723, + "grad_norm": 0.13671875, + "learning_rate": 0.0004998549716896791, + "loss": 0.524, + "step": 75540 + }, + { + "epoch": 3.752359193404192, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004998152379060296, + "loss": 0.5782, + "step": 75550 + }, + { + "epoch": 3.752855865699811, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004997755041223801, + "loss": 0.523, + "step": 75560 + }, + { + "epoch": 3.7533525379954304, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004997357703387305, + "loss": 0.5524, + "step": 75570 + }, + { + "epoch": 3.75384921029105, + "grad_norm": 0.10498046875, + "learning_rate": 0.000499696036555081, + "loss": 0.5031, + "step": 75580 + }, + { + "epoch": 3.7543458825866693, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004996563027714314, + "loss": 0.5556, + "step": 75590 + }, + { + "epoch": 3.7548425548822886, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004996165689877818, + "loss": 0.5152, + "step": 75600 + }, + { + "epoch": 3.755339227177908, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004995768352041324, + "loss": 0.5316, + "step": 75610 + }, + { + "epoch": 3.7558358994735275, + "grad_norm": 0.15625, + "learning_rate": 0.0004995371014204828, + "loss": 0.5134, + "step": 75620 + }, + { + "epoch": 3.7563325717691467, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004994973676368332, + "loss": 0.5617, + "step": 75630 + }, + { + "epoch": 3.756829244064766, + "grad_norm": 0.134765625, + "learning_rate": 0.0004994576338531837, + "loss": 0.528, + "step": 75640 + }, + { + "epoch": 3.7573259163603856, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004994179000695341, + "loss": 0.5361, + "step": 75650 + }, + { + "epoch": 3.757822588656005, + "grad_norm": 0.15625, + "learning_rate": 0.0004993781662858846, + "loss": 0.5295, + "step": 75660 + }, + { + "epoch": 3.758319260951624, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004993384325022351, + "loss": 0.5404, + "step": 75670 + }, + { + "epoch": 3.7588159332472433, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004992986987185855, + "loss": 0.5221, + "step": 75680 + }, + { + "epoch": 3.759312605542863, + "grad_norm": 0.162109375, + "learning_rate": 0.0004992589649349359, + "loss": 0.5524, + "step": 75690 + }, + { + "epoch": 3.759809277838482, + "grad_norm": 0.185546875, + "learning_rate": 0.0004992192311512864, + "loss": 0.5441, + "step": 75700 + }, + { + "epoch": 3.7603059501341014, + "grad_norm": 0.10009765625, + "learning_rate": 0.0004991794973676369, + "loss": 0.5298, + "step": 75710 + }, + { + "epoch": 3.760802622429721, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004991397635839874, + "loss": 0.5343, + "step": 75720 + }, + { + "epoch": 3.7612992947253403, + "grad_norm": 0.12451171875, + "learning_rate": 0.0004991000298003377, + "loss": 0.5252, + "step": 75730 + }, + { + "epoch": 3.7617959670209595, + "grad_norm": 0.10546875, + "learning_rate": 0.0004990602960166882, + "loss": 0.5356, + "step": 75740 + }, + { + "epoch": 3.7622926393165788, + "grad_norm": 0.115234375, + "learning_rate": 0.0004990205622330387, + "loss": 0.5323, + "step": 75750 + }, + { + "epoch": 3.7627893116121984, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004989808284493891, + "loss": 0.5198, + "step": 75760 + }, + { + "epoch": 3.7632859839078177, + "grad_norm": 0.1533203125, + "learning_rate": 0.0004989410946657396, + "loss": 0.5332, + "step": 75770 + }, + { + "epoch": 3.763782656203437, + "grad_norm": 0.11376953125, + "learning_rate": 0.00049890136088209, + "loss": 0.5421, + "step": 75780 + }, + { + "epoch": 3.7642793284990566, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004988616270984404, + "loss": 0.4969, + "step": 75790 + }, + { + "epoch": 3.764776000794676, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004988218933147909, + "loss": 0.5289, + "step": 75800 + }, + { + "epoch": 3.765272673090295, + "grad_norm": 0.134765625, + "learning_rate": 0.0004987821595311414, + "loss": 0.5358, + "step": 75810 + }, + { + "epoch": 3.7657693453859142, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004987424257474918, + "loss": 0.5283, + "step": 75820 + }, + { + "epoch": 3.7662660176815335, + "grad_norm": 0.10205078125, + "learning_rate": 0.0004987026919638423, + "loss": 0.5582, + "step": 75830 + }, + { + "epoch": 3.766762689977153, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004986629581801927, + "loss": 0.526, + "step": 75840 + }, + { + "epoch": 3.7672593622727724, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004986232243965431, + "loss": 0.5216, + "step": 75850 + }, + { + "epoch": 3.767756034568392, + "grad_norm": 0.0947265625, + "learning_rate": 0.0004985834906128937, + "loss": 0.5394, + "step": 75860 + }, + { + "epoch": 3.7682527068640113, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004985437568292441, + "loss": 0.5392, + "step": 75870 + }, + { + "epoch": 3.7687493791596305, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004985040230455946, + "loss": 0.5583, + "step": 75880 + }, + { + "epoch": 3.7692460514552497, + "grad_norm": 0.134765625, + "learning_rate": 0.000498464289261945, + "loss": 0.5372, + "step": 75890 + }, + { + "epoch": 3.769742723750869, + "grad_norm": 0.1435546875, + "learning_rate": 0.0004984245554782954, + "loss": 0.5437, + "step": 75900 + }, + { + "epoch": 3.7702393960464886, + "grad_norm": 0.1005859375, + "learning_rate": 0.000498384821694646, + "loss": 0.5374, + "step": 75910 + }, + { + "epoch": 3.770736068342108, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004983450879109963, + "loss": 0.5581, + "step": 75920 + }, + { + "epoch": 3.771232740637727, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004983053541273468, + "loss": 0.5358, + "step": 75930 + }, + { + "epoch": 3.7717294129333467, + "grad_norm": 0.1015625, + "learning_rate": 0.0004982656203436973, + "loss": 0.5459, + "step": 75940 + }, + { + "epoch": 3.772226085228966, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004982258865600476, + "loss": 0.5242, + "step": 75950 + }, + { + "epoch": 3.772722757524585, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004981861527763982, + "loss": 0.5467, + "step": 75960 + }, + { + "epoch": 3.7732194298202044, + "grad_norm": 0.12890625, + "learning_rate": 0.0004981464189927487, + "loss": 0.5221, + "step": 75970 + }, + { + "epoch": 3.773716102115824, + "grad_norm": 0.11962890625, + "learning_rate": 0.000498106685209099, + "loss": 0.5415, + "step": 75980 + }, + { + "epoch": 3.7742127744114433, + "grad_norm": 0.09814453125, + "learning_rate": 0.0004980669514254495, + "loss": 0.5212, + "step": 75990 + }, + { + "epoch": 3.7747094467070625, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004980272176417999, + "loss": 0.5402, + "step": 76000 + }, + { + "epoch": 3.775206119002682, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004979874838581504, + "loss": 0.5187, + "step": 76010 + }, + { + "epoch": 3.7757027912983014, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004979477500745009, + "loss": 0.5301, + "step": 76020 + }, + { + "epoch": 3.7761994635939207, + "grad_norm": 0.12890625, + "learning_rate": 0.0004979080162908513, + "loss": 0.5241, + "step": 76030 + }, + { + "epoch": 3.77669613588954, + "grad_norm": 0.1171875, + "learning_rate": 0.0004978682825072018, + "loss": 0.5217, + "step": 76040 + }, + { + "epoch": 3.7771928081851596, + "grad_norm": 0.1474609375, + "learning_rate": 0.0004978285487235522, + "loss": 0.5234, + "step": 76050 + }, + { + "epoch": 3.777689480480779, + "grad_norm": 0.11328125, + "learning_rate": 0.0004977888149399027, + "loss": 0.5231, + "step": 76060 + }, + { + "epoch": 3.778186152776398, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004977490811562532, + "loss": 0.5186, + "step": 76070 + }, + { + "epoch": 3.7786828250720177, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004977093473726036, + "loss": 0.5255, + "step": 76080 + }, + { + "epoch": 3.779179497367637, + "grad_norm": 0.12890625, + "learning_rate": 0.000497669613588954, + "loss": 0.5878, + "step": 76090 + }, + { + "epoch": 3.779676169663256, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004976298798053045, + "loss": 0.5512, + "step": 76100 + }, + { + "epoch": 3.7801728419588754, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004975901460216548, + "loss": 0.5578, + "step": 76110 + }, + { + "epoch": 3.780669514254495, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004975504122380054, + "loss": 0.5282, + "step": 76120 + }, + { + "epoch": 3.7811661865501143, + "grad_norm": 0.09130859375, + "learning_rate": 0.0004975106784543559, + "loss": 0.5399, + "step": 76130 + }, + { + "epoch": 3.7816628588457335, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004974709446707062, + "loss": 0.5211, + "step": 76140 + }, + { + "epoch": 3.782159531141353, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004974312108870567, + "loss": 0.4971, + "step": 76150 + }, + { + "epoch": 3.7826562034369724, + "grad_norm": 0.126953125, + "learning_rate": 0.0004973914771034073, + "loss": 0.5228, + "step": 76160 + }, + { + "epoch": 3.7831528757325916, + "grad_norm": 0.119140625, + "learning_rate": 0.0004973517433197577, + "loss": 0.5268, + "step": 76170 + }, + { + "epoch": 3.783649548028211, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004973120095361081, + "loss": 0.5358, + "step": 76180 + }, + { + "epoch": 3.78414622032383, + "grad_norm": 0.10546875, + "learning_rate": 0.0004972722757524585, + "loss": 0.5051, + "step": 76190 + }, + { + "epoch": 3.7846428926194497, + "grad_norm": 0.10400390625, + "learning_rate": 0.000497232541968809, + "loss": 0.552, + "step": 76200 + }, + { + "epoch": 3.785139564915069, + "grad_norm": 0.111328125, + "learning_rate": 0.0004971928081851595, + "loss": 0.5174, + "step": 76210 + }, + { + "epoch": 3.7856362372106886, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004971530744015099, + "loss": 0.5515, + "step": 76220 + }, + { + "epoch": 3.786132909506308, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004971133406178604, + "loss": 0.4994, + "step": 76230 + }, + { + "epoch": 3.786629581801927, + "grad_norm": 0.1650390625, + "learning_rate": 0.0004970736068342108, + "loss": 0.5371, + "step": 76240 + }, + { + "epoch": 3.7871262540975463, + "grad_norm": 0.208984375, + "learning_rate": 0.0004970338730505612, + "loss": 0.5338, + "step": 76250 + }, + { + "epoch": 3.7876229263931656, + "grad_norm": 0.09423828125, + "learning_rate": 0.0004969941392669118, + "loss": 0.536, + "step": 76260 + }, + { + "epoch": 3.7881195986887852, + "grad_norm": 0.12109375, + "learning_rate": 0.0004969544054832622, + "loss": 0.5196, + "step": 76270 + }, + { + "epoch": 3.7886162709844045, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004969146716996126, + "loss": 0.5345, + "step": 76280 + }, + { + "epoch": 3.7891129432800237, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004968749379159631, + "loss": 0.517, + "step": 76290 + }, + { + "epoch": 3.7896096155756434, + "grad_norm": 0.103515625, + "learning_rate": 0.0004968352041323135, + "loss": 0.5606, + "step": 76300 + }, + { + "epoch": 3.7901062878712626, + "grad_norm": 0.1064453125, + "learning_rate": 0.000496795470348664, + "loss": 0.5667, + "step": 76310 + }, + { + "epoch": 3.790602960166882, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004967557365650145, + "loss": 0.5437, + "step": 76320 + }, + { + "epoch": 3.791099632462501, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004967160027813649, + "loss": 0.5455, + "step": 76330 + }, + { + "epoch": 3.7915963047581207, + "grad_norm": 0.1484375, + "learning_rate": 0.0004966762689977153, + "loss": 0.5301, + "step": 76340 + }, + { + "epoch": 3.79209297705374, + "grad_norm": 0.1435546875, + "learning_rate": 0.0004966365352140658, + "loss": 0.5488, + "step": 76350 + }, + { + "epoch": 3.792589649349359, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004965968014304163, + "loss": 0.5309, + "step": 76360 + }, + { + "epoch": 3.793086321644979, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004965570676467667, + "loss": 0.5257, + "step": 76370 + }, + { + "epoch": 3.793582993940598, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004965173338631171, + "loss": 0.5463, + "step": 76380 + }, + { + "epoch": 3.7940796662362173, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004964776000794676, + "loss": 0.5615, + "step": 76390 + }, + { + "epoch": 3.7945763385318365, + "grad_norm": 0.1015625, + "learning_rate": 0.000496437866295818, + "loss": 0.5293, + "step": 76400 + }, + { + "epoch": 3.795073010827456, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004963981325121684, + "loss": 0.5131, + "step": 76410 + }, + { + "epoch": 3.7955696831230754, + "grad_norm": 0.09716796875, + "learning_rate": 0.000496358398728519, + "loss": 0.5213, + "step": 76420 + }, + { + "epoch": 3.7960663554186946, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004963186649448694, + "loss": 0.5412, + "step": 76430 + }, + { + "epoch": 3.7965630277143143, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004962789311612198, + "loss": 0.5338, + "step": 76440 + }, + { + "epoch": 3.7970597000099335, + "grad_norm": 0.10009765625, + "learning_rate": 0.0004962391973775703, + "loss": 0.5537, + "step": 76450 + }, + { + "epoch": 3.7975563723055528, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004961994635939207, + "loss": 0.5441, + "step": 76460 + }, + { + "epoch": 3.798053044601172, + "grad_norm": 0.123046875, + "learning_rate": 0.0004961597298102712, + "loss": 0.5135, + "step": 76470 + }, + { + "epoch": 3.7985497168967917, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004961199960266217, + "loss": 0.5597, + "step": 76480 + }, + { + "epoch": 3.799046389192411, + "grad_norm": 0.109375, + "learning_rate": 0.0004960802622429721, + "loss": 0.5418, + "step": 76490 + }, + { + "epoch": 3.79954306148803, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004960405284593225, + "loss": 0.5386, + "step": 76500 + }, + { + "epoch": 3.80003973378365, + "grad_norm": 0.130859375, + "learning_rate": 0.000496000794675673, + "loss": 0.5257, + "step": 76510 + }, + { + "epoch": 3.800536406079269, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004959610608920235, + "loss": 0.5425, + "step": 76520 + }, + { + "epoch": 3.8010330783748882, + "grad_norm": 0.10205078125, + "learning_rate": 0.0004959213271083739, + "loss": 0.5369, + "step": 76530 + }, + { + "epoch": 3.8015297506705075, + "grad_norm": 0.20703125, + "learning_rate": 0.0004958815933247244, + "loss": 0.5424, + "step": 76540 + }, + { + "epoch": 3.8020264229661267, + "grad_norm": 0.09375, + "learning_rate": 0.0004958418595410748, + "loss": 0.5137, + "step": 76550 + }, + { + "epoch": 3.8025230952617464, + "grad_norm": 0.1171875, + "learning_rate": 0.0004958021257574252, + "loss": 0.5386, + "step": 76560 + }, + { + "epoch": 3.8030197675573656, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004957623919737758, + "loss": 0.514, + "step": 76570 + }, + { + "epoch": 3.8035164398529853, + "grad_norm": 0.1015625, + "learning_rate": 0.0004957226581901262, + "loss": 0.535, + "step": 76580 + }, + { + "epoch": 3.8040131121486045, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004956829244064766, + "loss": 0.5503, + "step": 76590 + }, + { + "epoch": 3.8045097844442237, + "grad_norm": 0.1279296875, + "learning_rate": 0.000495643190622827, + "loss": 0.5156, + "step": 76600 + }, + { + "epoch": 3.805006456739843, + "grad_norm": 0.126953125, + "learning_rate": 0.0004956034568391775, + "loss": 0.5681, + "step": 76610 + }, + { + "epoch": 3.805503129035462, + "grad_norm": 0.150390625, + "learning_rate": 0.0004955637230555281, + "loss": 0.5347, + "step": 76620 + }, + { + "epoch": 3.805999801331082, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004955239892718784, + "loss": 0.5676, + "step": 76630 + }, + { + "epoch": 3.806496473626701, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004954842554882289, + "loss": 0.5308, + "step": 76640 + }, + { + "epoch": 3.8069931459223203, + "grad_norm": 0.111328125, + "learning_rate": 0.0004954445217045793, + "loss": 0.5314, + "step": 76650 + }, + { + "epoch": 3.80748981821794, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004954047879209297, + "loss": 0.5216, + "step": 76660 + }, + { + "epoch": 3.807986490513559, + "grad_norm": 0.1806640625, + "learning_rate": 0.0004953650541372803, + "loss": 0.5433, + "step": 76670 + }, + { + "epoch": 3.8084831628091784, + "grad_norm": 0.1904296875, + "learning_rate": 0.0004953253203536307, + "loss": 0.5521, + "step": 76680 + }, + { + "epoch": 3.8089798351047977, + "grad_norm": 0.126953125, + "learning_rate": 0.0004952855865699811, + "loss": 0.5234, + "step": 76690 + }, + { + "epoch": 3.8094765074004173, + "grad_norm": 0.1494140625, + "learning_rate": 0.0004952458527863316, + "loss": 0.4972, + "step": 76700 + }, + { + "epoch": 3.8099731796960365, + "grad_norm": 0.16796875, + "learning_rate": 0.000495206119002682, + "loss": 0.5387, + "step": 76710 + }, + { + "epoch": 3.8104698519916558, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004951663852190325, + "loss": 0.5169, + "step": 76720 + }, + { + "epoch": 3.8109665242872754, + "grad_norm": 0.15625, + "learning_rate": 0.000495126651435383, + "loss": 0.5121, + "step": 76730 + }, + { + "epoch": 3.8114631965828947, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004950869176517334, + "loss": 0.5538, + "step": 76740 + }, + { + "epoch": 3.811959868878514, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004950471838680838, + "loss": 0.529, + "step": 76750 + }, + { + "epoch": 3.812456541174133, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004950074500844343, + "loss": 0.5366, + "step": 76760 + }, + { + "epoch": 3.812953213469753, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004949677163007848, + "loss": 0.5343, + "step": 76770 + }, + { + "epoch": 3.813449885765372, + "grad_norm": 0.1552734375, + "learning_rate": 0.0004949279825171353, + "loss": 0.5588, + "step": 76780 + }, + { + "epoch": 3.8139465580609913, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004948882487334856, + "loss": 0.5383, + "step": 76790 + }, + { + "epoch": 3.814443230356611, + "grad_norm": 0.095703125, + "learning_rate": 0.0004948485149498361, + "loss": 0.5263, + "step": 76800 + }, + { + "epoch": 3.81493990265223, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004948087811661867, + "loss": 0.5515, + "step": 76810 + }, + { + "epoch": 3.8154365749478494, + "grad_norm": 0.142578125, + "learning_rate": 0.000494769047382537, + "loss": 0.515, + "step": 76820 + }, + { + "epoch": 3.8159332472434686, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004947293135988875, + "loss": 0.5327, + "step": 76830 + }, + { + "epoch": 3.816429919539088, + "grad_norm": 0.15234375, + "learning_rate": 0.000494689579815238, + "loss": 0.5308, + "step": 76840 + }, + { + "epoch": 3.8169265918347075, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004946498460315883, + "loss": 0.5392, + "step": 76850 + }, + { + "epoch": 3.8174232641303267, + "grad_norm": 0.10546875, + "learning_rate": 0.0004946101122479388, + "loss": 0.5223, + "step": 76860 + }, + { + "epoch": 3.8179199364259464, + "grad_norm": 0.095703125, + "learning_rate": 0.0004945703784642893, + "loss": 0.5221, + "step": 76870 + }, + { + "epoch": 3.8184166087215656, + "grad_norm": 0.09375, + "learning_rate": 0.0004945306446806397, + "loss": 0.5134, + "step": 76880 + }, + { + "epoch": 3.818913281017185, + "grad_norm": 0.20703125, + "learning_rate": 0.0004944909108969902, + "loss": 0.5067, + "step": 76890 + }, + { + "epoch": 3.819409953312804, + "grad_norm": 0.1474609375, + "learning_rate": 0.0004944511771133406, + "loss": 0.5321, + "step": 76900 + }, + { + "epoch": 3.8199066256084233, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004944114433296911, + "loss": 0.5426, + "step": 76910 + }, + { + "epoch": 3.820403297904043, + "grad_norm": 0.09619140625, + "learning_rate": 0.0004943717095460416, + "loss": 0.515, + "step": 76920 + }, + { + "epoch": 3.820899970199662, + "grad_norm": 0.11328125, + "learning_rate": 0.000494331975762392, + "loss": 0.5162, + "step": 76930 + }, + { + "epoch": 3.821396642495282, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004942922419787425, + "loss": 0.5413, + "step": 76940 + }, + { + "epoch": 3.821893314790901, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004942525081950929, + "loss": 0.5547, + "step": 76950 + }, + { + "epoch": 3.8223899870865203, + "grad_norm": 0.189453125, + "learning_rate": 0.0004942127744114433, + "loss": 0.5419, + "step": 76960 + }, + { + "epoch": 3.8228866593821396, + "grad_norm": 0.126953125, + "learning_rate": 0.0004941730406277939, + "loss": 0.5224, + "step": 76970 + }, + { + "epoch": 3.823383331677759, + "grad_norm": 0.1494140625, + "learning_rate": 0.0004941333068441442, + "loss": 0.5259, + "step": 76980 + }, + { + "epoch": 3.8238800039733785, + "grad_norm": 0.1015625, + "learning_rate": 0.0004940935730604947, + "loss": 0.5216, + "step": 76990 + }, + { + "epoch": 3.8243766762689977, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004940538392768452, + "loss": 0.5639, + "step": 77000 + }, + { + "epoch": 3.824873348564617, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004940141054931955, + "loss": 0.5427, + "step": 77010 + }, + { + "epoch": 3.8253700208602366, + "grad_norm": 0.12109375, + "learning_rate": 0.0004939743717095461, + "loss": 0.5237, + "step": 77020 + }, + { + "epoch": 3.825866693155856, + "grad_norm": 0.10546875, + "learning_rate": 0.0004939346379258966, + "loss": 0.5508, + "step": 77030 + }, + { + "epoch": 3.826363365451475, + "grad_norm": 0.177734375, + "learning_rate": 0.0004938949041422469, + "loss": 0.5257, + "step": 77040 + }, + { + "epoch": 3.8268600377470943, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004938551703585974, + "loss": 0.5381, + "step": 77050 + }, + { + "epoch": 3.827356710042714, + "grad_norm": 0.1650390625, + "learning_rate": 0.0004938154365749478, + "loss": 0.5421, + "step": 77060 + }, + { + "epoch": 3.827853382338333, + "grad_norm": 0.103515625, + "learning_rate": 0.0004937757027912984, + "loss": 0.525, + "step": 77070 + }, + { + "epoch": 3.8283500546339524, + "grad_norm": 0.109375, + "learning_rate": 0.0004937359690076488, + "loss": 0.5108, + "step": 77080 + }, + { + "epoch": 3.828846726929572, + "grad_norm": 0.103515625, + "learning_rate": 0.0004936962352239992, + "loss": 0.5251, + "step": 77090 + }, + { + "epoch": 3.8293433992251913, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004936565014403497, + "loss": 0.5566, + "step": 77100 + }, + { + "epoch": 3.8298400715208105, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004936167676567001, + "loss": 0.5185, + "step": 77110 + }, + { + "epoch": 3.8303367438164297, + "grad_norm": 0.099609375, + "learning_rate": 0.0004935770338730506, + "loss": 0.5233, + "step": 77120 + }, + { + "epoch": 3.8308334161120494, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004935373000894011, + "loss": 0.5356, + "step": 77130 + }, + { + "epoch": 3.8313300884076686, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004934975663057515, + "loss": 0.5334, + "step": 77140 + }, + { + "epoch": 3.831826760703288, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004934578325221019, + "loss": 0.5327, + "step": 77150 + }, + { + "epoch": 3.8323234329989075, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004934180987384524, + "loss": 0.5342, + "step": 77160 + }, + { + "epoch": 3.8328201052945268, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004933783649548029, + "loss": 0.5361, + "step": 77170 + }, + { + "epoch": 3.833316777590146, + "grad_norm": 0.103515625, + "learning_rate": 0.0004933386311711533, + "loss": 0.5545, + "step": 77180 + }, + { + "epoch": 3.8338134498857652, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004932988973875038, + "loss": 0.5256, + "step": 77190 + }, + { + "epoch": 3.8343101221813845, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004932591636038541, + "loss": 0.5355, + "step": 77200 + }, + { + "epoch": 3.834806794477004, + "grad_norm": 0.0966796875, + "learning_rate": 0.0004932194298202046, + "loss": 0.5366, + "step": 77210 + }, + { + "epoch": 3.8353034667726233, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004931796960365552, + "loss": 0.5469, + "step": 77220 + }, + { + "epoch": 3.835800139068243, + "grad_norm": 0.1533203125, + "learning_rate": 0.0004931399622529056, + "loss": 0.5382, + "step": 77230 + }, + { + "epoch": 3.8362968113638622, + "grad_norm": 0.11669921875, + "learning_rate": 0.000493100228469256, + "loss": 0.5042, + "step": 77240 + }, + { + "epoch": 3.8367934836594815, + "grad_norm": 0.1396484375, + "learning_rate": 0.0004930604946856064, + "loss": 0.5104, + "step": 77250 + }, + { + "epoch": 3.8372901559551007, + "grad_norm": 0.1640625, + "learning_rate": 0.0004930207609019569, + "loss": 0.5509, + "step": 77260 + }, + { + "epoch": 3.83778682825072, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004929810271183074, + "loss": 0.5261, + "step": 77270 + }, + { + "epoch": 3.8382835005463396, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004929412933346578, + "loss": 0.5002, + "step": 77280 + }, + { + "epoch": 3.838780172841959, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004929015595510083, + "loss": 0.5322, + "step": 77290 + }, + { + "epoch": 3.8392768451375785, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004928618257673587, + "loss": 0.5219, + "step": 77300 + }, + { + "epoch": 3.8397735174331977, + "grad_norm": 0.13671875, + "learning_rate": 0.0004928220919837091, + "loss": 0.5465, + "step": 77310 + }, + { + "epoch": 3.840270189728817, + "grad_norm": 0.1015625, + "learning_rate": 0.0004927823582000597, + "loss": 0.5752, + "step": 77320 + }, + { + "epoch": 3.840766862024436, + "grad_norm": 0.1650390625, + "learning_rate": 0.0004927426244164101, + "loss": 0.5246, + "step": 77330 + }, + { + "epoch": 3.8412635343200554, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004927028906327605, + "loss": 0.5423, + "step": 77340 + }, + { + "epoch": 3.841760206615675, + "grad_norm": 0.0947265625, + "learning_rate": 0.000492663156849111, + "loss": 0.5225, + "step": 77350 + }, + { + "epoch": 3.8422568789112943, + "grad_norm": 0.099609375, + "learning_rate": 0.0004926234230654614, + "loss": 0.5334, + "step": 77360 + }, + { + "epoch": 3.8427535512069135, + "grad_norm": 0.09130859375, + "learning_rate": 0.0004925836892818119, + "loss": 0.505, + "step": 77370 + }, + { + "epoch": 3.843250223502533, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004925439554981624, + "loss": 0.5224, + "step": 77380 + }, + { + "epoch": 3.8437468957981524, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004925042217145128, + "loss": 0.5149, + "step": 77390 + }, + { + "epoch": 3.8442435680937717, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004924644879308632, + "loss": 0.5327, + "step": 77400 + }, + { + "epoch": 3.844740240389391, + "grad_norm": 0.0966796875, + "learning_rate": 0.0004924247541472137, + "loss": 0.5212, + "step": 77410 + }, + { + "epoch": 3.8452369126850106, + "grad_norm": 0.0966796875, + "learning_rate": 0.0004923850203635642, + "loss": 0.5236, + "step": 77420 + }, + { + "epoch": 3.84573358498063, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004923452865799146, + "loss": 0.5518, + "step": 77430 + }, + { + "epoch": 3.846230257276249, + "grad_norm": 0.111328125, + "learning_rate": 0.0004923055527962651, + "loss": 0.5539, + "step": 77440 + }, + { + "epoch": 3.8467269295718687, + "grad_norm": 0.12890625, + "learning_rate": 0.0004922658190126155, + "loss": 0.5373, + "step": 77450 + }, + { + "epoch": 3.847223601867488, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004922260852289659, + "loss": 0.5529, + "step": 77460 + }, + { + "epoch": 3.847720274163107, + "grad_norm": 0.09716796875, + "learning_rate": 0.0004921863514453164, + "loss": 0.5682, + "step": 77470 + }, + { + "epoch": 3.8482169464587264, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004921466176616669, + "loss": 0.5279, + "step": 77480 + }, + { + "epoch": 3.848713618754346, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004921068838780173, + "loss": 0.5535, + "step": 77490 + }, + { + "epoch": 3.8492102910499653, + "grad_norm": 0.12890625, + "learning_rate": 0.0004920671500943677, + "loss": 0.5249, + "step": 77500 + }, + { + "epoch": 3.8497069633455845, + "grad_norm": 0.138671875, + "learning_rate": 0.0004920274163107182, + "loss": 0.5058, + "step": 77510 + }, + { + "epoch": 3.850203635641204, + "grad_norm": 0.2138671875, + "learning_rate": 0.0004919876825270687, + "loss": 0.5522, + "step": 77520 + }, + { + "epoch": 3.8507003079368234, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004919479487434191, + "loss": 0.5331, + "step": 77530 + }, + { + "epoch": 3.8511969802324426, + "grad_norm": 0.138671875, + "learning_rate": 0.0004919082149597696, + "loss": 0.5452, + "step": 77540 + }, + { + "epoch": 3.851693652528062, + "grad_norm": 0.09912109375, + "learning_rate": 0.00049186848117612, + "loss": 0.5229, + "step": 77550 + }, + { + "epoch": 3.852190324823681, + "grad_norm": 0.1376953125, + "learning_rate": 0.0004918287473924704, + "loss": 0.5111, + "step": 77560 + }, + { + "epoch": 3.8526869971193007, + "grad_norm": 0.11669921875, + "learning_rate": 0.000491789013608821, + "loss": 0.5245, + "step": 77570 + }, + { + "epoch": 3.85318366941492, + "grad_norm": 0.12451171875, + "learning_rate": 0.0004917492798251714, + "loss": 0.5325, + "step": 77580 + }, + { + "epoch": 3.8536803417105396, + "grad_norm": 0.177734375, + "learning_rate": 0.0004917095460415218, + "loss": 0.5324, + "step": 77590 + }, + { + "epoch": 3.854177014006159, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004916698122578723, + "loss": 0.523, + "step": 77600 + }, + { + "epoch": 3.854673686301778, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004916300784742227, + "loss": 0.5764, + "step": 77610 + }, + { + "epoch": 3.8551703585973973, + "grad_norm": 0.1494140625, + "learning_rate": 0.0004915903446905732, + "loss": 0.5273, + "step": 77620 + }, + { + "epoch": 3.8556670308930165, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004915506109069237, + "loss": 0.5281, + "step": 77630 + }, + { + "epoch": 3.856163703188636, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004915108771232741, + "loss": 0.5493, + "step": 77640 + }, + { + "epoch": 3.8566603754842554, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004914711433396245, + "loss": 0.53, + "step": 77650 + }, + { + "epoch": 3.857157047779875, + "grad_norm": 0.09716796875, + "learning_rate": 0.0004914314095559749, + "loss": 0.5514, + "step": 77660 + }, + { + "epoch": 3.8576537200754943, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004913916757723255, + "loss": 0.5638, + "step": 77670 + }, + { + "epoch": 3.8581503923711136, + "grad_norm": 0.12060546875, + "learning_rate": 0.000491351941988676, + "loss": 0.5153, + "step": 77680 + }, + { + "epoch": 3.858647064666733, + "grad_norm": 0.11328125, + "learning_rate": 0.0004913122082050263, + "loss": 0.5267, + "step": 77690 + }, + { + "epoch": 3.859143736962352, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004912724744213768, + "loss": 0.563, + "step": 77700 + }, + { + "epoch": 3.8596404092579717, + "grad_norm": 0.12890625, + "learning_rate": 0.0004912327406377273, + "loss": 0.5209, + "step": 77710 + }, + { + "epoch": 3.860137081553591, + "grad_norm": 0.125, + "learning_rate": 0.0004911930068540776, + "loss": 0.5381, + "step": 77720 + }, + { + "epoch": 3.86063375384921, + "grad_norm": 0.1376953125, + "learning_rate": 0.0004911532730704282, + "loss": 0.5184, + "step": 77730 + }, + { + "epoch": 3.86113042614483, + "grad_norm": 0.142578125, + "learning_rate": 0.0004911135392867786, + "loss": 0.5145, + "step": 77740 + }, + { + "epoch": 3.861627098440449, + "grad_norm": 0.1123046875, + "learning_rate": 0.000491073805503129, + "loss": 0.5214, + "step": 77750 + }, + { + "epoch": 3.8621237707360683, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004910340717194795, + "loss": 0.5344, + "step": 77760 + }, + { + "epoch": 3.8626204430316875, + "grad_norm": 0.1201171875, + "learning_rate": 0.00049099433793583, + "loss": 0.5234, + "step": 77770 + }, + { + "epoch": 3.863117115327307, + "grad_norm": 0.18359375, + "learning_rate": 0.0004909546041521804, + "loss": 0.5339, + "step": 77780 + }, + { + "epoch": 3.8636137876229264, + "grad_norm": 0.1015625, + "learning_rate": 0.0004909148703685309, + "loss": 0.5405, + "step": 77790 + }, + { + "epoch": 3.8641104599185456, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004908751365848813, + "loss": 0.5536, + "step": 77800 + }, + { + "epoch": 3.8646071322141653, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004908354028012318, + "loss": 0.5374, + "step": 77810 + }, + { + "epoch": 3.8651038045097845, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004907956690175823, + "loss": 0.5438, + "step": 77820 + }, + { + "epoch": 3.8656004768054038, + "grad_norm": 0.119140625, + "learning_rate": 0.0004907559352339327, + "loss": 0.5263, + "step": 77830 + }, + { + "epoch": 3.866097149101023, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004907162014502832, + "loss": 0.5228, + "step": 77840 + }, + { + "epoch": 3.8665938213966426, + "grad_norm": 0.205078125, + "learning_rate": 0.0004906764676666335, + "loss": 0.5644, + "step": 77850 + }, + { + "epoch": 3.867090493692262, + "grad_norm": 0.1259765625, + "learning_rate": 0.000490636733882984, + "loss": 0.5252, + "step": 77860 + }, + { + "epoch": 3.867587165987881, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004905970000993346, + "loss": 0.5538, + "step": 77870 + }, + { + "epoch": 3.8680838382835008, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004905572663156849, + "loss": 0.5115, + "step": 77880 + }, + { + "epoch": 3.86858051057912, + "grad_norm": 0.1171875, + "learning_rate": 0.0004905175325320354, + "loss": 0.5239, + "step": 77890 + }, + { + "epoch": 3.8690771828747392, + "grad_norm": 0.111328125, + "learning_rate": 0.0004904777987483859, + "loss": 0.4906, + "step": 77900 + }, + { + "epoch": 3.8695738551703585, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004904380649647362, + "loss": 0.551, + "step": 77910 + }, + { + "epoch": 3.8700705274659777, + "grad_norm": 0.14453125, + "learning_rate": 0.0004903983311810867, + "loss": 0.5334, + "step": 77920 + }, + { + "epoch": 3.8705671997615974, + "grad_norm": 0.0947265625, + "learning_rate": 0.0004903585973974372, + "loss": 0.5479, + "step": 77930 + }, + { + "epoch": 3.8710638720572166, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004903188636137876, + "loss": 0.5058, + "step": 77940 + }, + { + "epoch": 3.8715605443528363, + "grad_norm": 0.1171875, + "learning_rate": 0.0004902791298301381, + "loss": 0.54, + "step": 77950 + }, + { + "epoch": 3.8720572166484555, + "grad_norm": 0.111328125, + "learning_rate": 0.0004902393960464885, + "loss": 0.533, + "step": 77960 + }, + { + "epoch": 3.8725538889440747, + "grad_norm": 0.109375, + "learning_rate": 0.000490199662262839, + "loss": 0.5098, + "step": 77970 + }, + { + "epoch": 3.873050561239694, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004901599284791895, + "loss": 0.5435, + "step": 77980 + }, + { + "epoch": 3.873547233535313, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004901201946955399, + "loss": 0.5652, + "step": 77990 + }, + { + "epoch": 3.874043905830933, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004900804609118904, + "loss": 0.5213, + "step": 78000 + }, + { + "epoch": 3.874540578126552, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004900407271282408, + "loss": 0.5487, + "step": 78010 + }, + { + "epoch": 3.8750372504221713, + "grad_norm": 0.1328125, + "learning_rate": 0.0004900009933445912, + "loss": 0.5225, + "step": 78020 + }, + { + "epoch": 3.875533922717791, + "grad_norm": 0.11328125, + "learning_rate": 0.0004899612595609418, + "loss": 0.5417, + "step": 78030 + }, + { + "epoch": 3.87603059501341, + "grad_norm": 0.1015625, + "learning_rate": 0.0004899215257772922, + "loss": 0.5715, + "step": 78040 + }, + { + "epoch": 3.8765272673090294, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004898817919936426, + "loss": 0.5074, + "step": 78050 + }, + { + "epoch": 3.8770239396046486, + "grad_norm": 0.126953125, + "learning_rate": 0.0004898420582099931, + "loss": 0.5138, + "step": 78060 + }, + { + "epoch": 3.8775206119002683, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004898023244263434, + "loss": 0.513, + "step": 78070 + }, + { + "epoch": 3.8780172841958875, + "grad_norm": 0.1318359375, + "learning_rate": 0.000489762590642694, + "loss": 0.5218, + "step": 78080 + }, + { + "epoch": 3.8785139564915068, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004897228568590445, + "loss": 0.5516, + "step": 78090 + }, + { + "epoch": 3.8790106287871264, + "grad_norm": 0.1689453125, + "learning_rate": 0.0004896831230753948, + "loss": 0.5456, + "step": 78100 + }, + { + "epoch": 3.8795073010827457, + "grad_norm": 0.10546875, + "learning_rate": 0.0004896433892917453, + "loss": 0.5356, + "step": 78110 + }, + { + "epoch": 3.880003973378365, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004896036555080957, + "loss": 0.5203, + "step": 78120 + }, + { + "epoch": 3.880500645673984, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004895639217244463, + "loss": 0.5382, + "step": 78130 + }, + { + "epoch": 3.880997317969604, + "grad_norm": 0.0966796875, + "learning_rate": 0.0004895241879407967, + "loss": 0.5274, + "step": 78140 + }, + { + "epoch": 3.881493990265223, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004894844541571471, + "loss": 0.5278, + "step": 78150 + }, + { + "epoch": 3.8819906625608422, + "grad_norm": 0.130859375, + "learning_rate": 0.0004894447203734976, + "loss": 0.5564, + "step": 78160 + }, + { + "epoch": 3.882487334856462, + "grad_norm": 0.11181640625, + "learning_rate": 0.000489404986589848, + "loss": 0.5444, + "step": 78170 + }, + { + "epoch": 3.882984007152081, + "grad_norm": 0.099609375, + "learning_rate": 0.0004893652528061985, + "loss": 0.5267, + "step": 78180 + }, + { + "epoch": 3.8834806794477004, + "grad_norm": 0.10400390625, + "learning_rate": 0.000489325519022549, + "loss": 0.5116, + "step": 78190 + }, + { + "epoch": 3.8839773517433196, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004892857852388994, + "loss": 0.5024, + "step": 78200 + }, + { + "epoch": 3.8844740240389393, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004892460514552498, + "loss": 0.5304, + "step": 78210 + }, + { + "epoch": 3.8849706963345585, + "grad_norm": 0.12890625, + "learning_rate": 0.0004892063176716003, + "loss": 0.5474, + "step": 78220 + }, + { + "epoch": 3.8854673686301777, + "grad_norm": 0.158203125, + "learning_rate": 0.0004891665838879508, + "loss": 0.5243, + "step": 78230 + }, + { + "epoch": 3.8859640409257974, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004891268501043012, + "loss": 0.5531, + "step": 78240 + }, + { + "epoch": 3.8864607132214166, + "grad_norm": 0.109375, + "learning_rate": 0.0004890871163206517, + "loss": 0.5401, + "step": 78250 + }, + { + "epoch": 3.886957385517036, + "grad_norm": 0.14453125, + "learning_rate": 0.0004890473825370021, + "loss": 0.5397, + "step": 78260 + }, + { + "epoch": 3.887454057812655, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004890076487533525, + "loss": 0.5461, + "step": 78270 + }, + { + "epoch": 3.8879507301082743, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004889679149697031, + "loss": 0.5115, + "step": 78280 + }, + { + "epoch": 3.888447402403894, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004889281811860535, + "loss": 0.519, + "step": 78290 + }, + { + "epoch": 3.888944074699513, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004888884474024039, + "loss": 0.5557, + "step": 78300 + }, + { + "epoch": 3.889440746995133, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004888487136187543, + "loss": 0.5311, + "step": 78310 + }, + { + "epoch": 3.889937419290752, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004888089798351048, + "loss": 0.5394, + "step": 78320 + }, + { + "epoch": 3.8904340915863713, + "grad_norm": 0.10546875, + "learning_rate": 0.0004887692460514553, + "loss": 0.5154, + "step": 78330 + }, + { + "epoch": 3.8909307638819906, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004887295122678057, + "loss": 0.5444, + "step": 78340 + }, + { + "epoch": 3.8914274361776098, + "grad_norm": 0.1328125, + "learning_rate": 0.0004886897784841562, + "loss": 0.5237, + "step": 78350 + }, + { + "epoch": 3.8919241084732294, + "grad_norm": 0.111328125, + "learning_rate": 0.0004886500447005066, + "loss": 0.5175, + "step": 78360 + }, + { + "epoch": 3.8924207807688487, + "grad_norm": 0.10986328125, + "learning_rate": 0.000488610310916857, + "loss": 0.5257, + "step": 78370 + }, + { + "epoch": 3.892917453064468, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004885705771332076, + "loss": 0.5148, + "step": 78380 + }, + { + "epoch": 3.8934141253600876, + "grad_norm": 0.1083984375, + "learning_rate": 0.000488530843349558, + "loss": 0.5445, + "step": 78390 + }, + { + "epoch": 3.893910797655707, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004884911095659084, + "loss": 0.5255, + "step": 78400 + }, + { + "epoch": 3.894407469951326, + "grad_norm": 0.123046875, + "learning_rate": 0.0004884513757822589, + "loss": 0.538, + "step": 78410 + }, + { + "epoch": 3.8949041422469453, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004884116419986093, + "loss": 0.5602, + "step": 78420 + }, + { + "epoch": 3.895400814542565, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004883719082149598, + "loss": 0.5426, + "step": 78430 + }, + { + "epoch": 3.895897486838184, + "grad_norm": 0.10009765625, + "learning_rate": 0.0004883321744313103, + "loss": 0.5264, + "step": 78440 + }, + { + "epoch": 3.8963941591338034, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004882924406476607, + "loss": 0.5425, + "step": 78450 + }, + { + "epoch": 3.896890831429423, + "grad_norm": 0.1767578125, + "learning_rate": 0.0004882527068640111, + "loss": 0.5515, + "step": 78460 + }, + { + "epoch": 3.8973875037250423, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004882129730803616, + "loss": 0.5495, + "step": 78470 + }, + { + "epoch": 3.8978841760206615, + "grad_norm": 0.10693359375, + "learning_rate": 0.00048817323929671206, + "loss": 0.5271, + "step": 78480 + }, + { + "epoch": 3.8983808483162807, + "grad_norm": 0.166015625, + "learning_rate": 0.0004881335055130625, + "loss": 0.5404, + "step": 78490 + }, + { + "epoch": 3.8988775206119004, + "grad_norm": 0.1142578125, + "learning_rate": 0.00048809377172941295, + "loss": 0.5227, + "step": 78500 + }, + { + "epoch": 3.8993741929075196, + "grad_norm": 0.099609375, + "learning_rate": 0.0004880540379457634, + "loss": 0.5285, + "step": 78510 + }, + { + "epoch": 3.899870865203139, + "grad_norm": 0.09521484375, + "learning_rate": 0.00048801430416211384, + "loss": 0.5331, + "step": 78520 + }, + { + "epoch": 3.9003675374987585, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004879745703784643, + "loss": 0.5292, + "step": 78530 + }, + { + "epoch": 3.9008642097943778, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004879348365948148, + "loss": 0.4952, + "step": 78540 + }, + { + "epoch": 3.901360882089997, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004878951028111652, + "loss": 0.5172, + "step": 78550 + }, + { + "epoch": 3.901857554385616, + "grad_norm": 0.11572265625, + "learning_rate": 0.00048785536902751567, + "loss": 0.5235, + "step": 78560 + }, + { + "epoch": 3.902354226681236, + "grad_norm": 0.11328125, + "learning_rate": 0.00048781563524386614, + "loss": 0.5048, + "step": 78570 + }, + { + "epoch": 3.902850898976855, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004877759014602166, + "loss": 0.5437, + "step": 78580 + }, + { + "epoch": 3.9033475712724743, + "grad_norm": 0.1259765625, + "learning_rate": 0.00048773616767656703, + "loss": 0.5178, + "step": 78590 + }, + { + "epoch": 3.903844243568094, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004876964338929175, + "loss": 0.5071, + "step": 78600 + }, + { + "epoch": 3.9043409158637132, + "grad_norm": 0.10205078125, + "learning_rate": 0.00048765670010926797, + "loss": 0.5203, + "step": 78610 + }, + { + "epoch": 3.9048375881593325, + "grad_norm": 0.1103515625, + "learning_rate": 0.00048761696632561833, + "loss": 0.5194, + "step": 78620 + }, + { + "epoch": 3.9053342604549517, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004875772325419688, + "loss": 0.515, + "step": 78630 + }, + { + "epoch": 3.905830932750571, + "grad_norm": 0.1337890625, + "learning_rate": 0.00048753749875831933, + "loss": 0.533, + "step": 78640 + }, + { + "epoch": 3.9063276050461906, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004874977649746697, + "loss": 0.5588, + "step": 78650 + }, + { + "epoch": 3.90682427734181, + "grad_norm": 0.10888671875, + "learning_rate": 0.00048745803119102017, + "loss": 0.5296, + "step": 78660 + }, + { + "epoch": 3.9073209496374295, + "grad_norm": 0.1396484375, + "learning_rate": 0.00048741829740737064, + "loss": 0.5109, + "step": 78670 + }, + { + "epoch": 3.9078176219330487, + "grad_norm": 0.12158203125, + "learning_rate": 0.00048737856362372105, + "loss": 0.5399, + "step": 78680 + }, + { + "epoch": 3.908314294228668, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004873388298400715, + "loss": 0.509, + "step": 78690 + }, + { + "epoch": 3.908810966524287, + "grad_norm": 0.115234375, + "learning_rate": 0.000487299096056422, + "loss": 0.52, + "step": 78700 + }, + { + "epoch": 3.9093076388199064, + "grad_norm": 0.11328125, + "learning_rate": 0.00048725936227277247, + "loss": 0.5361, + "step": 78710 + }, + { + "epoch": 3.909804311115526, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004872196284891229, + "loss": 0.5012, + "step": 78720 + }, + { + "epoch": 3.9103009834111453, + "grad_norm": 0.1220703125, + "learning_rate": 0.00048717989470547336, + "loss": 0.5238, + "step": 78730 + }, + { + "epoch": 3.9107976557067645, + "grad_norm": 0.11865234375, + "learning_rate": 0.00048714016092182383, + "loss": 0.5273, + "step": 78740 + }, + { + "epoch": 3.911294328002384, + "grad_norm": 0.10546875, + "learning_rate": 0.00048710042713817424, + "loss": 0.532, + "step": 78750 + }, + { + "epoch": 3.9117910002980034, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004870606933545247, + "loss": 0.5236, + "step": 78760 + }, + { + "epoch": 3.9122876725936226, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004870209595708752, + "loss": 0.5262, + "step": 78770 + }, + { + "epoch": 3.912784344889242, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004869812257872256, + "loss": 0.5073, + "step": 78780 + }, + { + "epoch": 3.9132810171848615, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004869414920035761, + "loss": 0.5161, + "step": 78790 + }, + { + "epoch": 3.9137776894804808, + "grad_norm": 0.10400390625, + "learning_rate": 0.00048690175821992655, + "loss": 0.5226, + "step": 78800 + }, + { + "epoch": 3.9142743617761, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004868620244362769, + "loss": 0.5043, + "step": 78810 + }, + { + "epoch": 3.9147710340717197, + "grad_norm": 0.1064453125, + "learning_rate": 0.00048682229065262744, + "loss": 0.5119, + "step": 78820 + }, + { + "epoch": 3.915267706367339, + "grad_norm": 0.201171875, + "learning_rate": 0.0004867825568689779, + "loss": 0.5354, + "step": 78830 + }, + { + "epoch": 3.915764378662958, + "grad_norm": 0.09765625, + "learning_rate": 0.00048674282308532827, + "loss": 0.5004, + "step": 78840 + }, + { + "epoch": 3.9162610509585773, + "grad_norm": 0.1455078125, + "learning_rate": 0.00048670308930167874, + "loss": 0.5131, + "step": 78850 + }, + { + "epoch": 3.916757723254197, + "grad_norm": 0.09912109375, + "learning_rate": 0.00048666335551802927, + "loss": 0.5342, + "step": 78860 + }, + { + "epoch": 3.9172543955498162, + "grad_norm": 0.1376953125, + "learning_rate": 0.00048662362173437974, + "loss": 0.5517, + "step": 78870 + }, + { + "epoch": 3.9177510678454355, + "grad_norm": 0.10009765625, + "learning_rate": 0.0004865838879507301, + "loss": 0.5119, + "step": 78880 + }, + { + "epoch": 3.918247740141055, + "grad_norm": 0.08935546875, + "learning_rate": 0.00048654415416708057, + "loss": 0.5285, + "step": 78890 + }, + { + "epoch": 3.9187444124366744, + "grad_norm": 0.11181640625, + "learning_rate": 0.00048650442038343104, + "loss": 0.5466, + "step": 78900 + }, + { + "epoch": 3.9192410847322936, + "grad_norm": 0.095703125, + "learning_rate": 0.00048646468659978146, + "loss": 0.5246, + "step": 78910 + }, + { + "epoch": 3.919737757027913, + "grad_norm": 0.1337890625, + "learning_rate": 0.00048642495281613193, + "loss": 0.5565, + "step": 78920 + }, + { + "epoch": 3.9202344293235325, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004863852190324824, + "loss": 0.5123, + "step": 78930 + }, + { + "epoch": 3.9207311016191517, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004863454852488328, + "loss": 0.522, + "step": 78940 + }, + { + "epoch": 3.921227773914771, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004863057514651833, + "loss": 0.5093, + "step": 78950 + }, + { + "epoch": 3.9217244462103906, + "grad_norm": 0.11572265625, + "learning_rate": 0.00048626601768153376, + "loss": 0.511, + "step": 78960 + }, + { + "epoch": 3.92222111850601, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004862262838978842, + "loss": 0.5285, + "step": 78970 + }, + { + "epoch": 3.922717790801629, + "grad_norm": 0.1162109375, + "learning_rate": 0.00048618655011423465, + "loss": 0.5343, + "step": 78980 + }, + { + "epoch": 3.9232144630972483, + "grad_norm": 0.107421875, + "learning_rate": 0.0004861468163305851, + "loss": 0.5646, + "step": 78990 + }, + { + "epoch": 3.9237111353928675, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004861070825469356, + "loss": 0.5277, + "step": 79000 + }, + { + "epoch": 3.924207807688487, + "grad_norm": 0.1650390625, + "learning_rate": 0.000486067348763286, + "loss": 0.5393, + "step": 79010 + }, + { + "epoch": 3.9247044799841064, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004860276149796365, + "loss": 0.5405, + "step": 79020 + }, + { + "epoch": 3.925201152279726, + "grad_norm": 0.1005859375, + "learning_rate": 0.00048598788119598696, + "loss": 0.5444, + "step": 79030 + }, + { + "epoch": 3.9256978245753453, + "grad_norm": 0.115234375, + "learning_rate": 0.0004859481474123373, + "loss": 0.5462, + "step": 79040 + }, + { + "epoch": 3.9261944968709646, + "grad_norm": 0.0966796875, + "learning_rate": 0.00048590841362868784, + "loss": 0.5064, + "step": 79050 + }, + { + "epoch": 3.926691169166584, + "grad_norm": 0.09375, + "learning_rate": 0.0004858686798450383, + "loss": 0.519, + "step": 79060 + }, + { + "epoch": 3.927187841462203, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004858289460613887, + "loss": 0.5443, + "step": 79070 + }, + { + "epoch": 3.9276845137578227, + "grad_norm": 0.1630859375, + "learning_rate": 0.00048578921227773915, + "loss": 0.511, + "step": 79080 + }, + { + "epoch": 3.928181186053442, + "grad_norm": 0.10546875, + "learning_rate": 0.0004857494784940897, + "loss": 0.5439, + "step": 79090 + }, + { + "epoch": 3.928677858349061, + "grad_norm": 0.10302734375, + "learning_rate": 0.00048570974471044004, + "loss": 0.5368, + "step": 79100 + }, + { + "epoch": 3.929174530644681, + "grad_norm": 0.140625, + "learning_rate": 0.0004856700109267905, + "loss": 0.5425, + "step": 79110 + }, + { + "epoch": 3.9296712029403, + "grad_norm": 0.0986328125, + "learning_rate": 0.000485630277143141, + "loss": 0.5535, + "step": 79120 + }, + { + "epoch": 3.9301678752359193, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004855905433594914, + "loss": 0.5268, + "step": 79130 + }, + { + "epoch": 3.9306645475315385, + "grad_norm": 0.10009765625, + "learning_rate": 0.00048555080957584187, + "loss": 0.523, + "step": 79140 + }, + { + "epoch": 3.931161219827158, + "grad_norm": 0.1318359375, + "learning_rate": 0.00048551107579219234, + "loss": 0.537, + "step": 79150 + }, + { + "epoch": 3.9316578921227774, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004854713420085428, + "loss": 0.5496, + "step": 79160 + }, + { + "epoch": 3.9321545644183966, + "grad_norm": 0.1201171875, + "learning_rate": 0.00048543160822489323, + "loss": 0.5504, + "step": 79170 + }, + { + "epoch": 3.9326512367140163, + "grad_norm": 0.1630859375, + "learning_rate": 0.0004853918744412437, + "loss": 0.5735, + "step": 79180 + }, + { + "epoch": 3.9331479090096355, + "grad_norm": 0.1083984375, + "learning_rate": 0.00048535214065759417, + "loss": 0.5313, + "step": 79190 + }, + { + "epoch": 3.9336445813052547, + "grad_norm": 0.09814453125, + "learning_rate": 0.0004853124068739446, + "loss": 0.5333, + "step": 79200 + }, + { + "epoch": 3.934141253600874, + "grad_norm": 0.1337890625, + "learning_rate": 0.00048527267309029506, + "loss": 0.5218, + "step": 79210 + }, + { + "epoch": 3.9346379258964936, + "grad_norm": 0.1328125, + "learning_rate": 0.00048523293930664553, + "loss": 0.5266, + "step": 79220 + }, + { + "epoch": 3.935134598192113, + "grad_norm": 0.0966796875, + "learning_rate": 0.0004851932055229959, + "loss": 0.5356, + "step": 79230 + }, + { + "epoch": 3.935631270487732, + "grad_norm": 0.125, + "learning_rate": 0.0004851534717393464, + "loss": 0.5371, + "step": 79240 + }, + { + "epoch": 3.9361279427833518, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004851137379556969, + "loss": 0.509, + "step": 79250 + }, + { + "epoch": 3.936624615078971, + "grad_norm": 0.109375, + "learning_rate": 0.00048507400417204725, + "loss": 0.5251, + "step": 79260 + }, + { + "epoch": 3.93712128737459, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004850342703883977, + "loss": 0.5085, + "step": 79270 + }, + { + "epoch": 3.9376179596702094, + "grad_norm": 0.125, + "learning_rate": 0.00048499453660474825, + "loss": 0.5571, + "step": 79280 + }, + { + "epoch": 3.9381146319658287, + "grad_norm": 0.138671875, + "learning_rate": 0.0004849548028210986, + "loss": 0.5364, + "step": 79290 + }, + { + "epoch": 3.9386113042614483, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004849150690374491, + "loss": 0.5451, + "step": 79300 + }, + { + "epoch": 3.9391079765570676, + "grad_norm": 0.1025390625, + "learning_rate": 0.00048487533525379956, + "loss": 0.5003, + "step": 79310 + }, + { + "epoch": 3.9396046488526872, + "grad_norm": 0.1484375, + "learning_rate": 0.0004848356014701501, + "loss": 0.5263, + "step": 79320 + }, + { + "epoch": 3.9401013211483065, + "grad_norm": 0.10546875, + "learning_rate": 0.00048479586768650045, + "loss": 0.523, + "step": 79330 + }, + { + "epoch": 3.9405979934439257, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004847561339028509, + "loss": 0.5346, + "step": 79340 + }, + { + "epoch": 3.941094665739545, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004847164001192014, + "loss": 0.5233, + "step": 79350 + }, + { + "epoch": 3.941591338035164, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004846766663355518, + "loss": 0.5381, + "step": 79360 + }, + { + "epoch": 3.942088010330784, + "grad_norm": 0.11328125, + "learning_rate": 0.0004846369325519023, + "loss": 0.5275, + "step": 79370 + }, + { + "epoch": 3.942584682626403, + "grad_norm": 0.109375, + "learning_rate": 0.00048459719876825275, + "loss": 0.5336, + "step": 79380 + }, + { + "epoch": 3.9430813549220227, + "grad_norm": 0.12353515625, + "learning_rate": 0.00048455746498460316, + "loss": 0.5401, + "step": 79390 + }, + { + "epoch": 3.943578027217642, + "grad_norm": 0.11083984375, + "learning_rate": 0.00048451773120095364, + "loss": 0.524, + "step": 79400 + }, + { + "epoch": 3.944074699513261, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004844779974173041, + "loss": 0.514, + "step": 79410 + }, + { + "epoch": 3.9445713718088804, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004844382636336545, + "loss": 0.5181, + "step": 79420 + }, + { + "epoch": 3.9450680441044996, + "grad_norm": 0.12353515625, + "learning_rate": 0.000484398529850005, + "loss": 0.5247, + "step": 79430 + }, + { + "epoch": 3.9455647164001193, + "grad_norm": 0.09716796875, + "learning_rate": 0.00048435879606635547, + "loss": 0.5204, + "step": 79440 + }, + { + "epoch": 3.9460613886957385, + "grad_norm": 0.10400390625, + "learning_rate": 0.00048431906228270594, + "loss": 0.4897, + "step": 79450 + }, + { + "epoch": 3.9465580609913578, + "grad_norm": 0.1748046875, + "learning_rate": 0.0004842793284990563, + "loss": 0.5227, + "step": 79460 + }, + { + "epoch": 3.9470547332869774, + "grad_norm": 0.130859375, + "learning_rate": 0.00048423959471540683, + "loss": 0.5644, + "step": 79470 + }, + { + "epoch": 3.9475514055825967, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004841998609317573, + "loss": 0.5391, + "step": 79480 + }, + { + "epoch": 3.948048077878216, + "grad_norm": 0.1083984375, + "learning_rate": 0.00048416012714810766, + "loss": 0.5092, + "step": 79490 + }, + { + "epoch": 3.948544750173835, + "grad_norm": 0.1044921875, + "learning_rate": 0.00048412039336445813, + "loss": 0.5233, + "step": 79500 + }, + { + "epoch": 3.9490414224694548, + "grad_norm": 0.1083984375, + "learning_rate": 0.00048408065958080866, + "loss": 0.5487, + "step": 79510 + }, + { + "epoch": 3.949538094765074, + "grad_norm": 0.1240234375, + "learning_rate": 0.000484040925797159, + "loss": 0.5287, + "step": 79520 + }, + { + "epoch": 3.9500347670606932, + "grad_norm": 0.107421875, + "learning_rate": 0.0004840011920135095, + "loss": 0.5458, + "step": 79530 + }, + { + "epoch": 3.950531439356313, + "grad_norm": 0.138671875, + "learning_rate": 0.00048396145822985996, + "loss": 0.522, + "step": 79540 + }, + { + "epoch": 3.951028111651932, + "grad_norm": 0.115234375, + "learning_rate": 0.0004839217244462104, + "loss": 0.5134, + "step": 79550 + }, + { + "epoch": 3.9515247839475514, + "grad_norm": 0.1083984375, + "learning_rate": 0.00048388199066256085, + "loss": 0.5682, + "step": 79560 + }, + { + "epoch": 3.9520214562431706, + "grad_norm": 0.099609375, + "learning_rate": 0.0004838422568789113, + "loss": 0.4989, + "step": 79570 + }, + { + "epoch": 3.9525181285387903, + "grad_norm": 0.1572265625, + "learning_rate": 0.00048380252309526174, + "loss": 0.5383, + "step": 79580 + }, + { + "epoch": 3.9530148008344095, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004837627893116122, + "loss": 0.5253, + "step": 79590 + }, + { + "epoch": 3.9535114731300287, + "grad_norm": 0.095703125, + "learning_rate": 0.0004837230555279627, + "loss": 0.5204, + "step": 79600 + }, + { + "epoch": 3.9540081454256484, + "grad_norm": 0.1318359375, + "learning_rate": 0.00048368332174431316, + "loss": 0.5299, + "step": 79610 + }, + { + "epoch": 3.9545048177212676, + "grad_norm": 0.1416015625, + "learning_rate": 0.00048364358796066357, + "loss": 0.5158, + "step": 79620 + }, + { + "epoch": 3.955001490016887, + "grad_norm": 0.1435546875, + "learning_rate": 0.00048360385417701404, + "loss": 0.5451, + "step": 79630 + }, + { + "epoch": 3.955498162312506, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004835641203933645, + "loss": 0.5338, + "step": 79640 + }, + { + "epoch": 3.9559948346081253, + "grad_norm": 0.1318359375, + "learning_rate": 0.00048352438660971493, + "loss": 0.5416, + "step": 79650 + }, + { + "epoch": 3.956491506903745, + "grad_norm": 0.109375, + "learning_rate": 0.0004834846528260654, + "loss": 0.5345, + "step": 79660 + }, + { + "epoch": 3.956988179199364, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004834449190424159, + "loss": 0.549, + "step": 79670 + }, + { + "epoch": 3.957484851494984, + "grad_norm": 0.11962890625, + "learning_rate": 0.00048340518525876624, + "loss": 0.5493, + "step": 79680 + }, + { + "epoch": 3.957981523790603, + "grad_norm": 0.10546875, + "learning_rate": 0.00048336545147511676, + "loss": 0.5484, + "step": 79690 + }, + { + "epoch": 3.9584781960862223, + "grad_norm": 0.11669921875, + "learning_rate": 0.00048332571769146723, + "loss": 0.5111, + "step": 79700 + }, + { + "epoch": 3.9589748683818415, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004832859839078176, + "loss": 0.5525, + "step": 79710 + }, + { + "epoch": 3.9594715406774608, + "grad_norm": 0.12060546875, + "learning_rate": 0.00048324625012416807, + "loss": 0.516, + "step": 79720 + }, + { + "epoch": 3.9599682129730804, + "grad_norm": 0.1376953125, + "learning_rate": 0.00048320651634051854, + "loss": 0.5382, + "step": 79730 + }, + { + "epoch": 3.9604648852686997, + "grad_norm": 0.10546875, + "learning_rate": 0.00048316678255686896, + "loss": 0.5643, + "step": 79740 + }, + { + "epoch": 3.9609615575643193, + "grad_norm": 0.1123046875, + "learning_rate": 0.00048312704877321943, + "loss": 0.5073, + "step": 79750 + }, + { + "epoch": 3.9614582298599386, + "grad_norm": 0.1171875, + "learning_rate": 0.0004830873149895699, + "loss": 0.5077, + "step": 79760 + }, + { + "epoch": 3.961954902155558, + "grad_norm": 0.123046875, + "learning_rate": 0.00048304758120592037, + "loss": 0.5649, + "step": 79770 + }, + { + "epoch": 3.962451574451177, + "grad_norm": 0.0966796875, + "learning_rate": 0.0004830078474222708, + "loss": 0.5168, + "step": 79780 + }, + { + "epoch": 3.9629482467467962, + "grad_norm": 0.09912109375, + "learning_rate": 0.00048296811363862126, + "loss": 0.5375, + "step": 79790 + }, + { + "epoch": 3.963444919042416, + "grad_norm": 0.1494140625, + "learning_rate": 0.00048292837985497173, + "loss": 0.5512, + "step": 79800 + }, + { + "epoch": 3.963941591338035, + "grad_norm": 0.10009765625, + "learning_rate": 0.00048288864607132215, + "loss": 0.5251, + "step": 79810 + }, + { + "epoch": 3.9644382636336544, + "grad_norm": 0.1865234375, + "learning_rate": 0.0004828489122876726, + "loss": 0.5405, + "step": 79820 + }, + { + "epoch": 3.964934935929274, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004828091785040231, + "loss": 0.5521, + "step": 79830 + }, + { + "epoch": 3.9654316082248933, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004827694447203735, + "loss": 0.5328, + "step": 79840 + }, + { + "epoch": 3.9659282805205125, + "grad_norm": 0.12060546875, + "learning_rate": 0.000482729710936724, + "loss": 0.5432, + "step": 79850 + }, + { + "epoch": 3.9664249528161317, + "grad_norm": 0.14453125, + "learning_rate": 0.00048268997715307445, + "loss": 0.542, + "step": 79860 + }, + { + "epoch": 3.9669216251117514, + "grad_norm": 0.138671875, + "learning_rate": 0.0004826502433694248, + "loss": 0.5276, + "step": 79870 + }, + { + "epoch": 3.9674182974073706, + "grad_norm": 0.0986328125, + "learning_rate": 0.00048261050958577534, + "loss": 0.5499, + "step": 79880 + }, + { + "epoch": 3.96791496970299, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004825707758021258, + "loss": 0.5069, + "step": 79890 + }, + { + "epoch": 3.9684116419986095, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004825310420184763, + "loss": 0.5312, + "step": 79900 + }, + { + "epoch": 3.9689083142942287, + "grad_norm": 0.125, + "learning_rate": 0.00048249130823482665, + "loss": 0.5259, + "step": 79910 + }, + { + "epoch": 3.969404986589848, + "grad_norm": 0.11181640625, + "learning_rate": 0.00048245157445117717, + "loss": 0.5166, + "step": 79920 + }, + { + "epoch": 3.969901658885467, + "grad_norm": 0.107421875, + "learning_rate": 0.00048241184066752764, + "loss": 0.5314, + "step": 79930 + }, + { + "epoch": 3.970398331181087, + "grad_norm": 0.1455078125, + "learning_rate": 0.000482372106883878, + "loss": 0.5214, + "step": 79940 + }, + { + "epoch": 3.970895003476706, + "grad_norm": 0.0888671875, + "learning_rate": 0.0004823323731002285, + "loss": 0.5146, + "step": 79950 + }, + { + "epoch": 3.9713916757723253, + "grad_norm": 0.11279296875, + "learning_rate": 0.000482292639316579, + "loss": 0.538, + "step": 79960 + }, + { + "epoch": 3.971888348067945, + "grad_norm": 0.0986328125, + "learning_rate": 0.00048225290553292937, + "loss": 0.5395, + "step": 79970 + }, + { + "epoch": 3.9723850203635642, + "grad_norm": 0.09912109375, + "learning_rate": 0.00048221317174927984, + "loss": 0.5594, + "step": 79980 + }, + { + "epoch": 3.9728816926591835, + "grad_norm": 0.158203125, + "learning_rate": 0.0004821734379656303, + "loss": 0.5511, + "step": 79990 + }, + { + "epoch": 3.9733783649548027, + "grad_norm": 0.1416015625, + "learning_rate": 0.0004821337041819807, + "loss": 0.5274, + "step": 80000 + }, + { + "epoch": 3.973875037250422, + "grad_norm": 0.0966796875, + "learning_rate": 0.0004820939703983312, + "loss": 0.5179, + "step": 80010 + }, + { + "epoch": 3.9743717095460416, + "grad_norm": 0.162109375, + "learning_rate": 0.00048205423661468167, + "loss": 0.5516, + "step": 80020 + }, + { + "epoch": 3.974868381841661, + "grad_norm": 0.1806640625, + "learning_rate": 0.0004820145028310321, + "loss": 0.5319, + "step": 80030 + }, + { + "epoch": 3.9753650541372805, + "grad_norm": 0.10546875, + "learning_rate": 0.00048197476904738256, + "loss": 0.5173, + "step": 80040 + }, + { + "epoch": 3.9758617264328997, + "grad_norm": 0.1201171875, + "learning_rate": 0.00048193503526373303, + "loss": 0.5147, + "step": 80050 + }, + { + "epoch": 3.976358398728519, + "grad_norm": 0.1689453125, + "learning_rate": 0.0004818953014800835, + "loss": 0.5243, + "step": 80060 + }, + { + "epoch": 3.976855071024138, + "grad_norm": 0.1416015625, + "learning_rate": 0.0004818555676964339, + "loss": 0.4932, + "step": 80070 + }, + { + "epoch": 3.9773517433197574, + "grad_norm": 0.107421875, + "learning_rate": 0.0004818158339127844, + "loss": 0.5149, + "step": 80080 + }, + { + "epoch": 3.977848415615377, + "grad_norm": 0.13671875, + "learning_rate": 0.00048177610012913486, + "loss": 0.5575, + "step": 80090 + }, + { + "epoch": 3.9783450879109963, + "grad_norm": 0.13671875, + "learning_rate": 0.0004817363663454852, + "loss": 0.5325, + "step": 80100 + }, + { + "epoch": 3.978841760206616, + "grad_norm": 0.1181640625, + "learning_rate": 0.00048169663256183575, + "loss": 0.5211, + "step": 80110 + }, + { + "epoch": 3.979338432502235, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004816568987781862, + "loss": 0.4939, + "step": 80120 + }, + { + "epoch": 3.9798351047978544, + "grad_norm": 0.11328125, + "learning_rate": 0.0004816171649945366, + "loss": 0.5304, + "step": 80130 + }, + { + "epoch": 3.9803317770934736, + "grad_norm": 0.111328125, + "learning_rate": 0.00048157743121088705, + "loss": 0.5244, + "step": 80140 + }, + { + "epoch": 3.980828449389093, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004815376974272376, + "loss": 0.5435, + "step": 80150 + }, + { + "epoch": 3.9813251216847125, + "grad_norm": 0.12890625, + "learning_rate": 0.00048149796364358794, + "loss": 0.5387, + "step": 80160 + }, + { + "epoch": 3.9818217939803318, + "grad_norm": 0.12451171875, + "learning_rate": 0.0004814582298599384, + "loss": 0.5371, + "step": 80170 + }, + { + "epoch": 3.982318466275951, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004814184960762889, + "loss": 0.5091, + "step": 80180 + }, + { + "epoch": 3.9828151385715707, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004813787622926393, + "loss": 0.552, + "step": 80190 + }, + { + "epoch": 3.98331181086719, + "grad_norm": 0.142578125, + "learning_rate": 0.00048133902850898977, + "loss": 0.4959, + "step": 80200 + }, + { + "epoch": 3.983808483162809, + "grad_norm": 0.15625, + "learning_rate": 0.00048129929472534024, + "loss": 0.5101, + "step": 80210 + }, + { + "epoch": 3.9843051554584283, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004812595609416907, + "loss": 0.5362, + "step": 80220 + }, + { + "epoch": 3.984801827754048, + "grad_norm": 0.10791015625, + "learning_rate": 0.00048121982715804113, + "loss": 0.5268, + "step": 80230 + }, + { + "epoch": 3.9852985000496672, + "grad_norm": 0.169921875, + "learning_rate": 0.0004811800933743916, + "loss": 0.5353, + "step": 80240 + }, + { + "epoch": 3.9857951723452865, + "grad_norm": 0.10546875, + "learning_rate": 0.0004811403595907421, + "loss": 0.5479, + "step": 80250 + }, + { + "epoch": 3.986291844640906, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004811006258070925, + "loss": 0.5581, + "step": 80260 + }, + { + "epoch": 3.9867885169365254, + "grad_norm": 0.125, + "learning_rate": 0.00048106089202344296, + "loss": 0.5255, + "step": 80270 + }, + { + "epoch": 3.9872851892321446, + "grad_norm": 0.1181640625, + "learning_rate": 0.00048102115823979344, + "loss": 0.5291, + "step": 80280 + }, + { + "epoch": 3.987781861527764, + "grad_norm": 0.10693359375, + "learning_rate": 0.00048098142445614385, + "loss": 0.5187, + "step": 80290 + }, + { + "epoch": 3.9882785338233835, + "grad_norm": 0.142578125, + "learning_rate": 0.0004809416906724943, + "loss": 0.5122, + "step": 80300 + }, + { + "epoch": 3.9887752061190027, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004809019568888448, + "loss": 0.512, + "step": 80310 + }, + { + "epoch": 3.989271878414622, + "grad_norm": 0.107421875, + "learning_rate": 0.00048086222310519516, + "loss": 0.5211, + "step": 80320 + }, + { + "epoch": 3.9897685507102416, + "grad_norm": 0.1259765625, + "learning_rate": 0.00048082248932154563, + "loss": 0.5828, + "step": 80330 + }, + { + "epoch": 3.990265223005861, + "grad_norm": 0.146484375, + "learning_rate": 0.00048078275553789615, + "loss": 0.5629, + "step": 80340 + }, + { + "epoch": 3.99076189530148, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004807430217542466, + "loss": 0.544, + "step": 80350 + }, + { + "epoch": 3.9912585675970993, + "grad_norm": 0.1064453125, + "learning_rate": 0.000480703287970597, + "loss": 0.5288, + "step": 80360 + }, + { + "epoch": 3.9917552398927185, + "grad_norm": 0.1083984375, + "learning_rate": 0.00048066355418694746, + "loss": 0.5364, + "step": 80370 + }, + { + "epoch": 3.992251912188338, + "grad_norm": 0.1259765625, + "learning_rate": 0.000480623820403298, + "loss": 0.5254, + "step": 80380 + }, + { + "epoch": 3.9927485844839574, + "grad_norm": 0.09912109375, + "learning_rate": 0.00048058408661964835, + "loss": 0.5298, + "step": 80390 + }, + { + "epoch": 3.993245256779577, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004805443528359988, + "loss": 0.5143, + "step": 80400 + }, + { + "epoch": 3.9937419290751963, + "grad_norm": 0.2021484375, + "learning_rate": 0.0004805046190523493, + "loss": 0.5279, + "step": 80410 + }, + { + "epoch": 3.9942386013708155, + "grad_norm": 0.109375, + "learning_rate": 0.0004804648852686997, + "loss": 0.543, + "step": 80420 + }, + { + "epoch": 3.9947352736664348, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004804251514850502, + "loss": 0.5356, + "step": 80430 + }, + { + "epoch": 3.995231945962054, + "grad_norm": 0.1044921875, + "learning_rate": 0.00048038541770140065, + "loss": 0.5297, + "step": 80440 + }, + { + "epoch": 3.9957286182576737, + "grad_norm": 0.11376953125, + "learning_rate": 0.00048034568391775107, + "loss": 0.5392, + "step": 80450 + }, + { + "epoch": 3.996225290553293, + "grad_norm": 0.1123046875, + "learning_rate": 0.00048030595013410154, + "loss": 0.5283, + "step": 80460 + }, + { + "epoch": 3.996721962848912, + "grad_norm": 0.1064453125, + "learning_rate": 0.000480266216350452, + "loss": 0.5134, + "step": 80470 + }, + { + "epoch": 3.997218635144532, + "grad_norm": 0.166015625, + "learning_rate": 0.00048022648256680243, + "loss": 0.5283, + "step": 80480 + }, + { + "epoch": 3.997715307440151, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004801867487831529, + "loss": 0.4861, + "step": 80490 + }, + { + "epoch": 3.9982119797357702, + "grad_norm": 0.10205078125, + "learning_rate": 0.00048014701499950337, + "loss": 0.488, + "step": 80500 + }, + { + "epoch": 3.9987086520313895, + "grad_norm": 0.09765625, + "learning_rate": 0.00048010728121585384, + "loss": 0.5418, + "step": 80510 + }, + { + "epoch": 3.999205324327009, + "grad_norm": 0.09814453125, + "learning_rate": 0.00048006754743220426, + "loss": 0.5178, + "step": 80520 + }, + { + "epoch": 3.9997019966226284, + "grad_norm": 0.134765625, + "learning_rate": 0.00048002781364855473, + "loss": 0.563, + "step": 80530 + }, + { + "epoch": 4.000198668918248, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004799880798649052, + "loss": 0.5175, + "step": 80540 + }, + { + "epoch": 4.000695341213867, + "grad_norm": 0.12158203125, + "learning_rate": 0.00047994834608125557, + "loss": 0.5267, + "step": 80550 + }, + { + "epoch": 4.0011920135094865, + "grad_norm": 0.146484375, + "learning_rate": 0.0004799086122976061, + "loss": 0.5212, + "step": 80560 + }, + { + "epoch": 4.001688685805106, + "grad_norm": 0.1435546875, + "learning_rate": 0.00047986887851395656, + "loss": 0.5415, + "step": 80570 + }, + { + "epoch": 4.002185358100725, + "grad_norm": 0.1015625, + "learning_rate": 0.0004798291447303069, + "loss": 0.5447, + "step": 80580 + }, + { + "epoch": 4.002682030396344, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004797894109466574, + "loss": 0.5089, + "step": 80590 + }, + { + "epoch": 4.003178702691963, + "grad_norm": 0.123046875, + "learning_rate": 0.00047974967716300787, + "loss": 0.5216, + "step": 80600 + }, + { + "epoch": 4.0036753749875835, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004797099433793583, + "loss": 0.5156, + "step": 80610 + }, + { + "epoch": 4.004172047283203, + "grad_norm": 0.1171875, + "learning_rate": 0.00047967020959570876, + "loss": 0.5018, + "step": 80620 + }, + { + "epoch": 4.004668719578822, + "grad_norm": 0.1015625, + "learning_rate": 0.00047963047581205923, + "loss": 0.5122, + "step": 80630 + }, + { + "epoch": 4.005165391874441, + "grad_norm": 0.1083984375, + "learning_rate": 0.00047959074202840964, + "loss": 0.5377, + "step": 80640 + }, + { + "epoch": 4.00566206417006, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004795510082447601, + "loss": 0.5088, + "step": 80650 + }, + { + "epoch": 4.00615873646568, + "grad_norm": 0.11328125, + "learning_rate": 0.0004795112744611106, + "loss": 0.5172, + "step": 80660 + }, + { + "epoch": 4.006655408761299, + "grad_norm": 0.1494140625, + "learning_rate": 0.00047947154067746106, + "loss": 0.5396, + "step": 80670 + }, + { + "epoch": 4.007152081056919, + "grad_norm": 0.1328125, + "learning_rate": 0.0004794318068938115, + "loss": 0.5247, + "step": 80680 + }, + { + "epoch": 4.007648753352538, + "grad_norm": 0.10888671875, + "learning_rate": 0.00047939207311016195, + "loss": 0.5022, + "step": 80690 + }, + { + "epoch": 4.0081454256481575, + "grad_norm": 0.2138671875, + "learning_rate": 0.0004793523393265124, + "loss": 0.499, + "step": 80700 + }, + { + "epoch": 4.008642097943777, + "grad_norm": 0.09765625, + "learning_rate": 0.00047931260554286284, + "loss": 0.4814, + "step": 80710 + }, + { + "epoch": 4.009138770239396, + "grad_norm": 0.1396484375, + "learning_rate": 0.0004792728717592133, + "loss": 0.5266, + "step": 80720 + }, + { + "epoch": 4.009635442535015, + "grad_norm": 0.1328125, + "learning_rate": 0.0004792331379755638, + "loss": 0.5145, + "step": 80730 + }, + { + "epoch": 4.010132114830634, + "grad_norm": 0.1162109375, + "learning_rate": 0.00047919340419191414, + "loss": 0.4964, + "step": 80740 + }, + { + "epoch": 4.0106287871262545, + "grad_norm": 0.1630859375, + "learning_rate": 0.00047915367040826467, + "loss": 0.5007, + "step": 80750 + }, + { + "epoch": 4.011125459421874, + "grad_norm": 0.11474609375, + "learning_rate": 0.00047911393662461514, + "loss": 0.5183, + "step": 80760 + }, + { + "epoch": 4.011622131717493, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004790742028409655, + "loss": 0.5162, + "step": 80770 + }, + { + "epoch": 4.012118804013112, + "grad_norm": 0.10400390625, + "learning_rate": 0.00047903446905731597, + "loss": 0.5259, + "step": 80780 + }, + { + "epoch": 4.012615476308731, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004789947352736665, + "loss": 0.5069, + "step": 80790 + }, + { + "epoch": 4.013112148604351, + "grad_norm": 0.1142578125, + "learning_rate": 0.00047895500149001697, + "loss": 0.4948, + "step": 80800 + }, + { + "epoch": 4.01360882089997, + "grad_norm": 0.146484375, + "learning_rate": 0.00047891526770636733, + "loss": 0.5593, + "step": 80810 + }, + { + "epoch": 4.01410549319559, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004788755339227178, + "loss": 0.5431, + "step": 80820 + }, + { + "epoch": 4.014602165491209, + "grad_norm": 0.130859375, + "learning_rate": 0.0004788358001390683, + "loss": 0.5352, + "step": 80830 + }, + { + "epoch": 4.015098837786828, + "grad_norm": 0.150390625, + "learning_rate": 0.0004787960663554187, + "loss": 0.5245, + "step": 80840 + }, + { + "epoch": 4.015595510082448, + "grad_norm": 0.1162109375, + "learning_rate": 0.00047875633257176916, + "loss": 0.5241, + "step": 80850 + }, + { + "epoch": 4.016092182378067, + "grad_norm": 0.10888671875, + "learning_rate": 0.00047871659878811964, + "loss": 0.5128, + "step": 80860 + }, + { + "epoch": 4.016588854673686, + "grad_norm": 0.12353515625, + "learning_rate": 0.00047867686500447005, + "loss": 0.5192, + "step": 80870 + }, + { + "epoch": 4.017085526969305, + "grad_norm": 0.099609375, + "learning_rate": 0.0004786371312208205, + "loss": 0.5221, + "step": 80880 + }, + { + "epoch": 4.017582199264925, + "grad_norm": 0.1474609375, + "learning_rate": 0.000478597397437171, + "loss": 0.5048, + "step": 80890 + }, + { + "epoch": 4.018078871560545, + "grad_norm": 0.10546875, + "learning_rate": 0.0004785576636535214, + "loss": 0.5098, + "step": 80900 + }, + { + "epoch": 4.018575543856164, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004785179298698719, + "loss": 0.5253, + "step": 80910 + }, + { + "epoch": 4.019072216151783, + "grad_norm": 0.09912109375, + "learning_rate": 0.00047847819608622235, + "loss": 0.5029, + "step": 80920 + }, + { + "epoch": 4.019568888447402, + "grad_norm": 0.12890625, + "learning_rate": 0.0004784384623025727, + "loss": 0.5064, + "step": 80930 + }, + { + "epoch": 4.020065560743022, + "grad_norm": 0.10205078125, + "learning_rate": 0.00047839872851892324, + "loss": 0.5097, + "step": 80940 + }, + { + "epoch": 4.020562233038641, + "grad_norm": 0.0986328125, + "learning_rate": 0.0004783589947352737, + "loss": 0.4999, + "step": 80950 + }, + { + "epoch": 4.02105890533426, + "grad_norm": 0.115234375, + "learning_rate": 0.0004783192609516242, + "loss": 0.5068, + "step": 80960 + }, + { + "epoch": 4.02155557762988, + "grad_norm": 0.1298828125, + "learning_rate": 0.00047827952716797455, + "loss": 0.4947, + "step": 80970 + }, + { + "epoch": 4.022052249925499, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004782397933843251, + "loss": 0.5178, + "step": 80980 + }, + { + "epoch": 4.022548922221119, + "grad_norm": 0.2177734375, + "learning_rate": 0.00047820005960067555, + "loss": 0.5396, + "step": 80990 + }, + { + "epoch": 4.023045594516738, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004781603258170259, + "loss": 0.561, + "step": 81000 + }, + { + "epoch": 4.023542266812357, + "grad_norm": 0.123046875, + "learning_rate": 0.0004781205920333764, + "loss": 0.5209, + "step": 81010 + }, + { + "epoch": 4.024038939107976, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004780808582497269, + "loss": 0.5166, + "step": 81020 + }, + { + "epoch": 4.0245356114035955, + "grad_norm": 0.11376953125, + "learning_rate": 0.00047804112446607727, + "loss": 0.5519, + "step": 81030 + }, + { + "epoch": 4.025032283699216, + "grad_norm": 0.125, + "learning_rate": 0.00047800139068242774, + "loss": 0.5737, + "step": 81040 + }, + { + "epoch": 4.025528955994835, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004779616568987782, + "loss": 0.5398, + "step": 81050 + }, + { + "epoch": 4.026025628290454, + "grad_norm": 0.1015625, + "learning_rate": 0.00047792192311512863, + "loss": 0.5005, + "step": 81060 + }, + { + "epoch": 4.026522300586073, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004778821893314791, + "loss": 0.536, + "step": 81070 + }, + { + "epoch": 4.0270189728816925, + "grad_norm": 0.1591796875, + "learning_rate": 0.00047784245554782957, + "loss": 0.5223, + "step": 81080 + }, + { + "epoch": 4.027515645177312, + "grad_norm": 0.1591796875, + "learning_rate": 0.00047780272176418004, + "loss": 0.505, + "step": 81090 + }, + { + "epoch": 4.028012317472931, + "grad_norm": 0.119140625, + "learning_rate": 0.00047776298798053046, + "loss": 0.499, + "step": 81100 + }, + { + "epoch": 4.028508989768551, + "grad_norm": 0.11181640625, + "learning_rate": 0.00047772325419688093, + "loss": 0.5178, + "step": 81110 + }, + { + "epoch": 4.02900566206417, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004776835204132314, + "loss": 0.5239, + "step": 81120 + }, + { + "epoch": 4.0295023343597896, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004776437866295818, + "loss": 0.5163, + "step": 81130 + }, + { + "epoch": 4.029999006655409, + "grad_norm": 0.154296875, + "learning_rate": 0.0004776040528459323, + "loss": 0.5398, + "step": 81140 + }, + { + "epoch": 4.030495678951028, + "grad_norm": 0.11279296875, + "learning_rate": 0.00047756431906228276, + "loss": 0.5174, + "step": 81150 + }, + { + "epoch": 4.030992351246647, + "grad_norm": 0.09423828125, + "learning_rate": 0.0004775245852786331, + "loss": 0.5257, + "step": 81160 + }, + { + "epoch": 4.0314890235422665, + "grad_norm": 0.142578125, + "learning_rate": 0.00047748485149498365, + "loss": 0.493, + "step": 81170 + }, + { + "epoch": 4.031985695837887, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004774451177113341, + "loss": 0.5354, + "step": 81180 + }, + { + "epoch": 4.032482368133506, + "grad_norm": 0.126953125, + "learning_rate": 0.0004774053839276845, + "loss": 0.5447, + "step": 81190 + }, + { + "epoch": 4.032979040429125, + "grad_norm": 0.09326171875, + "learning_rate": 0.00047736565014403496, + "loss": 0.5177, + "step": 81200 + }, + { + "epoch": 4.033475712724744, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004773259163603855, + "loss": 0.5483, + "step": 81210 + }, + { + "epoch": 4.0339723850203635, + "grad_norm": 0.1044921875, + "learning_rate": 0.00047728618257673585, + "loss": 0.5108, + "step": 81220 + }, + { + "epoch": 4.034469057315983, + "grad_norm": 0.1513671875, + "learning_rate": 0.0004772464487930863, + "loss": 0.528, + "step": 81230 + }, + { + "epoch": 4.034965729611602, + "grad_norm": 0.138671875, + "learning_rate": 0.0004772067150094368, + "loss": 0.5305, + "step": 81240 + }, + { + "epoch": 4.035462401907222, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004771669812257873, + "loss": 0.5205, + "step": 81250 + }, + { + "epoch": 4.035959074202841, + "grad_norm": 0.1650390625, + "learning_rate": 0.0004771272474421377, + "loss": 0.5339, + "step": 81260 + }, + { + "epoch": 4.0364557464984605, + "grad_norm": 0.1435546875, + "learning_rate": 0.00047708751365848815, + "loss": 0.4839, + "step": 81270 + }, + { + "epoch": 4.03695241879408, + "grad_norm": 0.134765625, + "learning_rate": 0.0004770477798748386, + "loss": 0.525, + "step": 81280 + }, + { + "epoch": 4.037449091089699, + "grad_norm": 0.107421875, + "learning_rate": 0.00047700804609118904, + "loss": 0.5113, + "step": 81290 + }, + { + "epoch": 4.037945763385318, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004769683123075395, + "loss": 0.5052, + "step": 81300 + }, + { + "epoch": 4.038442435680937, + "grad_norm": 0.1328125, + "learning_rate": 0.00047692857852389, + "loss": 0.522, + "step": 81310 + }, + { + "epoch": 4.038939107976557, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004768888447402404, + "loss": 0.5277, + "step": 81320 + }, + { + "epoch": 4.039435780272177, + "grad_norm": 0.123046875, + "learning_rate": 0.00047684911095659087, + "loss": 0.5198, + "step": 81330 + }, + { + "epoch": 4.039932452567796, + "grad_norm": 0.11767578125, + "learning_rate": 0.00047680937717294134, + "loss": 0.4995, + "step": 81340 + }, + { + "epoch": 4.040429124863415, + "grad_norm": 0.1005859375, + "learning_rate": 0.00047676964338929176, + "loss": 0.4999, + "step": 81350 + }, + { + "epoch": 4.040925797159034, + "grad_norm": 0.125, + "learning_rate": 0.00047672990960564223, + "loss": 0.5285, + "step": 81360 + }, + { + "epoch": 4.041422469454654, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004766901758219927, + "loss": 0.5227, + "step": 81370 + }, + { + "epoch": 4.041919141750273, + "grad_norm": 0.1416015625, + "learning_rate": 0.00047665044203834306, + "loss": 0.5269, + "step": 81380 + }, + { + "epoch": 4.042415814045892, + "grad_norm": 0.09716796875, + "learning_rate": 0.0004766107082546936, + "loss": 0.4916, + "step": 81390 + }, + { + "epoch": 4.042912486341512, + "grad_norm": 0.09521484375, + "learning_rate": 0.00047657097447104406, + "loss": 0.5073, + "step": 81400 + }, + { + "epoch": 4.0434091586371315, + "grad_norm": 0.10498046875, + "learning_rate": 0.00047653124068739453, + "loss": 0.5292, + "step": 81410 + }, + { + "epoch": 4.043905830932751, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004764915069037449, + "loss": 0.5153, + "step": 81420 + }, + { + "epoch": 4.04440250322837, + "grad_norm": 0.1591796875, + "learning_rate": 0.00047645177312009536, + "loss": 0.5215, + "step": 81430 + }, + { + "epoch": 4.044899175523989, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004764120393364459, + "loss": 0.5353, + "step": 81440 + }, + { + "epoch": 4.045395847819608, + "grad_norm": 0.1259765625, + "learning_rate": 0.00047637230555279625, + "loss": 0.5035, + "step": 81450 + }, + { + "epoch": 4.045892520115228, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004763325717691467, + "loss": 0.5241, + "step": 81460 + }, + { + "epoch": 4.046389192410848, + "grad_norm": 0.115234375, + "learning_rate": 0.0004762928379854972, + "loss": 0.4885, + "step": 81470 + }, + { + "epoch": 4.046885864706467, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004762531042018476, + "loss": 0.5404, + "step": 81480 + }, + { + "epoch": 4.047382537002086, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004762133704181981, + "loss": 0.5116, + "step": 81490 + }, + { + "epoch": 4.047879209297705, + "grad_norm": 0.10546875, + "learning_rate": 0.00047617363663454856, + "loss": 0.5134, + "step": 81500 + }, + { + "epoch": 4.048375881593325, + "grad_norm": 0.10009765625, + "learning_rate": 0.00047613390285089897, + "loss": 0.511, + "step": 81510 + }, + { + "epoch": 4.048872553888944, + "grad_norm": 0.13671875, + "learning_rate": 0.00047609416906724944, + "loss": 0.5603, + "step": 81520 + }, + { + "epoch": 4.049369226184563, + "grad_norm": 0.12890625, + "learning_rate": 0.0004760544352835999, + "loss": 0.5238, + "step": 81530 + }, + { + "epoch": 4.049865898480183, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004760147014999504, + "loss": 0.5093, + "step": 81540 + }, + { + "epoch": 4.050362570775802, + "grad_norm": 0.123046875, + "learning_rate": 0.0004759749677163008, + "loss": 0.5349, + "step": 81550 + }, + { + "epoch": 4.050859243071422, + "grad_norm": 0.13671875, + "learning_rate": 0.0004759352339326513, + "loss": 0.5305, + "step": 81560 + }, + { + "epoch": 4.051355915367041, + "grad_norm": 0.10205078125, + "learning_rate": 0.00047589550014900175, + "loss": 0.5223, + "step": 81570 + }, + { + "epoch": 4.05185258766266, + "grad_norm": 0.1591796875, + "learning_rate": 0.00047585576636535216, + "loss": 0.5055, + "step": 81580 + }, + { + "epoch": 4.052349259958279, + "grad_norm": 0.099609375, + "learning_rate": 0.00047581603258170263, + "loss": 0.5241, + "step": 81590 + }, + { + "epoch": 4.0528459322538986, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004757762987980531, + "loss": 0.5139, + "step": 81600 + }, + { + "epoch": 4.053342604549519, + "grad_norm": 0.10693359375, + "learning_rate": 0.00047573656501440347, + "loss": 0.5414, + "step": 81610 + }, + { + "epoch": 4.053839276845138, + "grad_norm": 0.1171875, + "learning_rate": 0.000475696831230754, + "loss": 0.5034, + "step": 81620 + }, + { + "epoch": 4.054335949140757, + "grad_norm": 0.10498046875, + "learning_rate": 0.00047565709744710447, + "loss": 0.513, + "step": 81630 + }, + { + "epoch": 4.054832621436376, + "grad_norm": 0.10791015625, + "learning_rate": 0.00047561736366345483, + "loss": 0.5004, + "step": 81640 + }, + { + "epoch": 4.055329293731996, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004755776298798053, + "loss": 0.5158, + "step": 81650 + }, + { + "epoch": 4.055825966027615, + "grad_norm": 0.142578125, + "learning_rate": 0.0004755378960961558, + "loss": 0.5057, + "step": 81660 + }, + { + "epoch": 4.056322638323234, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004754981623125062, + "loss": 0.503, + "step": 81670 + }, + { + "epoch": 4.056819310618853, + "grad_norm": 0.10986328125, + "learning_rate": 0.00047545842852885666, + "loss": 0.5453, + "step": 81680 + }, + { + "epoch": 4.057315982914473, + "grad_norm": 0.15625, + "learning_rate": 0.00047541869474520713, + "loss": 0.5194, + "step": 81690 + }, + { + "epoch": 4.057812655210093, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004753789609615576, + "loss": 0.5501, + "step": 81700 + }, + { + "epoch": 4.058309327505712, + "grad_norm": 0.162109375, + "learning_rate": 0.000475339227177908, + "loss": 0.5369, + "step": 81710 + }, + { + "epoch": 4.058805999801331, + "grad_norm": 0.10546875, + "learning_rate": 0.0004752994933942585, + "loss": 0.5098, + "step": 81720 + }, + { + "epoch": 4.05930267209695, + "grad_norm": 0.1162109375, + "learning_rate": 0.00047525975961060896, + "loss": 0.5112, + "step": 81730 + }, + { + "epoch": 4.0597993443925695, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004752200258269594, + "loss": 0.5086, + "step": 81740 + }, + { + "epoch": 4.060296016688189, + "grad_norm": 0.11474609375, + "learning_rate": 0.00047518029204330985, + "loss": 0.5346, + "step": 81750 + }, + { + "epoch": 4.060792688983809, + "grad_norm": 0.1015625, + "learning_rate": 0.0004751405582596603, + "loss": 0.5093, + "step": 81760 + }, + { + "epoch": 4.061289361279428, + "grad_norm": 0.1171875, + "learning_rate": 0.00047510082447601074, + "loss": 0.4806, + "step": 81770 + }, + { + "epoch": 4.061786033575047, + "grad_norm": 0.0986328125, + "learning_rate": 0.0004750610906923612, + "loss": 0.5136, + "step": 81780 + }, + { + "epoch": 4.0622827058706665, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004750213569087117, + "loss": 0.4983, + "step": 81790 + }, + { + "epoch": 4.062779378166286, + "grad_norm": 0.107421875, + "learning_rate": 0.00047498162312506205, + "loss": 0.5402, + "step": 81800 + }, + { + "epoch": 4.063276050461905, + "grad_norm": 0.1474609375, + "learning_rate": 0.00047494188934141257, + "loss": 0.4821, + "step": 81810 + }, + { + "epoch": 4.063772722757524, + "grad_norm": 0.1396484375, + "learning_rate": 0.00047490215555776304, + "loss": 0.521, + "step": 81820 + }, + { + "epoch": 4.064269395053144, + "grad_norm": 0.10546875, + "learning_rate": 0.0004748624217741134, + "loss": 0.4949, + "step": 81830 + }, + { + "epoch": 4.064766067348764, + "grad_norm": 0.1611328125, + "learning_rate": 0.0004748226879904639, + "loss": 0.5308, + "step": 81840 + }, + { + "epoch": 4.065262739644383, + "grad_norm": 0.111328125, + "learning_rate": 0.0004747829542068144, + "loss": 0.5349, + "step": 81850 + }, + { + "epoch": 4.065759411940002, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004747432204231649, + "loss": 0.5077, + "step": 81860 + }, + { + "epoch": 4.066256084235621, + "grad_norm": 0.11767578125, + "learning_rate": 0.00047470348663951524, + "loss": 0.5036, + "step": 81870 + }, + { + "epoch": 4.0667527565312405, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004746637528558657, + "loss": 0.5353, + "step": 81880 + }, + { + "epoch": 4.06724942882686, + "grad_norm": 0.11669921875, + "learning_rate": 0.00047462401907221623, + "loss": 0.5128, + "step": 81890 + }, + { + "epoch": 4.06774610112248, + "grad_norm": 0.13671875, + "learning_rate": 0.0004745842852885666, + "loss": 0.5283, + "step": 81900 + }, + { + "epoch": 4.068242773418099, + "grad_norm": 0.1328125, + "learning_rate": 0.00047454455150491707, + "loss": 0.532, + "step": 81910 + }, + { + "epoch": 4.068739445713718, + "grad_norm": 0.1376953125, + "learning_rate": 0.00047450481772126754, + "loss": 0.5092, + "step": 81920 + }, + { + "epoch": 4.0692361180093375, + "grad_norm": 0.10986328125, + "learning_rate": 0.00047446508393761796, + "loss": 0.5129, + "step": 81930 + }, + { + "epoch": 4.069732790304957, + "grad_norm": 0.11279296875, + "learning_rate": 0.00047442535015396843, + "loss": 0.5507, + "step": 81940 + }, + { + "epoch": 4.070229462600576, + "grad_norm": 0.10009765625, + "learning_rate": 0.0004743856163703189, + "loss": 0.5067, + "step": 81950 + }, + { + "epoch": 4.070726134896195, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004743458825866693, + "loss": 0.5173, + "step": 81960 + }, + { + "epoch": 4.071222807191814, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004743061488030198, + "loss": 0.5237, + "step": 81970 + }, + { + "epoch": 4.0717194794874345, + "grad_norm": 0.123046875, + "learning_rate": 0.00047426641501937026, + "loss": 0.529, + "step": 81980 + }, + { + "epoch": 4.072216151783054, + "grad_norm": 0.10693359375, + "learning_rate": 0.00047422668123572073, + "loss": 0.497, + "step": 81990 + }, + { + "epoch": 4.072712824078673, + "grad_norm": 0.11279296875, + "learning_rate": 0.00047418694745207115, + "loss": 0.4832, + "step": 82000 + }, + { + "epoch": 4.073209496374292, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004741472136684216, + "loss": 0.5452, + "step": 82010 + }, + { + "epoch": 4.073706168669911, + "grad_norm": 0.1650390625, + "learning_rate": 0.0004741074798847721, + "loss": 0.501, + "step": 82020 + }, + { + "epoch": 4.074202840965531, + "grad_norm": 0.13671875, + "learning_rate": 0.00047406774610112245, + "loss": 0.5312, + "step": 82030 + }, + { + "epoch": 4.07469951326115, + "grad_norm": 0.11083984375, + "learning_rate": 0.000474028012317473, + "loss": 0.521, + "step": 82040 + }, + { + "epoch": 4.07519618555677, + "grad_norm": 0.12060546875, + "learning_rate": 0.00047398827853382345, + "loss": 0.5259, + "step": 82050 + }, + { + "epoch": 4.075692857852389, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004739485447501738, + "loss": 0.5127, + "step": 82060 + }, + { + "epoch": 4.0761895301480084, + "grad_norm": 0.111328125, + "learning_rate": 0.0004739088109665243, + "loss": 0.5266, + "step": 82070 + }, + { + "epoch": 4.076686202443628, + "grad_norm": 0.12109375, + "learning_rate": 0.0004738690771828748, + "loss": 0.5267, + "step": 82080 + }, + { + "epoch": 4.077182874739247, + "grad_norm": 0.10546875, + "learning_rate": 0.00047382934339922517, + "loss": 0.5366, + "step": 82090 + }, + { + "epoch": 4.077679547034866, + "grad_norm": 0.1171875, + "learning_rate": 0.00047378960961557564, + "loss": 0.505, + "step": 82100 + }, + { + "epoch": 4.078176219330485, + "grad_norm": 0.1572265625, + "learning_rate": 0.0004737498758319261, + "loss": 0.5062, + "step": 82110 + }, + { + "epoch": 4.0786728916261055, + "grad_norm": 0.1103515625, + "learning_rate": 0.00047371014204827653, + "loss": 0.5252, + "step": 82120 + }, + { + "epoch": 4.079169563921725, + "grad_norm": 0.1015625, + "learning_rate": 0.000473670408264627, + "loss": 0.5392, + "step": 82130 + }, + { + "epoch": 4.079666236217344, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004736306744809775, + "loss": 0.5166, + "step": 82140 + }, + { + "epoch": 4.080162908512963, + "grad_norm": 0.177734375, + "learning_rate": 0.00047359094069732795, + "loss": 0.5286, + "step": 82150 + }, + { + "epoch": 4.080659580808582, + "grad_norm": 0.130859375, + "learning_rate": 0.00047355120691367836, + "loss": 0.4974, + "step": 82160 + }, + { + "epoch": 4.081156253104202, + "grad_norm": 0.1162109375, + "learning_rate": 0.00047351147313002883, + "loss": 0.5004, + "step": 82170 + }, + { + "epoch": 4.081652925399821, + "grad_norm": 0.1328125, + "learning_rate": 0.0004734717393463793, + "loss": 0.5137, + "step": 82180 + }, + { + "epoch": 4.082149597695441, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004734320055627297, + "loss": 0.4974, + "step": 82190 + }, + { + "epoch": 4.08264626999106, + "grad_norm": 0.140625, + "learning_rate": 0.0004733922717790802, + "loss": 0.5247, + "step": 82200 + }, + { + "epoch": 4.083142942286679, + "grad_norm": 0.10107421875, + "learning_rate": 0.00047335253799543067, + "loss": 0.5155, + "step": 82210 + }, + { + "epoch": 4.083639614582299, + "grad_norm": 0.1171875, + "learning_rate": 0.0004733128042117811, + "loss": 0.4963, + "step": 82220 + }, + { + "epoch": 4.084136286877918, + "grad_norm": 0.142578125, + "learning_rate": 0.00047327307042813155, + "loss": 0.5233, + "step": 82230 + }, + { + "epoch": 4.084632959173537, + "grad_norm": 0.109375, + "learning_rate": 0.000473233336644482, + "loss": 0.5229, + "step": 82240 + }, + { + "epoch": 4.085129631469156, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004731936028608324, + "loss": 0.491, + "step": 82250 + }, + { + "epoch": 4.085626303764776, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004731538690771829, + "loss": 0.5364, + "step": 82260 + }, + { + "epoch": 4.086122976060396, + "grad_norm": 0.119140625, + "learning_rate": 0.0004731141352935334, + "loss": 0.4996, + "step": 82270 + }, + { + "epoch": 4.086619648356015, + "grad_norm": 0.11669921875, + "learning_rate": 0.00047307440150988375, + "loss": 0.5088, + "step": 82280 + }, + { + "epoch": 4.087116320651634, + "grad_norm": 0.11328125, + "learning_rate": 0.0004730346677262342, + "loss": 0.5234, + "step": 82290 + }, + { + "epoch": 4.087612992947253, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004729949339425847, + "loss": 0.5018, + "step": 82300 + }, + { + "epoch": 4.088109665242873, + "grad_norm": 0.166015625, + "learning_rate": 0.0004729552001589352, + "loss": 0.5146, + "step": 82310 + }, + { + "epoch": 4.088606337538492, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004729154663752856, + "loss": 0.5191, + "step": 82320 + }, + { + "epoch": 4.089103009834112, + "grad_norm": 0.11767578125, + "learning_rate": 0.00047287573259163605, + "loss": 0.5376, + "step": 82330 + }, + { + "epoch": 4.089599682129731, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004728359988079865, + "loss": 0.5175, + "step": 82340 + }, + { + "epoch": 4.09009635442535, + "grad_norm": 0.10400390625, + "learning_rate": 0.00047279626502433694, + "loss": 0.5092, + "step": 82350 + }, + { + "epoch": 4.09059302672097, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004727565312406874, + "loss": 0.5021, + "step": 82360 + }, + { + "epoch": 4.091089699016589, + "grad_norm": 0.111328125, + "learning_rate": 0.0004727167974570379, + "loss": 0.5344, + "step": 82370 + }, + { + "epoch": 4.091586371312208, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004726770636733883, + "loss": 0.4935, + "step": 82380 + }, + { + "epoch": 4.092083043607827, + "grad_norm": 0.1376953125, + "learning_rate": 0.00047263732988973877, + "loss": 0.491, + "step": 82390 + }, + { + "epoch": 4.0925797159034465, + "grad_norm": 0.162109375, + "learning_rate": 0.00047259759610608924, + "loss": 0.5281, + "step": 82400 + }, + { + "epoch": 4.093076388199067, + "grad_norm": 0.11865234375, + "learning_rate": 0.00047255786232243966, + "loss": 0.4877, + "step": 82410 + }, + { + "epoch": 4.093573060494686, + "grad_norm": 0.10498046875, + "learning_rate": 0.00047251812853879013, + "loss": 0.5048, + "step": 82420 + }, + { + "epoch": 4.094069732790305, + "grad_norm": 0.1376953125, + "learning_rate": 0.0004724783947551406, + "loss": 0.5371, + "step": 82430 + }, + { + "epoch": 4.094566405085924, + "grad_norm": 0.1533203125, + "learning_rate": 0.0004724386609714911, + "loss": 0.5233, + "step": 82440 + }, + { + "epoch": 4.0950630773815435, + "grad_norm": 0.134765625, + "learning_rate": 0.0004723989271878415, + "loss": 0.5133, + "step": 82450 + }, + { + "epoch": 4.095559749677163, + "grad_norm": 0.1337890625, + "learning_rate": 0.00047235919340419196, + "loss": 0.5158, + "step": 82460 + }, + { + "epoch": 4.096056421972782, + "grad_norm": 0.1279296875, + "learning_rate": 0.00047231945962054243, + "loss": 0.517, + "step": 82470 + }, + { + "epoch": 4.096553094268402, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004722797258368928, + "loss": 0.4954, + "step": 82480 + }, + { + "epoch": 4.097049766564021, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004722399920532433, + "loss": 0.5403, + "step": 82490 + }, + { + "epoch": 4.0975464388596405, + "grad_norm": 0.474609375, + "learning_rate": 0.0004722002582695938, + "loss": 0.5425, + "step": 82500 + }, + { + "epoch": 4.09804311115526, + "grad_norm": 0.09716796875, + "learning_rate": 0.00047216052448594416, + "loss": 0.524, + "step": 82510 + }, + { + "epoch": 4.098539783450879, + "grad_norm": 0.12158203125, + "learning_rate": 0.00047212079070229463, + "loss": 0.4932, + "step": 82520 + }, + { + "epoch": 4.099036455746498, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004720810569186451, + "loss": 0.5, + "step": 82530 + }, + { + "epoch": 4.0995331280421174, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004720413231349955, + "loss": 0.5181, + "step": 82540 + }, + { + "epoch": 4.100029800337738, + "grad_norm": 0.1083984375, + "learning_rate": 0.000472001589351346, + "loss": 0.502, + "step": 82550 + }, + { + "epoch": 4.100526472633357, + "grad_norm": 0.12158203125, + "learning_rate": 0.00047196185556769646, + "loss": 0.5251, + "step": 82560 + }, + { + "epoch": 4.101023144928976, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004719221217840469, + "loss": 0.5354, + "step": 82570 + }, + { + "epoch": 4.101519817224595, + "grad_norm": 0.1083984375, + "learning_rate": 0.00047188238800039735, + "loss": 0.5215, + "step": 82580 + }, + { + "epoch": 4.1020164895202145, + "grad_norm": 0.1767578125, + "learning_rate": 0.0004718426542167478, + "loss": 0.5062, + "step": 82590 + }, + { + "epoch": 4.102513161815834, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004718029204330983, + "loss": 0.5057, + "step": 82600 + }, + { + "epoch": 4.103009834111453, + "grad_norm": 0.1171875, + "learning_rate": 0.0004717631866494487, + "loss": 0.5163, + "step": 82610 + }, + { + "epoch": 4.103506506407073, + "grad_norm": 0.12109375, + "learning_rate": 0.0004717234528657992, + "loss": 0.5181, + "step": 82620 + }, + { + "epoch": 4.104003178702692, + "grad_norm": 0.166015625, + "learning_rate": 0.00047168371908214965, + "loss": 0.4976, + "step": 82630 + }, + { + "epoch": 4.1044998509983115, + "grad_norm": 0.11865234375, + "learning_rate": 0.00047164398529850007, + "loss": 0.5093, + "step": 82640 + }, + { + "epoch": 4.104996523293931, + "grad_norm": 0.11767578125, + "learning_rate": 0.00047160425151485054, + "loss": 0.5185, + "step": 82650 + }, + { + "epoch": 4.10549319558955, + "grad_norm": 0.12158203125, + "learning_rate": 0.000471564517731201, + "loss": 0.4915, + "step": 82660 + }, + { + "epoch": 4.105989867885169, + "grad_norm": 0.1064453125, + "learning_rate": 0.00047152478394755137, + "loss": 0.5473, + "step": 82670 + }, + { + "epoch": 4.106486540180788, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004714850501639019, + "loss": 0.5313, + "step": 82680 + }, + { + "epoch": 4.106983212476408, + "grad_norm": 0.1357421875, + "learning_rate": 0.00047144531638025237, + "loss": 0.5496, + "step": 82690 + }, + { + "epoch": 4.107479884772028, + "grad_norm": 0.11328125, + "learning_rate": 0.00047140558259660273, + "loss": 0.5321, + "step": 82700 + }, + { + "epoch": 4.107976557067647, + "grad_norm": 0.099609375, + "learning_rate": 0.0004713658488129532, + "loss": 0.5213, + "step": 82710 + }, + { + "epoch": 4.108473229363266, + "grad_norm": 0.130859375, + "learning_rate": 0.00047132611502930373, + "loss": 0.5279, + "step": 82720 + }, + { + "epoch": 4.108969901658885, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004712863812456541, + "loss": 0.4876, + "step": 82730 + }, + { + "epoch": 4.109466573954505, + "grad_norm": 0.1015625, + "learning_rate": 0.00047124664746200456, + "loss": 0.5223, + "step": 82740 + }, + { + "epoch": 4.109963246250124, + "grad_norm": 0.11669921875, + "learning_rate": 0.00047120691367835504, + "loss": 0.5293, + "step": 82750 + }, + { + "epoch": 4.110459918545743, + "grad_norm": 0.1611328125, + "learning_rate": 0.00047116717989470556, + "loss": 0.5532, + "step": 82760 + }, + { + "epoch": 4.110956590841363, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004711274461110559, + "loss": 0.5227, + "step": 82770 + }, + { + "epoch": 4.1114532631369825, + "grad_norm": 0.107421875, + "learning_rate": 0.0004710877123274064, + "loss": 0.5163, + "step": 82780 + }, + { + "epoch": 4.111949935432602, + "grad_norm": 0.119140625, + "learning_rate": 0.00047104797854375687, + "loss": 0.4817, + "step": 82790 + }, + { + "epoch": 4.112446607728221, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004710082447601073, + "loss": 0.5271, + "step": 82800 + }, + { + "epoch": 4.11294328002384, + "grad_norm": 0.1201171875, + "learning_rate": 0.00047096851097645775, + "loss": 0.5263, + "step": 82810 + }, + { + "epoch": 4.113439952319459, + "grad_norm": 0.09765625, + "learning_rate": 0.0004709287771928082, + "loss": 0.489, + "step": 82820 + }, + { + "epoch": 4.113936624615079, + "grad_norm": 0.10693359375, + "learning_rate": 0.00047088904340915864, + "loss": 0.5237, + "step": 82830 + }, + { + "epoch": 4.114433296910699, + "grad_norm": 0.1328125, + "learning_rate": 0.0004708493096255091, + "loss": 0.5245, + "step": 82840 + }, + { + "epoch": 4.114929969206318, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004708095758418596, + "loss": 0.5509, + "step": 82850 + }, + { + "epoch": 4.115426641501937, + "grad_norm": 0.10400390625, + "learning_rate": 0.00047076984205820995, + "loss": 0.5182, + "step": 82860 + }, + { + "epoch": 4.115923313797556, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004707301082745605, + "loss": 0.5243, + "step": 82870 + }, + { + "epoch": 4.116419986093176, + "grad_norm": 0.1103515625, + "learning_rate": 0.00047069037449091095, + "loss": 0.497, + "step": 82880 + }, + { + "epoch": 4.116916658388795, + "grad_norm": 0.10546875, + "learning_rate": 0.0004706506407072614, + "loss": 0.532, + "step": 82890 + }, + { + "epoch": 4.117413330684414, + "grad_norm": 0.142578125, + "learning_rate": 0.0004706109069236118, + "loss": 0.5535, + "step": 82900 + }, + { + "epoch": 4.117910002980034, + "grad_norm": 0.0947265625, + "learning_rate": 0.0004705711731399623, + "loss": 0.522, + "step": 82910 + }, + { + "epoch": 4.118406675275653, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004705314393563128, + "loss": 0.4981, + "step": 82920 + }, + { + "epoch": 4.118903347571273, + "grad_norm": 0.1044921875, + "learning_rate": 0.00047049170557266314, + "loss": 0.5042, + "step": 82930 + }, + { + "epoch": 4.119400019866892, + "grad_norm": 0.111328125, + "learning_rate": 0.0004704519717890136, + "loss": 0.5407, + "step": 82940 + }, + { + "epoch": 4.119896692162511, + "grad_norm": 0.1279296875, + "learning_rate": 0.00047041223800536414, + "loss": 0.495, + "step": 82950 + }, + { + "epoch": 4.12039336445813, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004703725042217145, + "loss": 0.4886, + "step": 82960 + }, + { + "epoch": 4.1208900367537495, + "grad_norm": 0.134765625, + "learning_rate": 0.00047033277043806497, + "loss": 0.5248, + "step": 82970 + }, + { + "epoch": 4.12138670904937, + "grad_norm": 0.1416015625, + "learning_rate": 0.00047029303665441544, + "loss": 0.5313, + "step": 82980 + }, + { + "epoch": 4.121883381344989, + "grad_norm": 0.10107421875, + "learning_rate": 0.00047025330287076586, + "loss": 0.4949, + "step": 82990 + }, + { + "epoch": 4.122380053640608, + "grad_norm": 0.0966796875, + "learning_rate": 0.00047021356908711633, + "loss": 0.4992, + "step": 83000 + }, + { + "epoch": 4.122876725936227, + "grad_norm": 0.12890625, + "learning_rate": 0.0004701738353034668, + "loss": 0.541, + "step": 83010 + }, + { + "epoch": 4.123373398231847, + "grad_norm": 0.146484375, + "learning_rate": 0.0004701341015198172, + "loss": 0.5312, + "step": 83020 + }, + { + "epoch": 4.123870070527466, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004700943677361677, + "loss": 0.513, + "step": 83030 + }, + { + "epoch": 4.124366742823085, + "grad_norm": 0.1640625, + "learning_rate": 0.00047005463395251816, + "loss": 0.5379, + "step": 83040 + }, + { + "epoch": 4.124863415118704, + "grad_norm": 0.1796875, + "learning_rate": 0.00047001490016886863, + "loss": 0.508, + "step": 83050 + }, + { + "epoch": 4.125360087414324, + "grad_norm": 0.1337890625, + "learning_rate": 0.00046997516638521905, + "loss": 0.5071, + "step": 83060 + }, + { + "epoch": 4.125856759709944, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004699354326015695, + "loss": 0.4966, + "step": 83070 + }, + { + "epoch": 4.126353432005563, + "grad_norm": 0.1103515625, + "learning_rate": 0.00046989569881792, + "loss": 0.5498, + "step": 83080 + }, + { + "epoch": 4.126850104301182, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004698559650342704, + "loss": 0.5128, + "step": 83090 + }, + { + "epoch": 4.127346776596801, + "grad_norm": 0.126953125, + "learning_rate": 0.0004698162312506209, + "loss": 0.534, + "step": 83100 + }, + { + "epoch": 4.1278434488924205, + "grad_norm": 0.11962890625, + "learning_rate": 0.00046977649746697135, + "loss": 0.5198, + "step": 83110 + }, + { + "epoch": 4.12834012118804, + "grad_norm": 0.109375, + "learning_rate": 0.0004697367636833217, + "loss": 0.5052, + "step": 83120 + }, + { + "epoch": 4.12883679348366, + "grad_norm": 0.111328125, + "learning_rate": 0.0004696970298996722, + "loss": 0.528, + "step": 83130 + }, + { + "epoch": 4.129333465779279, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004696572961160227, + "loss": 0.5558, + "step": 83140 + }, + { + "epoch": 4.129830138074898, + "grad_norm": 0.12890625, + "learning_rate": 0.0004696175623323731, + "loss": 0.5222, + "step": 83150 + }, + { + "epoch": 4.1303268103705175, + "grad_norm": 0.1044921875, + "learning_rate": 0.00046957782854872355, + "loss": 0.5299, + "step": 83160 + }, + { + "epoch": 4.130823482666137, + "grad_norm": 0.1533203125, + "learning_rate": 0.000469538094765074, + "loss": 0.5209, + "step": 83170 + }, + { + "epoch": 4.131320154961756, + "grad_norm": 0.11865234375, + "learning_rate": 0.00046949836098142444, + "loss": 0.528, + "step": 83180 + }, + { + "epoch": 4.131816827257375, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004694586271977749, + "loss": 0.5078, + "step": 83190 + }, + { + "epoch": 4.132313499552995, + "grad_norm": 0.142578125, + "learning_rate": 0.0004694188934141254, + "loss": 0.5392, + "step": 83200 + }, + { + "epoch": 4.1328101718486145, + "grad_norm": 0.158203125, + "learning_rate": 0.00046937915963047585, + "loss": 0.4914, + "step": 83210 + }, + { + "epoch": 4.133306844144234, + "grad_norm": 0.10693359375, + "learning_rate": 0.00046933942584682627, + "loss": 0.5129, + "step": 83220 + }, + { + "epoch": 4.133803516439853, + "grad_norm": 0.1044921875, + "learning_rate": 0.00046929969206317674, + "loss": 0.531, + "step": 83230 + }, + { + "epoch": 4.134300188735472, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004692599582795272, + "loss": 0.5036, + "step": 83240 + }, + { + "epoch": 4.1347968610310915, + "grad_norm": 0.099609375, + "learning_rate": 0.00046922022449587763, + "loss": 0.5092, + "step": 83250 + }, + { + "epoch": 4.135293533326711, + "grad_norm": 0.1435546875, + "learning_rate": 0.0004691804907122281, + "loss": 0.5096, + "step": 83260 + }, + { + "epoch": 4.135790205622331, + "grad_norm": 0.126953125, + "learning_rate": 0.00046914075692857857, + "loss": 0.5065, + "step": 83270 + }, + { + "epoch": 4.13628687791795, + "grad_norm": 0.11474609375, + "learning_rate": 0.000469101023144929, + "loss": 0.5523, + "step": 83280 + }, + { + "epoch": 4.136783550213569, + "grad_norm": 0.10986328125, + "learning_rate": 0.00046906128936127946, + "loss": 0.5279, + "step": 83290 + }, + { + "epoch": 4.1372802225091885, + "grad_norm": 0.10009765625, + "learning_rate": 0.00046902155557762993, + "loss": 0.5352, + "step": 83300 + }, + { + "epoch": 4.137776894804808, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004689818217939803, + "loss": 0.5355, + "step": 83310 + }, + { + "epoch": 4.138273567100427, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004689420880103308, + "loss": 0.563, + "step": 83320 + }, + { + "epoch": 4.138770239396046, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004689023542266813, + "loss": 0.5084, + "step": 83330 + }, + { + "epoch": 4.139266911691665, + "grad_norm": 0.107421875, + "learning_rate": 0.00046886262044303176, + "loss": 0.5208, + "step": 83340 + }, + { + "epoch": 4.1397635839872855, + "grad_norm": 0.115234375, + "learning_rate": 0.0004688228866593821, + "loss": 0.5138, + "step": 83350 + }, + { + "epoch": 4.140260256282905, + "grad_norm": 0.1259765625, + "learning_rate": 0.00046878315287573265, + "loss": 0.5155, + "step": 83360 + }, + { + "epoch": 4.140756928578524, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004687434190920831, + "loss": 0.5194, + "step": 83370 + }, + { + "epoch": 4.141253600874143, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004687036853084335, + "loss": 0.5232, + "step": 83380 + }, + { + "epoch": 4.141750273169762, + "grad_norm": 0.11279296875, + "learning_rate": 0.00046866395152478396, + "loss": 0.5192, + "step": 83390 + }, + { + "epoch": 4.142246945465382, + "grad_norm": 0.109375, + "learning_rate": 0.0004686242177411344, + "loss": 0.532, + "step": 83400 + }, + { + "epoch": 4.142743617761001, + "grad_norm": 0.13671875, + "learning_rate": 0.00046858448395748484, + "loss": 0.5369, + "step": 83410 + }, + { + "epoch": 4.143240290056621, + "grad_norm": 0.103515625, + "learning_rate": 0.0004685447501738353, + "loss": 0.5194, + "step": 83420 + }, + { + "epoch": 4.14373696235224, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004685050163901858, + "loss": 0.5188, + "step": 83430 + }, + { + "epoch": 4.144233634647859, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004684652826065362, + "loss": 0.4746, + "step": 83440 + }, + { + "epoch": 4.144730306943479, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004684255488228867, + "loss": 0.5332, + "step": 83450 + }, + { + "epoch": 4.145226979239098, + "grad_norm": 0.1162109375, + "learning_rate": 0.00046838581503923715, + "loss": 0.4914, + "step": 83460 + }, + { + "epoch": 4.145723651534717, + "grad_norm": 0.10791015625, + "learning_rate": 0.00046834608125558756, + "loss": 0.5398, + "step": 83470 + }, + { + "epoch": 4.146220323830336, + "grad_norm": 0.11279296875, + "learning_rate": 0.00046830634747193803, + "loss": 0.5198, + "step": 83480 + }, + { + "epoch": 4.1467169961259565, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004682666136882885, + "loss": 0.5076, + "step": 83490 + }, + { + "epoch": 4.147213668421576, + "grad_norm": 0.1103515625, + "learning_rate": 0.000468226879904639, + "loss": 0.5263, + "step": 83500 + }, + { + "epoch": 4.147710340717195, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004681871461209894, + "loss": 0.5078, + "step": 83510 + }, + { + "epoch": 4.148207013012814, + "grad_norm": 0.1162109375, + "learning_rate": 0.00046814741233733987, + "loss": 0.5411, + "step": 83520 + }, + { + "epoch": 4.148703685308433, + "grad_norm": 0.11572265625, + "learning_rate": 0.00046810767855369034, + "loss": 0.5165, + "step": 83530 + }, + { + "epoch": 4.149200357604053, + "grad_norm": 0.142578125, + "learning_rate": 0.0004680679447700407, + "loss": 0.5239, + "step": 83540 + }, + { + "epoch": 4.149697029899672, + "grad_norm": 0.0986328125, + "learning_rate": 0.0004680282109863912, + "loss": 0.5162, + "step": 83550 + }, + { + "epoch": 4.150193702195292, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004679884772027417, + "loss": 0.5268, + "step": 83560 + }, + { + "epoch": 4.150690374490911, + "grad_norm": 0.1103515625, + "learning_rate": 0.00046794874341909206, + "loss": 0.4695, + "step": 83570 + }, + { + "epoch": 4.15118704678653, + "grad_norm": 0.1181640625, + "learning_rate": 0.00046790900963544253, + "loss": 0.5221, + "step": 83580 + }, + { + "epoch": 4.15168371908215, + "grad_norm": 0.11767578125, + "learning_rate": 0.00046786927585179306, + "loss": 0.5106, + "step": 83590 + }, + { + "epoch": 4.152180391377769, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004678295420681434, + "loss": 0.5034, + "step": 83600 + }, + { + "epoch": 4.152677063673388, + "grad_norm": 0.111328125, + "learning_rate": 0.0004677898082844939, + "loss": 0.5582, + "step": 83610 + }, + { + "epoch": 4.153173735969007, + "grad_norm": 0.11181640625, + "learning_rate": 0.00046775007450084436, + "loss": 0.5301, + "step": 83620 + }, + { + "epoch": 4.153670408264627, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004677103407171949, + "loss": 0.5215, + "step": 83630 + }, + { + "epoch": 4.154167080560247, + "grad_norm": 0.130859375, + "learning_rate": 0.00046767060693354525, + "loss": 0.512, + "step": 83640 + }, + { + "epoch": 4.154663752855866, + "grad_norm": 0.095703125, + "learning_rate": 0.0004676308731498957, + "loss": 0.5255, + "step": 83650 + }, + { + "epoch": 4.155160425151485, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004675911393662462, + "loss": 0.5147, + "step": 83660 + }, + { + "epoch": 4.155657097447104, + "grad_norm": 0.1494140625, + "learning_rate": 0.0004675514055825966, + "loss": 0.5296, + "step": 83670 + }, + { + "epoch": 4.1561537697427235, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004675116717989471, + "loss": 0.5289, + "step": 83680 + }, + { + "epoch": 4.156650442038343, + "grad_norm": 0.1064453125, + "learning_rate": 0.00046747193801529755, + "loss": 0.5453, + "step": 83690 + }, + { + "epoch": 4.157147114333963, + "grad_norm": 0.15625, + "learning_rate": 0.00046743220423164797, + "loss": 0.5377, + "step": 83700 + }, + { + "epoch": 4.157643786629582, + "grad_norm": 0.0986328125, + "learning_rate": 0.00046739247044799844, + "loss": 0.4831, + "step": 83710 + }, + { + "epoch": 4.158140458925201, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004673527366643489, + "loss": 0.5165, + "step": 83720 + }, + { + "epoch": 4.158637131220821, + "grad_norm": 0.150390625, + "learning_rate": 0.0004673130028806993, + "loss": 0.532, + "step": 83730 + }, + { + "epoch": 4.15913380351644, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004672732690970498, + "loss": 0.5285, + "step": 83740 + }, + { + "epoch": 4.159630475812059, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004672335353134003, + "loss": 0.5117, + "step": 83750 + }, + { + "epoch": 4.160127148107678, + "grad_norm": 0.1083984375, + "learning_rate": 0.00046719380152975064, + "loss": 0.5011, + "step": 83760 + }, + { + "epoch": 4.1606238204032975, + "grad_norm": 0.146484375, + "learning_rate": 0.0004671540677461011, + "loss": 0.541, + "step": 83770 + }, + { + "epoch": 4.161120492698918, + "grad_norm": 0.12158203125, + "learning_rate": 0.00046711433396245163, + "loss": 0.5228, + "step": 83780 + }, + { + "epoch": 4.161617164994537, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004670746001788021, + "loss": 0.4957, + "step": 83790 + }, + { + "epoch": 4.162113837290156, + "grad_norm": 0.11865234375, + "learning_rate": 0.00046703486639515247, + "loss": 0.4934, + "step": 83800 + }, + { + "epoch": 4.162610509585775, + "grad_norm": 0.1572265625, + "learning_rate": 0.00046699513261150294, + "loss": 0.5274, + "step": 83810 + }, + { + "epoch": 4.1631071818813945, + "grad_norm": 0.1279296875, + "learning_rate": 0.00046695539882785346, + "loss": 0.5108, + "step": 83820 + }, + { + "epoch": 4.163603854177014, + "grad_norm": 0.150390625, + "learning_rate": 0.00046691566504420383, + "loss": 0.5514, + "step": 83830 + }, + { + "epoch": 4.164100526472633, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004668759312605543, + "loss": 0.5133, + "step": 83840 + }, + { + "epoch": 4.164597198768253, + "grad_norm": 0.11962890625, + "learning_rate": 0.00046683619747690477, + "loss": 0.5386, + "step": 83850 + }, + { + "epoch": 4.165093871063872, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004667964636932552, + "loss": 0.5187, + "step": 83860 + }, + { + "epoch": 4.1655905433594915, + "grad_norm": 0.1484375, + "learning_rate": 0.00046675672990960566, + "loss": 0.5123, + "step": 83870 + }, + { + "epoch": 4.166087215655111, + "grad_norm": 0.11962890625, + "learning_rate": 0.00046671699612595613, + "loss": 0.5165, + "step": 83880 + }, + { + "epoch": 4.16658388795073, + "grad_norm": 0.166015625, + "learning_rate": 0.00046667726234230655, + "loss": 0.5761, + "step": 83890 + }, + { + "epoch": 4.167080560246349, + "grad_norm": 0.1044921875, + "learning_rate": 0.000466637528558657, + "loss": 0.5479, + "step": 83900 + }, + { + "epoch": 4.167577232541968, + "grad_norm": 0.10205078125, + "learning_rate": 0.0004665977947750075, + "loss": 0.5462, + "step": 83910 + }, + { + "epoch": 4.1680739048375886, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004665580609913579, + "loss": 0.5091, + "step": 83920 + }, + { + "epoch": 4.168570577133208, + "grad_norm": 0.099609375, + "learning_rate": 0.0004665183272077084, + "loss": 0.5157, + "step": 83930 + }, + { + "epoch": 4.169067249428827, + "grad_norm": 0.115234375, + "learning_rate": 0.00046647859342405885, + "loss": 0.5151, + "step": 83940 + }, + { + "epoch": 4.169563921724446, + "grad_norm": 0.126953125, + "learning_rate": 0.0004664388596404093, + "loss": 0.5114, + "step": 83950 + }, + { + "epoch": 4.1700605940200655, + "grad_norm": 0.11083984375, + "learning_rate": 0.00046639912585675974, + "loss": 0.5499, + "step": 83960 + }, + { + "epoch": 4.170557266315685, + "grad_norm": 0.130859375, + "learning_rate": 0.0004663593920731102, + "loss": 0.5134, + "step": 83970 + }, + { + "epoch": 4.171053938611304, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004663196582894607, + "loss": 0.53, + "step": 83980 + }, + { + "epoch": 4.171550610906924, + "grad_norm": 0.10986328125, + "learning_rate": 0.00046627992450581104, + "loss": 0.5353, + "step": 83990 + }, + { + "epoch": 4.172047283202543, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004662401907221615, + "loss": 0.5044, + "step": 84000 + }, + { + "epoch": 4.1725439554981625, + "grad_norm": 0.115234375, + "learning_rate": 0.00046620045693851204, + "loss": 0.5158, + "step": 84010 + }, + { + "epoch": 4.173040627793782, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004661607231548624, + "loss": 0.5087, + "step": 84020 + }, + { + "epoch": 4.173537300089401, + "grad_norm": 0.1669921875, + "learning_rate": 0.0004661209893712129, + "loss": 0.5213, + "step": 84030 + }, + { + "epoch": 4.17403397238502, + "grad_norm": 0.10888671875, + "learning_rate": 0.00046608125558756335, + "loss": 0.5164, + "step": 84040 + }, + { + "epoch": 4.174530644680639, + "grad_norm": 0.10302734375, + "learning_rate": 0.00046604152180391376, + "loss": 0.5075, + "step": 84050 + }, + { + "epoch": 4.175027316976259, + "grad_norm": 0.11083984375, + "learning_rate": 0.00046600178802026423, + "loss": 0.4927, + "step": 84060 + }, + { + "epoch": 4.175523989271879, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004659620542366147, + "loss": 0.5201, + "step": 84070 + }, + { + "epoch": 4.176020661567498, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004659223204529652, + "loss": 0.5072, + "step": 84080 + }, + { + "epoch": 4.176517333863117, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004658825866693156, + "loss": 0.5163, + "step": 84090 + }, + { + "epoch": 4.177014006158736, + "grad_norm": 0.1162109375, + "learning_rate": 0.00046584285288566607, + "loss": 0.5076, + "step": 84100 + }, + { + "epoch": 4.177510678454356, + "grad_norm": 0.0986328125, + "learning_rate": 0.00046580311910201654, + "loss": 0.4845, + "step": 84110 + }, + { + "epoch": 4.178007350749975, + "grad_norm": 0.107421875, + "learning_rate": 0.00046576338531836695, + "loss": 0.5261, + "step": 84120 + }, + { + "epoch": 4.178504023045594, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004657236515347174, + "loss": 0.5091, + "step": 84130 + }, + { + "epoch": 4.179000695341214, + "grad_norm": 0.09814453125, + "learning_rate": 0.0004656839177510679, + "loss": 0.5522, + "step": 84140 + }, + { + "epoch": 4.179497367636833, + "grad_norm": 0.0986328125, + "learning_rate": 0.0004656441839674183, + "loss": 0.5152, + "step": 84150 + }, + { + "epoch": 4.179994039932453, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004656044501837688, + "loss": 0.5298, + "step": 84160 + }, + { + "epoch": 4.180490712228072, + "grad_norm": 0.1103515625, + "learning_rate": 0.00046556471640011926, + "loss": 0.514, + "step": 84170 + }, + { + "epoch": 4.180987384523691, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004655249826164696, + "loss": 0.5026, + "step": 84180 + }, + { + "epoch": 4.18148405681931, + "grad_norm": 0.12353515625, + "learning_rate": 0.00046548524883282015, + "loss": 0.5257, + "step": 84190 + }, + { + "epoch": 4.18198072911493, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004654455150491706, + "loss": 0.5155, + "step": 84200 + }, + { + "epoch": 4.18247740141055, + "grad_norm": 0.11572265625, + "learning_rate": 0.000465405781265521, + "loss": 0.5175, + "step": 84210 + }, + { + "epoch": 4.182974073706169, + "grad_norm": 0.109375, + "learning_rate": 0.00046536604748187145, + "loss": 0.5337, + "step": 84220 + }, + { + "epoch": 4.183470746001788, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004653263136982219, + "loss": 0.5357, + "step": 84230 + }, + { + "epoch": 4.183967418297407, + "grad_norm": 0.1044921875, + "learning_rate": 0.00046528657991457245, + "loss": 0.5226, + "step": 84240 + }, + { + "epoch": 4.184464090593027, + "grad_norm": 0.16015625, + "learning_rate": 0.0004652468461309228, + "loss": 0.5126, + "step": 84250 + }, + { + "epoch": 4.184960762888646, + "grad_norm": 0.10205078125, + "learning_rate": 0.0004652071123472733, + "loss": 0.5091, + "step": 84260 + }, + { + "epoch": 4.185457435184265, + "grad_norm": 0.1259765625, + "learning_rate": 0.00046516737856362375, + "loss": 0.5189, + "step": 84270 + }, + { + "epoch": 4.185954107479885, + "grad_norm": 0.10986328125, + "learning_rate": 0.00046512764477997417, + "loss": 0.5327, + "step": 84280 + }, + { + "epoch": 4.186450779775504, + "grad_norm": 0.10791015625, + "learning_rate": 0.00046508791099632464, + "loss": 0.5098, + "step": 84290 + }, + { + "epoch": 4.186947452071124, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004650481772126751, + "loss": 0.5159, + "step": 84300 + }, + { + "epoch": 4.187444124366743, + "grad_norm": 0.1044921875, + "learning_rate": 0.00046500844342902553, + "loss": 0.5371, + "step": 84310 + }, + { + "epoch": 4.187940796662362, + "grad_norm": 0.115234375, + "learning_rate": 0.000464968709645376, + "loss": 0.5184, + "step": 84320 + }, + { + "epoch": 4.188437468957981, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004649289758617265, + "loss": 0.4931, + "step": 84330 + }, + { + "epoch": 4.1889341412536005, + "grad_norm": 0.10205078125, + "learning_rate": 0.0004648892420780769, + "loss": 0.5019, + "step": 84340 + }, + { + "epoch": 4.189430813549221, + "grad_norm": 0.126953125, + "learning_rate": 0.00046484950829442736, + "loss": 0.5593, + "step": 84350 + }, + { + "epoch": 4.18992748584484, + "grad_norm": 0.10791015625, + "learning_rate": 0.00046480977451077783, + "loss": 0.5058, + "step": 84360 + }, + { + "epoch": 4.190424158140459, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004647700407271282, + "loss": 0.526, + "step": 84370 + }, + { + "epoch": 4.190920830436078, + "grad_norm": 0.130859375, + "learning_rate": 0.0004647303069434787, + "loss": 0.5303, + "step": 84380 + }, + { + "epoch": 4.1914175027316976, + "grad_norm": 0.1591796875, + "learning_rate": 0.0004646905731598292, + "loss": 0.5006, + "step": 84390 + }, + { + "epoch": 4.191914175027317, + "grad_norm": 0.1064453125, + "learning_rate": 0.00046465083937617966, + "loss": 0.5384, + "step": 84400 + }, + { + "epoch": 4.192410847322936, + "grad_norm": 0.10107421875, + "learning_rate": 0.00046461110559253003, + "loss": 0.539, + "step": 84410 + }, + { + "epoch": 4.192907519618556, + "grad_norm": 0.138671875, + "learning_rate": 0.00046457137180888055, + "loss": 0.5146, + "step": 84420 + }, + { + "epoch": 4.193404191914175, + "grad_norm": 0.130859375, + "learning_rate": 0.000464531638025231, + "loss": 0.5103, + "step": 84430 + }, + { + "epoch": 4.193900864209795, + "grad_norm": 0.111328125, + "learning_rate": 0.0004644919042415814, + "loss": 0.5426, + "step": 84440 + }, + { + "epoch": 4.194397536505414, + "grad_norm": 0.1083984375, + "learning_rate": 0.00046445217045793186, + "loss": 0.5138, + "step": 84450 + }, + { + "epoch": 4.194894208801033, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004644124366742824, + "loss": 0.521, + "step": 84460 + }, + { + "epoch": 4.195390881096652, + "grad_norm": 0.11474609375, + "learning_rate": 0.00046437270289063275, + "loss": 0.4961, + "step": 84470 + }, + { + "epoch": 4.1958875533922715, + "grad_norm": 0.12890625, + "learning_rate": 0.0004643329691069832, + "loss": 0.5285, + "step": 84480 + }, + { + "epoch": 4.196384225687891, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004642932353233337, + "loss": 0.5304, + "step": 84490 + }, + { + "epoch": 4.196880897983511, + "grad_norm": 0.158203125, + "learning_rate": 0.0004642535015396841, + "loss": 0.5183, + "step": 84500 + }, + { + "epoch": 4.19737757027913, + "grad_norm": 0.125, + "learning_rate": 0.0004642137677560346, + "loss": 0.5008, + "step": 84510 + }, + { + "epoch": 4.197874242574749, + "grad_norm": 0.1298828125, + "learning_rate": 0.00046417403397238505, + "loss": 0.527, + "step": 84520 + }, + { + "epoch": 4.1983709148703685, + "grad_norm": 0.1767578125, + "learning_rate": 0.0004641343001887355, + "loss": 0.5472, + "step": 84530 + }, + { + "epoch": 4.198867587165988, + "grad_norm": 0.10498046875, + "learning_rate": 0.00046409456640508594, + "loss": 0.5314, + "step": 84540 + }, + { + "epoch": 4.199364259461607, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004640548326214364, + "loss": 0.5252, + "step": 84550 + }, + { + "epoch": 4.199860931757226, + "grad_norm": 0.099609375, + "learning_rate": 0.0004640150988377869, + "loss": 0.5077, + "step": 84560 + }, + { + "epoch": 4.200357604052846, + "grad_norm": 0.119140625, + "learning_rate": 0.0004639753650541373, + "loss": 0.5551, + "step": 84570 + }, + { + "epoch": 4.2008542763484655, + "grad_norm": 0.1123046875, + "learning_rate": 0.00046393563127048777, + "loss": 0.5278, + "step": 84580 + }, + { + "epoch": 4.201350948644085, + "grad_norm": 0.11865234375, + "learning_rate": 0.00046389589748683824, + "loss": 0.5052, + "step": 84590 + }, + { + "epoch": 4.201847620939704, + "grad_norm": 0.154296875, + "learning_rate": 0.0004638561637031886, + "loss": 0.5159, + "step": 84600 + }, + { + "epoch": 4.202344293235323, + "grad_norm": 0.10888671875, + "learning_rate": 0.00046381642991953913, + "loss": 0.536, + "step": 84610 + }, + { + "epoch": 4.202840965530942, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004637766961358896, + "loss": 0.5078, + "step": 84620 + }, + { + "epoch": 4.203337637826562, + "grad_norm": 0.11376953125, + "learning_rate": 0.00046373696235223996, + "loss": 0.5414, + "step": 84630 + }, + { + "epoch": 4.203834310122182, + "grad_norm": 0.12109375, + "learning_rate": 0.00046369722856859044, + "loss": 0.5545, + "step": 84640 + }, + { + "epoch": 4.204330982417801, + "grad_norm": 0.1416015625, + "learning_rate": 0.00046365749478494096, + "loss": 0.5656, + "step": 84650 + }, + { + "epoch": 4.20482765471342, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004636177610012913, + "loss": 0.5252, + "step": 84660 + }, + { + "epoch": 4.2053243270090395, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004635780272176418, + "loss": 0.5175, + "step": 84670 + }, + { + "epoch": 4.205820999304659, + "grad_norm": 0.12060546875, + "learning_rate": 0.00046353829343399227, + "loss": 0.5261, + "step": 84680 + }, + { + "epoch": 4.206317671600278, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004634985596503428, + "loss": 0.5269, + "step": 84690 + }, + { + "epoch": 4.206814343895897, + "grad_norm": 0.1220703125, + "learning_rate": 0.00046345882586669315, + "loss": 0.5254, + "step": 84700 + }, + { + "epoch": 4.207311016191517, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004634190920830436, + "loss": 0.5144, + "step": 84710 + }, + { + "epoch": 4.2078076884871365, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004633793582993941, + "loss": 0.5286, + "step": 84720 + }, + { + "epoch": 4.208304360782756, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004633396245157445, + "loss": 0.5184, + "step": 84730 + }, + { + "epoch": 4.208801033078375, + "grad_norm": 0.1484375, + "learning_rate": 0.000463299890732095, + "loss": 0.5175, + "step": 84740 + }, + { + "epoch": 4.209297705373994, + "grad_norm": 0.0986328125, + "learning_rate": 0.00046326015694844546, + "loss": 0.4948, + "step": 84750 + }, + { + "epoch": 4.209794377669613, + "grad_norm": 0.10546875, + "learning_rate": 0.0004632204231647959, + "loss": 0.5161, + "step": 84760 + }, + { + "epoch": 4.210291049965233, + "grad_norm": 0.09375, + "learning_rate": 0.00046318068938114635, + "loss": 0.5169, + "step": 84770 + }, + { + "epoch": 4.210787722260852, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004631409555974968, + "loss": 0.5402, + "step": 84780 + }, + { + "epoch": 4.211284394556472, + "grad_norm": 0.1259765625, + "learning_rate": 0.00046310122181384723, + "loss": 0.5178, + "step": 84790 + }, + { + "epoch": 4.211781066852091, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004630614880301977, + "loss": 0.5004, + "step": 84800 + }, + { + "epoch": 4.21227773914771, + "grad_norm": 0.158203125, + "learning_rate": 0.0004630217542465482, + "loss": 0.5539, + "step": 84810 + }, + { + "epoch": 4.21277441144333, + "grad_norm": 0.11572265625, + "learning_rate": 0.00046298202046289854, + "loss": 0.5342, + "step": 84820 + }, + { + "epoch": 4.213271083738949, + "grad_norm": 0.0986328125, + "learning_rate": 0.000462942286679249, + "loss": 0.4905, + "step": 84830 + }, + { + "epoch": 4.213767756034568, + "grad_norm": 0.115234375, + "learning_rate": 0.00046290255289559954, + "loss": 0.5354, + "step": 84840 + }, + { + "epoch": 4.214264428330187, + "grad_norm": 0.1982421875, + "learning_rate": 0.00046286281911195, + "loss": 0.522, + "step": 84850 + }, + { + "epoch": 4.214761100625807, + "grad_norm": 0.1220703125, + "learning_rate": 0.00046282308532830037, + "loss": 0.5274, + "step": 84860 + }, + { + "epoch": 4.215257772921427, + "grad_norm": 0.1005859375, + "learning_rate": 0.00046278335154465084, + "loss": 0.4981, + "step": 84870 + }, + { + "epoch": 4.215754445217046, + "grad_norm": 0.1494140625, + "learning_rate": 0.00046274361776100137, + "loss": 0.4926, + "step": 84880 + }, + { + "epoch": 4.216251117512665, + "grad_norm": 0.10595703125, + "learning_rate": 0.00046270388397735173, + "loss": 0.5332, + "step": 84890 + }, + { + "epoch": 4.216747789808284, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004626641501937022, + "loss": 0.5076, + "step": 84900 + }, + { + "epoch": 4.217244462103904, + "grad_norm": 0.0986328125, + "learning_rate": 0.0004626244164100527, + "loss": 0.514, + "step": 84910 + }, + { + "epoch": 4.217741134399523, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004625846826264031, + "loss": 0.5241, + "step": 84920 + }, + { + "epoch": 4.218237806695143, + "grad_norm": 0.11962890625, + "learning_rate": 0.00046254494884275356, + "loss": 0.5204, + "step": 84930 + }, + { + "epoch": 4.218734478990762, + "grad_norm": 0.1435546875, + "learning_rate": 0.00046250521505910403, + "loss": 0.5244, + "step": 84940 + }, + { + "epoch": 4.219231151286381, + "grad_norm": 0.1201171875, + "learning_rate": 0.00046246548127545445, + "loss": 0.5049, + "step": 84950 + }, + { + "epoch": 4.219727823582001, + "grad_norm": 0.09619140625, + "learning_rate": 0.0004624257474918049, + "loss": 0.5101, + "step": 84960 + }, + { + "epoch": 4.22022449587762, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004623860137081554, + "loss": 0.5321, + "step": 84970 + }, + { + "epoch": 4.220721168173239, + "grad_norm": 0.1435546875, + "learning_rate": 0.00046234627992450586, + "loss": 0.5455, + "step": 84980 + }, + { + "epoch": 4.221217840468858, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004623065461408563, + "loss": 0.4932, + "step": 84990 + }, + { + "epoch": 4.221714512764478, + "grad_norm": 0.1484375, + "learning_rate": 0.00046226681235720675, + "loss": 0.5315, + "step": 85000 + }, + { + "epoch": 4.222211185060098, + "grad_norm": 0.130859375, + "learning_rate": 0.0004622270785735572, + "loss": 0.5236, + "step": 85010 + }, + { + "epoch": 4.222707857355717, + "grad_norm": 0.10302734375, + "learning_rate": 0.00046218734478990764, + "loss": 0.5112, + "step": 85020 + }, + { + "epoch": 4.223204529651336, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004621476110062581, + "loss": 0.513, + "step": 85030 + }, + { + "epoch": 4.223701201946955, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004621078772226086, + "loss": 0.4923, + "step": 85040 + }, + { + "epoch": 4.2241978742425745, + "grad_norm": 0.10888671875, + "learning_rate": 0.00046206814343895895, + "loss": 0.5019, + "step": 85050 + }, + { + "epoch": 4.224694546538194, + "grad_norm": 0.13671875, + "learning_rate": 0.0004620284096553095, + "loss": 0.4975, + "step": 85060 + }, + { + "epoch": 4.225191218833814, + "grad_norm": 0.11279296875, + "learning_rate": 0.00046198867587165994, + "loss": 0.5384, + "step": 85070 + }, + { + "epoch": 4.225687891129433, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004619489420880103, + "loss": 0.5359, + "step": 85080 + }, + { + "epoch": 4.226184563425052, + "grad_norm": 0.1328125, + "learning_rate": 0.0004619092083043608, + "loss": 0.5235, + "step": 85090 + }, + { + "epoch": 4.226681235720672, + "grad_norm": 0.091796875, + "learning_rate": 0.00046186947452071125, + "loss": 0.5044, + "step": 85100 + }, + { + "epoch": 4.227177908016291, + "grad_norm": 0.10546875, + "learning_rate": 0.00046182974073706167, + "loss": 0.5421, + "step": 85110 + }, + { + "epoch": 4.22767458031191, + "grad_norm": 0.12109375, + "learning_rate": 0.00046179000695341214, + "loss": 0.5294, + "step": 85120 + }, + { + "epoch": 4.228171252607529, + "grad_norm": 0.12109375, + "learning_rate": 0.0004617502731697626, + "loss": 0.5175, + "step": 85130 + }, + { + "epoch": 4.228667924903149, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004617105393861131, + "loss": 0.5488, + "step": 85140 + }, + { + "epoch": 4.229164597198769, + "grad_norm": 0.1611328125, + "learning_rate": 0.0004616708056024635, + "loss": 0.4906, + "step": 85150 + }, + { + "epoch": 4.229661269494388, + "grad_norm": 0.10595703125, + "learning_rate": 0.00046163107181881397, + "loss": 0.5223, + "step": 85160 + }, + { + "epoch": 4.230157941790007, + "grad_norm": 0.11865234375, + "learning_rate": 0.00046159133803516444, + "loss": 0.5071, + "step": 85170 + }, + { + "epoch": 4.230654614085626, + "grad_norm": 0.10546875, + "learning_rate": 0.00046155160425151486, + "loss": 0.5137, + "step": 85180 + }, + { + "epoch": 4.2311512863812455, + "grad_norm": 0.11083984375, + "learning_rate": 0.00046151187046786533, + "loss": 0.5056, + "step": 85190 + }, + { + "epoch": 4.231647958676865, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004614721366842158, + "loss": 0.505, + "step": 85200 + }, + { + "epoch": 4.232144630972484, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004614324029005662, + "loss": 0.5275, + "step": 85210 + }, + { + "epoch": 4.232641303268104, + "grad_norm": 0.169921875, + "learning_rate": 0.0004613926691169167, + "loss": 0.5189, + "step": 85220 + }, + { + "epoch": 4.233137975563723, + "grad_norm": 0.12255859375, + "learning_rate": 0.00046135293533326716, + "loss": 0.5391, + "step": 85230 + }, + { + "epoch": 4.2336346478593425, + "grad_norm": 0.1484375, + "learning_rate": 0.0004613132015496175, + "loss": 0.5331, + "step": 85240 + }, + { + "epoch": 4.234131320154962, + "grad_norm": 0.1083984375, + "learning_rate": 0.00046127346776596805, + "loss": 0.5216, + "step": 85250 + }, + { + "epoch": 4.234627992450581, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004612337339823185, + "loss": 0.5352, + "step": 85260 + }, + { + "epoch": 4.2351246647462, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004611940001986689, + "loss": 0.5044, + "step": 85270 + }, + { + "epoch": 4.235621337041819, + "grad_norm": 0.0966796875, + "learning_rate": 0.00046115426641501936, + "loss": 0.5219, + "step": 85280 + }, + { + "epoch": 4.2361180093374395, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004611145326313699, + "loss": 0.5064, + "step": 85290 + }, + { + "epoch": 4.236614681633059, + "grad_norm": 0.10498046875, + "learning_rate": 0.00046107479884772035, + "loss": 0.5503, + "step": 85300 + }, + { + "epoch": 4.237111353928678, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004610350650640707, + "loss": 0.5075, + "step": 85310 + }, + { + "epoch": 4.237608026224297, + "grad_norm": 0.166015625, + "learning_rate": 0.0004609953312804212, + "loss": 0.5491, + "step": 85320 + }, + { + "epoch": 4.2381046985199164, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004609555974967717, + "loss": 0.5304, + "step": 85330 + }, + { + "epoch": 4.238601370815536, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004609158637131221, + "loss": 0.5417, + "step": 85340 + }, + { + "epoch": 4.239098043111155, + "grad_norm": 0.10205078125, + "learning_rate": 0.00046087612992947255, + "loss": 0.5318, + "step": 85350 + }, + { + "epoch": 4.239594715406775, + "grad_norm": 0.1376953125, + "learning_rate": 0.000460836396145823, + "loss": 0.5274, + "step": 85360 + }, + { + "epoch": 4.240091387702394, + "grad_norm": 0.11572265625, + "learning_rate": 0.00046079666236217343, + "loss": 0.5192, + "step": 85370 + }, + { + "epoch": 4.2405880599980135, + "grad_norm": 0.099609375, + "learning_rate": 0.0004607569285785239, + "loss": 0.5049, + "step": 85380 + }, + { + "epoch": 4.241084732293633, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004607171947948744, + "loss": 0.5139, + "step": 85390 + }, + { + "epoch": 4.241581404589252, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004606774610112248, + "loss": 0.5182, + "step": 85400 + }, + { + "epoch": 4.242078076884871, + "grad_norm": 0.109375, + "learning_rate": 0.00046063772722757527, + "loss": 0.5096, + "step": 85410 + }, + { + "epoch": 4.24257474918049, + "grad_norm": 0.1376953125, + "learning_rate": 0.00046059799344392574, + "loss": 0.5349, + "step": 85420 + }, + { + "epoch": 4.24307142147611, + "grad_norm": 0.134765625, + "learning_rate": 0.0004605582596602762, + "loss": 0.5377, + "step": 85430 + }, + { + "epoch": 4.24356809377173, + "grad_norm": 0.134765625, + "learning_rate": 0.0004605185258766266, + "loss": 0.5205, + "step": 85440 + }, + { + "epoch": 4.244064766067349, + "grad_norm": 0.107421875, + "learning_rate": 0.0004604787920929771, + "loss": 0.5251, + "step": 85450 + }, + { + "epoch": 4.244561438362968, + "grad_norm": 0.1201171875, + "learning_rate": 0.00046043905830932757, + "loss": 0.5031, + "step": 85460 + }, + { + "epoch": 4.245058110658587, + "grad_norm": 0.1240234375, + "learning_rate": 0.00046039932452567793, + "loss": 0.5041, + "step": 85470 + }, + { + "epoch": 4.245554782954207, + "grad_norm": 0.0986328125, + "learning_rate": 0.00046035959074202846, + "loss": 0.5095, + "step": 85480 + }, + { + "epoch": 4.246051455249826, + "grad_norm": 0.11767578125, + "learning_rate": 0.00046031985695837893, + "loss": 0.5163, + "step": 85490 + }, + { + "epoch": 4.246548127545445, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004602801231747293, + "loss": 0.5149, + "step": 85500 + }, + { + "epoch": 4.247044799841065, + "grad_norm": 0.123046875, + "learning_rate": 0.00046024038939107976, + "loss": 0.523, + "step": 85510 + }, + { + "epoch": 4.247541472136684, + "grad_norm": 0.11328125, + "learning_rate": 0.0004602006556074303, + "loss": 0.5251, + "step": 85520 + }, + { + "epoch": 4.248038144432304, + "grad_norm": 0.09716796875, + "learning_rate": 0.00046016092182378065, + "loss": 0.4988, + "step": 85530 + }, + { + "epoch": 4.248534816727923, + "grad_norm": 0.126953125, + "learning_rate": 0.0004601211880401311, + "loss": 0.4956, + "step": 85540 + }, + { + "epoch": 4.249031489023542, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004600814542564816, + "loss": 0.5315, + "step": 85550 + }, + { + "epoch": 4.249528161319161, + "grad_norm": 0.11181640625, + "learning_rate": 0.000460041720472832, + "loss": 0.5234, + "step": 85560 + }, + { + "epoch": 4.250024833614781, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004600019866891825, + "loss": 0.5087, + "step": 85570 + }, + { + "epoch": 4.250521505910401, + "grad_norm": 0.1083984375, + "learning_rate": 0.00045996225290553295, + "loss": 0.4927, + "step": 85580 + }, + { + "epoch": 4.25101817820602, + "grad_norm": 0.1513671875, + "learning_rate": 0.0004599225191218834, + "loss": 0.5326, + "step": 85590 + }, + { + "epoch": 4.251514850501639, + "grad_norm": 0.10302734375, + "learning_rate": 0.00045988278533823384, + "loss": 0.4948, + "step": 85600 + }, + { + "epoch": 4.252011522797258, + "grad_norm": 0.1171875, + "learning_rate": 0.0004598430515545843, + "loss": 0.521, + "step": 85610 + }, + { + "epoch": 4.252508195092878, + "grad_norm": 0.115234375, + "learning_rate": 0.0004598033177709348, + "loss": 0.4986, + "step": 85620 + }, + { + "epoch": 4.253004867388497, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004597635839872852, + "loss": 0.512, + "step": 85630 + }, + { + "epoch": 4.253501539684116, + "grad_norm": 0.15234375, + "learning_rate": 0.0004597238502036357, + "loss": 0.5244, + "step": 85640 + }, + { + "epoch": 4.253998211979736, + "grad_norm": 0.107421875, + "learning_rate": 0.00045968411641998614, + "loss": 0.4869, + "step": 85650 + }, + { + "epoch": 4.254494884275355, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004596443826363365, + "loss": 0.5067, + "step": 85660 + }, + { + "epoch": 4.254991556570975, + "grad_norm": 0.12060546875, + "learning_rate": 0.00045960464885268703, + "loss": 0.5391, + "step": 85670 + }, + { + "epoch": 4.255488228866594, + "grad_norm": 0.15625, + "learning_rate": 0.0004595649150690375, + "loss": 0.4976, + "step": 85680 + }, + { + "epoch": 4.255984901162213, + "grad_norm": 0.12451171875, + "learning_rate": 0.00045952518128538787, + "loss": 0.5479, + "step": 85690 + }, + { + "epoch": 4.256481573457832, + "grad_norm": 0.0986328125, + "learning_rate": 0.00045948544750173834, + "loss": 0.5283, + "step": 85700 + }, + { + "epoch": 4.2569782457534515, + "grad_norm": 0.09228515625, + "learning_rate": 0.00045944571371808886, + "loss": 0.5109, + "step": 85710 + }, + { + "epoch": 4.257474918049072, + "grad_norm": 0.1005859375, + "learning_rate": 0.00045940597993443934, + "loss": 0.5107, + "step": 85720 + }, + { + "epoch": 4.257971590344691, + "grad_norm": 0.216796875, + "learning_rate": 0.0004593662461507897, + "loss": 0.5134, + "step": 85730 + }, + { + "epoch": 4.25846826264031, + "grad_norm": 0.12451171875, + "learning_rate": 0.00045932651236714017, + "loss": 0.5107, + "step": 85740 + }, + { + "epoch": 4.258964934935929, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004592867785834907, + "loss": 0.5183, + "step": 85750 + }, + { + "epoch": 4.2594616072315485, + "grad_norm": 0.11767578125, + "learning_rate": 0.00045924704479984106, + "loss": 0.54, + "step": 85760 + }, + { + "epoch": 4.259958279527168, + "grad_norm": 0.10986328125, + "learning_rate": 0.00045920731101619153, + "loss": 0.5028, + "step": 85770 + }, + { + "epoch": 4.260454951822787, + "grad_norm": 0.1640625, + "learning_rate": 0.000459167577232542, + "loss": 0.5201, + "step": 85780 + }, + { + "epoch": 4.260951624118407, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004591278434488924, + "loss": 0.4954, + "step": 85790 + }, + { + "epoch": 4.261448296414026, + "grad_norm": 0.111328125, + "learning_rate": 0.0004590881096652429, + "loss": 0.5304, + "step": 85800 + }, + { + "epoch": 4.261944968709646, + "grad_norm": 0.1416015625, + "learning_rate": 0.00045904837588159336, + "loss": 0.5535, + "step": 85810 + }, + { + "epoch": 4.262441641005265, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004590086420979438, + "loss": 0.5143, + "step": 85820 + }, + { + "epoch": 4.262938313300884, + "grad_norm": 0.1015625, + "learning_rate": 0.00045896890831429425, + "loss": 0.5171, + "step": 85830 + }, + { + "epoch": 4.263434985596503, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004589291745306447, + "loss": 0.5081, + "step": 85840 + }, + { + "epoch": 4.2639316578921225, + "grad_norm": 0.1064453125, + "learning_rate": 0.00045888944074699514, + "loss": 0.5469, + "step": 85850 + }, + { + "epoch": 4.264428330187743, + "grad_norm": 0.1328125, + "learning_rate": 0.0004588497069633456, + "loss": 0.5263, + "step": 85860 + }, + { + "epoch": 4.264925002483362, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004588099731796961, + "loss": 0.5449, + "step": 85870 + }, + { + "epoch": 4.265421674778981, + "grad_norm": 0.12158203125, + "learning_rate": 0.00045877023939604655, + "loss": 0.5172, + "step": 85880 + }, + { + "epoch": 4.2659183470746, + "grad_norm": 0.140625, + "learning_rate": 0.00045873050561239697, + "loss": 0.5357, + "step": 85890 + }, + { + "epoch": 4.2664150193702195, + "grad_norm": 0.1640625, + "learning_rate": 0.00045869077182874744, + "loss": 0.5344, + "step": 85900 + }, + { + "epoch": 4.266911691665839, + "grad_norm": 0.115234375, + "learning_rate": 0.0004586510380450979, + "loss": 0.5317, + "step": 85910 + }, + { + "epoch": 4.267408363961458, + "grad_norm": 0.126953125, + "learning_rate": 0.0004586113042614483, + "loss": 0.5079, + "step": 85920 + }, + { + "epoch": 4.267905036257077, + "grad_norm": 0.10791015625, + "learning_rate": 0.00045857157047779875, + "loss": 0.4963, + "step": 85930 + }, + { + "epoch": 4.268401708552697, + "grad_norm": 0.1416015625, + "learning_rate": 0.00045853183669414927, + "loss": 0.5111, + "step": 85940 + }, + { + "epoch": 4.2688983808483165, + "grad_norm": 0.1689453125, + "learning_rate": 0.00045849210291049963, + "loss": 0.5146, + "step": 85950 + }, + { + "epoch": 4.269395053143936, + "grad_norm": 0.1552734375, + "learning_rate": 0.0004584523691268501, + "loss": 0.5579, + "step": 85960 + }, + { + "epoch": 4.269891725439555, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004584126353432006, + "loss": 0.5281, + "step": 85970 + }, + { + "epoch": 4.270388397735174, + "grad_norm": 0.107421875, + "learning_rate": 0.000458372901559551, + "loss": 0.5497, + "step": 85980 + }, + { + "epoch": 4.270885070030793, + "grad_norm": 0.103515625, + "learning_rate": 0.00045833316777590147, + "loss": 0.4916, + "step": 85990 + }, + { + "epoch": 4.271381742326413, + "grad_norm": 0.1171875, + "learning_rate": 0.00045829343399225194, + "loss": 0.5393, + "step": 86000 + }, + { + "epoch": 4.271878414622033, + "grad_norm": 0.11181640625, + "learning_rate": 0.00045825370020860235, + "loss": 0.545, + "step": 86010 + }, + { + "epoch": 4.272375086917652, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004582139664249528, + "loss": 0.5428, + "step": 86020 + }, + { + "epoch": 4.272871759213271, + "grad_norm": 0.140625, + "learning_rate": 0.0004581742326413033, + "loss": 0.5207, + "step": 86030 + }, + { + "epoch": 4.2733684315088905, + "grad_norm": 0.10986328125, + "learning_rate": 0.00045813449885765377, + "loss": 0.5163, + "step": 86040 + }, + { + "epoch": 4.27386510380451, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004580947650740042, + "loss": 0.515, + "step": 86050 + }, + { + "epoch": 4.274361776100129, + "grad_norm": 0.1064453125, + "learning_rate": 0.00045805503129035466, + "loss": 0.549, + "step": 86060 + }, + { + "epoch": 4.274858448395748, + "grad_norm": 0.146484375, + "learning_rate": 0.00045801529750670513, + "loss": 0.535, + "step": 86070 + }, + { + "epoch": 4.275355120691368, + "grad_norm": 0.10595703125, + "learning_rate": 0.00045797556372305555, + "loss": 0.5278, + "step": 86080 + }, + { + "epoch": 4.2758517929869875, + "grad_norm": 0.12451171875, + "learning_rate": 0.000457935829939406, + "loss": 0.5455, + "step": 86090 + }, + { + "epoch": 4.276348465282607, + "grad_norm": 0.134765625, + "learning_rate": 0.0004578960961557565, + "loss": 0.5381, + "step": 86100 + }, + { + "epoch": 4.276845137578226, + "grad_norm": 0.11181640625, + "learning_rate": 0.00045785636237210685, + "loss": 0.526, + "step": 86110 + }, + { + "epoch": 4.277341809873845, + "grad_norm": 0.1416015625, + "learning_rate": 0.0004578166285884574, + "loss": 0.5111, + "step": 86120 + }, + { + "epoch": 4.277838482169464, + "grad_norm": 0.1064453125, + "learning_rate": 0.00045777689480480785, + "loss": 0.507, + "step": 86130 + }, + { + "epoch": 4.278335154465084, + "grad_norm": 0.1171875, + "learning_rate": 0.0004577371610211582, + "loss": 0.5182, + "step": 86140 + }, + { + "epoch": 4.278831826760703, + "grad_norm": 0.119140625, + "learning_rate": 0.0004576974272375087, + "loss": 0.5155, + "step": 86150 + }, + { + "epoch": 4.279328499056323, + "grad_norm": 0.12109375, + "learning_rate": 0.0004576576934538592, + "loss": 0.5027, + "step": 86160 + }, + { + "epoch": 4.279825171351942, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004576179596702097, + "loss": 0.5301, + "step": 86170 + }, + { + "epoch": 4.280321843647561, + "grad_norm": 0.1435546875, + "learning_rate": 0.00045757822588656004, + "loss": 0.5158, + "step": 86180 + }, + { + "epoch": 4.280818515943181, + "grad_norm": 0.1572265625, + "learning_rate": 0.0004575384921029105, + "loss": 0.5059, + "step": 86190 + }, + { + "epoch": 4.2813151882388, + "grad_norm": 0.1240234375, + "learning_rate": 0.000457498758319261, + "loss": 0.5142, + "step": 86200 + }, + { + "epoch": 4.281811860534419, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004574590245356114, + "loss": 0.5515, + "step": 86210 + }, + { + "epoch": 4.282308532830038, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004574192907519619, + "loss": 0.5063, + "step": 86220 + }, + { + "epoch": 4.282805205125658, + "grad_norm": 0.12109375, + "learning_rate": 0.00045737955696831234, + "loss": 0.4999, + "step": 86230 + }, + { + "epoch": 4.283301877421278, + "grad_norm": 0.12353515625, + "learning_rate": 0.00045733982318466276, + "loss": 0.5308, + "step": 86240 + }, + { + "epoch": 4.283798549716897, + "grad_norm": 0.10546875, + "learning_rate": 0.00045730008940101323, + "loss": 0.526, + "step": 86250 + }, + { + "epoch": 4.284295222012516, + "grad_norm": 0.12109375, + "learning_rate": 0.0004572603556173637, + "loss": 0.5213, + "step": 86260 + }, + { + "epoch": 4.284791894308135, + "grad_norm": 0.142578125, + "learning_rate": 0.0004572206218337141, + "loss": 0.539, + "step": 86270 + }, + { + "epoch": 4.285288566603755, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004571808880500646, + "loss": 0.5036, + "step": 86280 + }, + { + "epoch": 4.285785238899374, + "grad_norm": 0.1142578125, + "learning_rate": 0.00045714115426641506, + "loss": 0.5239, + "step": 86290 + }, + { + "epoch": 4.286281911194994, + "grad_norm": 0.10009765625, + "learning_rate": 0.00045710142048276543, + "loss": 0.5123, + "step": 86300 + }, + { + "epoch": 4.286778583490613, + "grad_norm": 0.11279296875, + "learning_rate": 0.00045706168669911595, + "loss": 0.5036, + "step": 86310 + }, + { + "epoch": 4.287275255786232, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004570219529154664, + "loss": 0.5138, + "step": 86320 + }, + { + "epoch": 4.287771928081852, + "grad_norm": 0.11328125, + "learning_rate": 0.0004569822191318169, + "loss": 0.4928, + "step": 86330 + }, + { + "epoch": 4.288268600377471, + "grad_norm": 0.10546875, + "learning_rate": 0.00045694248534816726, + "loss": 0.5393, + "step": 86340 + }, + { + "epoch": 4.28876527267309, + "grad_norm": 0.125, + "learning_rate": 0.0004569027515645178, + "loss": 0.5234, + "step": 86350 + }, + { + "epoch": 4.289261944968709, + "grad_norm": 0.115234375, + "learning_rate": 0.00045686301778086826, + "loss": 0.5304, + "step": 86360 + }, + { + "epoch": 4.289758617264329, + "grad_norm": 0.203125, + "learning_rate": 0.0004568232839972186, + "loss": 0.5225, + "step": 86370 + }, + { + "epoch": 4.290255289559949, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004567835502135691, + "loss": 0.5119, + "step": 86380 + }, + { + "epoch": 4.290751961855568, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004567438164299196, + "loss": 0.5613, + "step": 86390 + }, + { + "epoch": 4.291248634151187, + "grad_norm": 0.138671875, + "learning_rate": 0.00045670408264627, + "loss": 0.538, + "step": 86400 + }, + { + "epoch": 4.291745306446806, + "grad_norm": 0.1259765625, + "learning_rate": 0.00045666434886262045, + "loss": 0.5313, + "step": 86410 + }, + { + "epoch": 4.2922419787424255, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004566246150789709, + "loss": 0.5336, + "step": 86420 + }, + { + "epoch": 4.292738651038045, + "grad_norm": 0.1494140625, + "learning_rate": 0.00045658488129532134, + "loss": 0.4929, + "step": 86430 + }, + { + "epoch": 4.293235323333665, + "grad_norm": 0.134765625, + "learning_rate": 0.0004565451475116718, + "loss": 0.5121, + "step": 86440 + }, + { + "epoch": 4.293731995629284, + "grad_norm": 0.123046875, + "learning_rate": 0.0004565054137280223, + "loss": 0.54, + "step": 86450 + }, + { + "epoch": 4.294228667924903, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004564656799443727, + "loss": 0.5073, + "step": 86460 + }, + { + "epoch": 4.2947253402205225, + "grad_norm": 0.1982421875, + "learning_rate": 0.00045642594616072317, + "loss": 0.5126, + "step": 86470 + }, + { + "epoch": 4.295222012516142, + "grad_norm": 0.109375, + "learning_rate": 0.00045638621237707364, + "loss": 0.5324, + "step": 86480 + }, + { + "epoch": 4.295718684811761, + "grad_norm": 0.123046875, + "learning_rate": 0.0004563464785934241, + "loss": 0.52, + "step": 86490 + }, + { + "epoch": 4.29621535710738, + "grad_norm": 0.1171875, + "learning_rate": 0.00045630674480977453, + "loss": 0.5243, + "step": 86500 + }, + { + "epoch": 4.296712029403, + "grad_norm": 0.12353515625, + "learning_rate": 0.000456267011026125, + "loss": 0.513, + "step": 86510 + }, + { + "epoch": 4.29720870169862, + "grad_norm": 0.10986328125, + "learning_rate": 0.00045622727724247547, + "loss": 0.4888, + "step": 86520 + }, + { + "epoch": 4.297705373994239, + "grad_norm": 0.1611328125, + "learning_rate": 0.00045618754345882584, + "loss": 0.5306, + "step": 86530 + }, + { + "epoch": 4.298202046289858, + "grad_norm": 0.111328125, + "learning_rate": 0.00045614780967517636, + "loss": 0.5291, + "step": 86540 + }, + { + "epoch": 4.298698718585477, + "grad_norm": 0.1123046875, + "learning_rate": 0.00045610807589152683, + "loss": 0.5164, + "step": 86550 + }, + { + "epoch": 4.2991953908810965, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004560683421078772, + "loss": 0.5422, + "step": 86560 + }, + { + "epoch": 4.299692063176716, + "grad_norm": 0.1240234375, + "learning_rate": 0.00045602860832422767, + "loss": 0.5098, + "step": 86570 + }, + { + "epoch": 4.300188735472336, + "grad_norm": 0.166015625, + "learning_rate": 0.0004559888745405782, + "loss": 0.5123, + "step": 86580 + }, + { + "epoch": 4.300685407767955, + "grad_norm": 0.13671875, + "learning_rate": 0.00045594914075692855, + "loss": 0.4986, + "step": 86590 + }, + { + "epoch": 4.301182080063574, + "grad_norm": 0.1123046875, + "learning_rate": 0.000455909406973279, + "loss": 0.5292, + "step": 86600 + }, + { + "epoch": 4.3016787523591935, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004558696731896295, + "loss": 0.5057, + "step": 86610 + }, + { + "epoch": 4.302175424654813, + "grad_norm": 0.1630859375, + "learning_rate": 0.00045582993940598, + "loss": 0.5015, + "step": 86620 + }, + { + "epoch": 4.302672096950432, + "grad_norm": 0.19921875, + "learning_rate": 0.0004557902056223304, + "loss": 0.5086, + "step": 86630 + }, + { + "epoch": 4.303168769246051, + "grad_norm": 0.11279296875, + "learning_rate": 0.00045575047183868086, + "loss": 0.5087, + "step": 86640 + }, + { + "epoch": 4.30366544154167, + "grad_norm": 0.11328125, + "learning_rate": 0.00045571073805503133, + "loss": 0.4946, + "step": 86650 + }, + { + "epoch": 4.3041621138372905, + "grad_norm": 0.1201171875, + "learning_rate": 0.00045567100427138175, + "loss": 0.5068, + "step": 86660 + }, + { + "epoch": 4.30465878613291, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004556312704877322, + "loss": 0.4953, + "step": 86670 + }, + { + "epoch": 4.305155458428529, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004555915367040827, + "loss": 0.5288, + "step": 86680 + }, + { + "epoch": 4.305652130724148, + "grad_norm": 0.1396484375, + "learning_rate": 0.0004555518029204331, + "loss": 0.507, + "step": 86690 + }, + { + "epoch": 4.306148803019767, + "grad_norm": 0.115234375, + "learning_rate": 0.0004555120691367836, + "loss": 0.5239, + "step": 86700 + }, + { + "epoch": 4.306645475315387, + "grad_norm": 0.1279296875, + "learning_rate": 0.00045547233535313405, + "loss": 0.5222, + "step": 86710 + }, + { + "epoch": 4.307142147611006, + "grad_norm": 0.119140625, + "learning_rate": 0.00045543260156948447, + "loss": 0.5368, + "step": 86720 + }, + { + "epoch": 4.307638819906626, + "grad_norm": 0.10595703125, + "learning_rate": 0.00045539286778583494, + "loss": 0.5294, + "step": 86730 + }, + { + "epoch": 4.308135492202245, + "grad_norm": 0.115234375, + "learning_rate": 0.0004553531340021854, + "loss": 0.5032, + "step": 86740 + }, + { + "epoch": 4.3086321644978645, + "grad_norm": 0.140625, + "learning_rate": 0.00045531340021853577, + "loss": 0.5411, + "step": 86750 + }, + { + "epoch": 4.309128836793484, + "grad_norm": 0.275390625, + "learning_rate": 0.0004552736664348863, + "loss": 0.5447, + "step": 86760 + }, + { + "epoch": 4.309625509089103, + "grad_norm": 0.10595703125, + "learning_rate": 0.00045523393265123677, + "loss": 0.5329, + "step": 86770 + }, + { + "epoch": 4.310122181384722, + "grad_norm": 0.1513671875, + "learning_rate": 0.00045519419886758724, + "loss": 0.4954, + "step": 86780 + }, + { + "epoch": 4.310618853680341, + "grad_norm": 0.103515625, + "learning_rate": 0.0004551544650839376, + "loss": 0.504, + "step": 86790 + }, + { + "epoch": 4.311115525975961, + "grad_norm": 0.1015625, + "learning_rate": 0.0004551147313002881, + "loss": 0.5425, + "step": 86800 + }, + { + "epoch": 4.311612198271581, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004550749975166386, + "loss": 0.5113, + "step": 86810 + }, + { + "epoch": 4.3121088705672, + "grad_norm": 0.1064453125, + "learning_rate": 0.00045503526373298896, + "loss": 0.5203, + "step": 86820 + }, + { + "epoch": 4.312605542862819, + "grad_norm": 0.1162109375, + "learning_rate": 0.00045499552994933943, + "loss": 0.5085, + "step": 86830 + }, + { + "epoch": 4.313102215158438, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004549557961656899, + "loss": 0.5188, + "step": 86840 + }, + { + "epoch": 4.313598887454058, + "grad_norm": 0.1396484375, + "learning_rate": 0.0004549160623820403, + "loss": 0.5094, + "step": 86850 + }, + { + "epoch": 4.314095559749677, + "grad_norm": 0.10546875, + "learning_rate": 0.0004548763285983908, + "loss": 0.5057, + "step": 86860 + }, + { + "epoch": 4.314592232045296, + "grad_norm": 0.11279296875, + "learning_rate": 0.00045483659481474126, + "loss": 0.4869, + "step": 86870 + }, + { + "epoch": 4.315088904340916, + "grad_norm": 0.16015625, + "learning_rate": 0.0004547968610310917, + "loss": 0.5005, + "step": 86880 + }, + { + "epoch": 4.315585576636535, + "grad_norm": 0.1328125, + "learning_rate": 0.00045475712724744215, + "loss": 0.5301, + "step": 86890 + }, + { + "epoch": 4.316082248932155, + "grad_norm": 0.138671875, + "learning_rate": 0.0004547173934637926, + "loss": 0.53, + "step": 86900 + }, + { + "epoch": 4.316578921227774, + "grad_norm": 0.181640625, + "learning_rate": 0.00045467765968014304, + "loss": 0.5241, + "step": 86910 + }, + { + "epoch": 4.317075593523393, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004546379258964935, + "loss": 0.5442, + "step": 86920 + }, + { + "epoch": 4.317572265819012, + "grad_norm": 0.1044921875, + "learning_rate": 0.000454598192112844, + "loss": 0.4976, + "step": 86930 + }, + { + "epoch": 4.3180689381146315, + "grad_norm": 0.1044921875, + "learning_rate": 0.00045455845832919446, + "loss": 0.5249, + "step": 86940 + }, + { + "epoch": 4.318565610410252, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004545187245455449, + "loss": 0.5007, + "step": 86950 + }, + { + "epoch": 4.319062282705871, + "grad_norm": 0.1201171875, + "learning_rate": 0.00045447899076189534, + "loss": 0.528, + "step": 86960 + }, + { + "epoch": 4.31955895500149, + "grad_norm": 0.20703125, + "learning_rate": 0.0004544392569782458, + "loss": 0.5246, + "step": 86970 + }, + { + "epoch": 4.320055627297109, + "grad_norm": 0.10546875, + "learning_rate": 0.0004543995231945962, + "loss": 0.505, + "step": 86980 + }, + { + "epoch": 4.320552299592729, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004543597894109467, + "loss": 0.4948, + "step": 86990 + }, + { + "epoch": 4.321048971888348, + "grad_norm": 0.11328125, + "learning_rate": 0.0004543200556272972, + "loss": 0.5177, + "step": 87000 + }, + { + "epoch": 4.321545644183967, + "grad_norm": 0.1904296875, + "learning_rate": 0.00045428032184364754, + "loss": 0.5274, + "step": 87010 + }, + { + "epoch": 4.322042316479587, + "grad_norm": 0.1171875, + "learning_rate": 0.000454240588059998, + "loss": 0.5335, + "step": 87020 + }, + { + "epoch": 4.322538988775206, + "grad_norm": 0.1328125, + "learning_rate": 0.00045420085427634854, + "loss": 0.5448, + "step": 87030 + }, + { + "epoch": 4.323035661070826, + "grad_norm": 0.103515625, + "learning_rate": 0.0004541611204926989, + "loss": 0.5349, + "step": 87040 + }, + { + "epoch": 4.323532333366445, + "grad_norm": 0.11865234375, + "learning_rate": 0.00045412138670904937, + "loss": 0.4996, + "step": 87050 + }, + { + "epoch": 4.324029005662064, + "grad_norm": 0.1103515625, + "learning_rate": 0.00045408165292539984, + "loss": 0.516, + "step": 87060 + }, + { + "epoch": 4.324525677957683, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004540419191417503, + "loss": 0.5177, + "step": 87070 + }, + { + "epoch": 4.3250223502533025, + "grad_norm": 0.1083984375, + "learning_rate": 0.00045400218535810073, + "loss": 0.5055, + "step": 87080 + }, + { + "epoch": 4.325519022548923, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004539624515744512, + "loss": 0.5124, + "step": 87090 + }, + { + "epoch": 4.326015694844542, + "grad_norm": 0.10498046875, + "learning_rate": 0.00045392271779080167, + "loss": 0.5161, + "step": 87100 + }, + { + "epoch": 4.326512367140161, + "grad_norm": 0.0966796875, + "learning_rate": 0.0004538829840071521, + "loss": 0.5411, + "step": 87110 + }, + { + "epoch": 4.32700903943578, + "grad_norm": 0.12109375, + "learning_rate": 0.00045384325022350256, + "loss": 0.5097, + "step": 87120 + }, + { + "epoch": 4.3275057117313995, + "grad_norm": 0.1689453125, + "learning_rate": 0.00045380351643985303, + "loss": 0.5181, + "step": 87130 + }, + { + "epoch": 4.328002384027019, + "grad_norm": 0.1025390625, + "learning_rate": 0.00045376378265620345, + "loss": 0.5099, + "step": 87140 + }, + { + "epoch": 4.328499056322638, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004537240488725539, + "loss": 0.5018, + "step": 87150 + }, + { + "epoch": 4.328995728618258, + "grad_norm": 0.1171875, + "learning_rate": 0.0004536843150889044, + "loss": 0.5034, + "step": 87160 + }, + { + "epoch": 4.329492400913877, + "grad_norm": 0.10888671875, + "learning_rate": 0.00045364458130525475, + "loss": 0.5121, + "step": 87170 + }, + { + "epoch": 4.3299890732094966, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004536048475216053, + "loss": 0.5178, + "step": 87180 + }, + { + "epoch": 4.330485745505116, + "grad_norm": 0.11181640625, + "learning_rate": 0.00045356511373795575, + "loss": 0.5198, + "step": 87190 + }, + { + "epoch": 4.330982417800735, + "grad_norm": 0.0966796875, + "learning_rate": 0.0004535253799543061, + "loss": 0.5295, + "step": 87200 + }, + { + "epoch": 4.331479090096354, + "grad_norm": 0.140625, + "learning_rate": 0.0004534856461706566, + "loss": 0.5136, + "step": 87210 + }, + { + "epoch": 4.3319757623919735, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004534459123870071, + "loss": 0.5361, + "step": 87220 + }, + { + "epoch": 4.332472434687594, + "grad_norm": 0.126953125, + "learning_rate": 0.0004534061786033576, + "loss": 0.5476, + "step": 87230 + }, + { + "epoch": 4.332969106983213, + "grad_norm": 0.1396484375, + "learning_rate": 0.00045336644481970795, + "loss": 0.511, + "step": 87240 + }, + { + "epoch": 4.333465779278832, + "grad_norm": 0.19140625, + "learning_rate": 0.0004533267110360584, + "loss": 0.5044, + "step": 87250 + }, + { + "epoch": 4.333962451574451, + "grad_norm": 0.11962890625, + "learning_rate": 0.00045328697725240894, + "loss": 0.5091, + "step": 87260 + }, + { + "epoch": 4.3344591238700705, + "grad_norm": 0.1865234375, + "learning_rate": 0.0004532472434687593, + "loss": 0.5155, + "step": 87270 + }, + { + "epoch": 4.33495579616569, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004532075096851098, + "loss": 0.523, + "step": 87280 + }, + { + "epoch": 4.335452468461309, + "grad_norm": 0.1201171875, + "learning_rate": 0.00045316777590146025, + "loss": 0.5077, + "step": 87290 + }, + { + "epoch": 4.335949140756928, + "grad_norm": 0.1005859375, + "learning_rate": 0.00045312804211781067, + "loss": 0.5133, + "step": 87300 + }, + { + "epoch": 4.336445813052548, + "grad_norm": 0.09228515625, + "learning_rate": 0.00045308830833416114, + "loss": 0.504, + "step": 87310 + }, + { + "epoch": 4.3369424853481675, + "grad_norm": 0.126953125, + "learning_rate": 0.0004530485745505116, + "loss": 0.5102, + "step": 87320 + }, + { + "epoch": 4.337439157643787, + "grad_norm": 0.11181640625, + "learning_rate": 0.000453008840766862, + "loss": 0.528, + "step": 87330 + }, + { + "epoch": 4.337935829939406, + "grad_norm": 0.169921875, + "learning_rate": 0.0004529691069832125, + "loss": 0.5072, + "step": 87340 + }, + { + "epoch": 4.338432502235025, + "grad_norm": 0.12158203125, + "learning_rate": 0.00045292937319956297, + "loss": 0.5125, + "step": 87350 + }, + { + "epoch": 4.338929174530644, + "grad_norm": 0.12890625, + "learning_rate": 0.00045288963941591333, + "loss": 0.517, + "step": 87360 + }, + { + "epoch": 4.339425846826264, + "grad_norm": 0.1083984375, + "learning_rate": 0.00045284990563226386, + "loss": 0.5224, + "step": 87370 + }, + { + "epoch": 4.339922519121884, + "grad_norm": 0.10546875, + "learning_rate": 0.00045281017184861433, + "loss": 0.5199, + "step": 87380 + }, + { + "epoch": 4.340419191417503, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004527704380649648, + "loss": 0.5326, + "step": 87390 + }, + { + "epoch": 4.340915863713122, + "grad_norm": 0.1220703125, + "learning_rate": 0.00045273070428131516, + "loss": 0.5324, + "step": 87400 + }, + { + "epoch": 4.341412536008741, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004526909704976657, + "loss": 0.5096, + "step": 87410 + }, + { + "epoch": 4.341909208304361, + "grad_norm": 0.103515625, + "learning_rate": 0.00045265123671401616, + "loss": 0.526, + "step": 87420 + }, + { + "epoch": 4.34240588059998, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004526115029303665, + "loss": 0.5329, + "step": 87430 + }, + { + "epoch": 4.342902552895599, + "grad_norm": 0.10546875, + "learning_rate": 0.000452571769146717, + "loss": 0.5089, + "step": 87440 + }, + { + "epoch": 4.343399225191219, + "grad_norm": 0.10546875, + "learning_rate": 0.0004525320353630675, + "loss": 0.4873, + "step": 87450 + }, + { + "epoch": 4.3438958974868385, + "grad_norm": 0.12109375, + "learning_rate": 0.0004524923015794179, + "loss": 0.5222, + "step": 87460 + }, + { + "epoch": 4.344392569782458, + "grad_norm": 0.1162109375, + "learning_rate": 0.00045245256779576835, + "loss": 0.5066, + "step": 87470 + }, + { + "epoch": 4.344889242078077, + "grad_norm": 0.166015625, + "learning_rate": 0.0004524128340121188, + "loss": 0.5389, + "step": 87480 + }, + { + "epoch": 4.345385914373696, + "grad_norm": 0.1162109375, + "learning_rate": 0.00045237310022846924, + "loss": 0.5425, + "step": 87490 + }, + { + "epoch": 4.345882586669315, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004523333664448197, + "loss": 0.5212, + "step": 87500 + }, + { + "epoch": 4.346379258964935, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004522936326611702, + "loss": 0.5493, + "step": 87510 + }, + { + "epoch": 4.346875931260554, + "grad_norm": 0.10595703125, + "learning_rate": 0.00045225389887752066, + "loss": 0.5508, + "step": 87520 + }, + { + "epoch": 4.347372603556174, + "grad_norm": 0.1396484375, + "learning_rate": 0.0004522141650938711, + "loss": 0.5234, + "step": 87530 + }, + { + "epoch": 4.347869275851793, + "grad_norm": 0.1484375, + "learning_rate": 0.00045217443131022154, + "loss": 0.5188, + "step": 87540 + }, + { + "epoch": 4.348365948147412, + "grad_norm": 0.1640625, + "learning_rate": 0.000452134697526572, + "loss": 0.5179, + "step": 87550 + }, + { + "epoch": 4.348862620443032, + "grad_norm": 0.142578125, + "learning_rate": 0.00045209496374292243, + "loss": 0.4952, + "step": 87560 + }, + { + "epoch": 4.349359292738651, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004520552299592729, + "loss": 0.5231, + "step": 87570 + }, + { + "epoch": 4.34985596503427, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004520154961756234, + "loss": 0.5128, + "step": 87580 + }, + { + "epoch": 4.350352637329889, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004519757623919738, + "loss": 0.4745, + "step": 87590 + }, + { + "epoch": 4.350849309625509, + "grad_norm": 0.10205078125, + "learning_rate": 0.00045193602860832426, + "loss": 0.5124, + "step": 87600 + }, + { + "epoch": 4.351345981921129, + "grad_norm": 0.11279296875, + "learning_rate": 0.00045189629482467474, + "loss": 0.514, + "step": 87610 + }, + { + "epoch": 4.351842654216748, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004518565610410251, + "loss": 0.5281, + "step": 87620 + }, + { + "epoch": 4.352339326512367, + "grad_norm": 0.19140625, + "learning_rate": 0.00045181682725737557, + "loss": 0.5177, + "step": 87630 + }, + { + "epoch": 4.352835998807986, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004517770934737261, + "loss": 0.5217, + "step": 87640 + }, + { + "epoch": 4.3533326711036056, + "grad_norm": 0.11376953125, + "learning_rate": 0.00045173735969007646, + "loss": 0.536, + "step": 87650 + }, + { + "epoch": 4.353829343399225, + "grad_norm": 0.1328125, + "learning_rate": 0.00045169762590642693, + "loss": 0.5201, + "step": 87660 + }, + { + "epoch": 4.354326015694845, + "grad_norm": 0.158203125, + "learning_rate": 0.0004516578921227774, + "loss": 0.531, + "step": 87670 + }, + { + "epoch": 4.354822687990464, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004516181583391279, + "loss": 0.5379, + "step": 87680 + }, + { + "epoch": 4.355319360286083, + "grad_norm": 0.09228515625, + "learning_rate": 0.0004515784245554783, + "loss": 0.5195, + "step": 87690 + }, + { + "epoch": 4.355816032581703, + "grad_norm": 0.11572265625, + "learning_rate": 0.00045153869077182876, + "loss": 0.533, + "step": 87700 + }, + { + "epoch": 4.356312704877322, + "grad_norm": 0.1162109375, + "learning_rate": 0.00045149895698817923, + "loss": 0.5403, + "step": 87710 + }, + { + "epoch": 4.356809377172941, + "grad_norm": 0.1123046875, + "learning_rate": 0.00045145922320452965, + "loss": 0.5178, + "step": 87720 + }, + { + "epoch": 4.35730604946856, + "grad_norm": 0.107421875, + "learning_rate": 0.0004514194894208801, + "loss": 0.525, + "step": 87730 + }, + { + "epoch": 4.35780272176418, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004513797556372306, + "loss": 0.5164, + "step": 87740 + }, + { + "epoch": 4.3582993940598, + "grad_norm": 0.17578125, + "learning_rate": 0.000451340021853581, + "loss": 0.5465, + "step": 87750 + }, + { + "epoch": 4.358796066355419, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004513002880699315, + "loss": 0.5248, + "step": 87760 + }, + { + "epoch": 4.359292738651038, + "grad_norm": 0.1416015625, + "learning_rate": 0.00045126055428628195, + "loss": 0.5562, + "step": 87770 + }, + { + "epoch": 4.359789410946657, + "grad_norm": 0.130859375, + "learning_rate": 0.00045122082050263237, + "loss": 0.5066, + "step": 87780 + }, + { + "epoch": 4.3602860832422765, + "grad_norm": 0.0927734375, + "learning_rate": 0.00045118108671898284, + "loss": 0.4942, + "step": 87790 + }, + { + "epoch": 4.360782755537896, + "grad_norm": 0.1513671875, + "learning_rate": 0.0004511413529353333, + "loss": 0.5248, + "step": 87800 + }, + { + "epoch": 4.361279427833516, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004511016191516837, + "loss": 0.5215, + "step": 87810 + }, + { + "epoch": 4.361776100129135, + "grad_norm": 0.12451171875, + "learning_rate": 0.0004510618853680342, + "loss": 0.5075, + "step": 87820 + }, + { + "epoch": 4.362272772424754, + "grad_norm": 0.1064453125, + "learning_rate": 0.00045102215158438467, + "loss": 0.5233, + "step": 87830 + }, + { + "epoch": 4.3627694447203735, + "grad_norm": 0.11865234375, + "learning_rate": 0.00045098241780073514, + "loss": 0.5262, + "step": 87840 + }, + { + "epoch": 4.363266117015993, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004509426840170855, + "loss": 0.5257, + "step": 87850 + }, + { + "epoch": 4.363762789311612, + "grad_norm": 0.166015625, + "learning_rate": 0.00045090295023343603, + "loss": 0.5001, + "step": 87860 + }, + { + "epoch": 4.364259461607231, + "grad_norm": 0.134765625, + "learning_rate": 0.0004508632164497865, + "loss": 0.531, + "step": 87870 + }, + { + "epoch": 4.364756133902851, + "grad_norm": 0.1298828125, + "learning_rate": 0.00045082348266613687, + "loss": 0.5427, + "step": 87880 + }, + { + "epoch": 4.365252806198471, + "grad_norm": 0.1044921875, + "learning_rate": 0.00045078374888248734, + "loss": 0.5221, + "step": 87890 + }, + { + "epoch": 4.36574947849409, + "grad_norm": 0.13671875, + "learning_rate": 0.0004507440150988378, + "loss": 0.5203, + "step": 87900 + }, + { + "epoch": 4.366246150789709, + "grad_norm": 0.15234375, + "learning_rate": 0.0004507042813151882, + "loss": 0.5099, + "step": 87910 + }, + { + "epoch": 4.366742823085328, + "grad_norm": 0.1552734375, + "learning_rate": 0.0004506645475315387, + "loss": 0.5242, + "step": 87920 + }, + { + "epoch": 4.3672394953809475, + "grad_norm": 0.09765625, + "learning_rate": 0.00045062481374788917, + "loss": 0.5017, + "step": 87930 + }, + { + "epoch": 4.367736167676567, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004505850799642396, + "loss": 0.4999, + "step": 87940 + }, + { + "epoch": 4.368232839972187, + "grad_norm": 0.1689453125, + "learning_rate": 0.00045054534618059006, + "loss": 0.5277, + "step": 87950 + }, + { + "epoch": 4.368729512267806, + "grad_norm": 0.1318359375, + "learning_rate": 0.00045050561239694053, + "loss": 0.5373, + "step": 87960 + }, + { + "epoch": 4.369226184563425, + "grad_norm": 0.1865234375, + "learning_rate": 0.000450465878613291, + "loss": 0.5431, + "step": 87970 + }, + { + "epoch": 4.3697228568590445, + "grad_norm": 0.125, + "learning_rate": 0.0004504261448296414, + "loss": 0.5276, + "step": 87980 + }, + { + "epoch": 4.370219529154664, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004503864110459919, + "loss": 0.5384, + "step": 87990 + }, + { + "epoch": 4.370716201450283, + "grad_norm": 0.126953125, + "learning_rate": 0.00045034667726234236, + "loss": 0.5363, + "step": 88000 + }, + { + "epoch": 4.371212873745902, + "grad_norm": 0.130859375, + "learning_rate": 0.0004503069434786928, + "loss": 0.5175, + "step": 88010 + }, + { + "epoch": 4.371709546041521, + "grad_norm": 0.10595703125, + "learning_rate": 0.00045026720969504325, + "loss": 0.5321, + "step": 88020 + }, + { + "epoch": 4.3722062183371415, + "grad_norm": 0.1533203125, + "learning_rate": 0.0004502274759113937, + "loss": 0.5007, + "step": 88030 + }, + { + "epoch": 4.372702890632761, + "grad_norm": 0.1328125, + "learning_rate": 0.0004501877421277441, + "loss": 0.5155, + "step": 88040 + }, + { + "epoch": 4.37319956292838, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004501480083440946, + "loss": 0.5093, + "step": 88050 + }, + { + "epoch": 4.373696235223999, + "grad_norm": 0.134765625, + "learning_rate": 0.0004501082745604451, + "loss": 0.5215, + "step": 88060 + }, + { + "epoch": 4.374192907519618, + "grad_norm": 0.09423828125, + "learning_rate": 0.00045006854077679544, + "loss": 0.4951, + "step": 88070 + }, + { + "epoch": 4.374689579815238, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004500288069931459, + "loss": 0.5061, + "step": 88080 + }, + { + "epoch": 4.375186252110857, + "grad_norm": 0.130859375, + "learning_rate": 0.00044998907320949644, + "loss": 0.5246, + "step": 88090 + }, + { + "epoch": 4.375682924406477, + "grad_norm": 0.10205078125, + "learning_rate": 0.0004499493394258468, + "loss": 0.5417, + "step": 88100 + }, + { + "epoch": 4.376179596702096, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004499096056421973, + "loss": 0.552, + "step": 88110 + }, + { + "epoch": 4.3766762689977154, + "grad_norm": 0.11767578125, + "learning_rate": 0.00044986987185854774, + "loss": 0.5474, + "step": 88120 + }, + { + "epoch": 4.377172941293335, + "grad_norm": 0.1201171875, + "learning_rate": 0.00044983013807489827, + "loss": 0.5339, + "step": 88130 + }, + { + "epoch": 4.377669613588954, + "grad_norm": 0.1337890625, + "learning_rate": 0.00044979040429124863, + "loss": 0.5036, + "step": 88140 + }, + { + "epoch": 4.378166285884573, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004497506705075991, + "loss": 0.5204, + "step": 88150 + }, + { + "epoch": 4.378662958180192, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004497109367239496, + "loss": 0.5123, + "step": 88160 + }, + { + "epoch": 4.3791596304758125, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004496712029403, + "loss": 0.5498, + "step": 88170 + }, + { + "epoch": 4.379656302771432, + "grad_norm": 0.123046875, + "learning_rate": 0.00044963146915665046, + "loss": 0.5111, + "step": 88180 + }, + { + "epoch": 4.380152975067051, + "grad_norm": 0.142578125, + "learning_rate": 0.00044959173537300094, + "loss": 0.5254, + "step": 88190 + }, + { + "epoch": 4.38064964736267, + "grad_norm": 0.11328125, + "learning_rate": 0.00044955200158935135, + "loss": 0.5242, + "step": 88200 + }, + { + "epoch": 4.381146319658289, + "grad_norm": 0.17578125, + "learning_rate": 0.0004495122678057018, + "loss": 0.5093, + "step": 88210 + }, + { + "epoch": 4.381642991953909, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004494725340220523, + "loss": 0.5439, + "step": 88220 + }, + { + "epoch": 4.382139664249528, + "grad_norm": 0.1396484375, + "learning_rate": 0.00044943280023840266, + "loss": 0.5357, + "step": 88230 + }, + { + "epoch": 4.382636336545147, + "grad_norm": 0.091796875, + "learning_rate": 0.0004493930664547532, + "loss": 0.5, + "step": 88240 + }, + { + "epoch": 4.383133008840767, + "grad_norm": 0.1123046875, + "learning_rate": 0.00044935333267110366, + "loss": 0.5414, + "step": 88250 + }, + { + "epoch": 4.383629681136386, + "grad_norm": 0.2080078125, + "learning_rate": 0.0004493135988874541, + "loss": 0.5203, + "step": 88260 + }, + { + "epoch": 4.384126353432006, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004492738651038045, + "loss": 0.5286, + "step": 88270 + }, + { + "epoch": 4.384623025727625, + "grad_norm": 0.12451171875, + "learning_rate": 0.000449234131320155, + "loss": 0.5242, + "step": 88280 + }, + { + "epoch": 4.385119698023244, + "grad_norm": 0.1435546875, + "learning_rate": 0.0004491943975365055, + "loss": 0.5384, + "step": 88290 + }, + { + "epoch": 4.385616370318863, + "grad_norm": 0.123046875, + "learning_rate": 0.00044915466375285585, + "loss": 0.5252, + "step": 88300 + }, + { + "epoch": 4.3861130426144825, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004491149299692063, + "loss": 0.5113, + "step": 88310 + }, + { + "epoch": 4.386609714910103, + "grad_norm": 0.09619140625, + "learning_rate": 0.00044907519618555685, + "loss": 0.5088, + "step": 88320 + }, + { + "epoch": 4.387106387205722, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004490354624019072, + "loss": 0.5234, + "step": 88330 + }, + { + "epoch": 4.387603059501341, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004489957286182577, + "loss": 0.488, + "step": 88340 + }, + { + "epoch": 4.38809973179696, + "grad_norm": 0.1279296875, + "learning_rate": 0.00044895599483460815, + "loss": 0.5555, + "step": 88350 + }, + { + "epoch": 4.38859640409258, + "grad_norm": 0.10888671875, + "learning_rate": 0.00044891626105095857, + "loss": 0.5122, + "step": 88360 + }, + { + "epoch": 4.389093076388199, + "grad_norm": 0.1435546875, + "learning_rate": 0.00044887652726730904, + "loss": 0.5252, + "step": 88370 + }, + { + "epoch": 4.389589748683818, + "grad_norm": 0.111328125, + "learning_rate": 0.0004488367934836595, + "loss": 0.538, + "step": 88380 + }, + { + "epoch": 4.390086420979438, + "grad_norm": 0.123046875, + "learning_rate": 0.00044879705970000993, + "loss": 0.4978, + "step": 88390 + }, + { + "epoch": 4.390583093275057, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004487573259163604, + "loss": 0.5182, + "step": 88400 + }, + { + "epoch": 4.391079765570677, + "grad_norm": 0.12451171875, + "learning_rate": 0.00044871759213271087, + "loss": 0.5699, + "step": 88410 + }, + { + "epoch": 4.391576437866296, + "grad_norm": 0.1279296875, + "learning_rate": 0.00044867785834906134, + "loss": 0.5379, + "step": 88420 + }, + { + "epoch": 4.392073110161915, + "grad_norm": 0.11572265625, + "learning_rate": 0.00044863812456541176, + "loss": 0.5178, + "step": 88430 + }, + { + "epoch": 4.392569782457534, + "grad_norm": 0.140625, + "learning_rate": 0.00044859839078176223, + "loss": 0.5005, + "step": 88440 + }, + { + "epoch": 4.3930664547531535, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004485586569981127, + "loss": 0.516, + "step": 88450 + }, + { + "epoch": 4.393563127048774, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004485189232144631, + "loss": 0.538, + "step": 88460 + }, + { + "epoch": 4.394059799344393, + "grad_norm": 0.115234375, + "learning_rate": 0.0004484791894308136, + "loss": 0.5167, + "step": 88470 + }, + { + "epoch": 4.394556471640012, + "grad_norm": 0.11181640625, + "learning_rate": 0.00044843945564716406, + "loss": 0.5185, + "step": 88480 + }, + { + "epoch": 4.395053143935631, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004483997218635144, + "loss": 0.508, + "step": 88490 + }, + { + "epoch": 4.3955498162312505, + "grad_norm": 0.10546875, + "learning_rate": 0.0004483599880798649, + "loss": 0.5249, + "step": 88500 + }, + { + "epoch": 4.39604648852687, + "grad_norm": 0.134765625, + "learning_rate": 0.0004483202542962154, + "loss": 0.4992, + "step": 88510 + }, + { + "epoch": 4.396543160822489, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004482805205125658, + "loss": 0.5404, + "step": 88520 + }, + { + "epoch": 4.397039833118109, + "grad_norm": 0.10546875, + "learning_rate": 0.00044824078672891626, + "loss": 0.5283, + "step": 88530 + }, + { + "epoch": 4.397536505413728, + "grad_norm": 0.1181640625, + "learning_rate": 0.00044820105294526673, + "loss": 0.4998, + "step": 88540 + }, + { + "epoch": 4.3980331777093475, + "grad_norm": 0.11669921875, + "learning_rate": 0.00044816131916161715, + "loss": 0.5101, + "step": 88550 + }, + { + "epoch": 4.398529850004967, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004481215853779676, + "loss": 0.5001, + "step": 88560 + }, + { + "epoch": 4.399026522300586, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004480818515943181, + "loss": 0.5308, + "step": 88570 + }, + { + "epoch": 4.399523194596205, + "grad_norm": 0.109375, + "learning_rate": 0.00044804211781066856, + "loss": 0.5087, + "step": 88580 + }, + { + "epoch": 4.4000198668918244, + "grad_norm": 0.0986328125, + "learning_rate": 0.000448002384027019, + "loss": 0.5077, + "step": 88590 + }, + { + "epoch": 4.400516539187445, + "grad_norm": 0.126953125, + "learning_rate": 0.00044796265024336945, + "loss": 0.5311, + "step": 88600 + }, + { + "epoch": 4.401013211483064, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004479229164597199, + "loss": 0.5168, + "step": 88610 + }, + { + "epoch": 4.401509883778683, + "grad_norm": 0.1162109375, + "learning_rate": 0.00044788318267607034, + "loss": 0.5317, + "step": 88620 + }, + { + "epoch": 4.402006556074302, + "grad_norm": 0.142578125, + "learning_rate": 0.0004478434488924208, + "loss": 0.5134, + "step": 88630 + }, + { + "epoch": 4.4025032283699215, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004478037151087713, + "loss": 0.5415, + "step": 88640 + }, + { + "epoch": 4.402999900665541, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004477639813251217, + "loss": 0.4927, + "step": 88650 + }, + { + "epoch": 4.40349657296116, + "grad_norm": 0.1474609375, + "learning_rate": 0.00044772424754147217, + "loss": 0.5059, + "step": 88660 + }, + { + "epoch": 4.40399324525678, + "grad_norm": 0.1669921875, + "learning_rate": 0.00044768451375782264, + "loss": 0.5184, + "step": 88670 + }, + { + "epoch": 4.404489917552399, + "grad_norm": 0.10498046875, + "learning_rate": 0.000447644779974173, + "loss": 0.5118, + "step": 88680 + }, + { + "epoch": 4.4049865898480185, + "grad_norm": 0.1630859375, + "learning_rate": 0.00044760504619052353, + "loss": 0.5293, + "step": 88690 + }, + { + "epoch": 4.405483262143638, + "grad_norm": 0.11083984375, + "learning_rate": 0.000447565312406874, + "loss": 0.5356, + "step": 88700 + }, + { + "epoch": 4.405979934439257, + "grad_norm": 0.1201171875, + "learning_rate": 0.00044752557862322447, + "loss": 0.5143, + "step": 88710 + }, + { + "epoch": 4.406476606734876, + "grad_norm": 0.1201171875, + "learning_rate": 0.00044748584483957483, + "loss": 0.5378, + "step": 88720 + }, + { + "epoch": 4.406973279030495, + "grad_norm": 0.138671875, + "learning_rate": 0.00044744611105592536, + "loss": 0.5285, + "step": 88730 + }, + { + "epoch": 4.407469951326115, + "grad_norm": 0.11572265625, + "learning_rate": 0.00044740637727227583, + "loss": 0.5536, + "step": 88740 + }, + { + "epoch": 4.407966623621735, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004473666434886262, + "loss": 0.5143, + "step": 88750 + }, + { + "epoch": 4.408463295917354, + "grad_norm": 0.126953125, + "learning_rate": 0.00044732690970497666, + "loss": 0.4971, + "step": 88760 + }, + { + "epoch": 4.408959968212973, + "grad_norm": 0.10546875, + "learning_rate": 0.00044728717592132714, + "loss": 0.5375, + "step": 88770 + }, + { + "epoch": 4.409456640508592, + "grad_norm": 0.1181640625, + "learning_rate": 0.00044724744213767755, + "loss": 0.5175, + "step": 88780 + }, + { + "epoch": 4.409953312804212, + "grad_norm": 0.1337890625, + "learning_rate": 0.000447207708354028, + "loss": 0.5282, + "step": 88790 + }, + { + "epoch": 4.410449985099831, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004471679745703785, + "loss": 0.5248, + "step": 88800 + }, + { + "epoch": 4.41094665739545, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004471282407867289, + "loss": 0.5178, + "step": 88810 + }, + { + "epoch": 4.41144332969107, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004470885070030794, + "loss": 0.4972, + "step": 88820 + }, + { + "epoch": 4.4119400019866895, + "grad_norm": 0.0966796875, + "learning_rate": 0.00044704877321942986, + "loss": 0.5102, + "step": 88830 + }, + { + "epoch": 4.412436674282309, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004470090394357803, + "loss": 0.5329, + "step": 88840 + }, + { + "epoch": 4.412933346577928, + "grad_norm": 0.11181640625, + "learning_rate": 0.00044696930565213074, + "loss": 0.5102, + "step": 88850 + }, + { + "epoch": 4.413430018873547, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004469295718684812, + "loss": 0.4967, + "step": 88860 + }, + { + "epoch": 4.413926691169166, + "grad_norm": 0.140625, + "learning_rate": 0.0004468898380848317, + "loss": 0.504, + "step": 88870 + }, + { + "epoch": 4.414423363464786, + "grad_norm": 0.11328125, + "learning_rate": 0.0004468501043011821, + "loss": 0.5204, + "step": 88880 + }, + { + "epoch": 4.414920035760405, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004468103705175326, + "loss": 0.4997, + "step": 88890 + }, + { + "epoch": 4.415416708056025, + "grad_norm": 0.11376953125, + "learning_rate": 0.00044677063673388305, + "loss": 0.5183, + "step": 88900 + }, + { + "epoch": 4.415913380351644, + "grad_norm": 0.130859375, + "learning_rate": 0.0004467309029502334, + "loss": 0.5336, + "step": 88910 + }, + { + "epoch": 4.416410052647263, + "grad_norm": 0.1640625, + "learning_rate": 0.00044669116916658394, + "loss": 0.4998, + "step": 88920 + }, + { + "epoch": 4.416906724942883, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004466514353829344, + "loss": 0.4927, + "step": 88930 + }, + { + "epoch": 4.417403397238502, + "grad_norm": 0.1279296875, + "learning_rate": 0.00044661170159928477, + "loss": 0.5184, + "step": 88940 + }, + { + "epoch": 4.417900069534121, + "grad_norm": 0.10595703125, + "learning_rate": 0.00044657196781563524, + "loss": 0.5013, + "step": 88950 + }, + { + "epoch": 4.41839674182974, + "grad_norm": 0.099609375, + "learning_rate": 0.00044653223403198577, + "loss": 0.4943, + "step": 88960 + }, + { + "epoch": 4.41889341412536, + "grad_norm": 0.12060546875, + "learning_rate": 0.00044649250024833613, + "loss": 0.5071, + "step": 88970 + }, + { + "epoch": 4.41939008642098, + "grad_norm": 0.107421875, + "learning_rate": 0.0004464527664646866, + "loss": 0.5252, + "step": 88980 + }, + { + "epoch": 4.419886758716599, + "grad_norm": 0.11083984375, + "learning_rate": 0.00044641303268103707, + "loss": 0.5049, + "step": 88990 + }, + { + "epoch": 4.420383431012218, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004463732988973875, + "loss": 0.507, + "step": 89000 + }, + { + "epoch": 4.420880103307837, + "grad_norm": 0.10791015625, + "learning_rate": 0.00044633356511373796, + "loss": 0.4965, + "step": 89010 + }, + { + "epoch": 4.4213767756034565, + "grad_norm": 0.12158203125, + "learning_rate": 0.00044629383133008843, + "loss": 0.5358, + "step": 89020 + }, + { + "epoch": 4.421873447899076, + "grad_norm": 0.12109375, + "learning_rate": 0.0004462540975464389, + "loss": 0.5142, + "step": 89030 + }, + { + "epoch": 4.422370120194696, + "grad_norm": 0.111328125, + "learning_rate": 0.0004462143637627893, + "loss": 0.5253, + "step": 89040 + }, + { + "epoch": 4.422866792490315, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004461746299791398, + "loss": 0.512, + "step": 89050 + }, + { + "epoch": 4.423363464785934, + "grad_norm": 0.107421875, + "learning_rate": 0.00044613489619549026, + "loss": 0.5219, + "step": 89060 + }, + { + "epoch": 4.423860137081554, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004460951624118407, + "loss": 0.5303, + "step": 89070 + }, + { + "epoch": 4.424356809377173, + "grad_norm": 0.11767578125, + "learning_rate": 0.00044605542862819115, + "loss": 0.5194, + "step": 89080 + }, + { + "epoch": 4.424853481672792, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004460156948445416, + "loss": 0.5159, + "step": 89090 + }, + { + "epoch": 4.425350153968411, + "grad_norm": 0.130859375, + "learning_rate": 0.000445975961060892, + "loss": 0.5096, + "step": 89100 + }, + { + "epoch": 4.425846826264031, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004459362272772425, + "loss": 0.5184, + "step": 89110 + }, + { + "epoch": 4.426343498559651, + "grad_norm": 0.1162109375, + "learning_rate": 0.000445896493493593, + "loss": 0.541, + "step": 89120 + }, + { + "epoch": 4.42684017085527, + "grad_norm": 0.12353515625, + "learning_rate": 0.00044585675970994335, + "loss": 0.5351, + "step": 89130 + }, + { + "epoch": 4.427336843150889, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004458170259262938, + "loss": 0.4929, + "step": 89140 + }, + { + "epoch": 4.427833515446508, + "grad_norm": 0.1318359375, + "learning_rate": 0.00044577729214264434, + "loss": 0.4825, + "step": 89150 + }, + { + "epoch": 4.4283301877421275, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004457375583589948, + "loss": 0.5049, + "step": 89160 + }, + { + "epoch": 4.428826860037747, + "grad_norm": 0.107421875, + "learning_rate": 0.0004456978245753452, + "loss": 0.5024, + "step": 89170 + }, + { + "epoch": 4.429323532333367, + "grad_norm": 0.1220703125, + "learning_rate": 0.00044565809079169565, + "loss": 0.5139, + "step": 89180 + }, + { + "epoch": 4.429820204628986, + "grad_norm": 0.107421875, + "learning_rate": 0.0004456183570080462, + "loss": 0.516, + "step": 89190 + }, + { + "epoch": 4.430316876924605, + "grad_norm": 0.13671875, + "learning_rate": 0.00044557862322439654, + "loss": 0.5034, + "step": 89200 + }, + { + "epoch": 4.4308135492202245, + "grad_norm": 0.17578125, + "learning_rate": 0.000445538889440747, + "loss": 0.5133, + "step": 89210 + }, + { + "epoch": 4.431310221515844, + "grad_norm": 0.130859375, + "learning_rate": 0.0004454991556570975, + "loss": 0.4981, + "step": 89220 + }, + { + "epoch": 4.431806893811463, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004454594218734479, + "loss": 0.5328, + "step": 89230 + }, + { + "epoch": 4.432303566107082, + "grad_norm": 0.1416015625, + "learning_rate": 0.00044541968808979837, + "loss": 0.5556, + "step": 89240 + }, + { + "epoch": 4.432800238402702, + "grad_norm": 0.134765625, + "learning_rate": 0.00044537995430614884, + "loss": 0.5132, + "step": 89250 + }, + { + "epoch": 4.4332969106983215, + "grad_norm": 0.1416015625, + "learning_rate": 0.00044534022052249926, + "loss": 0.519, + "step": 89260 + }, + { + "epoch": 4.433793582993941, + "grad_norm": 0.11376953125, + "learning_rate": 0.00044530048673884973, + "loss": 0.5153, + "step": 89270 + }, + { + "epoch": 4.43429025528956, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004452607529552002, + "loss": 0.5048, + "step": 89280 + }, + { + "epoch": 4.434786927585179, + "grad_norm": 0.171875, + "learning_rate": 0.0004452210191715506, + "loss": 0.5422, + "step": 89290 + }, + { + "epoch": 4.4352835998807985, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004451812853879011, + "loss": 0.5017, + "step": 89300 + }, + { + "epoch": 4.435780272176418, + "grad_norm": 0.140625, + "learning_rate": 0.00044514155160425156, + "loss": 0.516, + "step": 89310 + }, + { + "epoch": 4.436276944472038, + "grad_norm": 0.1025390625, + "learning_rate": 0.00044510181782060203, + "loss": 0.5387, + "step": 89320 + }, + { + "epoch": 4.436773616767657, + "grad_norm": 0.10546875, + "learning_rate": 0.0004450620840369524, + "loss": 0.5209, + "step": 89330 + }, + { + "epoch": 4.437270289063276, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004450223502533029, + "loss": 0.5306, + "step": 89340 + }, + { + "epoch": 4.4377669613588955, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004449826164696534, + "loss": 0.4843, + "step": 89350 + }, + { + "epoch": 4.438263633654515, + "grad_norm": 0.11669921875, + "learning_rate": 0.00044494288268600375, + "loss": 0.5431, + "step": 89360 + }, + { + "epoch": 4.438760305950134, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004449031489023542, + "loss": 0.538, + "step": 89370 + }, + { + "epoch": 4.439256978245753, + "grad_norm": 0.12451171875, + "learning_rate": 0.00044486341511870475, + "loss": 0.5177, + "step": 89380 + }, + { + "epoch": 4.439753650541372, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004448236813350551, + "loss": 0.5403, + "step": 89390 + }, + { + "epoch": 4.4402503228369925, + "grad_norm": 0.12451171875, + "learning_rate": 0.0004447839475514056, + "loss": 0.5181, + "step": 89400 + }, + { + "epoch": 4.440746995132612, + "grad_norm": 0.10888671875, + "learning_rate": 0.00044474421376775606, + "loss": 0.5358, + "step": 89410 + }, + { + "epoch": 4.441243667428231, + "grad_norm": 0.13671875, + "learning_rate": 0.0004447044799841065, + "loss": 0.5469, + "step": 89420 + }, + { + "epoch": 4.44174033972385, + "grad_norm": 0.1259765625, + "learning_rate": 0.00044466474620045694, + "loss": 0.4976, + "step": 89430 + }, + { + "epoch": 4.442237012019469, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004446250124168074, + "loss": 0.5204, + "step": 89440 + }, + { + "epoch": 4.442733684315089, + "grad_norm": 0.11279296875, + "learning_rate": 0.00044458527863315783, + "loss": 0.4834, + "step": 89450 + }, + { + "epoch": 4.443230356610708, + "grad_norm": 0.1171875, + "learning_rate": 0.0004445455448495083, + "loss": 0.5473, + "step": 89460 + }, + { + "epoch": 4.443727028906328, + "grad_norm": 0.1328125, + "learning_rate": 0.0004445058110658588, + "loss": 0.522, + "step": 89470 + }, + { + "epoch": 4.444223701201947, + "grad_norm": 0.10205078125, + "learning_rate": 0.00044446607728220925, + "loss": 0.517, + "step": 89480 + }, + { + "epoch": 4.444720373497566, + "grad_norm": 0.10693359375, + "learning_rate": 0.00044442634349855966, + "loss": 0.5171, + "step": 89490 + }, + { + "epoch": 4.445217045793186, + "grad_norm": 0.130859375, + "learning_rate": 0.00044438660971491014, + "loss": 0.5147, + "step": 89500 + }, + { + "epoch": 4.445713718088805, + "grad_norm": 0.12109375, + "learning_rate": 0.0004443468759312606, + "loss": 0.5131, + "step": 89510 + }, + { + "epoch": 4.446210390384424, + "grad_norm": 0.1259765625, + "learning_rate": 0.000444307142147611, + "loss": 0.5493, + "step": 89520 + }, + { + "epoch": 4.446707062680043, + "grad_norm": 0.134765625, + "learning_rate": 0.0004442674083639615, + "loss": 0.5042, + "step": 89530 + }, + { + "epoch": 4.4472037349756635, + "grad_norm": 0.11083984375, + "learning_rate": 0.00044422767458031197, + "loss": 0.5174, + "step": 89540 + }, + { + "epoch": 4.447700407271283, + "grad_norm": 0.11669921875, + "learning_rate": 0.00044418794079666233, + "loss": 0.5088, + "step": 89550 + }, + { + "epoch": 4.448197079566902, + "grad_norm": 0.10498046875, + "learning_rate": 0.00044414820701301286, + "loss": 0.5215, + "step": 89560 + }, + { + "epoch": 4.448693751862521, + "grad_norm": 0.1748046875, + "learning_rate": 0.0004441084732293633, + "loss": 0.5181, + "step": 89570 + }, + { + "epoch": 4.44919042415814, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004440687394457137, + "loss": 0.5321, + "step": 89580 + }, + { + "epoch": 4.44968709645376, + "grad_norm": 0.1376953125, + "learning_rate": 0.00044402900566206416, + "loss": 0.5109, + "step": 89590 + }, + { + "epoch": 4.450183768749379, + "grad_norm": 0.1025390625, + "learning_rate": 0.00044398927187841463, + "loss": 0.5235, + "step": 89600 + }, + { + "epoch": 4.450680441044998, + "grad_norm": 0.10498046875, + "learning_rate": 0.00044394953809476516, + "loss": 0.5096, + "step": 89610 + }, + { + "epoch": 4.451177113340618, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004439098043111155, + "loss": 0.5184, + "step": 89620 + }, + { + "epoch": 4.451673785636237, + "grad_norm": 0.142578125, + "learning_rate": 0.000443870070527466, + "loss": 0.5522, + "step": 89630 + }, + { + "epoch": 4.452170457931857, + "grad_norm": 0.1201171875, + "learning_rate": 0.00044383033674381646, + "loss": 0.5298, + "step": 89640 + }, + { + "epoch": 4.452667130227476, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004437906029601669, + "loss": 0.5082, + "step": 89650 + }, + { + "epoch": 4.453163802523095, + "grad_norm": 0.1103515625, + "learning_rate": 0.00044375086917651735, + "loss": 0.5256, + "step": 89660 + }, + { + "epoch": 4.453660474818714, + "grad_norm": 0.099609375, + "learning_rate": 0.0004437111353928678, + "loss": 0.4979, + "step": 89670 + }, + { + "epoch": 4.4541571471143335, + "grad_norm": 0.1142578125, + "learning_rate": 0.00044367140160921824, + "loss": 0.5271, + "step": 89680 + }, + { + "epoch": 4.454653819409954, + "grad_norm": 0.154296875, + "learning_rate": 0.0004436316678255687, + "loss": 0.5414, + "step": 89690 + }, + { + "epoch": 4.455150491705573, + "grad_norm": 0.134765625, + "learning_rate": 0.0004435919340419192, + "loss": 0.5237, + "step": 89700 + }, + { + "epoch": 4.455647164001192, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004435522002582696, + "loss": 0.4943, + "step": 89710 + }, + { + "epoch": 4.456143836296811, + "grad_norm": 0.11376953125, + "learning_rate": 0.00044351246647462007, + "loss": 0.5289, + "step": 89720 + }, + { + "epoch": 4.4566405085924305, + "grad_norm": 0.125, + "learning_rate": 0.00044347273269097054, + "loss": 0.5164, + "step": 89730 + }, + { + "epoch": 4.45713718088805, + "grad_norm": 0.09521484375, + "learning_rate": 0.0004434329989073209, + "loss": 0.4989, + "step": 89740 + }, + { + "epoch": 4.457633853183669, + "grad_norm": 0.1328125, + "learning_rate": 0.00044339326512367143, + "loss": 0.5271, + "step": 89750 + }, + { + "epoch": 4.458130525479289, + "grad_norm": 0.1533203125, + "learning_rate": 0.0004433535313400219, + "loss": 0.504, + "step": 89760 + }, + { + "epoch": 4.458627197774908, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004433137975563724, + "loss": 0.5256, + "step": 89770 + }, + { + "epoch": 4.459123870070528, + "grad_norm": 0.142578125, + "learning_rate": 0.00044327406377272274, + "loss": 0.5199, + "step": 89780 + }, + { + "epoch": 4.459620542366147, + "grad_norm": 0.11767578125, + "learning_rate": 0.00044323432998907326, + "loss": 0.5165, + "step": 89790 + }, + { + "epoch": 4.460117214661766, + "grad_norm": 0.12451171875, + "learning_rate": 0.00044319459620542373, + "loss": 0.4881, + "step": 89800 + }, + { + "epoch": 4.460613886957385, + "grad_norm": 0.111328125, + "learning_rate": 0.0004431548624217741, + "loss": 0.5022, + "step": 89810 + }, + { + "epoch": 4.4611105592530045, + "grad_norm": 0.1103515625, + "learning_rate": 0.00044311512863812457, + "loss": 0.5424, + "step": 89820 + }, + { + "epoch": 4.461607231548625, + "grad_norm": 0.109375, + "learning_rate": 0.0004430753948544751, + "loss": 0.4923, + "step": 89830 + }, + { + "epoch": 4.462103903844244, + "grad_norm": 0.10791015625, + "learning_rate": 0.00044303566107082546, + "loss": 0.4948, + "step": 89840 + }, + { + "epoch": 4.462600576139863, + "grad_norm": 0.103515625, + "learning_rate": 0.00044299592728717593, + "loss": 0.5069, + "step": 89850 + }, + { + "epoch": 4.463097248435482, + "grad_norm": 0.1533203125, + "learning_rate": 0.0004429561935035264, + "loss": 0.5038, + "step": 89860 + }, + { + "epoch": 4.4635939207311015, + "grad_norm": 0.11328125, + "learning_rate": 0.0004429164597198768, + "loss": 0.5163, + "step": 89870 + }, + { + "epoch": 4.464090593026721, + "grad_norm": 0.15234375, + "learning_rate": 0.0004428767259362273, + "loss": 0.526, + "step": 89880 + }, + { + "epoch": 4.46458726532234, + "grad_norm": 0.11767578125, + "learning_rate": 0.00044283699215257776, + "loss": 0.5336, + "step": 89890 + }, + { + "epoch": 4.46508393761796, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004427972583689282, + "loss": 0.5185, + "step": 89900 + }, + { + "epoch": 4.465580609913579, + "grad_norm": 0.1474609375, + "learning_rate": 0.00044275752458527865, + "loss": 0.494, + "step": 89910 + }, + { + "epoch": 4.4660772822091985, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004427177908016291, + "loss": 0.5058, + "step": 89920 + }, + { + "epoch": 4.466573954504818, + "grad_norm": 0.1171875, + "learning_rate": 0.0004426780570179796, + "loss": 0.5049, + "step": 89930 + }, + { + "epoch": 4.467070626800437, + "grad_norm": 0.1103515625, + "learning_rate": 0.00044263832323433, + "loss": 0.5247, + "step": 89940 + }, + { + "epoch": 4.467567299096056, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004425985894506805, + "loss": 0.5206, + "step": 89950 + }, + { + "epoch": 4.468063971391675, + "grad_norm": 0.1416015625, + "learning_rate": 0.00044255885566703095, + "loss": 0.5468, + "step": 89960 + }, + { + "epoch": 4.4685606436872956, + "grad_norm": 0.123046875, + "learning_rate": 0.0004425191218833813, + "loss": 0.5069, + "step": 89970 + }, + { + "epoch": 4.469057315982915, + "grad_norm": 0.10498046875, + "learning_rate": 0.00044247938809973184, + "loss": 0.5266, + "step": 89980 + }, + { + "epoch": 4.469553988278534, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004424396543160823, + "loss": 0.4976, + "step": 89990 + }, + { + "epoch": 4.470050660574153, + "grad_norm": 0.119140625, + "learning_rate": 0.0004423999205324327, + "loss": 0.5109, + "step": 90000 + }, + { + "epoch": 4.4705473328697725, + "grad_norm": 0.1005859375, + "learning_rate": 0.00044236018674878314, + "loss": 0.5179, + "step": 90010 + }, + { + "epoch": 4.471044005165392, + "grad_norm": 0.10107421875, + "learning_rate": 0.00044232045296513367, + "loss": 0.5047, + "step": 90020 + }, + { + "epoch": 4.471540677461011, + "grad_norm": 0.1513671875, + "learning_rate": 0.00044228071918148403, + "loss": 0.528, + "step": 90030 + }, + { + "epoch": 4.472037349756631, + "grad_norm": 0.1474609375, + "learning_rate": 0.0004422409853978345, + "loss": 0.5362, + "step": 90040 + }, + { + "epoch": 4.47253402205225, + "grad_norm": 0.1015625, + "learning_rate": 0.000442201251614185, + "loss": 0.5376, + "step": 90050 + }, + { + "epoch": 4.4730306943478695, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004421615178305355, + "loss": 0.532, + "step": 90060 + }, + { + "epoch": 4.473527366643489, + "grad_norm": 0.09814453125, + "learning_rate": 0.00044212178404688586, + "loss": 0.5163, + "step": 90070 + }, + { + "epoch": 4.474024038939108, + "grad_norm": 0.130859375, + "learning_rate": 0.00044208205026323634, + "loss": 0.5136, + "step": 90080 + }, + { + "epoch": 4.474520711234727, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004420423164795868, + "loss": 0.5068, + "step": 90090 + }, + { + "epoch": 4.475017383530346, + "grad_norm": 0.1416015625, + "learning_rate": 0.0004420025826959372, + "loss": 0.5269, + "step": 90100 + }, + { + "epoch": 4.475514055825966, + "grad_norm": 0.10546875, + "learning_rate": 0.0004419628489122877, + "loss": 0.5006, + "step": 90110 + }, + { + "epoch": 4.476010728121586, + "grad_norm": 0.158203125, + "learning_rate": 0.00044192311512863817, + "loss": 0.4995, + "step": 90120 + }, + { + "epoch": 4.476507400417205, + "grad_norm": 0.134765625, + "learning_rate": 0.0004418833813449886, + "loss": 0.5093, + "step": 90130 + }, + { + "epoch": 4.477004072712824, + "grad_norm": 0.1015625, + "learning_rate": 0.00044184364756133906, + "loss": 0.5181, + "step": 90140 + }, + { + "epoch": 4.477500745008443, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004418039137776895, + "loss": 0.5198, + "step": 90150 + }, + { + "epoch": 4.477997417304063, + "grad_norm": 0.1220703125, + "learning_rate": 0.00044176417999403994, + "loss": 0.5444, + "step": 90160 + }, + { + "epoch": 4.478494089599682, + "grad_norm": 0.126953125, + "learning_rate": 0.0004417244462103904, + "loss": 0.5314, + "step": 90170 + }, + { + "epoch": 4.478990761895301, + "grad_norm": 0.142578125, + "learning_rate": 0.0004416847124267409, + "loss": 0.4931, + "step": 90180 + }, + { + "epoch": 4.479487434190921, + "grad_norm": 0.1181640625, + "learning_rate": 0.00044164497864309125, + "loss": 0.536, + "step": 90190 + }, + { + "epoch": 4.47998410648654, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004416052448594417, + "loss": 0.4823, + "step": 90200 + }, + { + "epoch": 4.48048077878216, + "grad_norm": 0.109375, + "learning_rate": 0.00044156551107579225, + "loss": 0.5047, + "step": 90210 + }, + { + "epoch": 4.480977451077779, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004415257772921427, + "loss": 0.531, + "step": 90220 + }, + { + "epoch": 4.481474123373398, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004414860435084931, + "loss": 0.5304, + "step": 90230 + }, + { + "epoch": 4.481970795669017, + "grad_norm": 0.12109375, + "learning_rate": 0.00044144630972484355, + "loss": 0.4967, + "step": 90240 + }, + { + "epoch": 4.482467467964637, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004414065759411941, + "loss": 0.531, + "step": 90250 + }, + { + "epoch": 4.482964140260257, + "grad_norm": 0.1396484375, + "learning_rate": 0.00044136684215754444, + "loss": 0.5198, + "step": 90260 + }, + { + "epoch": 4.483460812555876, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004413271083738949, + "loss": 0.5248, + "step": 90270 + }, + { + "epoch": 4.483957484851495, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004412873745902454, + "loss": 0.5253, + "step": 90280 + }, + { + "epoch": 4.484454157147114, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004412476408065958, + "loss": 0.5224, + "step": 90290 + }, + { + "epoch": 4.484950829442734, + "grad_norm": 0.1123046875, + "learning_rate": 0.00044120790702294627, + "loss": 0.4993, + "step": 90300 + }, + { + "epoch": 4.485447501738353, + "grad_norm": 0.10791015625, + "learning_rate": 0.00044116817323929674, + "loss": 0.543, + "step": 90310 + }, + { + "epoch": 4.485944174033972, + "grad_norm": 0.1201171875, + "learning_rate": 0.00044112843945564716, + "loss": 0.5356, + "step": 90320 + }, + { + "epoch": 4.486440846329591, + "grad_norm": 0.1552734375, + "learning_rate": 0.00044108870567199763, + "loss": 0.5123, + "step": 90330 + }, + { + "epoch": 4.486937518625211, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004410489718883481, + "loss": 0.5289, + "step": 90340 + }, + { + "epoch": 4.487434190920831, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004410092381046985, + "loss": 0.5183, + "step": 90350 + }, + { + "epoch": 4.48793086321645, + "grad_norm": 0.12890625, + "learning_rate": 0.000440969504321049, + "loss": 0.5404, + "step": 90360 + }, + { + "epoch": 4.488427535512069, + "grad_norm": 0.125, + "learning_rate": 0.00044092977053739946, + "loss": 0.5353, + "step": 90370 + }, + { + "epoch": 4.488924207807688, + "grad_norm": 0.123046875, + "learning_rate": 0.00044089003675374993, + "loss": 0.5108, + "step": 90380 + }, + { + "epoch": 4.4894208801033075, + "grad_norm": 0.1064453125, + "learning_rate": 0.00044085030297010035, + "loss": 0.5424, + "step": 90390 + }, + { + "epoch": 4.489917552398927, + "grad_norm": 0.15625, + "learning_rate": 0.0004408105691864508, + "loss": 0.5061, + "step": 90400 + }, + { + "epoch": 4.490414224694547, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004407708354028013, + "loss": 0.5274, + "step": 90410 + }, + { + "epoch": 4.490910896990166, + "grad_norm": 0.1435546875, + "learning_rate": 0.00044073110161915166, + "loss": 0.5364, + "step": 90420 + }, + { + "epoch": 4.491407569285785, + "grad_norm": 0.09814453125, + "learning_rate": 0.0004406913678355022, + "loss": 0.5274, + "step": 90430 + }, + { + "epoch": 4.4919042415814046, + "grad_norm": 0.150390625, + "learning_rate": 0.00044065163405185265, + "loss": 0.5184, + "step": 90440 + }, + { + "epoch": 4.492400913877024, + "grad_norm": 0.107421875, + "learning_rate": 0.000440611900268203, + "loss": 0.5215, + "step": 90450 + }, + { + "epoch": 4.492897586172643, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004405721664845535, + "loss": 0.528, + "step": 90460 + }, + { + "epoch": 4.493394258468262, + "grad_norm": 0.1044921875, + "learning_rate": 0.00044053243270090396, + "loss": 0.5049, + "step": 90470 + }, + { + "epoch": 4.493890930763882, + "grad_norm": 0.10107421875, + "learning_rate": 0.0004404926989172544, + "loss": 0.5208, + "step": 90480 + }, + { + "epoch": 4.494387603059502, + "grad_norm": 0.103515625, + "learning_rate": 0.00044045296513360485, + "loss": 0.5114, + "step": 90490 + }, + { + "epoch": 4.494884275355121, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004404132313499553, + "loss": 0.5216, + "step": 90500 + }, + { + "epoch": 4.49538094765074, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004403734975663058, + "loss": 0.5159, + "step": 90510 + }, + { + "epoch": 4.495877619946359, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004403337637826562, + "loss": 0.5308, + "step": 90520 + }, + { + "epoch": 4.4963742922419785, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004402940299990067, + "loss": 0.5129, + "step": 90530 + }, + { + "epoch": 4.496870964537598, + "grad_norm": 0.1171875, + "learning_rate": 0.00044025429621535715, + "loss": 0.5012, + "step": 90540 + }, + { + "epoch": 4.497367636833218, + "grad_norm": 0.16015625, + "learning_rate": 0.00044021456243170757, + "loss": 0.5065, + "step": 90550 + }, + { + "epoch": 4.497864309128837, + "grad_norm": 0.10498046875, + "learning_rate": 0.00044017482864805804, + "loss": 0.5324, + "step": 90560 + }, + { + "epoch": 4.498360981424456, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004401350948644085, + "loss": 0.5212, + "step": 90570 + }, + { + "epoch": 4.4988576537200755, + "grad_norm": 0.11181640625, + "learning_rate": 0.00044009536108075893, + "loss": 0.4977, + "step": 90580 + }, + { + "epoch": 4.499354326015695, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004400556272971094, + "loss": 0.5069, + "step": 90590 + }, + { + "epoch": 4.499850998311314, + "grad_norm": 0.1181640625, + "learning_rate": 0.00044001589351345987, + "loss": 0.5481, + "step": 90600 + }, + { + "epoch": 4.500347670606933, + "grad_norm": 0.11376953125, + "learning_rate": 0.00043997615972981023, + "loss": 0.5016, + "step": 90610 + }, + { + "epoch": 4.500844342902553, + "grad_norm": 0.130859375, + "learning_rate": 0.00043993642594616076, + "loss": 0.5354, + "step": 90620 + }, + { + "epoch": 4.5013410151981725, + "grad_norm": 0.12109375, + "learning_rate": 0.00043989669216251123, + "loss": 0.5329, + "step": 90630 + }, + { + "epoch": 4.501837687493792, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004398569583788616, + "loss": 0.523, + "step": 90640 + }, + { + "epoch": 4.502334359789411, + "grad_norm": 0.130859375, + "learning_rate": 0.00043981722459521206, + "loss": 0.5129, + "step": 90650 + }, + { + "epoch": 4.50283103208503, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004397774908115626, + "loss": 0.4894, + "step": 90660 + }, + { + "epoch": 4.503327704380649, + "grad_norm": 0.1123046875, + "learning_rate": 0.00043973775702791306, + "loss": 0.5067, + "step": 90670 + }, + { + "epoch": 4.503824376676269, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004396980232442634, + "loss": 0.5217, + "step": 90680 + }, + { + "epoch": 4.504321048971889, + "grad_norm": 0.119140625, + "learning_rate": 0.0004396582894606139, + "loss": 0.519, + "step": 90690 + }, + { + "epoch": 4.504817721267508, + "grad_norm": 0.1240234375, + "learning_rate": 0.00043961855567696437, + "loss": 0.4938, + "step": 90700 + }, + { + "epoch": 4.505314393563127, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004395788218933148, + "loss": 0.5179, + "step": 90710 + }, + { + "epoch": 4.5058110658587465, + "grad_norm": 0.1357421875, + "learning_rate": 0.00043953908810966526, + "loss": 0.4878, + "step": 90720 + }, + { + "epoch": 4.506307738154366, + "grad_norm": 0.14453125, + "learning_rate": 0.0004394993543260157, + "loss": 0.5139, + "step": 90730 + }, + { + "epoch": 4.506804410449985, + "grad_norm": 0.12158203125, + "learning_rate": 0.00043945962054236614, + "loss": 0.5121, + "step": 90740 + }, + { + "epoch": 4.507301082745604, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004394198867587166, + "loss": 0.4962, + "step": 90750 + }, + { + "epoch": 4.507797755041224, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004393801529750671, + "loss": 0.4949, + "step": 90760 + }, + { + "epoch": 4.5082944273368435, + "grad_norm": 0.1484375, + "learning_rate": 0.0004393404191914175, + "loss": 0.5008, + "step": 90770 + }, + { + "epoch": 4.508791099632463, + "grad_norm": 0.1005859375, + "learning_rate": 0.000439300685407768, + "loss": 0.5548, + "step": 90780 + }, + { + "epoch": 4.509287771928082, + "grad_norm": 0.11376953125, + "learning_rate": 0.00043926095162411845, + "loss": 0.5179, + "step": 90790 + }, + { + "epoch": 4.509784444223701, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004392212178404689, + "loss": 0.518, + "step": 90800 + }, + { + "epoch": 4.51028111651932, + "grad_norm": 0.1103515625, + "learning_rate": 0.00043918148405681934, + "loss": 0.53, + "step": 90810 + }, + { + "epoch": 4.51077778881494, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004391417502731698, + "loss": 0.4934, + "step": 90820 + }, + { + "epoch": 4.51127446111056, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004391020164895203, + "loss": 0.5173, + "step": 90830 + }, + { + "epoch": 4.511771133406179, + "grad_norm": 0.1064453125, + "learning_rate": 0.00043906228270587064, + "loss": 0.5262, + "step": 90840 + }, + { + "epoch": 4.512267805701798, + "grad_norm": 0.09814453125, + "learning_rate": 0.00043902254892222117, + "loss": 0.4987, + "step": 90850 + }, + { + "epoch": 4.512764477997417, + "grad_norm": 0.1123046875, + "learning_rate": 0.00043898281513857164, + "loss": 0.5014, + "step": 90860 + }, + { + "epoch": 4.513261150293037, + "grad_norm": 0.115234375, + "learning_rate": 0.000438943081354922, + "loss": 0.5175, + "step": 90870 + }, + { + "epoch": 4.513757822588656, + "grad_norm": 0.146484375, + "learning_rate": 0.00043890334757127247, + "loss": 0.5199, + "step": 90880 + }, + { + "epoch": 4.514254494884275, + "grad_norm": 0.12060546875, + "learning_rate": 0.000438863613787623, + "loss": 0.5094, + "step": 90890 + }, + { + "epoch": 4.514751167179894, + "grad_norm": 0.12890625, + "learning_rate": 0.00043882388000397336, + "loss": 0.4907, + "step": 90900 + }, + { + "epoch": 4.515247839475514, + "grad_norm": 0.1064453125, + "learning_rate": 0.00043878414622032383, + "loss": 0.5088, + "step": 90910 + }, + { + "epoch": 4.515744511771134, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004387444124366743, + "loss": 0.506, + "step": 90920 + }, + { + "epoch": 4.516241184066753, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004387046786530247, + "loss": 0.504, + "step": 90930 + }, + { + "epoch": 4.516737856362372, + "grad_norm": 0.1484375, + "learning_rate": 0.0004386649448693752, + "loss": 0.5338, + "step": 90940 + }, + { + "epoch": 4.517234528657991, + "grad_norm": 0.103515625, + "learning_rate": 0.00043862521108572566, + "loss": 0.5308, + "step": 90950 + }, + { + "epoch": 4.517731200953611, + "grad_norm": 0.125, + "learning_rate": 0.00043858547730207613, + "loss": 0.521, + "step": 90960 + }, + { + "epoch": 4.51822787324923, + "grad_norm": 0.16015625, + "learning_rate": 0.00043854574351842655, + "loss": 0.5086, + "step": 90970 + }, + { + "epoch": 4.518724545544849, + "grad_norm": 0.142578125, + "learning_rate": 0.000438506009734777, + "loss": 0.5313, + "step": 90980 + }, + { + "epoch": 4.519221217840469, + "grad_norm": 0.109375, + "learning_rate": 0.0004384662759511275, + "loss": 0.5356, + "step": 90990 + }, + { + "epoch": 4.519717890136088, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004384265421674779, + "loss": 0.5374, + "step": 91000 + }, + { + "epoch": 4.520214562431708, + "grad_norm": 0.103515625, + "learning_rate": 0.0004383868083838284, + "loss": 0.5043, + "step": 91010 + }, + { + "epoch": 4.520711234727327, + "grad_norm": 0.11962890625, + "learning_rate": 0.00043834707460017885, + "loss": 0.5101, + "step": 91020 + }, + { + "epoch": 4.521207907022946, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004383073408165292, + "loss": 0.5411, + "step": 91030 + }, + { + "epoch": 4.521704579318565, + "grad_norm": 0.1357421875, + "learning_rate": 0.00043826760703287974, + "loss": 0.5164, + "step": 91040 + }, + { + "epoch": 4.5222012516141845, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004382278732492302, + "loss": 0.5594, + "step": 91050 + }, + { + "epoch": 4.522697923909805, + "grad_norm": 0.109375, + "learning_rate": 0.0004381881394655806, + "loss": 0.4987, + "step": 91060 + }, + { + "epoch": 4.523194596205424, + "grad_norm": 0.134765625, + "learning_rate": 0.00043814840568193105, + "loss": 0.5175, + "step": 91070 + }, + { + "epoch": 4.523691268501043, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004381086718982816, + "loss": 0.5346, + "step": 91080 + }, + { + "epoch": 4.524187940796662, + "grad_norm": 0.1767578125, + "learning_rate": 0.00043806893811463194, + "loss": 0.5303, + "step": 91090 + }, + { + "epoch": 4.5246846130922815, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004380292043309824, + "loss": 0.5066, + "step": 91100 + }, + { + "epoch": 4.525181285387901, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004379894705473329, + "loss": 0.5155, + "step": 91110 + }, + { + "epoch": 4.52567795768352, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004379497367636834, + "loss": 0.5041, + "step": 91120 + }, + { + "epoch": 4.52617462997914, + "grad_norm": 0.10205078125, + "learning_rate": 0.00043791000298003377, + "loss": 0.4968, + "step": 91130 + }, + { + "epoch": 4.526671302274759, + "grad_norm": 0.11474609375, + "learning_rate": 0.00043787026919638424, + "loss": 0.5027, + "step": 91140 + }, + { + "epoch": 4.527167974570379, + "grad_norm": 0.1943359375, + "learning_rate": 0.0004378305354127347, + "loss": 0.5014, + "step": 91150 + }, + { + "epoch": 4.527664646865998, + "grad_norm": 0.1123046875, + "learning_rate": 0.00043779080162908513, + "loss": 0.5132, + "step": 91160 + }, + { + "epoch": 4.528161319161617, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004377510678454356, + "loss": 0.5287, + "step": 91170 + }, + { + "epoch": 4.528657991457236, + "grad_norm": 0.0986328125, + "learning_rate": 0.00043771133406178607, + "loss": 0.5218, + "step": 91180 + }, + { + "epoch": 4.5291546637528555, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004376716002781365, + "loss": 0.532, + "step": 91190 + }, + { + "epoch": 4.529651336048476, + "grad_norm": 0.130859375, + "learning_rate": 0.00043763186649448696, + "loss": 0.5352, + "step": 91200 + }, + { + "epoch": 4.530148008344095, + "grad_norm": 0.228515625, + "learning_rate": 0.00043759213271083743, + "loss": 0.5376, + "step": 91210 + }, + { + "epoch": 4.530644680639714, + "grad_norm": 0.140625, + "learning_rate": 0.00043755239892718785, + "loss": 0.5245, + "step": 91220 + }, + { + "epoch": 4.531141352935333, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004375126651435383, + "loss": 0.535, + "step": 91230 + }, + { + "epoch": 4.5316380252309525, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004374729313598888, + "loss": 0.5091, + "step": 91240 + }, + { + "epoch": 4.532134697526572, + "grad_norm": 0.1474609375, + "learning_rate": 0.00043743319757623926, + "loss": 0.5041, + "step": 91250 + }, + { + "epoch": 4.532631369822191, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004373934637925897, + "loss": 0.5173, + "step": 91260 + }, + { + "epoch": 4.533128042117811, + "grad_norm": 0.11376953125, + "learning_rate": 0.00043735373000894015, + "loss": 0.5157, + "step": 91270 + }, + { + "epoch": 4.53362471441343, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004373139962252906, + "loss": 0.5, + "step": 91280 + }, + { + "epoch": 4.5341213867090495, + "grad_norm": 0.109375, + "learning_rate": 0.000437274262441641, + "loss": 0.532, + "step": 91290 + }, + { + "epoch": 4.534618059004669, + "grad_norm": 0.10888671875, + "learning_rate": 0.00043723452865799146, + "loss": 0.5365, + "step": 91300 + }, + { + "epoch": 4.535114731300288, + "grad_norm": 0.10595703125, + "learning_rate": 0.000437194794874342, + "loss": 0.5145, + "step": 91310 + }, + { + "epoch": 4.535611403595907, + "grad_norm": 0.10693359375, + "learning_rate": 0.00043715506109069234, + "loss": 0.5214, + "step": 91320 + }, + { + "epoch": 4.536108075891526, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004371153273070428, + "loss": 0.512, + "step": 91330 + }, + { + "epoch": 4.5366047481871465, + "grad_norm": 0.12109375, + "learning_rate": 0.0004370755935233933, + "loss": 0.5114, + "step": 91340 + }, + { + "epoch": 4.537101420482766, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004370358597397437, + "loss": 0.5316, + "step": 91350 + }, + { + "epoch": 4.537598092778385, + "grad_norm": 0.150390625, + "learning_rate": 0.0004369961259560942, + "loss": 0.5055, + "step": 91360 + }, + { + "epoch": 4.538094765074004, + "grad_norm": 0.119140625, + "learning_rate": 0.00043695639217244465, + "loss": 0.5538, + "step": 91370 + }, + { + "epoch": 4.5385914373696234, + "grad_norm": 0.1396484375, + "learning_rate": 0.00043691665838879506, + "loss": 0.5058, + "step": 91380 + }, + { + "epoch": 4.539088109665243, + "grad_norm": 0.12353515625, + "learning_rate": 0.00043687692460514554, + "loss": 0.52, + "step": 91390 + }, + { + "epoch": 4.539584781960862, + "grad_norm": 0.107421875, + "learning_rate": 0.000436837190821496, + "loss": 0.5364, + "step": 91400 + }, + { + "epoch": 4.540081454256482, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004367974570378465, + "loss": 0.5169, + "step": 91410 + }, + { + "epoch": 4.540578126552101, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004367577232541969, + "loss": 0.5479, + "step": 91420 + }, + { + "epoch": 4.5410747988477205, + "grad_norm": 0.107421875, + "learning_rate": 0.00043671798947054737, + "loss": 0.5127, + "step": 91430 + }, + { + "epoch": 4.54157147114334, + "grad_norm": 0.1845703125, + "learning_rate": 0.00043667825568689784, + "loss": 0.5051, + "step": 91440 + }, + { + "epoch": 4.542068143438959, + "grad_norm": 0.10791015625, + "learning_rate": 0.00043663852190324826, + "loss": 0.5273, + "step": 91450 + }, + { + "epoch": 4.542564815734578, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004365987881195987, + "loss": 0.549, + "step": 91460 + }, + { + "epoch": 4.543061488030197, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004365590543359492, + "loss": 0.5372, + "step": 91470 + }, + { + "epoch": 4.5435581603258175, + "grad_norm": 0.1552734375, + "learning_rate": 0.00043651932055229956, + "loss": 0.4916, + "step": 91480 + }, + { + "epoch": 4.544054832621437, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004364795867686501, + "loss": 0.5001, + "step": 91490 + }, + { + "epoch": 4.544551504917056, + "grad_norm": 0.1083984375, + "learning_rate": 0.00043643985298500056, + "loss": 0.5066, + "step": 91500 + }, + { + "epoch": 4.545048177212675, + "grad_norm": 0.1328125, + "learning_rate": 0.0004364001192013509, + "loss": 0.481, + "step": 91510 + }, + { + "epoch": 4.545544849508294, + "grad_norm": 0.1484375, + "learning_rate": 0.0004363603854177014, + "loss": 0.5047, + "step": 91520 + }, + { + "epoch": 4.546041521803914, + "grad_norm": 0.1611328125, + "learning_rate": 0.0004363206516340519, + "loss": 0.5525, + "step": 91530 + }, + { + "epoch": 4.546538194099533, + "grad_norm": 0.119140625, + "learning_rate": 0.0004362809178504023, + "loss": 0.52, + "step": 91540 + }, + { + "epoch": 4.547034866395153, + "grad_norm": 0.10400390625, + "learning_rate": 0.00043624118406675275, + "loss": 0.4791, + "step": 91550 + }, + { + "epoch": 4.547531538690772, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004362014502831032, + "loss": 0.5197, + "step": 91560 + }, + { + "epoch": 4.548028210986391, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004361617164994537, + "loss": 0.4996, + "step": 91570 + }, + { + "epoch": 4.548524883282011, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004361219827158041, + "loss": 0.5043, + "step": 91580 + }, + { + "epoch": 4.54902155557763, + "grad_norm": 0.1015625, + "learning_rate": 0.0004360822489321546, + "loss": 0.5015, + "step": 91590 + }, + { + "epoch": 4.549518227873249, + "grad_norm": 0.1171875, + "learning_rate": 0.00043604251514850505, + "loss": 0.5149, + "step": 91600 + }, + { + "epoch": 4.550014900168868, + "grad_norm": 0.1005859375, + "learning_rate": 0.00043600278136485547, + "loss": 0.5205, + "step": 91610 + }, + { + "epoch": 4.550511572464488, + "grad_norm": 0.1484375, + "learning_rate": 0.00043596304758120594, + "loss": 0.5549, + "step": 91620 + }, + { + "epoch": 4.551008244760107, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004359233137975564, + "loss": 0.4934, + "step": 91630 + }, + { + "epoch": 4.551504917055727, + "grad_norm": 0.1123046875, + "learning_rate": 0.00043588358001390683, + "loss": 0.5018, + "step": 91640 + }, + { + "epoch": 4.552001589351346, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004358438462302573, + "loss": 0.5088, + "step": 91650 + }, + { + "epoch": 4.552498261646965, + "grad_norm": 0.14453125, + "learning_rate": 0.0004358041124466078, + "loss": 0.5304, + "step": 91660 + }, + { + "epoch": 4.552994933942585, + "grad_norm": 0.1044921875, + "learning_rate": 0.00043576437866295814, + "loss": 0.5237, + "step": 91670 + }, + { + "epoch": 4.553491606238204, + "grad_norm": 0.15625, + "learning_rate": 0.00043572464487930866, + "loss": 0.4961, + "step": 91680 + }, + { + "epoch": 4.553988278533823, + "grad_norm": 0.140625, + "learning_rate": 0.00043568491109565913, + "loss": 0.507, + "step": 91690 + }, + { + "epoch": 4.554484950829442, + "grad_norm": 0.107421875, + "learning_rate": 0.0004356451773120096, + "loss": 0.493, + "step": 91700 + }, + { + "epoch": 4.554981623125062, + "grad_norm": 0.11279296875, + "learning_rate": 0.00043560544352835997, + "loss": 0.5198, + "step": 91710 + }, + { + "epoch": 4.555478295420682, + "grad_norm": 0.130859375, + "learning_rate": 0.0004355657097447105, + "loss": 0.4914, + "step": 91720 + }, + { + "epoch": 4.555974967716301, + "grad_norm": 0.12353515625, + "learning_rate": 0.00043552597596106097, + "loss": 0.5304, + "step": 91730 + }, + { + "epoch": 4.55647164001192, + "grad_norm": 0.103515625, + "learning_rate": 0.00043548624217741133, + "loss": 0.5141, + "step": 91740 + }, + { + "epoch": 4.556968312307539, + "grad_norm": 0.0986328125, + "learning_rate": 0.0004354465083937618, + "loss": 0.5095, + "step": 91750 + }, + { + "epoch": 4.5574649846031585, + "grad_norm": 0.09130859375, + "learning_rate": 0.0004354067746101123, + "loss": 0.5133, + "step": 91760 + }, + { + "epoch": 4.557961656898778, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004353670408264627, + "loss": 0.4957, + "step": 91770 + }, + { + "epoch": 4.558458329194398, + "grad_norm": 0.115234375, + "learning_rate": 0.00043532730704281316, + "loss": 0.5412, + "step": 91780 + }, + { + "epoch": 4.558955001490017, + "grad_norm": 0.0986328125, + "learning_rate": 0.00043528757325916363, + "loss": 0.5023, + "step": 91790 + }, + { + "epoch": 4.559451673785636, + "grad_norm": 0.11376953125, + "learning_rate": 0.00043524783947551405, + "loss": 0.5153, + "step": 91800 + }, + { + "epoch": 4.5599483460812555, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004352081056918645, + "loss": 0.5302, + "step": 91810 + }, + { + "epoch": 4.560445018376875, + "grad_norm": 0.1640625, + "learning_rate": 0.000435168371908215, + "loss": 0.5443, + "step": 91820 + }, + { + "epoch": 4.560941690672494, + "grad_norm": 0.1328125, + "learning_rate": 0.0004351286381245654, + "loss": 0.5454, + "step": 91830 + }, + { + "epoch": 4.561438362968113, + "grad_norm": 0.12109375, + "learning_rate": 0.0004350889043409159, + "loss": 0.5493, + "step": 91840 + }, + { + "epoch": 4.561935035263733, + "grad_norm": 0.1064453125, + "learning_rate": 0.00043504917055726635, + "loss": 0.5348, + "step": 91850 + }, + { + "epoch": 4.562431707559353, + "grad_norm": 0.10302734375, + "learning_rate": 0.0004350094367736168, + "loss": 0.5078, + "step": 91860 + }, + { + "epoch": 4.562928379854972, + "grad_norm": 0.1142578125, + "learning_rate": 0.00043496970298996724, + "loss": 0.5058, + "step": 91870 + }, + { + "epoch": 4.563425052150591, + "grad_norm": 0.13671875, + "learning_rate": 0.0004349299692063177, + "loss": 0.509, + "step": 91880 + }, + { + "epoch": 4.56392172444621, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004348902354226682, + "loss": 0.5261, + "step": 91890 + }, + { + "epoch": 4.5644183967418295, + "grad_norm": 0.12353515625, + "learning_rate": 0.00043485050163901854, + "loss": 0.5089, + "step": 91900 + }, + { + "epoch": 4.564915069037449, + "grad_norm": 0.1279296875, + "learning_rate": 0.00043481076785536907, + "loss": 0.5048, + "step": 91910 + }, + { + "epoch": 4.565411741333069, + "grad_norm": 0.11181640625, + "learning_rate": 0.00043477103407171954, + "loss": 0.5006, + "step": 91920 + }, + { + "epoch": 4.565908413628688, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004347313002880699, + "loss": 0.4904, + "step": 91930 + }, + { + "epoch": 4.566405085924307, + "grad_norm": 0.103515625, + "learning_rate": 0.0004346915665044204, + "loss": 0.5116, + "step": 91940 + }, + { + "epoch": 4.5669017582199265, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004346518327207709, + "loss": 0.5094, + "step": 91950 + }, + { + "epoch": 4.567398430515546, + "grad_norm": 0.1162109375, + "learning_rate": 0.00043461209893712126, + "loss": 0.4969, + "step": 91960 + }, + { + "epoch": 4.567895102811165, + "grad_norm": 0.10888671875, + "learning_rate": 0.00043457236515347174, + "loss": 0.512, + "step": 91970 + }, + { + "epoch": 4.568391775106784, + "grad_norm": 0.111328125, + "learning_rate": 0.0004345326313698222, + "loss": 0.5013, + "step": 91980 + }, + { + "epoch": 4.568888447402404, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004344928975861726, + "loss": 0.504, + "step": 91990 + }, + { + "epoch": 4.5693851196980235, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004344531638025231, + "loss": 0.5093, + "step": 92000 + }, + { + "epoch": 4.569881791993643, + "grad_norm": 0.126953125, + "learning_rate": 0.00043441343001887357, + "loss": 0.5301, + "step": 92010 + }, + { + "epoch": 4.570378464289262, + "grad_norm": 0.126953125, + "learning_rate": 0.00043437369623522404, + "loss": 0.5294, + "step": 92020 + }, + { + "epoch": 4.570875136584881, + "grad_norm": 0.1123046875, + "learning_rate": 0.00043433396245157446, + "loss": 0.501, + "step": 92030 + }, + { + "epoch": 4.5713718088805, + "grad_norm": 0.1591796875, + "learning_rate": 0.0004342942286679249, + "loss": 0.5225, + "step": 92040 + }, + { + "epoch": 4.57186848117612, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004342544948842754, + "loss": 0.5365, + "step": 92050 + }, + { + "epoch": 4.57236515347174, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004342147611006258, + "loss": 0.5126, + "step": 92060 + }, + { + "epoch": 4.572861825767359, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004341750273169763, + "loss": 0.5127, + "step": 92070 + }, + { + "epoch": 4.573358498062978, + "grad_norm": 0.15625, + "learning_rate": 0.00043413529353332676, + "loss": 0.5216, + "step": 92080 + }, + { + "epoch": 4.5738551703585975, + "grad_norm": 0.15625, + "learning_rate": 0.0004340955597496772, + "loss": 0.4977, + "step": 92090 + }, + { + "epoch": 4.574351842654217, + "grad_norm": 0.11572265625, + "learning_rate": 0.00043405582596602765, + "loss": 0.5137, + "step": 92100 + }, + { + "epoch": 4.574848514949836, + "grad_norm": 0.10546875, + "learning_rate": 0.0004340160921823781, + "loss": 0.5194, + "step": 92110 + }, + { + "epoch": 4.575345187245455, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004339763583987285, + "loss": 0.5342, + "step": 92120 + }, + { + "epoch": 4.575841859541075, + "grad_norm": 0.1357421875, + "learning_rate": 0.000433936624615079, + "loss": 0.5041, + "step": 92130 + }, + { + "epoch": 4.5763385318366945, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004338968908314295, + "loss": 0.5216, + "step": 92140 + }, + { + "epoch": 4.576835204132314, + "grad_norm": 0.10595703125, + "learning_rate": 0.00043385715704777995, + "loss": 0.494, + "step": 92150 + }, + { + "epoch": 4.577331876427933, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004338174232641303, + "loss": 0.5261, + "step": 92160 + }, + { + "epoch": 4.577828548723552, + "grad_norm": 0.150390625, + "learning_rate": 0.0004337776894804808, + "loss": 0.5084, + "step": 92170 + }, + { + "epoch": 4.578325221019171, + "grad_norm": 0.130859375, + "learning_rate": 0.0004337379556968313, + "loss": 0.5247, + "step": 92180 + }, + { + "epoch": 4.578821893314791, + "grad_norm": 0.10400390625, + "learning_rate": 0.00043369822191318167, + "loss": 0.5207, + "step": 92190 + }, + { + "epoch": 4.579318565610411, + "grad_norm": 0.1044921875, + "learning_rate": 0.00043365848812953214, + "loss": 0.5027, + "step": 92200 + }, + { + "epoch": 4.57981523790603, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004336187543458826, + "loss": 0.5522, + "step": 92210 + }, + { + "epoch": 4.580311910201649, + "grad_norm": 0.11474609375, + "learning_rate": 0.00043357902056223303, + "loss": 0.5035, + "step": 92220 + }, + { + "epoch": 4.580808582497268, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004335392867785835, + "loss": 0.5145, + "step": 92230 + }, + { + "epoch": 4.581305254792888, + "grad_norm": 0.1123046875, + "learning_rate": 0.000433499552994934, + "loss": 0.5112, + "step": 92240 + }, + { + "epoch": 4.581801927088507, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004334598192112844, + "loss": 0.5067, + "step": 92250 + }, + { + "epoch": 4.582298599384126, + "grad_norm": 0.1064453125, + "learning_rate": 0.00043342008542763486, + "loss": 0.5239, + "step": 92260 + }, + { + "epoch": 4.582795271679745, + "grad_norm": 0.125, + "learning_rate": 0.00043338035164398533, + "loss": 0.5027, + "step": 92270 + }, + { + "epoch": 4.583291943975365, + "grad_norm": 0.1513671875, + "learning_rate": 0.00043334061786033575, + "loss": 0.48, + "step": 92280 + }, + { + "epoch": 4.583788616270985, + "grad_norm": 0.115234375, + "learning_rate": 0.0004333008840766862, + "loss": 0.5186, + "step": 92290 + }, + { + "epoch": 4.584285288566604, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004332611502930367, + "loss": 0.5166, + "step": 92300 + }, + { + "epoch": 4.584781960862223, + "grad_norm": 0.109375, + "learning_rate": 0.00043322141650938717, + "loss": 0.5059, + "step": 92310 + }, + { + "epoch": 4.585278633157842, + "grad_norm": 0.1328125, + "learning_rate": 0.0004331816827257376, + "loss": 0.5462, + "step": 92320 + }, + { + "epoch": 4.585775305453462, + "grad_norm": 0.1572265625, + "learning_rate": 0.00043314194894208805, + "loss": 0.4904, + "step": 92330 + }, + { + "epoch": 4.586271977749081, + "grad_norm": 0.12890625, + "learning_rate": 0.0004331022151584385, + "loss": 0.5082, + "step": 92340 + }, + { + "epoch": 4.5867686500447, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004330624813747889, + "loss": 0.5404, + "step": 92350 + }, + { + "epoch": 4.58726532234032, + "grad_norm": 0.1474609375, + "learning_rate": 0.0004330227475911394, + "loss": 0.5415, + "step": 92360 + }, + { + "epoch": 4.587761994635939, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004329830138074899, + "loss": 0.5209, + "step": 92370 + }, + { + "epoch": 4.588258666931559, + "grad_norm": 0.130859375, + "learning_rate": 0.00043294328002384025, + "loss": 0.5144, + "step": 92380 + }, + { + "epoch": 4.588755339227178, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004329035462401907, + "loss": 0.4951, + "step": 92390 + }, + { + "epoch": 4.589252011522797, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004328638124565412, + "loss": 0.506, + "step": 92400 + }, + { + "epoch": 4.589748683818416, + "grad_norm": 0.11328125, + "learning_rate": 0.0004328240786728916, + "loss": 0.4986, + "step": 92410 + }, + { + "epoch": 4.5902453561140355, + "grad_norm": 0.1435546875, + "learning_rate": 0.0004327843448892421, + "loss": 0.5077, + "step": 92420 + }, + { + "epoch": 4.590742028409656, + "grad_norm": 0.15625, + "learning_rate": 0.00043274461110559255, + "loss": 0.5016, + "step": 92430 + }, + { + "epoch": 4.591238700705275, + "grad_norm": 0.10400390625, + "learning_rate": 0.00043270487732194297, + "loss": 0.5008, + "step": 92440 + }, + { + "epoch": 4.591735373000894, + "grad_norm": 0.10986328125, + "learning_rate": 0.00043266514353829344, + "loss": 0.5352, + "step": 92450 + }, + { + "epoch": 4.592232045296513, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004326254097546439, + "loss": 0.5004, + "step": 92460 + }, + { + "epoch": 4.5927287175921325, + "grad_norm": 0.111328125, + "learning_rate": 0.0004325856759709944, + "loss": 0.5296, + "step": 92470 + }, + { + "epoch": 4.593225389887752, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004325459421873448, + "loss": 0.5047, + "step": 92480 + }, + { + "epoch": 4.593722062183371, + "grad_norm": 0.1220703125, + "learning_rate": 0.00043250620840369527, + "loss": 0.5274, + "step": 92490 + }, + { + "epoch": 4.594218734478991, + "grad_norm": 0.1259765625, + "learning_rate": 0.00043246647462004574, + "loss": 0.5117, + "step": 92500 + }, + { + "epoch": 4.59471540677461, + "grad_norm": 0.130859375, + "learning_rate": 0.00043242674083639616, + "loss": 0.5121, + "step": 92510 + }, + { + "epoch": 4.5952120790702295, + "grad_norm": 0.1162109375, + "learning_rate": 0.00043238700705274663, + "loss": 0.5202, + "step": 92520 + }, + { + "epoch": 4.595708751365849, + "grad_norm": 0.125, + "learning_rate": 0.0004323472732690971, + "loss": 0.5194, + "step": 92530 + }, + { + "epoch": 4.596205423661468, + "grad_norm": 0.1396484375, + "learning_rate": 0.00043230753948544746, + "loss": 0.5228, + "step": 92540 + }, + { + "epoch": 4.596702095957087, + "grad_norm": 0.09521484375, + "learning_rate": 0.000432267805701798, + "loss": 0.5171, + "step": 92550 + }, + { + "epoch": 4.5971987682527065, + "grad_norm": 0.12451171875, + "learning_rate": 0.00043222807191814846, + "loss": 0.4623, + "step": 92560 + }, + { + "epoch": 4.597695440548327, + "grad_norm": 0.1171875, + "learning_rate": 0.0004321883381344988, + "loss": 0.5129, + "step": 92570 + }, + { + "epoch": 4.598192112843946, + "grad_norm": 0.1572265625, + "learning_rate": 0.0004321486043508493, + "loss": 0.5342, + "step": 92580 + }, + { + "epoch": 4.598688785139565, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004321088705671998, + "loss": 0.523, + "step": 92590 + }, + { + "epoch": 4.599185457435184, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004320691367835503, + "loss": 0.5319, + "step": 92600 + }, + { + "epoch": 4.5996821297308035, + "grad_norm": 0.130859375, + "learning_rate": 0.00043202940299990066, + "loss": 0.5162, + "step": 92610 + }, + { + "epoch": 4.600178802026423, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004319896692162511, + "loss": 0.486, + "step": 92620 + }, + { + "epoch": 4.600675474322042, + "grad_norm": 0.126953125, + "learning_rate": 0.00043194993543260165, + "loss": 0.5066, + "step": 92630 + }, + { + "epoch": 4.601172146617662, + "grad_norm": 0.123046875, + "learning_rate": 0.000431910201648952, + "loss": 0.5231, + "step": 92640 + }, + { + "epoch": 4.601668818913281, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004318704678653025, + "loss": 0.5092, + "step": 92650 + }, + { + "epoch": 4.6021654912089005, + "grad_norm": 0.115234375, + "learning_rate": 0.00043183073408165296, + "loss": 0.5104, + "step": 92660 + }, + { + "epoch": 4.60266216350452, + "grad_norm": 0.12109375, + "learning_rate": 0.0004317910002980034, + "loss": 0.4988, + "step": 92670 + }, + { + "epoch": 4.603158835800139, + "grad_norm": 0.11962890625, + "learning_rate": 0.00043175126651435385, + "loss": 0.5284, + "step": 92680 + }, + { + "epoch": 4.603655508095758, + "grad_norm": 0.13671875, + "learning_rate": 0.0004317115327307043, + "loss": 0.5235, + "step": 92690 + }, + { + "epoch": 4.604152180391377, + "grad_norm": 0.123046875, + "learning_rate": 0.00043167179894705474, + "loss": 0.5051, + "step": 92700 + }, + { + "epoch": 4.6046488526869975, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004316320651634052, + "loss": 0.4786, + "step": 92710 + }, + { + "epoch": 4.605145524982617, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004315923313797557, + "loss": 0.4958, + "step": 92720 + }, + { + "epoch": 4.605642197278236, + "grad_norm": 0.11767578125, + "learning_rate": 0.00043155259759610604, + "loss": 0.5089, + "step": 92730 + }, + { + "epoch": 4.606138869573855, + "grad_norm": 0.15234375, + "learning_rate": 0.00043151286381245657, + "loss": 0.5285, + "step": 92740 + }, + { + "epoch": 4.606635541869474, + "grad_norm": 0.11572265625, + "learning_rate": 0.00043147313002880704, + "loss": 0.5246, + "step": 92750 + }, + { + "epoch": 4.607132214165094, + "grad_norm": 0.111328125, + "learning_rate": 0.0004314333962451575, + "loss": 0.5399, + "step": 92760 + }, + { + "epoch": 4.607628886460713, + "grad_norm": 0.12158203125, + "learning_rate": 0.00043139366246150787, + "loss": 0.5304, + "step": 92770 + }, + { + "epoch": 4.608125558756333, + "grad_norm": 0.142578125, + "learning_rate": 0.0004313539286778584, + "loss": 0.5089, + "step": 92780 + }, + { + "epoch": 4.608622231051952, + "grad_norm": 0.12451171875, + "learning_rate": 0.00043131419489420887, + "loss": 0.5307, + "step": 92790 + }, + { + "epoch": 4.6091189033475715, + "grad_norm": 0.125, + "learning_rate": 0.00043127446111055923, + "loss": 0.5138, + "step": 92800 + }, + { + "epoch": 4.609615575643191, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004312347273269097, + "loss": 0.5108, + "step": 92810 + }, + { + "epoch": 4.61011224793881, + "grad_norm": 0.12060546875, + "learning_rate": 0.00043119499354326023, + "loss": 0.4865, + "step": 92820 + }, + { + "epoch": 4.610608920234429, + "grad_norm": 0.1171875, + "learning_rate": 0.0004311552597596106, + "loss": 0.5165, + "step": 92830 + }, + { + "epoch": 4.611105592530048, + "grad_norm": 0.1025390625, + "learning_rate": 0.00043111552597596106, + "loss": 0.491, + "step": 92840 + }, + { + "epoch": 4.6116022648256685, + "grad_norm": 0.11669921875, + "learning_rate": 0.00043107579219231153, + "loss": 0.5152, + "step": 92850 + }, + { + "epoch": 4.612098937121288, + "grad_norm": 0.12890625, + "learning_rate": 0.00043103605840866195, + "loss": 0.5158, + "step": 92860 + }, + { + "epoch": 4.612595609416907, + "grad_norm": 0.1416015625, + "learning_rate": 0.0004309963246250124, + "loss": 0.5234, + "step": 92870 + }, + { + "epoch": 4.613092281712526, + "grad_norm": 0.1787109375, + "learning_rate": 0.0004309565908413629, + "loss": 0.5254, + "step": 92880 + }, + { + "epoch": 4.613588954008145, + "grad_norm": 0.1162109375, + "learning_rate": 0.00043091685705771337, + "loss": 0.497, + "step": 92890 + }, + { + "epoch": 4.614085626303765, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004308771232740638, + "loss": 0.5365, + "step": 92900 + }, + { + "epoch": 4.614582298599384, + "grad_norm": 0.12109375, + "learning_rate": 0.00043083738949041425, + "loss": 0.5452, + "step": 92910 + }, + { + "epoch": 4.615078970895004, + "grad_norm": 0.142578125, + "learning_rate": 0.0004307976557067647, + "loss": 0.5492, + "step": 92920 + }, + { + "epoch": 4.615575643190623, + "grad_norm": 0.1103515625, + "learning_rate": 0.00043075792192311514, + "loss": 0.5096, + "step": 92930 + }, + { + "epoch": 4.616072315486242, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004307181881394656, + "loss": 0.5493, + "step": 92940 + }, + { + "epoch": 4.616568987781862, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004306784543558161, + "loss": 0.5185, + "step": 92950 + }, + { + "epoch": 4.617065660077481, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004306387205721665, + "loss": 0.5305, + "step": 92960 + }, + { + "epoch": 4.6175623323731, + "grad_norm": 0.1044921875, + "learning_rate": 0.000430598986788517, + "loss": 0.5127, + "step": 92970 + }, + { + "epoch": 4.618059004668719, + "grad_norm": 0.109375, + "learning_rate": 0.00043055925300486745, + "loss": 0.528, + "step": 92980 + }, + { + "epoch": 4.6185556769643386, + "grad_norm": 0.12890625, + "learning_rate": 0.0004305195192212178, + "loss": 0.5339, + "step": 92990 + }, + { + "epoch": 4.619052349259959, + "grad_norm": 0.109375, + "learning_rate": 0.0004304797854375683, + "loss": 0.5293, + "step": 93000 + }, + { + "epoch": 4.619549021555578, + "grad_norm": 0.111328125, + "learning_rate": 0.0004304400516539188, + "loss": 0.5201, + "step": 93010 + }, + { + "epoch": 4.620045693851197, + "grad_norm": 0.10888671875, + "learning_rate": 0.00043040031787026917, + "loss": 0.505, + "step": 93020 + }, + { + "epoch": 4.620542366146816, + "grad_norm": 0.1064453125, + "learning_rate": 0.00043036058408661964, + "loss": 0.5059, + "step": 93030 + }, + { + "epoch": 4.621039038442436, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004303208503029701, + "loss": 0.5123, + "step": 93040 + }, + { + "epoch": 4.621535710738055, + "grad_norm": 0.10986328125, + "learning_rate": 0.00043028111651932064, + "loss": 0.5062, + "step": 93050 + }, + { + "epoch": 4.622032383033674, + "grad_norm": 0.10595703125, + "learning_rate": 0.000430241382735671, + "loss": 0.5308, + "step": 93060 + }, + { + "epoch": 4.622529055329293, + "grad_norm": 0.1201171875, + "learning_rate": 0.00043020164895202147, + "loss": 0.517, + "step": 93070 + }, + { + "epoch": 4.623025727624913, + "grad_norm": 0.11474609375, + "learning_rate": 0.00043016191516837194, + "loss": 0.5132, + "step": 93080 + }, + { + "epoch": 4.623522399920533, + "grad_norm": 0.12060546875, + "learning_rate": 0.00043012218138472236, + "loss": 0.5055, + "step": 93090 + }, + { + "epoch": 4.624019072216152, + "grad_norm": 0.10888671875, + "learning_rate": 0.00043008244760107283, + "loss": 0.4954, + "step": 93100 + }, + { + "epoch": 4.624515744511771, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004300427138174233, + "loss": 0.4991, + "step": 93110 + }, + { + "epoch": 4.62501241680739, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004300029800337737, + "loss": 0.5172, + "step": 93120 + }, + { + "epoch": 4.6255090891030095, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004299632462501242, + "loss": 0.5386, + "step": 93130 + }, + { + "epoch": 4.626005761398629, + "grad_norm": 0.1298828125, + "learning_rate": 0.00042992351246647466, + "loss": 0.5271, + "step": 93140 + }, + { + "epoch": 4.626502433694249, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004298837786828251, + "loss": 0.5245, + "step": 93150 + }, + { + "epoch": 4.626999105989868, + "grad_norm": 0.103515625, + "learning_rate": 0.00042984404489917555, + "loss": 0.541, + "step": 93160 + }, + { + "epoch": 4.627495778285487, + "grad_norm": 0.11767578125, + "learning_rate": 0.000429804311115526, + "loss": 0.4988, + "step": 93170 + }, + { + "epoch": 4.6279924505811065, + "grad_norm": 0.10546875, + "learning_rate": 0.0004297645773318764, + "loss": 0.5171, + "step": 93180 + }, + { + "epoch": 4.628489122876726, + "grad_norm": 0.109375, + "learning_rate": 0.0004297248435482269, + "loss": 0.5038, + "step": 93190 + }, + { + "epoch": 4.628985795172345, + "grad_norm": 0.12890625, + "learning_rate": 0.0004296851097645774, + "loss": 0.4955, + "step": 93200 + }, + { + "epoch": 4.629482467467964, + "grad_norm": 0.11669921875, + "learning_rate": 0.00042964537598092785, + "loss": 0.5067, + "step": 93210 + }, + { + "epoch": 4.629979139763584, + "grad_norm": 0.10546875, + "learning_rate": 0.0004296056421972782, + "loss": 0.5226, + "step": 93220 + }, + { + "epoch": 4.6304758120592036, + "grad_norm": 0.11279296875, + "learning_rate": 0.00042956590841362874, + "loss": 0.5224, + "step": 93230 + }, + { + "epoch": 4.630972484354823, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004295261746299792, + "loss": 0.5154, + "step": 93240 + }, + { + "epoch": 4.631469156650442, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004294864408463296, + "loss": 0.5042, + "step": 93250 + }, + { + "epoch": 4.631965828946061, + "grad_norm": 0.109375, + "learning_rate": 0.00042944670706268005, + "loss": 0.5015, + "step": 93260 + }, + { + "epoch": 4.6324625012416805, + "grad_norm": 0.10205078125, + "learning_rate": 0.0004294069732790305, + "loss": 0.4962, + "step": 93270 + }, + { + "epoch": 4.6329591735373, + "grad_norm": 0.1220703125, + "learning_rate": 0.00042936723949538094, + "loss": 0.5325, + "step": 93280 + }, + { + "epoch": 4.63345584583292, + "grad_norm": 0.193359375, + "learning_rate": 0.0004293275057117314, + "loss": 0.5356, + "step": 93290 + }, + { + "epoch": 4.633952518128539, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004292877719280819, + "loss": 0.5387, + "step": 93300 + }, + { + "epoch": 4.634449190424158, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004292480381444323, + "loss": 0.5287, + "step": 93310 + }, + { + "epoch": 4.6349458627197775, + "grad_norm": 0.10888671875, + "learning_rate": 0.00042920830436078277, + "loss": 0.4945, + "step": 93320 + }, + { + "epoch": 4.635442535015397, + "grad_norm": 0.1064453125, + "learning_rate": 0.00042916857057713324, + "loss": 0.5056, + "step": 93330 + }, + { + "epoch": 4.635939207311016, + "grad_norm": 0.1376953125, + "learning_rate": 0.0004291288367934837, + "loss": 0.5392, + "step": 93340 + }, + { + "epoch": 4.636435879606635, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004290891030098341, + "loss": 0.5309, + "step": 93350 + }, + { + "epoch": 4.636932551902255, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004290493692261846, + "loss": 0.5161, + "step": 93360 + }, + { + "epoch": 4.6374292241978745, + "grad_norm": 0.1220703125, + "learning_rate": 0.00042900963544253507, + "loss": 0.5358, + "step": 93370 + }, + { + "epoch": 4.637925896493494, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004289699016588855, + "loss": 0.5189, + "step": 93380 + }, + { + "epoch": 4.638422568789113, + "grad_norm": 0.09814453125, + "learning_rate": 0.00042893016787523596, + "loss": 0.5122, + "step": 93390 + }, + { + "epoch": 4.638919241084732, + "grad_norm": 0.10498046875, + "learning_rate": 0.00042889043409158643, + "loss": 0.4765, + "step": 93400 + }, + { + "epoch": 4.639415913380351, + "grad_norm": 0.1728515625, + "learning_rate": 0.0004288507003079368, + "loss": 0.4987, + "step": 93410 + }, + { + "epoch": 4.639912585675971, + "grad_norm": 0.099609375, + "learning_rate": 0.0004288109665242873, + "loss": 0.5058, + "step": 93420 + }, + { + "epoch": 4.640409257971591, + "grad_norm": 0.12451171875, + "learning_rate": 0.0004287712327406378, + "loss": 0.4966, + "step": 93430 + }, + { + "epoch": 4.64090593026721, + "grad_norm": 0.1015625, + "learning_rate": 0.00042873149895698815, + "loss": 0.5476, + "step": 93440 + }, + { + "epoch": 4.641402602562829, + "grad_norm": 0.1171875, + "learning_rate": 0.0004286917651733386, + "loss": 0.5086, + "step": 93450 + }, + { + "epoch": 4.641899274858448, + "grad_norm": 0.10595703125, + "learning_rate": 0.00042865203138968915, + "loss": 0.515, + "step": 93460 + }, + { + "epoch": 4.642395947154068, + "grad_norm": 0.12890625, + "learning_rate": 0.0004286122976060395, + "loss": 0.4866, + "step": 93470 + }, + { + "epoch": 4.642892619449687, + "grad_norm": 0.1044921875, + "learning_rate": 0.00042857256382239, + "loss": 0.4974, + "step": 93480 + }, + { + "epoch": 4.643389291745306, + "grad_norm": 0.1337890625, + "learning_rate": 0.00042853283003874045, + "loss": 0.5037, + "step": 93490 + }, + { + "epoch": 4.643885964040926, + "grad_norm": 0.10302734375, + "learning_rate": 0.000428493096255091, + "loss": 0.5221, + "step": 93500 + }, + { + "epoch": 4.6443826363365455, + "grad_norm": 0.1318359375, + "learning_rate": 0.00042845336247144134, + "loss": 0.4928, + "step": 93510 + }, + { + "epoch": 4.644879308632165, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004284136286877918, + "loss": 0.5028, + "step": 93520 + }, + { + "epoch": 4.645375980927784, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004283738949041423, + "loss": 0.5423, + "step": 93530 + }, + { + "epoch": 4.645872653223403, + "grad_norm": 0.1376953125, + "learning_rate": 0.0004283341611204927, + "loss": 0.4969, + "step": 93540 + }, + { + "epoch": 4.646369325519022, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004282944273368432, + "loss": 0.5313, + "step": 93550 + }, + { + "epoch": 4.646865997814642, + "grad_norm": 0.1064453125, + "learning_rate": 0.00042825469355319365, + "loss": 0.5243, + "step": 93560 + }, + { + "epoch": 4.647362670110262, + "grad_norm": 0.140625, + "learning_rate": 0.00042821495976954406, + "loss": 0.5266, + "step": 93570 + }, + { + "epoch": 4.647859342405881, + "grad_norm": 0.10595703125, + "learning_rate": 0.00042817522598589453, + "loss": 0.4925, + "step": 93580 + }, + { + "epoch": 4.6483560147015, + "grad_norm": 0.11865234375, + "learning_rate": 0.000428135492202245, + "loss": 0.5117, + "step": 93590 + }, + { + "epoch": 4.648852686997119, + "grad_norm": 0.1416015625, + "learning_rate": 0.00042809575841859537, + "loss": 0.502, + "step": 93600 + }, + { + "epoch": 4.649349359292739, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004280560246349459, + "loss": 0.5174, + "step": 93610 + }, + { + "epoch": 4.649846031588358, + "grad_norm": 0.109375, + "learning_rate": 0.00042801629085129637, + "loss": 0.4945, + "step": 93620 + }, + { + "epoch": 4.650342703883977, + "grad_norm": 0.12890625, + "learning_rate": 0.00042797655706764673, + "loss": 0.5148, + "step": 93630 + }, + { + "epoch": 4.650839376179597, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004279368232839972, + "loss": 0.5034, + "step": 93640 + }, + { + "epoch": 4.651336048475216, + "grad_norm": 0.13671875, + "learning_rate": 0.0004278970895003477, + "loss": 0.5312, + "step": 93650 + }, + { + "epoch": 4.651832720770836, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004278573557166982, + "loss": 0.4967, + "step": 93660 + }, + { + "epoch": 4.652329393066455, + "grad_norm": 0.1474609375, + "learning_rate": 0.00042781762193304856, + "loss": 0.5173, + "step": 93670 + }, + { + "epoch": 4.652826065362074, + "grad_norm": 0.1171875, + "learning_rate": 0.00042777788814939903, + "loss": 0.5002, + "step": 93680 + }, + { + "epoch": 4.653322737657693, + "grad_norm": 0.10302734375, + "learning_rate": 0.00042773815436574956, + "loss": 0.5336, + "step": 93690 + }, + { + "epoch": 4.653819409953313, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004276984205820999, + "loss": 0.5114, + "step": 93700 + }, + { + "epoch": 4.654316082248932, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004276586867984504, + "loss": 0.4867, + "step": 93710 + }, + { + "epoch": 4.654812754544551, + "grad_norm": 0.12060546875, + "learning_rate": 0.00042761895301480086, + "loss": 0.4993, + "step": 93720 + }, + { + "epoch": 4.655309426840171, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004275792192311513, + "loss": 0.5041, + "step": 93730 + }, + { + "epoch": 4.65580609913579, + "grad_norm": 0.125, + "learning_rate": 0.00042753948544750175, + "loss": 0.5209, + "step": 93740 + }, + { + "epoch": 4.65630277143141, + "grad_norm": 0.2021484375, + "learning_rate": 0.0004274997516638522, + "loss": 0.5342, + "step": 93750 + }, + { + "epoch": 4.656799443727029, + "grad_norm": 0.1201171875, + "learning_rate": 0.00042746001788020264, + "loss": 0.5215, + "step": 93760 + }, + { + "epoch": 4.657296116022648, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004274202840965531, + "loss": 0.5045, + "step": 93770 + }, + { + "epoch": 4.657792788318267, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004273805503129036, + "loss": 0.5057, + "step": 93780 + }, + { + "epoch": 4.6582894606138865, + "grad_norm": 0.12890625, + "learning_rate": 0.00042734081652925405, + "loss": 0.511, + "step": 93790 + }, + { + "epoch": 4.658786132909507, + "grad_norm": 0.0966796875, + "learning_rate": 0.00042730108274560447, + "loss": 0.5015, + "step": 93800 + }, + { + "epoch": 4.659282805205126, + "grad_norm": 0.13671875, + "learning_rate": 0.00042726134896195494, + "loss": 0.5054, + "step": 93810 + }, + { + "epoch": 4.659779477500745, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004272216151783054, + "loss": 0.5361, + "step": 93820 + }, + { + "epoch": 4.660276149796364, + "grad_norm": 0.1640625, + "learning_rate": 0.00042718188139465583, + "loss": 0.5188, + "step": 93830 + }, + { + "epoch": 4.6607728220919835, + "grad_norm": 0.1025390625, + "learning_rate": 0.0004271421476110063, + "loss": 0.4968, + "step": 93840 + }, + { + "epoch": 4.661269494387603, + "grad_norm": 0.12353515625, + "learning_rate": 0.00042710241382735677, + "loss": 0.4918, + "step": 93850 + }, + { + "epoch": 4.661766166683222, + "grad_norm": 0.13671875, + "learning_rate": 0.00042706268004370714, + "loss": 0.533, + "step": 93860 + }, + { + "epoch": 4.662262838978842, + "grad_norm": 0.220703125, + "learning_rate": 0.0004270229462600576, + "loss": 0.4973, + "step": 93870 + }, + { + "epoch": 4.662759511274461, + "grad_norm": 0.1181640625, + "learning_rate": 0.00042698321247640813, + "loss": 0.4874, + "step": 93880 + }, + { + "epoch": 4.6632561835700805, + "grad_norm": 0.119140625, + "learning_rate": 0.0004269434786927585, + "loss": 0.5093, + "step": 93890 + }, + { + "epoch": 4.6637528558657, + "grad_norm": 0.12255859375, + "learning_rate": 0.00042690374490910897, + "loss": 0.4838, + "step": 93900 + }, + { + "epoch": 4.664249528161319, + "grad_norm": 0.1279296875, + "learning_rate": 0.00042686401112545944, + "loss": 0.5036, + "step": 93910 + }, + { + "epoch": 4.664746200456938, + "grad_norm": 0.1298828125, + "learning_rate": 0.00042682427734180986, + "loss": 0.5148, + "step": 93920 + }, + { + "epoch": 4.6652428727525574, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004267845435581603, + "loss": 0.4976, + "step": 93930 + }, + { + "epoch": 4.665739545048178, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004267448097745108, + "loss": 0.5301, + "step": 93940 + }, + { + "epoch": 4.666236217343797, + "grad_norm": 0.1220703125, + "learning_rate": 0.00042670507599086127, + "loss": 0.5311, + "step": 93950 + }, + { + "epoch": 4.666732889639416, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004266653422072117, + "loss": 0.5232, + "step": 93960 + }, + { + "epoch": 4.667229561935035, + "grad_norm": 0.12109375, + "learning_rate": 0.00042662560842356216, + "loss": 0.5074, + "step": 93970 + }, + { + "epoch": 4.6677262342306545, + "grad_norm": 0.1142578125, + "learning_rate": 0.00042658587463991263, + "loss": 0.5078, + "step": 93980 + }, + { + "epoch": 4.668222906526274, + "grad_norm": 0.103515625, + "learning_rate": 0.00042654614085626305, + "loss": 0.5122, + "step": 93990 + }, + { + "epoch": 4.668719578821893, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004265064070726135, + "loss": 0.5162, + "step": 94000 + }, + { + "epoch": 4.669216251117513, + "grad_norm": 0.1279296875, + "learning_rate": 0.000426466673288964, + "loss": 0.5243, + "step": 94010 + }, + { + "epoch": 4.669712923413132, + "grad_norm": 0.125, + "learning_rate": 0.0004264269395053144, + "loss": 0.5339, + "step": 94020 + }, + { + "epoch": 4.6702095957087515, + "grad_norm": 0.123046875, + "learning_rate": 0.0004263872057216649, + "loss": 0.5147, + "step": 94030 + }, + { + "epoch": 4.670706268004371, + "grad_norm": 0.146484375, + "learning_rate": 0.00042634747193801535, + "loss": 0.5363, + "step": 94040 + }, + { + "epoch": 4.67120294029999, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004263077381543657, + "loss": 0.5158, + "step": 94050 + }, + { + "epoch": 4.671699612595609, + "grad_norm": 0.11767578125, + "learning_rate": 0.00042626800437071624, + "loss": 0.5312, + "step": 94060 + }, + { + "epoch": 4.672196284891228, + "grad_norm": 0.10009765625, + "learning_rate": 0.0004262282705870667, + "loss": 0.529, + "step": 94070 + }, + { + "epoch": 4.6726929571868485, + "grad_norm": 0.10546875, + "learning_rate": 0.00042618853680341707, + "loss": 0.5017, + "step": 94080 + }, + { + "epoch": 4.673189629482468, + "grad_norm": 0.12890625, + "learning_rate": 0.00042614880301976754, + "loss": 0.5209, + "step": 94090 + }, + { + "epoch": 4.673686301778087, + "grad_norm": 0.1982421875, + "learning_rate": 0.000426109069236118, + "loss": 0.5108, + "step": 94100 + }, + { + "epoch": 4.674182974073706, + "grad_norm": 0.1552734375, + "learning_rate": 0.00042606933545246854, + "loss": 0.5438, + "step": 94110 + }, + { + "epoch": 4.674679646369325, + "grad_norm": 0.126953125, + "learning_rate": 0.0004260296016688189, + "loss": 0.5113, + "step": 94120 + }, + { + "epoch": 4.675176318664945, + "grad_norm": 0.138671875, + "learning_rate": 0.0004259898678851694, + "loss": 0.4907, + "step": 94130 + }, + { + "epoch": 4.675672990960564, + "grad_norm": 0.11865234375, + "learning_rate": 0.00042595013410151985, + "loss": 0.528, + "step": 94140 + }, + { + "epoch": 4.676169663256184, + "grad_norm": 0.158203125, + "learning_rate": 0.00042591040031787026, + "loss": 0.53, + "step": 94150 + }, + { + "epoch": 4.676666335551803, + "grad_norm": 0.119140625, + "learning_rate": 0.00042587066653422073, + "loss": 0.5285, + "step": 94160 + }, + { + "epoch": 4.6771630078474224, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004258309327505712, + "loss": 0.5242, + "step": 94170 + }, + { + "epoch": 4.677659680143042, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004257911989669216, + "loss": 0.5066, + "step": 94180 + }, + { + "epoch": 4.678156352438661, + "grad_norm": 0.1806640625, + "learning_rate": 0.0004257514651832721, + "loss": 0.5107, + "step": 94190 + }, + { + "epoch": 4.67865302473428, + "grad_norm": 0.11669921875, + "learning_rate": 0.00042571173139962257, + "loss": 0.5478, + "step": 94200 + }, + { + "epoch": 4.679149697029899, + "grad_norm": 0.1630859375, + "learning_rate": 0.000425671997615973, + "loss": 0.5088, + "step": 94210 + }, + { + "epoch": 4.6796463693255195, + "grad_norm": 0.1298828125, + "learning_rate": 0.00042563226383232345, + "loss": 0.5017, + "step": 94220 + }, + { + "epoch": 4.680143041621139, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004255925300486739, + "loss": 0.5478, + "step": 94230 + }, + { + "epoch": 4.680639713916758, + "grad_norm": 0.107421875, + "learning_rate": 0.0004255527962650244, + "loss": 0.5269, + "step": 94240 + }, + { + "epoch": 4.681136386212377, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004255130624813748, + "loss": 0.506, + "step": 94250 + }, + { + "epoch": 4.681633058507996, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004254733286977253, + "loss": 0.533, + "step": 94260 + }, + { + "epoch": 4.682129730803616, + "grad_norm": 0.1083984375, + "learning_rate": 0.00042543359491407576, + "loss": 0.5428, + "step": 94270 + }, + { + "epoch": 4.682626403099235, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004253938611304261, + "loss": 0.51, + "step": 94280 + }, + { + "epoch": 4.683123075394855, + "grad_norm": 0.142578125, + "learning_rate": 0.00042535412734677664, + "loss": 0.5113, + "step": 94290 + }, + { + "epoch": 4.683619747690474, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004253143935631271, + "loss": 0.5103, + "step": 94300 + }, + { + "epoch": 4.684116419986093, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004252746597794775, + "loss": 0.4983, + "step": 94310 + }, + { + "epoch": 4.684613092281713, + "grad_norm": 0.1474609375, + "learning_rate": 0.00042523492599582795, + "loss": 0.506, + "step": 94320 + }, + { + "epoch": 4.685109764577332, + "grad_norm": 0.1845703125, + "learning_rate": 0.0004251951922121785, + "loss": 0.5123, + "step": 94330 + }, + { + "epoch": 4.685606436872951, + "grad_norm": 0.130859375, + "learning_rate": 0.00042515545842852884, + "loss": 0.5111, + "step": 94340 + }, + { + "epoch": 4.68610310916857, + "grad_norm": 0.1201171875, + "learning_rate": 0.0004251157246448793, + "loss": 0.5294, + "step": 94350 + }, + { + "epoch": 4.6865997814641895, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004250759908612298, + "loss": 0.5078, + "step": 94360 + }, + { + "epoch": 4.68709645375981, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004250362570775802, + "loss": 0.5218, + "step": 94370 + }, + { + "epoch": 4.687593126055429, + "grad_norm": 0.12060546875, + "learning_rate": 0.00042499652329393067, + "loss": 0.5118, + "step": 94380 + }, + { + "epoch": 4.688089798351048, + "grad_norm": 0.1240234375, + "learning_rate": 0.00042495678951028114, + "loss": 0.5267, + "step": 94390 + }, + { + "epoch": 4.688586470646667, + "grad_norm": 0.10009765625, + "learning_rate": 0.0004249170557266316, + "loss": 0.5034, + "step": 94400 + }, + { + "epoch": 4.689083142942287, + "grad_norm": 0.12109375, + "learning_rate": 0.00042487732194298203, + "loss": 0.502, + "step": 94410 + }, + { + "epoch": 4.689579815237906, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004248375881593325, + "loss": 0.514, + "step": 94420 + }, + { + "epoch": 4.690076487533525, + "grad_norm": 0.10595703125, + "learning_rate": 0.000424797854375683, + "loss": 0.5047, + "step": 94430 + }, + { + "epoch": 4.690573159829144, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004247581205920334, + "loss": 0.5045, + "step": 94440 + }, + { + "epoch": 4.691069832124764, + "grad_norm": 0.11669921875, + "learning_rate": 0.00042471838680838386, + "loss": 0.5306, + "step": 94450 + }, + { + "epoch": 4.691566504420384, + "grad_norm": 0.10302734375, + "learning_rate": 0.00042467865302473433, + "loss": 0.4941, + "step": 94460 + }, + { + "epoch": 4.692063176716003, + "grad_norm": 0.1171875, + "learning_rate": 0.0004246389192410847, + "loss": 0.5001, + "step": 94470 + }, + { + "epoch": 4.692559849011622, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004245991854574352, + "loss": 0.5268, + "step": 94480 + }, + { + "epoch": 4.693056521307241, + "grad_norm": 0.1435546875, + "learning_rate": 0.0004245594516737857, + "loss": 0.5235, + "step": 94490 + }, + { + "epoch": 4.6935531936028605, + "grad_norm": 0.10546875, + "learning_rate": 0.00042451971789013606, + "loss": 0.5035, + "step": 94500 + }, + { + "epoch": 4.69404986589848, + "grad_norm": 0.10205078125, + "learning_rate": 0.0004244799841064865, + "loss": 0.5141, + "step": 94510 + }, + { + "epoch": 4.6945465381941, + "grad_norm": 0.126953125, + "learning_rate": 0.00042444025032283705, + "loss": 0.5439, + "step": 94520 + }, + { + "epoch": 4.695043210489719, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004244005165391874, + "loss": 0.5259, + "step": 94530 + }, + { + "epoch": 4.695539882785338, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004243607827555379, + "loss": 0.5355, + "step": 94540 + }, + { + "epoch": 4.6960365550809575, + "grad_norm": 0.10205078125, + "learning_rate": 0.00042432104897188836, + "loss": 0.5214, + "step": 94550 + }, + { + "epoch": 4.696533227376577, + "grad_norm": 0.11328125, + "learning_rate": 0.0004242813151882389, + "loss": 0.5157, + "step": 94560 + }, + { + "epoch": 4.697029899672196, + "grad_norm": 0.10400390625, + "learning_rate": 0.00042424158140458925, + "loss": 0.5294, + "step": 94570 + }, + { + "epoch": 4.697526571967815, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004242018476209397, + "loss": 0.535, + "step": 94580 + }, + { + "epoch": 4.698023244263435, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004241621138372902, + "loss": 0.5182, + "step": 94590 + }, + { + "epoch": 4.6985199165590545, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004241223800536406, + "loss": 0.5396, + "step": 94600 + }, + { + "epoch": 4.699016588854674, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004240826462699911, + "loss": 0.4946, + "step": 94610 + }, + { + "epoch": 4.699513261150293, + "grad_norm": 0.10498046875, + "learning_rate": 0.00042404291248634155, + "loss": 0.5151, + "step": 94620 + }, + { + "epoch": 4.700009933445912, + "grad_norm": 0.13671875, + "learning_rate": 0.00042400317870269197, + "loss": 0.514, + "step": 94630 + }, + { + "epoch": 4.7005066057415315, + "grad_norm": 0.1162109375, + "learning_rate": 0.00042396344491904244, + "loss": 0.5305, + "step": 94640 + }, + { + "epoch": 4.701003278037151, + "grad_norm": 0.171875, + "learning_rate": 0.0004239237111353929, + "loss": 0.5106, + "step": 94650 + }, + { + "epoch": 4.701499950332771, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004238839773517433, + "loss": 0.5304, + "step": 94660 + }, + { + "epoch": 4.70199662262839, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004238442435680938, + "loss": 0.5195, + "step": 94670 + }, + { + "epoch": 4.702493294924009, + "grad_norm": 0.185546875, + "learning_rate": 0.00042380450978444427, + "loss": 0.5279, + "step": 94680 + }, + { + "epoch": 4.7029899672196285, + "grad_norm": 0.12353515625, + "learning_rate": 0.00042376477600079474, + "loss": 0.5219, + "step": 94690 + }, + { + "epoch": 4.703486639515248, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004237250422171451, + "loss": 0.5455, + "step": 94700 + }, + { + "epoch": 4.703983311810867, + "grad_norm": 0.1142578125, + "learning_rate": 0.00042368530843349563, + "loss": 0.5194, + "step": 94710 + }, + { + "epoch": 4.704479984106486, + "grad_norm": 0.111328125, + "learning_rate": 0.0004236455746498461, + "loss": 0.5275, + "step": 94720 + }, + { + "epoch": 4.704976656402106, + "grad_norm": 0.1181640625, + "learning_rate": 0.00042360584086619646, + "loss": 0.5154, + "step": 94730 + }, + { + "epoch": 4.7054733286977255, + "grad_norm": 0.1123046875, + "learning_rate": 0.00042356610708254693, + "loss": 0.5294, + "step": 94740 + }, + { + "epoch": 4.705970000993345, + "grad_norm": 0.1259765625, + "learning_rate": 0.00042352637329889746, + "loss": 0.5077, + "step": 94750 + }, + { + "epoch": 4.706466673288964, + "grad_norm": 0.1728515625, + "learning_rate": 0.0004234866395152478, + "loss": 0.5258, + "step": 94760 + }, + { + "epoch": 4.706963345584583, + "grad_norm": 0.177734375, + "learning_rate": 0.0004234469057315983, + "loss": 0.5071, + "step": 94770 + }, + { + "epoch": 4.707460017880202, + "grad_norm": 0.119140625, + "learning_rate": 0.00042340717194794877, + "loss": 0.5164, + "step": 94780 + }, + { + "epoch": 4.707956690175822, + "grad_norm": 0.142578125, + "learning_rate": 0.0004233674381642992, + "loss": 0.4936, + "step": 94790 + }, + { + "epoch": 4.708453362471442, + "grad_norm": 0.1064453125, + "learning_rate": 0.00042332770438064965, + "loss": 0.5188, + "step": 94800 + }, + { + "epoch": 4.708950034767061, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004232879705970001, + "loss": 0.4913, + "step": 94810 + }, + { + "epoch": 4.70944670706268, + "grad_norm": 0.1298828125, + "learning_rate": 0.00042324823681335054, + "loss": 0.543, + "step": 94820 + }, + { + "epoch": 4.709943379358299, + "grad_norm": 0.10888671875, + "learning_rate": 0.000423208503029701, + "loss": 0.5066, + "step": 94830 + }, + { + "epoch": 4.710440051653919, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004231687692460515, + "loss": 0.5207, + "step": 94840 + }, + { + "epoch": 4.710936723949538, + "grad_norm": 0.142578125, + "learning_rate": 0.00042312903546240196, + "loss": 0.5341, + "step": 94850 + }, + { + "epoch": 4.711433396245157, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004230893016787524, + "loss": 0.5013, + "step": 94860 + }, + { + "epoch": 4.711930068540777, + "grad_norm": 0.158203125, + "learning_rate": 0.00042304956789510285, + "loss": 0.5352, + "step": 94870 + }, + { + "epoch": 4.7124267408363965, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004230098341114533, + "loss": 0.5075, + "step": 94880 + }, + { + "epoch": 4.712923413132016, + "grad_norm": 0.1279296875, + "learning_rate": 0.00042297010032780373, + "loss": 0.4926, + "step": 94890 + }, + { + "epoch": 4.713420085427635, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004229303665441542, + "loss": 0.5439, + "step": 94900 + }, + { + "epoch": 4.713916757723254, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004228906327605047, + "loss": 0.5166, + "step": 94910 + }, + { + "epoch": 4.714413430018873, + "grad_norm": 0.173828125, + "learning_rate": 0.00042285089897685504, + "loss": 0.5406, + "step": 94920 + }, + { + "epoch": 4.714910102314493, + "grad_norm": 0.12158203125, + "learning_rate": 0.00042281116519320556, + "loss": 0.5206, + "step": 94930 + }, + { + "epoch": 4.715406774610113, + "grad_norm": 0.15234375, + "learning_rate": 0.00042277143140955604, + "loss": 0.4995, + "step": 94940 + }, + { + "epoch": 4.715903446905732, + "grad_norm": 0.1630859375, + "learning_rate": 0.0004227316976259064, + "loss": 0.4979, + "step": 94950 + }, + { + "epoch": 4.716400119201351, + "grad_norm": 0.13671875, + "learning_rate": 0.00042269196384225687, + "loss": 0.5015, + "step": 94960 + }, + { + "epoch": 4.71689679149697, + "grad_norm": 0.1376953125, + "learning_rate": 0.00042265223005860734, + "loss": 0.5038, + "step": 94970 + }, + { + "epoch": 4.71739346379259, + "grad_norm": 0.11181640625, + "learning_rate": 0.00042261249627495776, + "loss": 0.5304, + "step": 94980 + }, + { + "epoch": 4.717890136088209, + "grad_norm": 0.11376953125, + "learning_rate": 0.00042257276249130823, + "loss": 0.5447, + "step": 94990 + }, + { + "epoch": 4.718386808383828, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004225330287076587, + "loss": 0.4964, + "step": 95000 + }, + { + "epoch": 4.718883480679448, + "grad_norm": 0.1533203125, + "learning_rate": 0.0004224932949240092, + "loss": 0.5399, + "step": 95010 + }, + { + "epoch": 4.719380152975067, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004224535611403596, + "loss": 0.5241, + "step": 95020 + }, + { + "epoch": 4.719876825270687, + "grad_norm": 0.126953125, + "learning_rate": 0.00042241382735671006, + "loss": 0.5011, + "step": 95030 + }, + { + "epoch": 4.720373497566306, + "grad_norm": 0.10693359375, + "learning_rate": 0.00042237409357306053, + "loss": 0.5227, + "step": 95040 + }, + { + "epoch": 4.720870169861925, + "grad_norm": 0.12109375, + "learning_rate": 0.00042233435978941095, + "loss": 0.4831, + "step": 95050 + }, + { + "epoch": 4.721366842157544, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004222946260057614, + "loss": 0.5311, + "step": 95060 + }, + { + "epoch": 4.7218635144531635, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004222548922221119, + "loss": 0.5486, + "step": 95070 + }, + { + "epoch": 4.722360186748783, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004222151584384623, + "loss": 0.4966, + "step": 95080 + }, + { + "epoch": 4.722856859044403, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004221754246548128, + "loss": 0.5321, + "step": 95090 + }, + { + "epoch": 4.723353531340022, + "grad_norm": 0.10693359375, + "learning_rate": 0.00042213569087116325, + "loss": 0.5146, + "step": 95100 + }, + { + "epoch": 4.723850203635641, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004220959570875136, + "loss": 0.5115, + "step": 95110 + }, + { + "epoch": 4.724346875931261, + "grad_norm": 0.1435546875, + "learning_rate": 0.00042205622330386414, + "loss": 0.5373, + "step": 95120 + }, + { + "epoch": 4.72484354822688, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004220164895202146, + "loss": 0.5361, + "step": 95130 + }, + { + "epoch": 4.725340220522499, + "grad_norm": 0.134765625, + "learning_rate": 0.0004219767557365651, + "loss": 0.5529, + "step": 95140 + }, + { + "epoch": 4.725836892818118, + "grad_norm": 0.11083984375, + "learning_rate": 0.00042193702195291545, + "loss": 0.5089, + "step": 95150 + }, + { + "epoch": 4.7263335651137375, + "grad_norm": 0.1240234375, + "learning_rate": 0.00042189728816926597, + "loss": 0.5577, + "step": 95160 + }, + { + "epoch": 4.726830237409358, + "grad_norm": 0.12451171875, + "learning_rate": 0.00042185755438561644, + "loss": 0.5262, + "step": 95170 + }, + { + "epoch": 4.727326909704977, + "grad_norm": 0.1640625, + "learning_rate": 0.0004218178206019668, + "loss": 0.5136, + "step": 95180 + }, + { + "epoch": 4.727823582000596, + "grad_norm": 0.18359375, + "learning_rate": 0.0004217780868183173, + "loss": 0.4981, + "step": 95190 + }, + { + "epoch": 4.728320254296215, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004217383530346678, + "loss": 0.5209, + "step": 95200 + }, + { + "epoch": 4.7288169265918345, + "grad_norm": 0.1328125, + "learning_rate": 0.00042169861925101817, + "loss": 0.5366, + "step": 95210 + }, + { + "epoch": 4.729313598887454, + "grad_norm": 0.1181640625, + "learning_rate": 0.00042165888546736864, + "loss": 0.4795, + "step": 95220 + }, + { + "epoch": 4.729810271183073, + "grad_norm": 0.11328125, + "learning_rate": 0.0004216191516837191, + "loss": 0.5345, + "step": 95230 + }, + { + "epoch": 4.730306943478693, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004215794179000695, + "loss": 0.5245, + "step": 95240 + }, + { + "epoch": 4.730803615774312, + "grad_norm": 0.1279296875, + "learning_rate": 0.00042153968411642, + "loss": 0.5295, + "step": 95250 + }, + { + "epoch": 4.7313002880699315, + "grad_norm": 0.12060546875, + "learning_rate": 0.00042149995033277047, + "loss": 0.5018, + "step": 95260 + }, + { + "epoch": 4.731796960365551, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004214602165491209, + "loss": 0.4959, + "step": 95270 + }, + { + "epoch": 4.73229363266117, + "grad_norm": 0.10107421875, + "learning_rate": 0.00042142048276547136, + "loss": 0.517, + "step": 95280 + }, + { + "epoch": 4.732790304956789, + "grad_norm": 0.1396484375, + "learning_rate": 0.00042138074898182183, + "loss": 0.5057, + "step": 95290 + }, + { + "epoch": 4.733286977252408, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004213410151981723, + "loss": 0.5063, + "step": 95300 + }, + { + "epoch": 4.7337836495480285, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004213012814145227, + "loss": 0.5184, + "step": 95310 + }, + { + "epoch": 4.734280321843648, + "grad_norm": 0.10888671875, + "learning_rate": 0.0004212615476308732, + "loss": 0.5132, + "step": 95320 + }, + { + "epoch": 4.734776994139267, + "grad_norm": 0.11865234375, + "learning_rate": 0.00042122181384722366, + "loss": 0.5021, + "step": 95330 + }, + { + "epoch": 4.735273666434886, + "grad_norm": 0.15234375, + "learning_rate": 0.000421182080063574, + "loss": 0.4903, + "step": 95340 + }, + { + "epoch": 4.7357703387305055, + "grad_norm": 0.11962890625, + "learning_rate": 0.00042114234627992455, + "loss": 0.5328, + "step": 95350 + }, + { + "epoch": 4.736267011026125, + "grad_norm": 0.11376953125, + "learning_rate": 0.000421102612496275, + "loss": 0.5182, + "step": 95360 + }, + { + "epoch": 4.736763683321744, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004210628787126254, + "loss": 0.5032, + "step": 95370 + }, + { + "epoch": 4.737260355617364, + "grad_norm": 0.11572265625, + "learning_rate": 0.00042102314492897585, + "loss": 0.5062, + "step": 95380 + }, + { + "epoch": 4.737757027912983, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004209834111453264, + "loss": 0.5156, + "step": 95390 + }, + { + "epoch": 4.7382537002086025, + "grad_norm": 0.1005859375, + "learning_rate": 0.00042094367736167674, + "loss": 0.5158, + "step": 95400 + }, + { + "epoch": 4.738750372504222, + "grad_norm": 0.125, + "learning_rate": 0.0004209039435780272, + "loss": 0.5502, + "step": 95410 + }, + { + "epoch": 4.739247044799841, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004208642097943777, + "loss": 0.5035, + "step": 95420 + }, + { + "epoch": 4.73974371709546, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004208244760107282, + "loss": 0.5091, + "step": 95430 + }, + { + "epoch": 4.740240389391079, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004207847422270786, + "loss": 0.5248, + "step": 95440 + }, + { + "epoch": 4.7407370616866995, + "grad_norm": 0.1162109375, + "learning_rate": 0.00042074500844342905, + "loss": 0.5515, + "step": 95450 + }, + { + "epoch": 4.741233733982319, + "grad_norm": 0.10009765625, + "learning_rate": 0.0004207052746597795, + "loss": 0.5064, + "step": 95460 + }, + { + "epoch": 4.741730406277938, + "grad_norm": 0.1103515625, + "learning_rate": 0.00042066554087612993, + "loss": 0.506, + "step": 95470 + }, + { + "epoch": 4.742227078573557, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004206258070924804, + "loss": 0.4984, + "step": 95480 + }, + { + "epoch": 4.742723750869176, + "grad_norm": 0.138671875, + "learning_rate": 0.0004205860733088309, + "loss": 0.5167, + "step": 95490 + }, + { + "epoch": 4.743220423164796, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004205463395251813, + "loss": 0.5086, + "step": 95500 + }, + { + "epoch": 4.743717095460415, + "grad_norm": 0.1279296875, + "learning_rate": 0.00042050660574153177, + "loss": 0.5237, + "step": 95510 + }, + { + "epoch": 4.744213767756035, + "grad_norm": 0.140625, + "learning_rate": 0.00042046687195788224, + "loss": 0.5113, + "step": 95520 + }, + { + "epoch": 4.744710440051654, + "grad_norm": 0.130859375, + "learning_rate": 0.0004204271381742326, + "loss": 0.5052, + "step": 95530 + }, + { + "epoch": 4.745207112347273, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004203874043905831, + "loss": 0.5447, + "step": 95540 + }, + { + "epoch": 4.745703784642893, + "grad_norm": 0.109375, + "learning_rate": 0.0004203476706069336, + "loss": 0.5118, + "step": 95550 + }, + { + "epoch": 4.746200456938512, + "grad_norm": 0.10400390625, + "learning_rate": 0.00042030793682328396, + "loss": 0.4928, + "step": 95560 + }, + { + "epoch": 4.746697129234131, + "grad_norm": 0.126953125, + "learning_rate": 0.00042026820303963443, + "loss": 0.5002, + "step": 95570 + }, + { + "epoch": 4.74719380152975, + "grad_norm": 0.166015625, + "learning_rate": 0.00042022846925598496, + "loss": 0.5171, + "step": 95580 + }, + { + "epoch": 4.7476904738253705, + "grad_norm": 0.177734375, + "learning_rate": 0.00042018873547233543, + "loss": 0.5324, + "step": 95590 + }, + { + "epoch": 4.74818714612099, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004201490016886858, + "loss": 0.5299, + "step": 95600 + }, + { + "epoch": 4.748683818416609, + "grad_norm": 0.11279296875, + "learning_rate": 0.00042010926790503626, + "loss": 0.5255, + "step": 95610 + }, + { + "epoch": 4.749180490712228, + "grad_norm": 0.109375, + "learning_rate": 0.0004200695341213868, + "loss": 0.5217, + "step": 95620 + }, + { + "epoch": 4.749677163007847, + "grad_norm": 0.12255859375, + "learning_rate": 0.00042002980033773715, + "loss": 0.4949, + "step": 95630 + }, + { + "epoch": 4.750173835303467, + "grad_norm": 0.123046875, + "learning_rate": 0.0004199900665540876, + "loss": 0.5195, + "step": 95640 + }, + { + "epoch": 4.750670507599086, + "grad_norm": 0.12451171875, + "learning_rate": 0.0004199503327704381, + "loss": 0.5092, + "step": 95650 + }, + { + "epoch": 4.751167179894706, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004199105989867885, + "loss": 0.5452, + "step": 95660 + }, + { + "epoch": 4.751663852190325, + "grad_norm": 0.10546875, + "learning_rate": 0.000419870865203139, + "loss": 0.5041, + "step": 95670 + }, + { + "epoch": 4.752160524485944, + "grad_norm": 0.12255859375, + "learning_rate": 0.00041983113141948945, + "loss": 0.5251, + "step": 95680 + }, + { + "epoch": 4.752657196781564, + "grad_norm": 0.1171875, + "learning_rate": 0.00041979139763583987, + "loss": 0.5251, + "step": 95690 + }, + { + "epoch": 4.753153869077183, + "grad_norm": 0.11083984375, + "learning_rate": 0.00041975166385219034, + "loss": 0.4777, + "step": 95700 + }, + { + "epoch": 4.753650541372802, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004197119300685408, + "loss": 0.5222, + "step": 95710 + }, + { + "epoch": 4.754147213668421, + "grad_norm": 0.107421875, + "learning_rate": 0.00041967219628489123, + "loss": 0.5147, + "step": 95720 + }, + { + "epoch": 4.754643885964041, + "grad_norm": 0.115234375, + "learning_rate": 0.0004196324625012417, + "loss": 0.5202, + "step": 95730 + }, + { + "epoch": 4.755140558259661, + "grad_norm": 0.11376953125, + "learning_rate": 0.00041959272871759217, + "loss": 0.4835, + "step": 95740 + }, + { + "epoch": 4.75563723055528, + "grad_norm": 0.11962890625, + "learning_rate": 0.00041955299493394264, + "loss": 0.4836, + "step": 95750 + }, + { + "epoch": 4.756133902850899, + "grad_norm": 0.14453125, + "learning_rate": 0.00041951326115029306, + "loss": 0.5481, + "step": 95760 + }, + { + "epoch": 4.756630575146518, + "grad_norm": 0.11376953125, + "learning_rate": 0.00041947352736664353, + "loss": 0.5153, + "step": 95770 + }, + { + "epoch": 4.7571272474421376, + "grad_norm": 0.1474609375, + "learning_rate": 0.000419433793582994, + "loss": 0.5352, + "step": 95780 + }, + { + "epoch": 4.757623919737757, + "grad_norm": 0.10888671875, + "learning_rate": 0.00041939405979934437, + "loss": 0.5043, + "step": 95790 + }, + { + "epoch": 4.758120592033376, + "grad_norm": 0.12109375, + "learning_rate": 0.00041935432601569484, + "loss": 0.5119, + "step": 95800 + }, + { + "epoch": 4.758617264328995, + "grad_norm": 0.12890625, + "learning_rate": 0.00041931459223204536, + "loss": 0.5162, + "step": 95810 + }, + { + "epoch": 4.759113936624615, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004192748584483957, + "loss": 0.5127, + "step": 95820 + }, + { + "epoch": 4.759610608920235, + "grad_norm": 0.12451171875, + "learning_rate": 0.0004192351246647462, + "loss": 0.5209, + "step": 95830 + }, + { + "epoch": 4.760107281215854, + "grad_norm": 0.1298828125, + "learning_rate": 0.00041919539088109667, + "loss": 0.5126, + "step": 95840 + }, + { + "epoch": 4.760603953511473, + "grad_norm": 0.111328125, + "learning_rate": 0.0004191556570974471, + "loss": 0.4896, + "step": 95850 + }, + { + "epoch": 4.761100625807092, + "grad_norm": 0.11083984375, + "learning_rate": 0.00041911592331379756, + "loss": 0.5022, + "step": 95860 + }, + { + "epoch": 4.7615972981027115, + "grad_norm": 0.10888671875, + "learning_rate": 0.00041907618953014803, + "loss": 0.5168, + "step": 95870 + }, + { + "epoch": 4.762093970398331, + "grad_norm": 0.1376953125, + "learning_rate": 0.0004190364557464985, + "loss": 0.5049, + "step": 95880 + }, + { + "epoch": 4.762590642693951, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004189967219628489, + "loss": 0.4866, + "step": 95890 + }, + { + "epoch": 4.76308731498957, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004189569881791994, + "loss": 0.5106, + "step": 95900 + }, + { + "epoch": 4.763583987285189, + "grad_norm": 0.1044921875, + "learning_rate": 0.00041891725439554986, + "loss": 0.4877, + "step": 95910 + }, + { + "epoch": 4.7640806595808085, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004188775206119003, + "loss": 0.513, + "step": 95920 + }, + { + "epoch": 4.764577331876428, + "grad_norm": 0.126953125, + "learning_rate": 0.00041883778682825075, + "loss": 0.4947, + "step": 95930 + }, + { + "epoch": 4.765074004172047, + "grad_norm": 0.1591796875, + "learning_rate": 0.0004187980530446012, + "loss": 0.5392, + "step": 95940 + }, + { + "epoch": 4.765570676467666, + "grad_norm": 0.154296875, + "learning_rate": 0.00041875831926095164, + "loss": 0.5238, + "step": 95950 + }, + { + "epoch": 4.766067348763286, + "grad_norm": 0.107421875, + "learning_rate": 0.0004187185854773021, + "loss": 0.5083, + "step": 95960 + }, + { + "epoch": 4.7665640210589055, + "grad_norm": 0.1552734375, + "learning_rate": 0.0004186788516936526, + "loss": 0.5545, + "step": 95970 + }, + { + "epoch": 4.767060693354525, + "grad_norm": 0.11962890625, + "learning_rate": 0.00041863911791000294, + "loss": 0.5336, + "step": 95980 + }, + { + "epoch": 4.767557365650144, + "grad_norm": 0.111328125, + "learning_rate": 0.00041859938412635347, + "loss": 0.5086, + "step": 95990 + }, + { + "epoch": 4.768054037945763, + "grad_norm": 0.11328125, + "learning_rate": 0.00041855965034270394, + "loss": 0.5403, + "step": 96000 + }, + { + "epoch": 4.768550710241382, + "grad_norm": 0.107421875, + "learning_rate": 0.0004185199165590543, + "loss": 0.5288, + "step": 96010 + }, + { + "epoch": 4.769047382537002, + "grad_norm": 0.103515625, + "learning_rate": 0.0004184801827754048, + "loss": 0.5142, + "step": 96020 + }, + { + "epoch": 4.769544054832622, + "grad_norm": 0.12255859375, + "learning_rate": 0.0004184404489917553, + "loss": 0.509, + "step": 96030 + }, + { + "epoch": 4.770040727128241, + "grad_norm": 0.1298828125, + "learning_rate": 0.00041840071520810577, + "loss": 0.5234, + "step": 96040 + }, + { + "epoch": 4.77053739942386, + "grad_norm": 0.11376953125, + "learning_rate": 0.00041836098142445613, + "loss": 0.522, + "step": 96050 + }, + { + "epoch": 4.7710340717194795, + "grad_norm": 0.126953125, + "learning_rate": 0.0004183212476408066, + "loss": 0.5065, + "step": 96060 + }, + { + "epoch": 4.771530744015099, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004182815138571571, + "loss": 0.519, + "step": 96070 + }, + { + "epoch": 4.772027416310718, + "grad_norm": 0.12890625, + "learning_rate": 0.0004182417800735075, + "loss": 0.5165, + "step": 96080 + }, + { + "epoch": 4.772524088606337, + "grad_norm": 0.11669921875, + "learning_rate": 0.00041820204628985797, + "loss": 0.5168, + "step": 96090 + }, + { + "epoch": 4.773020760901957, + "grad_norm": 0.09814453125, + "learning_rate": 0.00041816231250620844, + "loss": 0.4887, + "step": 96100 + }, + { + "epoch": 4.7735174331975765, + "grad_norm": 0.130859375, + "learning_rate": 0.00041812257872255885, + "loss": 0.5135, + "step": 96110 + }, + { + "epoch": 4.774014105493196, + "grad_norm": 0.1572265625, + "learning_rate": 0.0004180828449389093, + "loss": 0.4996, + "step": 96120 + }, + { + "epoch": 4.774510777788815, + "grad_norm": 0.10546875, + "learning_rate": 0.0004180431111552598, + "loss": 0.5189, + "step": 96130 + }, + { + "epoch": 4.775007450084434, + "grad_norm": 0.111328125, + "learning_rate": 0.0004180033773716102, + "loss": 0.5156, + "step": 96140 + }, + { + "epoch": 4.775504122380053, + "grad_norm": 0.12890625, + "learning_rate": 0.0004179636435879607, + "loss": 0.5192, + "step": 96150 + }, + { + "epoch": 4.776000794675673, + "grad_norm": 0.1552734375, + "learning_rate": 0.00041792390980431116, + "loss": 0.5164, + "step": 96160 + }, + { + "epoch": 4.776497466971293, + "grad_norm": 0.109375, + "learning_rate": 0.0004178841760206615, + "loss": 0.5318, + "step": 96170 + }, + { + "epoch": 4.776994139266912, + "grad_norm": 0.10888671875, + "learning_rate": 0.00041784444223701204, + "loss": 0.5487, + "step": 96180 + }, + { + "epoch": 4.777490811562531, + "grad_norm": 0.1396484375, + "learning_rate": 0.0004178047084533625, + "loss": 0.553, + "step": 96190 + }, + { + "epoch": 4.77798748385815, + "grad_norm": 0.1103515625, + "learning_rate": 0.000417764974669713, + "loss": 0.5055, + "step": 96200 + }, + { + "epoch": 4.77848415615377, + "grad_norm": 0.12255859375, + "learning_rate": 0.00041772524088606335, + "loss": 0.4895, + "step": 96210 + }, + { + "epoch": 4.778980828449389, + "grad_norm": 0.107421875, + "learning_rate": 0.0004176855071024139, + "loss": 0.5172, + "step": 96220 + }, + { + "epoch": 4.779477500745008, + "grad_norm": 0.11376953125, + "learning_rate": 0.00041764577331876435, + "loss": 0.4994, + "step": 96230 + }, + { + "epoch": 4.779974173040628, + "grad_norm": 0.126953125, + "learning_rate": 0.0004176060395351147, + "loss": 0.5072, + "step": 96240 + }, + { + "epoch": 4.780470845336247, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004175663057514652, + "loss": 0.5237, + "step": 96250 + }, + { + "epoch": 4.780967517631867, + "grad_norm": 0.1396484375, + "learning_rate": 0.0004175265719678157, + "loss": 0.5339, + "step": 96260 + }, + { + "epoch": 4.781464189927486, + "grad_norm": 0.1396484375, + "learning_rate": 0.00041748683818416607, + "loss": 0.5089, + "step": 96270 + }, + { + "epoch": 4.781960862223105, + "grad_norm": 0.16796875, + "learning_rate": 0.00041744710440051654, + "loss": 0.4891, + "step": 96280 + }, + { + "epoch": 4.782457534518724, + "grad_norm": 0.126953125, + "learning_rate": 0.000417407370616867, + "loss": 0.5031, + "step": 96290 + }, + { + "epoch": 4.782954206814344, + "grad_norm": 0.123046875, + "learning_rate": 0.00041736763683321743, + "loss": 0.5108, + "step": 96300 + }, + { + "epoch": 4.783450879109964, + "grad_norm": 0.1142578125, + "learning_rate": 0.0004173279030495679, + "loss": 0.5089, + "step": 96310 + }, + { + "epoch": 4.783947551405583, + "grad_norm": 0.1123046875, + "learning_rate": 0.00041728816926591837, + "loss": 0.5291, + "step": 96320 + }, + { + "epoch": 4.784444223701202, + "grad_norm": 0.1259765625, + "learning_rate": 0.00041724843548226884, + "loss": 0.51, + "step": 96330 + }, + { + "epoch": 4.784940895996821, + "grad_norm": 0.197265625, + "learning_rate": 0.00041720870169861926, + "loss": 0.4878, + "step": 96340 + }, + { + "epoch": 4.785437568292441, + "grad_norm": 0.12353515625, + "learning_rate": 0.00041716896791496973, + "loss": 0.5025, + "step": 96350 + }, + { + "epoch": 4.78593424058806, + "grad_norm": 0.138671875, + "learning_rate": 0.0004171292341313202, + "loss": 0.5162, + "step": 96360 + }, + { + "epoch": 4.786430912883679, + "grad_norm": 0.1416015625, + "learning_rate": 0.0004170895003476706, + "loss": 0.5003, + "step": 96370 + }, + { + "epoch": 4.786927585179299, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004170497665640211, + "loss": 0.5095, + "step": 96380 + }, + { + "epoch": 4.787424257474918, + "grad_norm": 0.1328125, + "learning_rate": 0.00041701003278037156, + "loss": 0.4919, + "step": 96390 + }, + { + "epoch": 4.787920929770538, + "grad_norm": 0.1630859375, + "learning_rate": 0.0004169702989967219, + "loss": 0.5275, + "step": 96400 + }, + { + "epoch": 4.788417602066157, + "grad_norm": 0.1044921875, + "learning_rate": 0.00041693056521307245, + "loss": 0.5021, + "step": 96410 + }, + { + "epoch": 4.788914274361776, + "grad_norm": 0.125, + "learning_rate": 0.0004168908314294229, + "loss": 0.5032, + "step": 96420 + }, + { + "epoch": 4.789410946657395, + "grad_norm": 0.193359375, + "learning_rate": 0.0004168510976457733, + "loss": 0.519, + "step": 96430 + }, + { + "epoch": 4.7899076189530145, + "grad_norm": 0.10400390625, + "learning_rate": 0.00041681136386212376, + "loss": 0.5152, + "step": 96440 + }, + { + "epoch": 4.790404291248635, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004167716300784743, + "loss": 0.5233, + "step": 96450 + }, + { + "epoch": 4.790900963544254, + "grad_norm": 0.1484375, + "learning_rate": 0.00041673189629482465, + "loss": 0.4867, + "step": 96460 + }, + { + "epoch": 4.791397635839873, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004166921625111751, + "loss": 0.5125, + "step": 96470 + }, + { + "epoch": 4.791894308135492, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004166524287275256, + "loss": 0.5196, + "step": 96480 + }, + { + "epoch": 4.792390980431112, + "grad_norm": 0.1328125, + "learning_rate": 0.0004166126949438761, + "loss": 0.5539, + "step": 96490 + }, + { + "epoch": 4.792887652726731, + "grad_norm": 0.130859375, + "learning_rate": 0.0004165729611602265, + "loss": 0.5388, + "step": 96500 + }, + { + "epoch": 4.79338432502235, + "grad_norm": 0.11865234375, + "learning_rate": 0.00041653322737657695, + "loss": 0.4887, + "step": 96510 + }, + { + "epoch": 4.793880997317969, + "grad_norm": 0.1533203125, + "learning_rate": 0.0004164934935929274, + "loss": 0.495, + "step": 96520 + }, + { + "epoch": 4.7943776696135885, + "grad_norm": 0.13671875, + "learning_rate": 0.00041645375980927784, + "loss": 0.4969, + "step": 96530 + }, + { + "epoch": 4.794874341909209, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004164140260256283, + "loss": 0.4896, + "step": 96540 + }, + { + "epoch": 4.795371014204828, + "grad_norm": 0.11328125, + "learning_rate": 0.0004163742922419788, + "loss": 0.503, + "step": 96550 + }, + { + "epoch": 4.795867686500447, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004163345584583292, + "loss": 0.5325, + "step": 96560 + }, + { + "epoch": 4.796364358796066, + "grad_norm": 0.1171875, + "learning_rate": 0.00041629482467467967, + "loss": 0.5173, + "step": 96570 + }, + { + "epoch": 4.7968610310916855, + "grad_norm": 0.13671875, + "learning_rate": 0.00041625509089103014, + "loss": 0.5358, + "step": 96580 + }, + { + "epoch": 4.797357703387305, + "grad_norm": 0.1044921875, + "learning_rate": 0.00041621535710738056, + "loss": 0.535, + "step": 96590 + }, + { + "epoch": 4.797854375682924, + "grad_norm": 0.11865234375, + "learning_rate": 0.00041617562332373103, + "loss": 0.522, + "step": 96600 + }, + { + "epoch": 4.798351047978544, + "grad_norm": 0.171875, + "learning_rate": 0.0004161358895400815, + "loss": 0.5228, + "step": 96610 + }, + { + "epoch": 4.798847720274163, + "grad_norm": 0.11279296875, + "learning_rate": 0.00041609615575643186, + "loss": 0.507, + "step": 96620 + }, + { + "epoch": 4.7993443925697825, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004160564219727824, + "loss": 0.4998, + "step": 96630 + }, + { + "epoch": 4.799841064865402, + "grad_norm": 0.109375, + "learning_rate": 0.00041601668818913286, + "loss": 0.5215, + "step": 96640 + }, + { + "epoch": 4.800337737161021, + "grad_norm": 0.197265625, + "learning_rate": 0.00041597695440548333, + "loss": 0.5134, + "step": 96650 + }, + { + "epoch": 4.80083440945664, + "grad_norm": 0.126953125, + "learning_rate": 0.0004159372206218337, + "loss": 0.5027, + "step": 96660 + }, + { + "epoch": 4.801331081752259, + "grad_norm": 0.1376953125, + "learning_rate": 0.00041589748683818417, + "loss": 0.4889, + "step": 96670 + }, + { + "epoch": 4.8018277540478795, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004158577530545347, + "loss": 0.5308, + "step": 96680 + }, + { + "epoch": 4.802324426343499, + "grad_norm": 0.1201171875, + "learning_rate": 0.00041581801927088505, + "loss": 0.5025, + "step": 96690 + }, + { + "epoch": 4.802821098639118, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004157782854872355, + "loss": 0.4896, + "step": 96700 + }, + { + "epoch": 4.803317770934737, + "grad_norm": 0.1064453125, + "learning_rate": 0.000415738551703586, + "loss": 0.4873, + "step": 96710 + }, + { + "epoch": 4.8038144432303564, + "grad_norm": 0.1474609375, + "learning_rate": 0.0004156988179199364, + "loss": 0.5143, + "step": 96720 + }, + { + "epoch": 4.804311115525976, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004156590841362869, + "loss": 0.5448, + "step": 96730 + }, + { + "epoch": 4.804807787821595, + "grad_norm": 0.11669921875, + "learning_rate": 0.00041561935035263736, + "loss": 0.4952, + "step": 96740 + }, + { + "epoch": 4.805304460117215, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004155796165689878, + "loss": 0.5376, + "step": 96750 + }, + { + "epoch": 4.805801132412834, + "grad_norm": 0.1611328125, + "learning_rate": 0.00041553988278533825, + "loss": 0.5119, + "step": 96760 + }, + { + "epoch": 4.8062978047084535, + "grad_norm": 0.1728515625, + "learning_rate": 0.0004155001490016887, + "loss": 0.5141, + "step": 96770 + }, + { + "epoch": 4.806794477004073, + "grad_norm": 0.0986328125, + "learning_rate": 0.0004154604152180392, + "loss": 0.5169, + "step": 96780 + }, + { + "epoch": 4.807291149299692, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004154206814343896, + "loss": 0.5102, + "step": 96790 + }, + { + "epoch": 4.807787821595311, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004153809476507401, + "loss": 0.5279, + "step": 96800 + }, + { + "epoch": 4.80828449389093, + "grad_norm": 0.1240234375, + "learning_rate": 0.00041534121386709055, + "loss": 0.5039, + "step": 96810 + }, + { + "epoch": 4.8087811661865505, + "grad_norm": 0.10205078125, + "learning_rate": 0.00041530148008344096, + "loss": 0.497, + "step": 96820 + }, + { + "epoch": 4.80927783848217, + "grad_norm": 0.1201171875, + "learning_rate": 0.00041526174629979144, + "loss": 0.4898, + "step": 96830 + }, + { + "epoch": 4.809774510777789, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004152220125161419, + "loss": 0.5293, + "step": 96840 + }, + { + "epoch": 4.810271183073408, + "grad_norm": 0.1279296875, + "learning_rate": 0.00041518227873249227, + "loss": 0.5282, + "step": 96850 + }, + { + "epoch": 4.810767855369027, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004151425449488428, + "loss": 0.5407, + "step": 96860 + }, + { + "epoch": 4.811264527664647, + "grad_norm": 0.10400390625, + "learning_rate": 0.00041510281116519327, + "loss": 0.5026, + "step": 96870 + }, + { + "epoch": 4.811761199960266, + "grad_norm": 0.12890625, + "learning_rate": 0.00041506307738154363, + "loss": 0.4928, + "step": 96880 + }, + { + "epoch": 4.812257872255886, + "grad_norm": 0.0986328125, + "learning_rate": 0.0004150233435978941, + "loss": 0.4927, + "step": 96890 + }, + { + "epoch": 4.812754544551505, + "grad_norm": 0.11572265625, + "learning_rate": 0.00041498360981424463, + "loss": 0.5254, + "step": 96900 + }, + { + "epoch": 4.813251216847124, + "grad_norm": 0.1142578125, + "learning_rate": 0.000414943876030595, + "loss": 0.5341, + "step": 96910 + }, + { + "epoch": 4.813747889142744, + "grad_norm": 0.115234375, + "learning_rate": 0.00041490414224694546, + "loss": 0.4761, + "step": 96920 + }, + { + "epoch": 4.814244561438363, + "grad_norm": 0.0966796875, + "learning_rate": 0.00041486440846329593, + "loss": 0.4771, + "step": 96930 + }, + { + "epoch": 4.814741233733982, + "grad_norm": 0.123046875, + "learning_rate": 0.0004148246746796464, + "loss": 0.5111, + "step": 96940 + }, + { + "epoch": 4.815237906029601, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004147849408959968, + "loss": 0.5093, + "step": 96950 + }, + { + "epoch": 4.8157345783252214, + "grad_norm": 0.109375, + "learning_rate": 0.0004147452071123473, + "loss": 0.5041, + "step": 96960 + }, + { + "epoch": 4.816231250620841, + "grad_norm": 0.11328125, + "learning_rate": 0.00041470547332869776, + "loss": 0.4918, + "step": 96970 + }, + { + "epoch": 4.81672792291646, + "grad_norm": 0.125, + "learning_rate": 0.0004146657395450482, + "loss": 0.5107, + "step": 96980 + }, + { + "epoch": 4.817224595212079, + "grad_norm": 0.1298828125, + "learning_rate": 0.00041462600576139865, + "loss": 0.501, + "step": 96990 + }, + { + "epoch": 4.817721267507698, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004145862719777491, + "loss": 0.5194, + "step": 97000 + }, + { + "epoch": 4.818217939803318, + "grad_norm": 0.1123046875, + "learning_rate": 0.00041454653819409954, + "loss": 0.5162, + "step": 97010 + }, + { + "epoch": 4.818714612098937, + "grad_norm": 0.11474609375, + "learning_rate": 0.00041450680441045, + "loss": 0.5299, + "step": 97020 + }, + { + "epoch": 4.819211284394557, + "grad_norm": 0.119140625, + "learning_rate": 0.0004144670706268005, + "loss": 0.4721, + "step": 97030 + }, + { + "epoch": 4.819707956690176, + "grad_norm": 0.1494140625, + "learning_rate": 0.00041442733684315085, + "loss": 0.5487, + "step": 97040 + }, + { + "epoch": 4.820204628985795, + "grad_norm": 0.1630859375, + "learning_rate": 0.00041438760305950137, + "loss": 0.5348, + "step": 97050 + }, + { + "epoch": 4.820701301281415, + "grad_norm": 0.138671875, + "learning_rate": 0.00041434786927585184, + "loss": 0.5042, + "step": 97060 + }, + { + "epoch": 4.821197973577034, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004143081354922022, + "loss": 0.4872, + "step": 97070 + }, + { + "epoch": 4.821694645872653, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004142684017085527, + "loss": 0.4811, + "step": 97080 + }, + { + "epoch": 4.822191318168272, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004142286679249032, + "loss": 0.4971, + "step": 97090 + }, + { + "epoch": 4.822687990463892, + "grad_norm": 0.11962890625, + "learning_rate": 0.0004141889341412537, + "loss": 0.5381, + "step": 97100 + }, + { + "epoch": 4.823184662759512, + "grad_norm": 0.1259765625, + "learning_rate": 0.00041414920035760404, + "loss": 0.5092, + "step": 97110 + }, + { + "epoch": 4.823681335055131, + "grad_norm": 0.1181640625, + "learning_rate": 0.0004141094665739545, + "loss": 0.5213, + "step": 97120 + }, + { + "epoch": 4.82417800735075, + "grad_norm": 0.1162109375, + "learning_rate": 0.00041406973279030503, + "loss": 0.5406, + "step": 97130 + }, + { + "epoch": 4.824674679646369, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004140299990066554, + "loss": 0.5084, + "step": 97140 + }, + { + "epoch": 4.8251713519419885, + "grad_norm": 0.1689453125, + "learning_rate": 0.00041399026522300587, + "loss": 0.5019, + "step": 97150 + }, + { + "epoch": 4.825668024237608, + "grad_norm": 0.12109375, + "learning_rate": 0.00041395053143935634, + "loss": 0.5396, + "step": 97160 + }, + { + "epoch": 4.826164696533227, + "grad_norm": 0.10546875, + "learning_rate": 0.00041391079765570676, + "loss": 0.5518, + "step": 97170 + }, + { + "epoch": 4.826661368828847, + "grad_norm": 0.10986328125, + "learning_rate": 0.00041387106387205723, + "loss": 0.4836, + "step": 97180 + }, + { + "epoch": 4.827158041124466, + "grad_norm": 0.142578125, + "learning_rate": 0.0004138313300884077, + "loss": 0.5422, + "step": 97190 + }, + { + "epoch": 4.827654713420086, + "grad_norm": 0.1591796875, + "learning_rate": 0.0004137915963047581, + "loss": 0.4921, + "step": 97200 + }, + { + "epoch": 4.828151385715705, + "grad_norm": 0.142578125, + "learning_rate": 0.0004137518625211086, + "loss": 0.5346, + "step": 97210 + }, + { + "epoch": 4.828648058011324, + "grad_norm": 0.1220703125, + "learning_rate": 0.00041371212873745906, + "loss": 0.4824, + "step": 97220 + }, + { + "epoch": 4.829144730306943, + "grad_norm": 0.130859375, + "learning_rate": 0.00041367239495380953, + "loss": 0.4936, + "step": 97230 + }, + { + "epoch": 4.8296414026025625, + "grad_norm": 0.119140625, + "learning_rate": 0.00041363266117015995, + "loss": 0.5038, + "step": 97240 + }, + { + "epoch": 4.830138074898182, + "grad_norm": 0.1435546875, + "learning_rate": 0.0004135929273865104, + "loss": 0.5332, + "step": 97250 + }, + { + "epoch": 4.830634747193802, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004135531936028609, + "loss": 0.5166, + "step": 97260 + }, + { + "epoch": 4.831131419489421, + "grad_norm": 0.1298828125, + "learning_rate": 0.00041351345981921125, + "loss": 0.5166, + "step": 97270 + }, + { + "epoch": 4.83162809178504, + "grad_norm": 0.126953125, + "learning_rate": 0.0004134737260355618, + "loss": 0.495, + "step": 97280 + }, + { + "epoch": 4.8321247640806595, + "grad_norm": 0.1181640625, + "learning_rate": 0.00041343399225191225, + "loss": 0.5042, + "step": 97290 + }, + { + "epoch": 4.832621436376279, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004133942584682626, + "loss": 0.5037, + "step": 97300 + }, + { + "epoch": 4.833118108671898, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004133545246846131, + "loss": 0.5008, + "step": 97310 + }, + { + "epoch": 4.833614780967517, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004133147909009636, + "loss": 0.5325, + "step": 97320 + }, + { + "epoch": 4.834111453263137, + "grad_norm": 0.138671875, + "learning_rate": 0.000413275057117314, + "loss": 0.4981, + "step": 97330 + }, + { + "epoch": 4.8346081255587565, + "grad_norm": 0.119140625, + "learning_rate": 0.00041323532333366445, + "loss": 0.5227, + "step": 97340 + }, + { + "epoch": 4.835104797854376, + "grad_norm": 0.15234375, + "learning_rate": 0.0004131955895500149, + "loss": 0.5002, + "step": 97350 + }, + { + "epoch": 4.835601470149995, + "grad_norm": 0.107421875, + "learning_rate": 0.00041315585576636533, + "loss": 0.5431, + "step": 97360 + }, + { + "epoch": 4.836098142445614, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004131161219827158, + "loss": 0.5112, + "step": 97370 + }, + { + "epoch": 4.836594814741233, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004130763881990663, + "loss": 0.5155, + "step": 97380 + }, + { + "epoch": 4.837091487036853, + "grad_norm": 0.109375, + "learning_rate": 0.00041303665441541675, + "loss": 0.5001, + "step": 97390 + }, + { + "epoch": 4.837588159332473, + "grad_norm": 0.10693359375, + "learning_rate": 0.00041299692063176717, + "loss": 0.499, + "step": 97400 + }, + { + "epoch": 4.838084831628092, + "grad_norm": 0.134765625, + "learning_rate": 0.00041295718684811764, + "loss": 0.5312, + "step": 97410 + }, + { + "epoch": 4.838581503923711, + "grad_norm": 0.16015625, + "learning_rate": 0.0004129174530644681, + "loss": 0.5056, + "step": 97420 + }, + { + "epoch": 4.8390781762193305, + "grad_norm": 0.1494140625, + "learning_rate": 0.0004128777192808185, + "loss": 0.4915, + "step": 97430 + }, + { + "epoch": 4.83957484851495, + "grad_norm": 0.1328125, + "learning_rate": 0.000412837985497169, + "loss": 0.504, + "step": 97440 + }, + { + "epoch": 4.840071520810569, + "grad_norm": 0.11474609375, + "learning_rate": 0.00041279825171351947, + "loss": 0.5052, + "step": 97450 + }, + { + "epoch": 4.840568193106188, + "grad_norm": 0.09912109375, + "learning_rate": 0.0004127585179298699, + "loss": 0.5274, + "step": 97460 + }, + { + "epoch": 4.841064865401808, + "grad_norm": 0.11669921875, + "learning_rate": 0.00041271878414622036, + "loss": 0.4997, + "step": 97470 + }, + { + "epoch": 4.8415615376974275, + "grad_norm": 0.11376953125, + "learning_rate": 0.00041267905036257083, + "loss": 0.4722, + "step": 97480 + }, + { + "epoch": 4.842058209993047, + "grad_norm": 0.138671875, + "learning_rate": 0.0004126393165789212, + "loss": 0.5154, + "step": 97490 + }, + { + "epoch": 4.842554882288666, + "grad_norm": 0.12109375, + "learning_rate": 0.00041259958279527166, + "loss": 0.4914, + "step": 97500 + }, + { + "epoch": 4.843051554584285, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004125598490116222, + "loss": 0.5148, + "step": 97510 + }, + { + "epoch": 4.843548226879904, + "grad_norm": 0.1708984375, + "learning_rate": 0.00041252011522797266, + "loss": 0.5141, + "step": 97520 + }, + { + "epoch": 4.844044899175524, + "grad_norm": 0.130859375, + "learning_rate": 0.000412480381444323, + "loss": 0.5172, + "step": 97530 + }, + { + "epoch": 4.844541571471144, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004124406476606735, + "loss": 0.5127, + "step": 97540 + }, + { + "epoch": 4.845038243766763, + "grad_norm": 0.11865234375, + "learning_rate": 0.000412400913877024, + "loss": 0.4933, + "step": 97550 + }, + { + "epoch": 4.845534916062382, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004123611800933744, + "loss": 0.503, + "step": 97560 + }, + { + "epoch": 4.846031588358001, + "grad_norm": 0.10400390625, + "learning_rate": 0.00041232144630972485, + "loss": 0.5205, + "step": 97570 + }, + { + "epoch": 4.846528260653621, + "grad_norm": 0.1376953125, + "learning_rate": 0.0004122817125260753, + "loss": 0.5166, + "step": 97580 + }, + { + "epoch": 4.84702493294924, + "grad_norm": 0.10888671875, + "learning_rate": 0.00041224197874242574, + "loss": 0.5364, + "step": 97590 + }, + { + "epoch": 4.847521605244859, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004122022449587762, + "loss": 0.5433, + "step": 97600 + }, + { + "epoch": 4.848018277540479, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004121625111751267, + "loss": 0.5206, + "step": 97610 + }, + { + "epoch": 4.848514949836098, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004121227773914771, + "loss": 0.5149, + "step": 97620 + }, + { + "epoch": 4.849011622131718, + "grad_norm": 0.11083984375, + "learning_rate": 0.00041208304360782757, + "loss": 0.5024, + "step": 97630 + }, + { + "epoch": 4.849508294427337, + "grad_norm": 0.119140625, + "learning_rate": 0.00041204330982417804, + "loss": 0.539, + "step": 97640 + }, + { + "epoch": 4.850004966722956, + "grad_norm": 0.11669921875, + "learning_rate": 0.00041200357604052846, + "loss": 0.5152, + "step": 97650 + }, + { + "epoch": 4.850501639018575, + "grad_norm": 0.17578125, + "learning_rate": 0.00041196384225687893, + "loss": 0.5044, + "step": 97660 + }, + { + "epoch": 4.850998311314195, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004119241084732294, + "loss": 0.4817, + "step": 97670 + }, + { + "epoch": 4.851494983609815, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004118843746895799, + "loss": 0.5401, + "step": 97680 + }, + { + "epoch": 4.851991655905434, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004118446409059303, + "loss": 0.5415, + "step": 97690 + }, + { + "epoch": 4.852488328201053, + "grad_norm": 0.12158203125, + "learning_rate": 0.00041180490712228076, + "loss": 0.5361, + "step": 97700 + }, + { + "epoch": 4.852985000496672, + "grad_norm": 0.119140625, + "learning_rate": 0.00041176517333863123, + "loss": 0.5263, + "step": 97710 + }, + { + "epoch": 4.853481672792292, + "grad_norm": 0.126953125, + "learning_rate": 0.0004117254395549816, + "loss": 0.5104, + "step": 97720 + }, + { + "epoch": 4.853978345087911, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004116857057713321, + "loss": 0.5278, + "step": 97730 + }, + { + "epoch": 4.85447501738353, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004116459719876826, + "loss": 0.5389, + "step": 97740 + }, + { + "epoch": 4.85497168967915, + "grad_norm": 0.12060546875, + "learning_rate": 0.00041160623820403296, + "loss": 0.5303, + "step": 97750 + }, + { + "epoch": 4.855468361974769, + "grad_norm": 0.11962890625, + "learning_rate": 0.00041156650442038343, + "loss": 0.5347, + "step": 97760 + }, + { + "epoch": 4.855965034270389, + "grad_norm": 0.1474609375, + "learning_rate": 0.0004115267706367339, + "loss": 0.5075, + "step": 97770 + }, + { + "epoch": 4.856461706566008, + "grad_norm": 0.10205078125, + "learning_rate": 0.0004114870368530843, + "loss": 0.5044, + "step": 97780 + }, + { + "epoch": 4.856958378861627, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004114473030694348, + "loss": 0.5152, + "step": 97790 + }, + { + "epoch": 4.857455051157246, + "grad_norm": 0.1435546875, + "learning_rate": 0.00041140756928578526, + "loss": 0.4994, + "step": 97800 + }, + { + "epoch": 4.8579517234528655, + "grad_norm": 0.1513671875, + "learning_rate": 0.0004113678355021357, + "loss": 0.5332, + "step": 97810 + }, + { + "epoch": 4.858448395748486, + "grad_norm": 0.1123046875, + "learning_rate": 0.00041132810171848615, + "loss": 0.5236, + "step": 97820 + }, + { + "epoch": 4.858945068044105, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004112883679348366, + "loss": 0.5387, + "step": 97830 + }, + { + "epoch": 4.859441740339724, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004112486341511871, + "loss": 0.4905, + "step": 97840 + }, + { + "epoch": 4.859938412635343, + "grad_norm": 0.126953125, + "learning_rate": 0.0004112089003675375, + "loss": 0.4935, + "step": 97850 + }, + { + "epoch": 4.8604350849309625, + "grad_norm": 0.1611328125, + "learning_rate": 0.000411169166583888, + "loss": 0.5209, + "step": 97860 + }, + { + "epoch": 4.860931757226582, + "grad_norm": 0.1357421875, + "learning_rate": 0.00041112943280023845, + "loss": 0.5148, + "step": 97870 + }, + { + "epoch": 4.861428429522201, + "grad_norm": 0.1298828125, + "learning_rate": 0.00041108969901658887, + "loss": 0.4997, + "step": 97880 + }, + { + "epoch": 4.86192510181782, + "grad_norm": 0.1337890625, + "learning_rate": 0.00041104996523293934, + "loss": 0.5335, + "step": 97890 + }, + { + "epoch": 4.8624217741134395, + "grad_norm": 0.119140625, + "learning_rate": 0.0004110102314492898, + "loss": 0.5021, + "step": 97900 + }, + { + "epoch": 4.86291844640906, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004109704976656402, + "loss": 0.521, + "step": 97910 + }, + { + "epoch": 4.863415118704679, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004109307638819907, + "loss": 0.522, + "step": 97920 + }, + { + "epoch": 4.863911791000298, + "grad_norm": 0.10986328125, + "learning_rate": 0.00041089103009834117, + "loss": 0.5152, + "step": 97930 + }, + { + "epoch": 4.864408463295917, + "grad_norm": 0.150390625, + "learning_rate": 0.00041085129631469153, + "loss": 0.4814, + "step": 97940 + }, + { + "epoch": 4.8649051355915365, + "grad_norm": 0.11767578125, + "learning_rate": 0.000410811562531042, + "loss": 0.5029, + "step": 97950 + }, + { + "epoch": 4.865401807887156, + "grad_norm": 0.103515625, + "learning_rate": 0.00041077182874739253, + "loss": 0.4984, + "step": 97960 + }, + { + "epoch": 4.865898480182775, + "grad_norm": 0.11279296875, + "learning_rate": 0.000410732094963743, + "loss": 0.5218, + "step": 97970 + }, + { + "epoch": 4.866395152478395, + "grad_norm": 0.1240234375, + "learning_rate": 0.00041069236118009337, + "loss": 0.5101, + "step": 97980 + }, + { + "epoch": 4.866891824774014, + "grad_norm": 0.1396484375, + "learning_rate": 0.00041065262739644384, + "loss": 0.5146, + "step": 97990 + }, + { + "epoch": 4.8673884970696335, + "grad_norm": 0.1220703125, + "learning_rate": 0.00041061289361279436, + "loss": 0.5067, + "step": 98000 + }, + { + "epoch": 4.867885169365253, + "grad_norm": 0.1337890625, + "learning_rate": 0.0004105731598291447, + "loss": 0.4766, + "step": 98010 + }, + { + "epoch": 4.868381841660872, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004105334260454952, + "loss": 0.4903, + "step": 98020 + }, + { + "epoch": 4.868878513956491, + "grad_norm": 0.12109375, + "learning_rate": 0.00041049369226184567, + "loss": 0.5307, + "step": 98030 + }, + { + "epoch": 4.86937518625211, + "grad_norm": 0.140625, + "learning_rate": 0.0004104539584781961, + "loss": 0.5246, + "step": 98040 + }, + { + "epoch": 4.8698718585477305, + "grad_norm": 0.119140625, + "learning_rate": 0.00041041422469454656, + "loss": 0.5032, + "step": 98050 + }, + { + "epoch": 4.87036853084335, + "grad_norm": 0.1103515625, + "learning_rate": 0.00041037449091089703, + "loss": 0.5249, + "step": 98060 + }, + { + "epoch": 4.870865203138969, + "grad_norm": 0.11328125, + "learning_rate": 0.00041033475712724744, + "loss": 0.5255, + "step": 98070 + }, + { + "epoch": 4.871361875434588, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004102950233435979, + "loss": 0.5082, + "step": 98080 + }, + { + "epoch": 4.871858547730207, + "grad_norm": 0.2138671875, + "learning_rate": 0.0004102552895599484, + "loss": 0.5224, + "step": 98090 + }, + { + "epoch": 4.872355220025827, + "grad_norm": 0.1728515625, + "learning_rate": 0.00041021555577629875, + "loss": 0.5227, + "step": 98100 + }, + { + "epoch": 4.872851892321446, + "grad_norm": 0.12451171875, + "learning_rate": 0.0004101758219926493, + "loss": 0.5188, + "step": 98110 + }, + { + "epoch": 4.873348564617066, + "grad_norm": 0.126953125, + "learning_rate": 0.00041013608820899975, + "loss": 0.4935, + "step": 98120 + }, + { + "epoch": 4.873845236912685, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004100963544253502, + "loss": 0.5272, + "step": 98130 + }, + { + "epoch": 4.8743419092083045, + "grad_norm": 0.1171875, + "learning_rate": 0.0004100566206417006, + "loss": 0.5161, + "step": 98140 + }, + { + "epoch": 4.874838581503924, + "grad_norm": 0.154296875, + "learning_rate": 0.0004100168868580511, + "loss": 0.4965, + "step": 98150 + }, + { + "epoch": 4.875335253799543, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004099771530744016, + "loss": 0.4862, + "step": 98160 + }, + { + "epoch": 4.875831926095162, + "grad_norm": 0.11376953125, + "learning_rate": 0.00040993741929075194, + "loss": 0.5286, + "step": 98170 + }, + { + "epoch": 4.876328598390781, + "grad_norm": 0.10986328125, + "learning_rate": 0.0004098976855071024, + "loss": 0.5334, + "step": 98180 + }, + { + "epoch": 4.8768252706864015, + "grad_norm": 0.107421875, + "learning_rate": 0.00040985795172345294, + "loss": 0.5077, + "step": 98190 + }, + { + "epoch": 4.877321942982021, + "grad_norm": 0.126953125, + "learning_rate": 0.0004098182179398033, + "loss": 0.5194, + "step": 98200 + }, + { + "epoch": 4.87781861527764, + "grad_norm": 0.12255859375, + "learning_rate": 0.00040977848415615377, + "loss": 0.5228, + "step": 98210 + }, + { + "epoch": 4.878315287573259, + "grad_norm": 0.12890625, + "learning_rate": 0.00040973875037250424, + "loss": 0.5258, + "step": 98220 + }, + { + "epoch": 4.878811959868878, + "grad_norm": 0.11279296875, + "learning_rate": 0.00040969901658885466, + "loss": 0.5157, + "step": 98230 + }, + { + "epoch": 4.879308632164498, + "grad_norm": 0.130859375, + "learning_rate": 0.00040965928280520513, + "loss": 0.5134, + "step": 98240 + }, + { + "epoch": 4.879805304460117, + "grad_norm": 0.1650390625, + "learning_rate": 0.0004096195490215556, + "loss": 0.5122, + "step": 98250 + }, + { + "epoch": 4.880301976755737, + "grad_norm": 0.12353515625, + "learning_rate": 0.000409579815237906, + "loss": 0.5279, + "step": 98260 + }, + { + "epoch": 4.880798649051356, + "grad_norm": 0.142578125, + "learning_rate": 0.0004095400814542565, + "loss": 0.5175, + "step": 98270 + }, + { + "epoch": 4.881295321346975, + "grad_norm": 0.12353515625, + "learning_rate": 0.00040950034767060696, + "loss": 0.5035, + "step": 98280 + }, + { + "epoch": 4.881791993642595, + "grad_norm": 0.11376953125, + "learning_rate": 0.00040946061388695744, + "loss": 0.4978, + "step": 98290 + }, + { + "epoch": 4.882288665938214, + "grad_norm": 0.1083984375, + "learning_rate": 0.00040942088010330785, + "loss": 0.4894, + "step": 98300 + }, + { + "epoch": 4.882785338233833, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004093811463196583, + "loss": 0.5125, + "step": 98310 + }, + { + "epoch": 4.883282010529452, + "grad_norm": 0.12890625, + "learning_rate": 0.0004093414125360088, + "loss": 0.5366, + "step": 98320 + }, + { + "epoch": 4.883778682825072, + "grad_norm": 0.15625, + "learning_rate": 0.0004093016787523592, + "loss": 0.509, + "step": 98330 + }, + { + "epoch": 4.884275355120692, + "grad_norm": 0.130859375, + "learning_rate": 0.0004092619449687097, + "loss": 0.5094, + "step": 98340 + }, + { + "epoch": 4.884772027416311, + "grad_norm": 0.1181640625, + "learning_rate": 0.00040922221118506015, + "loss": 0.4979, + "step": 98350 + }, + { + "epoch": 4.88526869971193, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004091824774014105, + "loss": 0.4915, + "step": 98360 + }, + { + "epoch": 4.885765372007549, + "grad_norm": 0.10693359375, + "learning_rate": 0.000409142743617761, + "loss": 0.5128, + "step": 98370 + }, + { + "epoch": 4.886262044303169, + "grad_norm": 0.150390625, + "learning_rate": 0.0004091030098341115, + "loss": 0.5414, + "step": 98380 + }, + { + "epoch": 4.886758716598788, + "grad_norm": 0.119140625, + "learning_rate": 0.0004090632760504619, + "loss": 0.4982, + "step": 98390 + }, + { + "epoch": 4.887255388894408, + "grad_norm": 0.11865234375, + "learning_rate": 0.00040902354226681235, + "loss": 0.4913, + "step": 98400 + }, + { + "epoch": 4.887752061190027, + "grad_norm": 0.1630859375, + "learning_rate": 0.0004089838084831628, + "loss": 0.5041, + "step": 98410 + }, + { + "epoch": 4.888248733485646, + "grad_norm": 0.140625, + "learning_rate": 0.00040894407469951335, + "loss": 0.5105, + "step": 98420 + }, + { + "epoch": 4.888745405781266, + "grad_norm": 0.119140625, + "learning_rate": 0.0004089043409158637, + "loss": 0.5313, + "step": 98430 + }, + { + "epoch": 4.889242078076885, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004088646071322142, + "loss": 0.5135, + "step": 98440 + }, + { + "epoch": 4.889738750372504, + "grad_norm": 0.12255859375, + "learning_rate": 0.00040882487334856465, + "loss": 0.5178, + "step": 98450 + }, + { + "epoch": 4.890235422668123, + "grad_norm": 0.11083984375, + "learning_rate": 0.00040878513956491507, + "loss": 0.5356, + "step": 98460 + }, + { + "epoch": 4.890732094963743, + "grad_norm": 0.10498046875, + "learning_rate": 0.00040874540578126554, + "loss": 0.5044, + "step": 98470 + }, + { + "epoch": 4.891228767259363, + "grad_norm": 0.1875, + "learning_rate": 0.000408705671997616, + "loss": 0.495, + "step": 98480 + }, + { + "epoch": 4.891725439554982, + "grad_norm": 0.1259765625, + "learning_rate": 0.00040866593821396643, + "loss": 0.5322, + "step": 98490 + }, + { + "epoch": 4.892222111850601, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004086262044303169, + "loss": 0.5052, + "step": 98500 + }, + { + "epoch": 4.89271878414622, + "grad_norm": 0.1328125, + "learning_rate": 0.00040858647064666737, + "loss": 0.4894, + "step": 98510 + }, + { + "epoch": 4.8932154564418395, + "grad_norm": 0.16015625, + "learning_rate": 0.0004085467368630178, + "loss": 0.5148, + "step": 98520 + }, + { + "epoch": 4.893712128737459, + "grad_norm": 0.1357421875, + "learning_rate": 0.00040850700307936826, + "loss": 0.5268, + "step": 98530 + }, + { + "epoch": 4.894208801033079, + "grad_norm": 0.158203125, + "learning_rate": 0.00040846726929571873, + "loss": 0.4975, + "step": 98540 + }, + { + "epoch": 4.894705473328698, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004084275355120691, + "loss": 0.4836, + "step": 98550 + }, + { + "epoch": 4.895202145624317, + "grad_norm": 0.125, + "learning_rate": 0.0004083878017284196, + "loss": 0.5274, + "step": 98560 + }, + { + "epoch": 4.8956988179199366, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004083480679447701, + "loss": 0.52, + "step": 98570 + }, + { + "epoch": 4.896195490215556, + "grad_norm": 0.12255859375, + "learning_rate": 0.00040830833416112056, + "loss": 0.5171, + "step": 98580 + }, + { + "epoch": 4.896692162511175, + "grad_norm": 0.11865234375, + "learning_rate": 0.0004082686003774709, + "loss": 0.5075, + "step": 98590 + }, + { + "epoch": 4.897188834806794, + "grad_norm": 0.123046875, + "learning_rate": 0.00040822886659382145, + "loss": 0.5407, + "step": 98600 + }, + { + "epoch": 4.8976855071024135, + "grad_norm": 0.103515625, + "learning_rate": 0.0004081891328101719, + "loss": 0.5008, + "step": 98610 + }, + { + "epoch": 4.898182179398033, + "grad_norm": 0.1484375, + "learning_rate": 0.0004081493990265223, + "loss": 0.5087, + "step": 98620 + }, + { + "epoch": 4.898678851693653, + "grad_norm": 0.11669921875, + "learning_rate": 0.00040810966524287276, + "loss": 0.52, + "step": 98630 + }, + { + "epoch": 4.899175523989272, + "grad_norm": 0.10986328125, + "learning_rate": 0.00040806993145922323, + "loss": 0.5035, + "step": 98640 + }, + { + "epoch": 4.899672196284891, + "grad_norm": 0.15625, + "learning_rate": 0.00040803019767557365, + "loss": 0.5082, + "step": 98650 + }, + { + "epoch": 4.9001688685805105, + "grad_norm": 0.11474609375, + "learning_rate": 0.0004079904638919241, + "loss": 0.4911, + "step": 98660 + }, + { + "epoch": 4.90066554087613, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004079507301082746, + "loss": 0.5165, + "step": 98670 + }, + { + "epoch": 4.901162213171749, + "grad_norm": 0.10693359375, + "learning_rate": 0.000407910996324625, + "loss": 0.5235, + "step": 98680 + }, + { + "epoch": 4.901658885467368, + "grad_norm": 0.24609375, + "learning_rate": 0.0004078712625409755, + "loss": 0.5162, + "step": 98690 + }, + { + "epoch": 4.902155557762988, + "grad_norm": 0.1611328125, + "learning_rate": 0.00040783152875732595, + "loss": 0.5158, + "step": 98700 + }, + { + "epoch": 4.9026522300586075, + "grad_norm": 0.1162109375, + "learning_rate": 0.00040779179497367636, + "loss": 0.5105, + "step": 98710 + }, + { + "epoch": 4.903148902354227, + "grad_norm": 0.1171875, + "learning_rate": 0.00040775206119002684, + "loss": 0.4942, + "step": 98720 + }, + { + "epoch": 4.903645574649846, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004077123274063773, + "loss": 0.5166, + "step": 98730 + }, + { + "epoch": 4.904142246945465, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004076725936227278, + "loss": 0.5246, + "step": 98740 + }, + { + "epoch": 4.904638919241084, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004076328598390782, + "loss": 0.5428, + "step": 98750 + }, + { + "epoch": 4.905135591536704, + "grad_norm": 0.10693359375, + "learning_rate": 0.00040759312605542867, + "loss": 0.5142, + "step": 98760 + }, + { + "epoch": 4.905632263832324, + "grad_norm": 0.1328125, + "learning_rate": 0.00040755339227177914, + "loss": 0.5362, + "step": 98770 + }, + { + "epoch": 4.906128936127943, + "grad_norm": 0.1083984375, + "learning_rate": 0.0004075136584881295, + "loss": 0.5124, + "step": 98780 + }, + { + "epoch": 4.906625608423562, + "grad_norm": 0.10791015625, + "learning_rate": 0.00040747392470448003, + "loss": 0.5158, + "step": 98790 + }, + { + "epoch": 4.907122280719181, + "grad_norm": 0.162109375, + "learning_rate": 0.0004074341909208305, + "loss": 0.4925, + "step": 98800 + }, + { + "epoch": 4.907618953014801, + "grad_norm": 0.1318359375, + "learning_rate": 0.00040739445713718086, + "loss": 0.5157, + "step": 98810 + }, + { + "epoch": 4.90811562531042, + "grad_norm": 0.10498046875, + "learning_rate": 0.00040735472335353133, + "loss": 0.4888, + "step": 98820 + }, + { + "epoch": 4.908612297606039, + "grad_norm": 0.1865234375, + "learning_rate": 0.00040731498956988186, + "loss": 0.5231, + "step": 98830 + }, + { + "epoch": 4.909108969901659, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004072752557862322, + "loss": 0.503, + "step": 98840 + }, + { + "epoch": 4.9096056421972785, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004072355220025827, + "loss": 0.4851, + "step": 98850 + }, + { + "epoch": 4.910102314492898, + "grad_norm": 0.1318359375, + "learning_rate": 0.00040719578821893316, + "loss": 0.5166, + "step": 98860 + }, + { + "epoch": 4.910598986788517, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004071560544352837, + "loss": 0.51, + "step": 98870 + }, + { + "epoch": 4.911095659084136, + "grad_norm": 0.1318359375, + "learning_rate": 0.00040711632065163405, + "loss": 0.5411, + "step": 98880 + }, + { + "epoch": 4.911592331379755, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004070765868679845, + "loss": 0.4917, + "step": 98890 + }, + { + "epoch": 4.912089003675375, + "grad_norm": 0.107421875, + "learning_rate": 0.000407036853084335, + "loss": 0.5179, + "step": 98900 + }, + { + "epoch": 4.912585675970995, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004069971193006854, + "loss": 0.5128, + "step": 98910 + }, + { + "epoch": 4.913082348266614, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004069573855170359, + "loss": 0.508, + "step": 98920 + }, + { + "epoch": 4.913579020562233, + "grad_norm": 0.1298828125, + "learning_rate": 0.00040691765173338636, + "loss": 0.5038, + "step": 98930 + }, + { + "epoch": 4.914075692857852, + "grad_norm": 0.146484375, + "learning_rate": 0.00040687791794973677, + "loss": 0.5337, + "step": 98940 + }, + { + "epoch": 4.914572365153472, + "grad_norm": 0.10107421875, + "learning_rate": 0.00040683818416608724, + "loss": 0.4889, + "step": 98950 + }, + { + "epoch": 4.915069037449091, + "grad_norm": 0.189453125, + "learning_rate": 0.0004067984503824377, + "loss": 0.5286, + "step": 98960 + }, + { + "epoch": 4.91556570974471, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004067587165987881, + "loss": 0.5046, + "step": 98970 + }, + { + "epoch": 4.91606238204033, + "grad_norm": 0.134765625, + "learning_rate": 0.0004067189828151386, + "loss": 0.4965, + "step": 98980 + }, + { + "epoch": 4.916559054335949, + "grad_norm": 0.107421875, + "learning_rate": 0.0004066792490314891, + "loss": 0.5457, + "step": 98990 + }, + { + "epoch": 4.917055726631569, + "grad_norm": 0.11083984375, + "learning_rate": 0.00040663951524783944, + "loss": 0.5051, + "step": 99000 + }, + { + "epoch": 4.917552398927188, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004065997814641899, + "loss": 0.4822, + "step": 99010 + }, + { + "epoch": 4.918049071222807, + "grad_norm": 0.115234375, + "learning_rate": 0.00040656004768054043, + "loss": 0.5285, + "step": 99020 + }, + { + "epoch": 4.918545743518426, + "grad_norm": 0.11669921875, + "learning_rate": 0.0004065203138968909, + "loss": 0.4931, + "step": 99030 + }, + { + "epoch": 4.9190424158140456, + "grad_norm": 0.154296875, + "learning_rate": 0.00040648058011324127, + "loss": 0.5481, + "step": 99040 + }, + { + "epoch": 4.919539088109666, + "grad_norm": 0.11279296875, + "learning_rate": 0.00040644084632959174, + "loss": 0.5057, + "step": 99050 + }, + { + "epoch": 4.920035760405285, + "grad_norm": 0.1279296875, + "learning_rate": 0.00040640111254594227, + "loss": 0.4832, + "step": 99060 + }, + { + "epoch": 4.920532432700904, + "grad_norm": 0.11962890625, + "learning_rate": 0.00040636137876229263, + "loss": 0.5402, + "step": 99070 + }, + { + "epoch": 4.921029104996523, + "grad_norm": 0.10400390625, + "learning_rate": 0.0004063216449786431, + "loss": 0.5001, + "step": 99080 + }, + { + "epoch": 4.921525777292143, + "grad_norm": 0.1728515625, + "learning_rate": 0.00040628191119499357, + "loss": 0.5159, + "step": 99090 + }, + { + "epoch": 4.922022449587762, + "grad_norm": 0.12060546875, + "learning_rate": 0.000406242177411344, + "loss": 0.5343, + "step": 99100 + }, + { + "epoch": 4.922519121883381, + "grad_norm": 0.11962890625, + "learning_rate": 0.00040620244362769446, + "loss": 0.5164, + "step": 99110 + }, + { + "epoch": 4.923015794179001, + "grad_norm": 0.10205078125, + "learning_rate": 0.00040616270984404493, + "loss": 0.498, + "step": 99120 + }, + { + "epoch": 4.92351246647462, + "grad_norm": 0.11962890625, + "learning_rate": 0.00040612297606039535, + "loss": 0.5175, + "step": 99130 + }, + { + "epoch": 4.92400913877024, + "grad_norm": 0.12890625, + "learning_rate": 0.0004060832422767458, + "loss": 0.5302, + "step": 99140 + }, + { + "epoch": 4.924505811065859, + "grad_norm": 0.1357421875, + "learning_rate": 0.0004060435084930963, + "loss": 0.5007, + "step": 99150 + }, + { + "epoch": 4.925002483361478, + "grad_norm": 0.10498046875, + "learning_rate": 0.0004060037747094467, + "loss": 0.5229, + "step": 99160 + }, + { + "epoch": 4.925499155657097, + "grad_norm": 0.1220703125, + "learning_rate": 0.0004059640409257972, + "loss": 0.519, + "step": 99170 + }, + { + "epoch": 4.9259958279527165, + "grad_norm": 0.11181640625, + "learning_rate": 0.00040592430714214765, + "loss": 0.5088, + "step": 99180 + }, + { + "epoch": 4.926492500248337, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004058845733584981, + "loss": 0.5215, + "step": 99190 + }, + { + "epoch": 4.926989172543956, + "grad_norm": 0.1064453125, + "learning_rate": 0.0004058448395748485, + "loss": 0.5158, + "step": 99200 + }, + { + "epoch": 4.927485844839575, + "grad_norm": 0.12060546875, + "learning_rate": 0.000405805105791199, + "loss": 0.5514, + "step": 99210 + }, + { + "epoch": 4.927982517135194, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004057653720075495, + "loss": 0.5251, + "step": 99220 + }, + { + "epoch": 4.9284791894308135, + "grad_norm": 0.1015625, + "learning_rate": 0.00040572563822389985, + "loss": 0.4987, + "step": 99230 + }, + { + "epoch": 4.928975861726433, + "grad_norm": 0.15234375, + "learning_rate": 0.0004056859044402503, + "loss": 0.495, + "step": 99240 + }, + { + "epoch": 4.929472534022052, + "grad_norm": 0.1318359375, + "learning_rate": 0.00040564617065660084, + "loss": 0.5459, + "step": 99250 + }, + { + "epoch": 4.929969206317671, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004056064368729512, + "loss": 0.5065, + "step": 99260 + }, + { + "epoch": 4.930465878613291, + "grad_norm": 0.125, + "learning_rate": 0.0004055667030893017, + "loss": 0.5229, + "step": 99270 + }, + { + "epoch": 4.930962550908911, + "grad_norm": 0.1259765625, + "learning_rate": 0.00040552696930565215, + "loss": 0.4875, + "step": 99280 + }, + { + "epoch": 4.93145922320453, + "grad_norm": 0.11572265625, + "learning_rate": 0.00040548723552200256, + "loss": 0.5139, + "step": 99290 + }, + { + "epoch": 4.931955895500149, + "grad_norm": 0.10986328125, + "learning_rate": 0.00040544750173835304, + "loss": 0.528, + "step": 99300 + }, + { + "epoch": 4.932452567795768, + "grad_norm": 0.10693359375, + "learning_rate": 0.0004054077679547035, + "loss": 0.5157, + "step": 99310 + }, + { + "epoch": 4.9329492400913875, + "grad_norm": 0.12890625, + "learning_rate": 0.000405368034171054, + "loss": 0.5011, + "step": 99320 + }, + { + "epoch": 4.933445912387007, + "grad_norm": 0.12890625, + "learning_rate": 0.0004053283003874044, + "loss": 0.4929, + "step": 99330 + }, + { + "epoch": 4.933942584682626, + "grad_norm": 0.12451171875, + "learning_rate": 0.00040528856660375487, + "loss": 0.4835, + "step": 99340 + }, + { + "epoch": 4.934439256978246, + "grad_norm": 0.119140625, + "learning_rate": 0.00040524883282010534, + "loss": 0.5361, + "step": 99350 + }, + { + "epoch": 4.934935929273865, + "grad_norm": 0.193359375, + "learning_rate": 0.00040520909903645576, + "loss": 0.5291, + "step": 99360 + }, + { + "epoch": 4.9354326015694845, + "grad_norm": 0.11669921875, + "learning_rate": 0.00040516936525280623, + "loss": 0.5118, + "step": 99370 + }, + { + "epoch": 4.935929273865104, + "grad_norm": 0.1796875, + "learning_rate": 0.0004051296314691567, + "loss": 0.4971, + "step": 99380 + }, + { + "epoch": 4.936425946160723, + "grad_norm": 0.12353515625, + "learning_rate": 0.0004050898976855071, + "loss": 0.4903, + "step": 99390 + }, + { + "epoch": 4.936922618456342, + "grad_norm": 0.1669921875, + "learning_rate": 0.0004050501639018576, + "loss": 0.5122, + "step": 99400 + }, + { + "epoch": 4.937419290751961, + "grad_norm": 0.11083984375, + "learning_rate": 0.00040501043011820806, + "loss": 0.4923, + "step": 99410 + }, + { + "epoch": 4.9379159630475815, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004049706963345584, + "loss": 0.5291, + "step": 99420 + }, + { + "epoch": 4.938412635343201, + "grad_norm": 0.12060546875, + "learning_rate": 0.00040493096255090895, + "loss": 0.5176, + "step": 99430 + }, + { + "epoch": 4.93890930763882, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004048912287672594, + "loss": 0.5011, + "step": 99440 + }, + { + "epoch": 4.939405979934439, + "grad_norm": 0.1435546875, + "learning_rate": 0.0004048514949836098, + "loss": 0.501, + "step": 99450 + }, + { + "epoch": 4.939902652230058, + "grad_norm": 0.119140625, + "learning_rate": 0.00040481176119996025, + "loss": 0.5367, + "step": 99460 + }, + { + "epoch": 4.940399324525678, + "grad_norm": 0.1259765625, + "learning_rate": 0.0004047720274163107, + "loss": 0.5023, + "step": 99470 + }, + { + "epoch": 4.940895996821297, + "grad_norm": 0.140625, + "learning_rate": 0.00040473229363266125, + "loss": 0.5122, + "step": 99480 + }, + { + "epoch": 4.941392669116917, + "grad_norm": 0.10595703125, + "learning_rate": 0.0004046925598490116, + "loss": 0.5038, + "step": 99490 + }, + { + "epoch": 4.941889341412536, + "grad_norm": 0.12060546875, + "learning_rate": 0.0004046528260653621, + "loss": 0.5243, + "step": 99500 + }, + { + "epoch": 4.9423860137081554, + "grad_norm": 0.123046875, + "learning_rate": 0.00040461309228171256, + "loss": 0.5047, + "step": 99510 + }, + { + "epoch": 4.942882686003775, + "grad_norm": 0.12158203125, + "learning_rate": 0.00040457335849806297, + "loss": 0.5128, + "step": 99520 + }, + { + "epoch": 4.943379358299394, + "grad_norm": 0.10791015625, + "learning_rate": 0.00040453362471441344, + "loss": 0.5099, + "step": 99530 + }, + { + "epoch": 4.943876030595013, + "grad_norm": 0.12109375, + "learning_rate": 0.0004044938909307639, + "loss": 0.5317, + "step": 99540 + }, + { + "epoch": 4.944372702890632, + "grad_norm": 0.1298828125, + "learning_rate": 0.00040445415714711433, + "loss": 0.5239, + "step": 99550 + }, + { + "epoch": 4.9448693751862525, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004044144233634648, + "loss": 0.5218, + "step": 99560 + }, + { + "epoch": 4.945366047481872, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004043746895798153, + "loss": 0.5068, + "step": 99570 + }, + { + "epoch": 4.945862719777491, + "grad_norm": 0.1611328125, + "learning_rate": 0.0004043349557961657, + "loss": 0.5081, + "step": 99580 + }, + { + "epoch": 4.94635939207311, + "grad_norm": 0.138671875, + "learning_rate": 0.00040429522201251616, + "loss": 0.4895, + "step": 99590 + }, + { + "epoch": 4.946856064368729, + "grad_norm": 0.1357421875, + "learning_rate": 0.00040425548822886663, + "loss": 0.5348, + "step": 99600 + }, + { + "epoch": 4.947352736664349, + "grad_norm": 0.1220703125, + "learning_rate": 0.000404215754445217, + "loss": 0.5263, + "step": 99610 + }, + { + "epoch": 4.947849408959968, + "grad_norm": 0.13671875, + "learning_rate": 0.0004041760206615675, + "loss": 0.5113, + "step": 99620 + }, + { + "epoch": 4.948346081255588, + "grad_norm": 0.1083984375, + "learning_rate": 0.000404136286877918, + "loss": 0.4971, + "step": 99630 + }, + { + "epoch": 4.948842753551207, + "grad_norm": 0.11376953125, + "learning_rate": 0.00040409655309426847, + "loss": 0.5496, + "step": 99640 + }, + { + "epoch": 4.949339425846826, + "grad_norm": 0.115234375, + "learning_rate": 0.00040405681931061883, + "loss": 0.4924, + "step": 99650 + }, + { + "epoch": 4.949836098142446, + "grad_norm": 0.10400390625, + "learning_rate": 0.00040401708552696935, + "loss": 0.4995, + "step": 99660 + }, + { + "epoch": 4.950332770438065, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004039773517433198, + "loss": 0.5248, + "step": 99670 + }, + { + "epoch": 4.950829442733684, + "grad_norm": 0.107421875, + "learning_rate": 0.0004039376179596702, + "loss": 0.5132, + "step": 99680 + }, + { + "epoch": 4.951326115029303, + "grad_norm": 0.11376953125, + "learning_rate": 0.00040389788417602066, + "loss": 0.5077, + "step": 99690 + }, + { + "epoch": 4.951822787324923, + "grad_norm": 0.1162109375, + "learning_rate": 0.0004038581503923712, + "loss": 0.4941, + "step": 99700 + }, + { + "epoch": 4.952319459620543, + "grad_norm": 0.146484375, + "learning_rate": 0.00040381841660872155, + "loss": 0.5123, + "step": 99710 + }, + { + "epoch": 4.952816131916162, + "grad_norm": 0.12109375, + "learning_rate": 0.000403778682825072, + "loss": 0.4875, + "step": 99720 + }, + { + "epoch": 4.953312804211781, + "grad_norm": 0.16015625, + "learning_rate": 0.0004037389490414225, + "loss": 0.5222, + "step": 99730 + }, + { + "epoch": 4.9538094765074, + "grad_norm": 0.1298828125, + "learning_rate": 0.0004036992152577729, + "loss": 0.487, + "step": 99740 + }, + { + "epoch": 4.95430614880302, + "grad_norm": 0.1044921875, + "learning_rate": 0.0004036594814741234, + "loss": 0.512, + "step": 99750 + }, + { + "epoch": 4.954802821098639, + "grad_norm": 0.1201171875, + "learning_rate": 0.00040361974769047385, + "loss": 0.5148, + "step": 99760 + }, + { + "epoch": 4.955299493394259, + "grad_norm": 0.10791015625, + "learning_rate": 0.0004035800139068243, + "loss": 0.4891, + "step": 99770 + }, + { + "epoch": 4.955796165689878, + "grad_norm": 0.109375, + "learning_rate": 0.00040354028012317474, + "loss": 0.5063, + "step": 99780 + }, + { + "epoch": 4.956292837985497, + "grad_norm": 0.1455078125, + "learning_rate": 0.0004035005463395252, + "loss": 0.5276, + "step": 99790 + }, + { + "epoch": 4.956789510281117, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004034608125558757, + "loss": 0.5103, + "step": 99800 + }, + { + "epoch": 4.957286182576736, + "grad_norm": 0.11181640625, + "learning_rate": 0.0004034210787722261, + "loss": 0.5315, + "step": 99810 + }, + { + "epoch": 4.957782854872355, + "grad_norm": 0.111328125, + "learning_rate": 0.00040338134498857657, + "loss": 0.4878, + "step": 99820 + }, + { + "epoch": 4.958279527167974, + "grad_norm": 0.12109375, + "learning_rate": 0.00040334161120492704, + "loss": 0.4885, + "step": 99830 + }, + { + "epoch": 4.958776199463594, + "grad_norm": 0.1240234375, + "learning_rate": 0.0004033018774212774, + "loss": 0.5389, + "step": 99840 + }, + { + "epoch": 4.959272871759214, + "grad_norm": 0.11767578125, + "learning_rate": 0.00040326214363762793, + "loss": 0.5157, + "step": 99850 + }, + { + "epoch": 4.959769544054833, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004032224098539784, + "loss": 0.4998, + "step": 99860 + }, + { + "epoch": 4.960266216350452, + "grad_norm": 0.1142578125, + "learning_rate": 0.00040318267607032877, + "loss": 0.4896, + "step": 99870 + }, + { + "epoch": 4.960762888646071, + "grad_norm": 0.09619140625, + "learning_rate": 0.00040314294228667924, + "loss": 0.5059, + "step": 99880 + }, + { + "epoch": 4.9612595609416905, + "grad_norm": 0.1328125, + "learning_rate": 0.00040310320850302976, + "loss": 0.5358, + "step": 99890 + }, + { + "epoch": 4.96175623323731, + "grad_norm": 0.130859375, + "learning_rate": 0.0004030634747193801, + "loss": 0.5355, + "step": 99900 + }, + { + "epoch": 4.96225290553293, + "grad_norm": 0.12890625, + "learning_rate": 0.0004030237409357306, + "loss": 0.5232, + "step": 99910 + }, + { + "epoch": 4.962749577828549, + "grad_norm": 0.130859375, + "learning_rate": 0.00040298400715208107, + "loss": 0.5263, + "step": 99920 + }, + { + "epoch": 4.963246250124168, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004029442733684316, + "loss": 0.5206, + "step": 99930 + }, + { + "epoch": 4.9637429224197875, + "grad_norm": 0.13671875, + "learning_rate": 0.00040290453958478196, + "loss": 0.546, + "step": 99940 + }, + { + "epoch": 4.964239594715407, + "grad_norm": 0.1015625, + "learning_rate": 0.00040286480580113243, + "loss": 0.5026, + "step": 99950 + }, + { + "epoch": 4.964736267011026, + "grad_norm": 0.1416015625, + "learning_rate": 0.0004028250720174829, + "loss": 0.5343, + "step": 99960 + }, + { + "epoch": 4.965232939306645, + "grad_norm": 0.1611328125, + "learning_rate": 0.0004027853382338333, + "loss": 0.498, + "step": 99970 + }, + { + "epoch": 4.9657296116022644, + "grad_norm": 0.111328125, + "learning_rate": 0.0004027456044501838, + "loss": 0.498, + "step": 99980 + }, + { + "epoch": 4.966226283897885, + "grad_norm": 0.1064453125, + "learning_rate": 0.00040270587066653426, + "loss": 0.5245, + "step": 99990 + }, + { + "epoch": 4.966722956193504, + "grad_norm": 0.1279296875, + "learning_rate": 0.0004026661368828847, + "loss": 0.5091, + "step": 100000 + }, + { + "epoch": 4.967219628489123, + "grad_norm": 0.1103515625, + "learning_rate": 0.00040262640309923515, + "loss": 0.4981, + "step": 100010 + }, + { + "epoch": 4.967716300784742, + "grad_norm": 0.13671875, + "learning_rate": 0.0004025866693155856, + "loss": 0.5341, + "step": 100020 + }, + { + "epoch": 4.9682129730803615, + "grad_norm": 0.11767578125, + "learning_rate": 0.00040254693553193604, + "loss": 0.5405, + "step": 100030 + }, + { + "epoch": 4.968709645375981, + "grad_norm": 0.140625, + "learning_rate": 0.0004025072017482865, + "loss": 0.4873, + "step": 100040 + }, + { + "epoch": 4.9692063176716, + "grad_norm": 0.140625, + "learning_rate": 0.000402467467964637, + "loss": 0.5229, + "step": 100050 + }, + { + "epoch": 4.969702989967219, + "grad_norm": 0.1474609375, + "learning_rate": 0.00040242773418098745, + "loss": 0.5282, + "step": 100060 + }, + { + "epoch": 4.970199662262839, + "grad_norm": 0.1123046875, + "learning_rate": 0.0004023880003973378, + "loss": 0.5119, + "step": 100070 + }, + { + "epoch": 4.9706963345584585, + "grad_norm": 0.1142578125, + "learning_rate": 0.00040234826661368834, + "loss": 0.5049, + "step": 100080 + }, + { + "epoch": 4.971193006854078, + "grad_norm": 0.1533203125, + "learning_rate": 0.0004023085328300388, + "loss": 0.5449, + "step": 100090 + }, + { + "epoch": 4.971689679149697, + "grad_norm": 0.1533203125, + "learning_rate": 0.00040226879904638917, + "loss": 0.5154, + "step": 100100 + }, + { + "epoch": 4.972186351445316, + "grad_norm": 0.10693359375, + "learning_rate": 0.00040222906526273964, + "loss": 0.5116, + "step": 100110 + }, + { + "epoch": 4.972683023740935, + "grad_norm": 0.1328125, + "learning_rate": 0.00040218933147909017, + "loss": 0.5156, + "step": 100120 + }, + { + "epoch": 4.973179696036555, + "grad_norm": 0.1474609375, + "learning_rate": 0.00040214959769544053, + "loss": 0.517, + "step": 100130 + }, + { + "epoch": 4.973676368332175, + "grad_norm": 0.1455078125, + "learning_rate": 0.000402109863911791, + "loss": 0.4893, + "step": 100140 + }, + { + "epoch": 4.974173040627794, + "grad_norm": 0.1103515625, + "learning_rate": 0.0004020701301281415, + "loss": 0.5071, + "step": 100150 + }, + { + "epoch": 4.974669712923413, + "grad_norm": 0.119140625, + "learning_rate": 0.0004020303963444919, + "loss": 0.5004, + "step": 100160 + }, + { + "epoch": 4.975166385219032, + "grad_norm": 0.1298828125, + "learning_rate": 0.00040199066256084236, + "loss": 0.538, + "step": 100170 + }, + { + "epoch": 4.975663057514652, + "grad_norm": 0.19140625, + "learning_rate": 0.00040195092877719284, + "loss": 0.523, + "step": 100180 + }, + { + "epoch": 4.976159729810271, + "grad_norm": 0.10546875, + "learning_rate": 0.00040191119499354325, + "loss": 0.5077, + "step": 100190 + }, + { + "epoch": 4.97665640210589, + "grad_norm": 0.11572265625, + "learning_rate": 0.0004018714612098937, + "loss": 0.5341, + "step": 100200 + }, + { + "epoch": 4.97715307440151, + "grad_norm": 0.1474609375, + "learning_rate": 0.0004018317274262442, + "loss": 0.5223, + "step": 100210 + }, + { + "epoch": 4.9776497466971295, + "grad_norm": 0.111328125, + "learning_rate": 0.00040179199364259467, + "loss": 0.4992, + "step": 100220 + }, + { + "epoch": 4.978146418992749, + "grad_norm": 0.1572265625, + "learning_rate": 0.0004017522598589451, + "loss": 0.5209, + "step": 100230 + }, + { + "epoch": 4.978643091288368, + "grad_norm": 0.1220703125, + "learning_rate": 0.00040171252607529555, + "loss": 0.5051, + "step": 100240 + }, + { + "epoch": 4.979139763583987, + "grad_norm": 0.12255859375, + "learning_rate": 0.000401672792291646, + "loss": 0.5268, + "step": 100250 + }, + { + "epoch": 4.979636435879606, + "grad_norm": 0.11669921875, + "learning_rate": 0.00040163305850799644, + "loss": 0.5149, + "step": 100260 + }, + { + "epoch": 4.980133108175226, + "grad_norm": 0.11376953125, + "learning_rate": 0.0004015933247243469, + "loss": 0.5173, + "step": 100270 + }, + { + "epoch": 4.980629780470846, + "grad_norm": 0.1669921875, + "learning_rate": 0.0004015535909406974, + "loss": 0.4917, + "step": 100280 + }, + { + "epoch": 4.981126452766465, + "grad_norm": 0.17578125, + "learning_rate": 0.00040151385715704775, + "loss": 0.5374, + "step": 100290 + }, + { + "epoch": 4.981623125062084, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004014741233733983, + "loss": 0.5289, + "step": 100300 + }, + { + "epoch": 4.982119797357703, + "grad_norm": 0.130859375, + "learning_rate": 0.00040143438958974875, + "loss": 0.5264, + "step": 100310 + }, + { + "epoch": 4.982616469653323, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004013946558060991, + "loss": 0.5166, + "step": 100320 + }, + { + "epoch": 4.983113141948942, + "grad_norm": 0.1708984375, + "learning_rate": 0.0004013549220224496, + "loss": 0.5035, + "step": 100330 + }, + { + "epoch": 4.983609814244561, + "grad_norm": 0.111328125, + "learning_rate": 0.00040131518823880005, + "loss": 0.5512, + "step": 100340 + }, + { + "epoch": 4.984106486540181, + "grad_norm": 0.130859375, + "learning_rate": 0.00040127545445515047, + "loss": 0.5229, + "step": 100350 + }, + { + "epoch": 4.9846031588358, + "grad_norm": 0.123046875, + "learning_rate": 0.00040123572067150094, + "loss": 0.5159, + "step": 100360 + }, + { + "epoch": 4.98509983113142, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004011959868878514, + "loss": 0.5254, + "step": 100370 + }, + { + "epoch": 4.985596503427039, + "grad_norm": 0.115234375, + "learning_rate": 0.0004011562531042019, + "loss": 0.5119, + "step": 100380 + }, + { + "epoch": 4.986093175722658, + "grad_norm": 0.12890625, + "learning_rate": 0.0004011165193205523, + "loss": 0.5259, + "step": 100390 + }, + { + "epoch": 4.986589848018277, + "grad_norm": 0.14453125, + "learning_rate": 0.00040107678553690277, + "loss": 0.5211, + "step": 100400 + }, + { + "epoch": 4.9870865203138965, + "grad_norm": 0.12353515625, + "learning_rate": 0.00040103705175325324, + "loss": 0.5141, + "step": 100410 + }, + { + "epoch": 4.987583192609517, + "grad_norm": 0.12255859375, + "learning_rate": 0.00040099731796960366, + "loss": 0.5036, + "step": 100420 + }, + { + "epoch": 4.988079864905136, + "grad_norm": 0.146484375, + "learning_rate": 0.00040095758418595413, + "loss": 0.5237, + "step": 100430 + }, + { + "epoch": 4.988576537200755, + "grad_norm": 0.138671875, + "learning_rate": 0.0004009178504023046, + "loss": 0.4908, + "step": 100440 + }, + { + "epoch": 4.989073209496374, + "grad_norm": 0.11474609375, + "learning_rate": 0.000400878116618655, + "loss": 0.5049, + "step": 100450 + }, + { + "epoch": 4.989569881791994, + "grad_norm": 0.11279296875, + "learning_rate": 0.0004008383828350055, + "loss": 0.5167, + "step": 100460 + }, + { + "epoch": 4.990066554087613, + "grad_norm": 0.1220703125, + "learning_rate": 0.00040079864905135596, + "loss": 0.4888, + "step": 100470 + }, + { + "epoch": 4.990563226383232, + "grad_norm": 0.1416015625, + "learning_rate": 0.0004007589152677063, + "loss": 0.5351, + "step": 100480 + }, + { + "epoch": 4.991059898678852, + "grad_norm": 0.10400390625, + "learning_rate": 0.00040071918148405685, + "loss": 0.5331, + "step": 100490 + }, + { + "epoch": 4.991556570974471, + "grad_norm": 0.111328125, + "learning_rate": 0.0004006794477004073, + "loss": 0.5051, + "step": 100500 + }, + { + "epoch": 4.992053243270091, + "grad_norm": 0.12158203125, + "learning_rate": 0.0004006397139167578, + "loss": 0.5323, + "step": 100510 + }, + { + "epoch": 4.99254991556571, + "grad_norm": 0.1142578125, + "learning_rate": 0.00040059998013310816, + "loss": 0.5355, + "step": 100520 + }, + { + "epoch": 4.993046587861329, + "grad_norm": 0.1318359375, + "learning_rate": 0.0004005602463494587, + "loss": 0.5263, + "step": 100530 + }, + { + "epoch": 4.993543260156948, + "grad_norm": 0.12451171875, + "learning_rate": 0.00040052051256580915, + "loss": 0.5215, + "step": 100540 + }, + { + "epoch": 4.9940399324525675, + "grad_norm": 0.11767578125, + "learning_rate": 0.0004004807787821595, + "loss": 0.5193, + "step": 100550 + }, + { + "epoch": 4.994536604748188, + "grad_norm": 0.1533203125, + "learning_rate": 0.00040044104499851, + "loss": 0.4774, + "step": 100560 + }, + { + "epoch": 4.995033277043807, + "grad_norm": 0.1318359375, + "learning_rate": 0.00040040131121486046, + "loss": 0.5122, + "step": 100570 + }, + { + "epoch": 4.995529949339426, + "grad_norm": 0.11083984375, + "learning_rate": 0.0004003615774312109, + "loss": 0.5145, + "step": 100580 + }, + { + "epoch": 4.996026621635045, + "grad_norm": 0.1484375, + "learning_rate": 0.00040032184364756135, + "loss": 0.5119, + "step": 100590 + }, + { + "epoch": 4.9965232939306645, + "grad_norm": 0.109375, + "learning_rate": 0.0004002821098639118, + "loss": 0.5173, + "step": 100600 + }, + { + "epoch": 4.997019966226284, + "grad_norm": 0.11376953125, + "learning_rate": 0.00040024237608026224, + "loss": 0.5329, + "step": 100610 + }, + { + "epoch": 4.997516638521903, + "grad_norm": 0.13671875, + "learning_rate": 0.0004002026422966127, + "loss": 0.5277, + "step": 100620 + }, + { + "epoch": 4.998013310817523, + "grad_norm": 0.1806640625, + "learning_rate": 0.0004001629085129632, + "loss": 0.5278, + "step": 100630 + }, + { + "epoch": 4.998509983113142, + "grad_norm": 0.138671875, + "learning_rate": 0.0004001231747293136, + "loss": 0.514, + "step": 100640 + }, + { + "epoch": 4.9990066554087615, + "grad_norm": 0.11767578125, + "learning_rate": 0.00040008344094566407, + "loss": 0.51, + "step": 100650 + }, + { + "epoch": 4.999503327704381, + "grad_norm": 0.1162109375, + "learning_rate": 0.00040004370716201454, + "loss": 0.5158, + "step": 100660 + }, + { + "epoch": 5.0, + "grad_norm": 0.107421875, + "learning_rate": 0.000400003973378365, + "loss": 0.5036, + "step": 100670 + }, + { + "epoch": 5.000496672295619, + "grad_norm": 0.12890625, + "learning_rate": 0.00039996423959471543, + "loss": 0.5218, + "step": 100680 + }, + { + "epoch": 5.0009933445912385, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003999245058110659, + "loss": 0.5116, + "step": 100690 + }, + { + "epoch": 5.001490016886858, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003998847720274163, + "loss": 0.4877, + "step": 100700 + }, + { + "epoch": 5.001986689182478, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003998450382437668, + "loss": 0.5158, + "step": 100710 + }, + { + "epoch": 5.002483361478097, + "grad_norm": 0.15234375, + "learning_rate": 0.00039980530446011726, + "loss": 0.5278, + "step": 100720 + }, + { + "epoch": 5.002980033773716, + "grad_norm": 0.11083984375, + "learning_rate": 0.0003997655706764677, + "loss": 0.5015, + "step": 100730 + }, + { + "epoch": 5.0034767060693355, + "grad_norm": 0.11669921875, + "learning_rate": 0.00039972583689281815, + "loss": 0.4608, + "step": 100740 + }, + { + "epoch": 5.003973378364955, + "grad_norm": 0.1201171875, + "learning_rate": 0.00039968610310916856, + "loss": 0.5155, + "step": 100750 + }, + { + "epoch": 5.004470050660574, + "grad_norm": 0.13671875, + "learning_rate": 0.00039964636932551904, + "loss": 0.5058, + "step": 100760 + }, + { + "epoch": 5.004966722956193, + "grad_norm": 0.109375, + "learning_rate": 0.0003996066355418695, + "loss": 0.4725, + "step": 100770 + }, + { + "epoch": 5.005463395251813, + "grad_norm": 0.123046875, + "learning_rate": 0.0003995669017582199, + "loss": 0.4929, + "step": 100780 + }, + { + "epoch": 5.0059600675474325, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003995271679745704, + "loss": 0.5012, + "step": 100790 + }, + { + "epoch": 5.006456739843052, + "grad_norm": 0.107421875, + "learning_rate": 0.00039948743419092087, + "loss": 0.4765, + "step": 100800 + }, + { + "epoch": 5.006953412138671, + "grad_norm": 0.162109375, + "learning_rate": 0.0003994477004072713, + "loss": 0.4979, + "step": 100810 + }, + { + "epoch": 5.00745008443429, + "grad_norm": 0.12060546875, + "learning_rate": 0.00039940796662362176, + "loss": 0.5149, + "step": 100820 + }, + { + "epoch": 5.007946756729909, + "grad_norm": 0.12109375, + "learning_rate": 0.0003993682328399722, + "loss": 0.4895, + "step": 100830 + }, + { + "epoch": 5.008443429025529, + "grad_norm": 0.11669921875, + "learning_rate": 0.00039932849905632264, + "loss": 0.5264, + "step": 100840 + }, + { + "epoch": 5.008940101321148, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003992887652726731, + "loss": 0.4954, + "step": 100850 + }, + { + "epoch": 5.009436773616768, + "grad_norm": 0.1689453125, + "learning_rate": 0.00039924903148902353, + "loss": 0.5156, + "step": 100860 + }, + { + "epoch": 5.009933445912387, + "grad_norm": 0.10546875, + "learning_rate": 0.000399209297705374, + "loss": 0.4886, + "step": 100870 + }, + { + "epoch": 5.010430118208006, + "grad_norm": 0.13671875, + "learning_rate": 0.0003991695639217245, + "loss": 0.4991, + "step": 100880 + }, + { + "epoch": 5.010926790503626, + "grad_norm": 0.130859375, + "learning_rate": 0.0003991298301380749, + "loss": 0.4754, + "step": 100890 + }, + { + "epoch": 5.011423462799245, + "grad_norm": 0.1162109375, + "learning_rate": 0.00039909009635442536, + "loss": 0.5036, + "step": 100900 + }, + { + "epoch": 5.011920135094864, + "grad_norm": 0.1123046875, + "learning_rate": 0.00039905036257077583, + "loss": 0.5072, + "step": 100910 + }, + { + "epoch": 5.012416807390483, + "grad_norm": 0.146484375, + "learning_rate": 0.0003990106287871263, + "loss": 0.5208, + "step": 100920 + }, + { + "epoch": 5.0129134796861035, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003989708950034767, + "loss": 0.5063, + "step": 100930 + }, + { + "epoch": 5.013410151981723, + "grad_norm": 0.12060546875, + "learning_rate": 0.00039893116121982714, + "loss": 0.5126, + "step": 100940 + }, + { + "epoch": 5.013906824277342, + "grad_norm": 0.1162109375, + "learning_rate": 0.00039889142743617767, + "loss": 0.501, + "step": 100950 + }, + { + "epoch": 5.014403496572961, + "grad_norm": 0.11328125, + "learning_rate": 0.0003988516936525281, + "loss": 0.5127, + "step": 100960 + }, + { + "epoch": 5.01490016886858, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003988119598688785, + "loss": 0.4887, + "step": 100970 + }, + { + "epoch": 5.0153968411642, + "grad_norm": 0.1201171875, + "learning_rate": 0.00039877222608522897, + "loss": 0.5193, + "step": 100980 + }, + { + "epoch": 5.015893513459819, + "grad_norm": 0.11572265625, + "learning_rate": 0.00039873249230157944, + "loss": 0.5054, + "step": 100990 + }, + { + "epoch": 5.016390185755439, + "grad_norm": 0.1552734375, + "learning_rate": 0.0003986927585179299, + "loss": 0.4664, + "step": 101000 + }, + { + "epoch": 5.016886858051058, + "grad_norm": 0.1767578125, + "learning_rate": 0.00039865302473428033, + "loss": 0.4959, + "step": 101010 + }, + { + "epoch": 5.017383530346677, + "grad_norm": 0.115234375, + "learning_rate": 0.0003986132909506308, + "loss": 0.5126, + "step": 101020 + }, + { + "epoch": 5.017880202642297, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003985735571669813, + "loss": 0.4981, + "step": 101030 + }, + { + "epoch": 5.018376874937916, + "grad_norm": 0.10693359375, + "learning_rate": 0.0003985338233833317, + "loss": 0.5043, + "step": 101040 + }, + { + "epoch": 5.018873547233535, + "grad_norm": 0.123046875, + "learning_rate": 0.0003984940895996821, + "loss": 0.5113, + "step": 101050 + }, + { + "epoch": 5.019370219529154, + "grad_norm": 0.134765625, + "learning_rate": 0.00039845435581603263, + "loss": 0.5359, + "step": 101060 + }, + { + "epoch": 5.019866891824774, + "grad_norm": 0.123046875, + "learning_rate": 0.00039841462203238305, + "loss": 0.4985, + "step": 101070 + }, + { + "epoch": 5.020363564120394, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003983748882487335, + "loss": 0.4846, + "step": 101080 + }, + { + "epoch": 5.020860236416013, + "grad_norm": 0.14453125, + "learning_rate": 0.00039833515446508394, + "loss": 0.5227, + "step": 101090 + }, + { + "epoch": 5.021356908711632, + "grad_norm": 0.11328125, + "learning_rate": 0.0003982954206814344, + "loss": 0.5161, + "step": 101100 + }, + { + "epoch": 5.021853581007251, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003982556868977849, + "loss": 0.4944, + "step": 101110 + }, + { + "epoch": 5.0223502533028705, + "grad_norm": 0.13671875, + "learning_rate": 0.0003982159531141353, + "loss": 0.5032, + "step": 101120 + }, + { + "epoch": 5.02284692559849, + "grad_norm": 0.1435546875, + "learning_rate": 0.00039817621933048577, + "loss": 0.5054, + "step": 101130 + }, + { + "epoch": 5.02334359789411, + "grad_norm": 0.134765625, + "learning_rate": 0.00039813648554683624, + "loss": 0.5094, + "step": 101140 + }, + { + "epoch": 5.023840270189729, + "grad_norm": 0.12353515625, + "learning_rate": 0.00039809675176318666, + "loss": 0.4775, + "step": 101150 + }, + { + "epoch": 5.024336942485348, + "grad_norm": 0.1552734375, + "learning_rate": 0.00039805701797953713, + "loss": 0.4934, + "step": 101160 + }, + { + "epoch": 5.024833614780968, + "grad_norm": 0.107421875, + "learning_rate": 0.00039801728419588755, + "loss": 0.5035, + "step": 101170 + }, + { + "epoch": 5.025330287076587, + "grad_norm": 0.1220703125, + "learning_rate": 0.000397977550412238, + "loss": 0.5133, + "step": 101180 + }, + { + "epoch": 5.025826959372206, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003979378166285885, + "loss": 0.5045, + "step": 101190 + }, + { + "epoch": 5.026323631667825, + "grad_norm": 0.154296875, + "learning_rate": 0.0003978980828449389, + "loss": 0.495, + "step": 101200 + }, + { + "epoch": 5.0268203039634445, + "grad_norm": 0.125, + "learning_rate": 0.0003978583490612894, + "loss": 0.4998, + "step": 101210 + }, + { + "epoch": 5.027316976259065, + "grad_norm": 0.10986328125, + "learning_rate": 0.00039781861527763985, + "loss": 0.478, + "step": 101220 + }, + { + "epoch": 5.027813648554684, + "grad_norm": 0.11181640625, + "learning_rate": 0.00039777888149399027, + "loss": 0.4937, + "step": 101230 + }, + { + "epoch": 5.028310320850303, + "grad_norm": 0.11962890625, + "learning_rate": 0.00039773914771034074, + "loss": 0.4847, + "step": 101240 + }, + { + "epoch": 5.028806993145922, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003976994139266912, + "loss": 0.5081, + "step": 101250 + }, + { + "epoch": 5.0293036654415415, + "grad_norm": 0.126953125, + "learning_rate": 0.00039765968014304163, + "loss": 0.5119, + "step": 101260 + }, + { + "epoch": 5.029800337737161, + "grad_norm": 0.10498046875, + "learning_rate": 0.0003976199463593921, + "loss": 0.5127, + "step": 101270 + }, + { + "epoch": 5.03029701003278, + "grad_norm": 0.1171875, + "learning_rate": 0.0003975802125757425, + "loss": 0.4919, + "step": 101280 + }, + { + "epoch": 5.0307936823284, + "grad_norm": 0.1123046875, + "learning_rate": 0.000397540478792093, + "loss": 0.5063, + "step": 101290 + }, + { + "epoch": 5.031290354624019, + "grad_norm": 0.1748046875, + "learning_rate": 0.00039750074500844346, + "loss": 0.4855, + "step": 101300 + }, + { + "epoch": 5.0317870269196385, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003974610112247939, + "loss": 0.4819, + "step": 101310 + }, + { + "epoch": 5.032283699215258, + "grad_norm": 0.1298828125, + "learning_rate": 0.00039742127744114435, + "loss": 0.4999, + "step": 101320 + }, + { + "epoch": 5.032780371510877, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003973815436574948, + "loss": 0.4966, + "step": 101330 + }, + { + "epoch": 5.033277043806496, + "grad_norm": 0.1259765625, + "learning_rate": 0.00039734180987384524, + "loss": 0.529, + "step": 101340 + }, + { + "epoch": 5.033773716102115, + "grad_norm": 0.11328125, + "learning_rate": 0.0003973020760901957, + "loss": 0.4916, + "step": 101350 + }, + { + "epoch": 5.0342703883977356, + "grad_norm": 0.111328125, + "learning_rate": 0.0003972623423065462, + "loss": 0.4746, + "step": 101360 + }, + { + "epoch": 5.034767060693355, + "grad_norm": 0.1494140625, + "learning_rate": 0.00039722260852289665, + "loss": 0.5153, + "step": 101370 + }, + { + "epoch": 5.035263732988974, + "grad_norm": 0.11328125, + "learning_rate": 0.00039718287473924707, + "loss": 0.4996, + "step": 101380 + }, + { + "epoch": 5.035760405284593, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003971431409555975, + "loss": 0.5083, + "step": 101390 + }, + { + "epoch": 5.0362570775802125, + "grad_norm": 0.138671875, + "learning_rate": 0.000397103407171948, + "loss": 0.4963, + "step": 101400 + }, + { + "epoch": 5.036753749875832, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003970636733882984, + "loss": 0.5013, + "step": 101410 + }, + { + "epoch": 5.037250422171451, + "grad_norm": 0.11328125, + "learning_rate": 0.00039702393960464884, + "loss": 0.4922, + "step": 101420 + }, + { + "epoch": 5.037747094467071, + "grad_norm": 0.099609375, + "learning_rate": 0.0003969842058209993, + "loss": 0.4955, + "step": 101430 + }, + { + "epoch": 5.03824376676269, + "grad_norm": 0.115234375, + "learning_rate": 0.0003969444720373498, + "loss": 0.4976, + "step": 101440 + }, + { + "epoch": 5.0387404390583095, + "grad_norm": 0.1435546875, + "learning_rate": 0.00039690473825370026, + "loss": 0.5043, + "step": 101450 + }, + { + "epoch": 5.039237111353929, + "grad_norm": 0.11279296875, + "learning_rate": 0.0003968650044700507, + "loss": 0.4751, + "step": 101460 + }, + { + "epoch": 5.039733783649548, + "grad_norm": 0.115234375, + "learning_rate": 0.0003968252706864011, + "loss": 0.4963, + "step": 101470 + }, + { + "epoch": 5.040230455945167, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003967855369027516, + "loss": 0.5233, + "step": 101480 + }, + { + "epoch": 5.040727128240786, + "grad_norm": 0.140625, + "learning_rate": 0.00039674580311910203, + "loss": 0.5207, + "step": 101490 + }, + { + "epoch": 5.0412238005364065, + "grad_norm": 0.1083984375, + "learning_rate": 0.00039670606933545245, + "loss": 0.4913, + "step": 101500 + }, + { + "epoch": 5.041720472832026, + "grad_norm": 0.107421875, + "learning_rate": 0.0003966663355518029, + "loss": 0.5108, + "step": 101510 + }, + { + "epoch": 5.042217145127645, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003966266017681534, + "loss": 0.5051, + "step": 101520 + }, + { + "epoch": 5.042713817423264, + "grad_norm": 0.1298828125, + "learning_rate": 0.00039658686798450387, + "loss": 0.5184, + "step": 101530 + }, + { + "epoch": 5.043210489718883, + "grad_norm": 0.10986328125, + "learning_rate": 0.0003965471342008543, + "loss": 0.4665, + "step": 101540 + }, + { + "epoch": 5.043707162014503, + "grad_norm": 0.1435546875, + "learning_rate": 0.00039650740041720475, + "loss": 0.522, + "step": 101550 + }, + { + "epoch": 5.044203834310122, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003964676666335552, + "loss": 0.4723, + "step": 101560 + }, + { + "epoch": 5.044700506605741, + "grad_norm": 0.1328125, + "learning_rate": 0.00039642793284990564, + "loss": 0.4845, + "step": 101570 + }, + { + "epoch": 5.045197178901361, + "grad_norm": 0.158203125, + "learning_rate": 0.00039638819906625606, + "loss": 0.4704, + "step": 101580 + }, + { + "epoch": 5.04569385119698, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003963484652826066, + "loss": 0.5048, + "step": 101590 + }, + { + "epoch": 5.0461905234926, + "grad_norm": 0.10986328125, + "learning_rate": 0.000396308731498957, + "loss": 0.5225, + "step": 101600 + }, + { + "epoch": 5.046687195788219, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003962689977153075, + "loss": 0.5385, + "step": 101610 + }, + { + "epoch": 5.047183868083838, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003962292639316579, + "loss": 0.4829, + "step": 101620 + }, + { + "epoch": 5.047680540379457, + "grad_norm": 0.111328125, + "learning_rate": 0.00039618953014800836, + "loss": 0.4772, + "step": 101630 + }, + { + "epoch": 5.048177212675077, + "grad_norm": 0.1494140625, + "learning_rate": 0.00039614979636435883, + "loss": 0.5052, + "step": 101640 + }, + { + "epoch": 5.048673884970697, + "grad_norm": 0.1083984375, + "learning_rate": 0.00039611006258070925, + "loss": 0.5336, + "step": 101650 + }, + { + "epoch": 5.049170557266316, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003960703287970597, + "loss": 0.5237, + "step": 101660 + }, + { + "epoch": 5.049667229561935, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003960305950134102, + "loss": 0.5515, + "step": 101670 + }, + { + "epoch": 5.050163901857554, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003959908612297606, + "loss": 0.5257, + "step": 101680 + }, + { + "epoch": 5.050660574153174, + "grad_norm": 0.154296875, + "learning_rate": 0.0003959511274461111, + "loss": 0.5001, + "step": 101690 + }, + { + "epoch": 5.051157246448793, + "grad_norm": 0.1318359375, + "learning_rate": 0.00039591139366246155, + "loss": 0.5143, + "step": 101700 + }, + { + "epoch": 5.051653918744412, + "grad_norm": 0.11279296875, + "learning_rate": 0.00039587165987881197, + "loss": 0.5013, + "step": 101710 + }, + { + "epoch": 5.052150591040032, + "grad_norm": 0.11767578125, + "learning_rate": 0.00039583192609516244, + "loss": 0.5004, + "step": 101720 + }, + { + "epoch": 5.052647263335651, + "grad_norm": 0.11669921875, + "learning_rate": 0.00039579219231151286, + "loss": 0.515, + "step": 101730 + }, + { + "epoch": 5.053143935631271, + "grad_norm": 0.11376953125, + "learning_rate": 0.00039575245852786333, + "loss": 0.4932, + "step": 101740 + }, + { + "epoch": 5.05364060792689, + "grad_norm": 0.10400390625, + "learning_rate": 0.0003957127247442138, + "loss": 0.5008, + "step": 101750 + }, + { + "epoch": 5.054137280222509, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003956729909605642, + "loss": 0.5068, + "step": 101760 + }, + { + "epoch": 5.054633952518128, + "grad_norm": 0.11083984375, + "learning_rate": 0.0003956332571769147, + "loss": 0.4884, + "step": 101770 + }, + { + "epoch": 5.0551306248137475, + "grad_norm": 0.154296875, + "learning_rate": 0.00039559352339326516, + "loss": 0.5051, + "step": 101780 + }, + { + "epoch": 5.055627297109368, + "grad_norm": 0.115234375, + "learning_rate": 0.0003955537896096156, + "loss": 0.4967, + "step": 101790 + }, + { + "epoch": 5.056123969404987, + "grad_norm": 0.126953125, + "learning_rate": 0.00039551405582596605, + "loss": 0.5092, + "step": 101800 + }, + { + "epoch": 5.056620641700606, + "grad_norm": 0.1318359375, + "learning_rate": 0.00039547432204231647, + "loss": 0.52, + "step": 101810 + }, + { + "epoch": 5.057117313996225, + "grad_norm": 0.125, + "learning_rate": 0.000395434588258667, + "loss": 0.5318, + "step": 101820 + }, + { + "epoch": 5.0576139862918446, + "grad_norm": 0.1689453125, + "learning_rate": 0.0003953948544750174, + "loss": 0.4582, + "step": 101830 + }, + { + "epoch": 5.058110658587464, + "grad_norm": 0.130859375, + "learning_rate": 0.00039535512069136783, + "loss": 0.4796, + "step": 101840 + }, + { + "epoch": 5.058607330883083, + "grad_norm": 0.12109375, + "learning_rate": 0.0003953153869077183, + "loss": 0.52, + "step": 101850 + }, + { + "epoch": 5.059104003178703, + "grad_norm": 0.1904296875, + "learning_rate": 0.00039527565312406877, + "loss": 0.4915, + "step": 101860 + }, + { + "epoch": 5.059600675474322, + "grad_norm": 0.21875, + "learning_rate": 0.0003952359193404192, + "loss": 0.4914, + "step": 101870 + }, + { + "epoch": 5.060097347769942, + "grad_norm": 0.1279296875, + "learning_rate": 0.00039519618555676966, + "loss": 0.5179, + "step": 101880 + }, + { + "epoch": 5.060594020065561, + "grad_norm": 0.11279296875, + "learning_rate": 0.00039515645177312013, + "loss": 0.5114, + "step": 101890 + }, + { + "epoch": 5.06109069236118, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003951167179894706, + "loss": 0.5279, + "step": 101900 + }, + { + "epoch": 5.061587364656799, + "grad_norm": 0.1279296875, + "learning_rate": 0.000395076984205821, + "loss": 0.4745, + "step": 101910 + }, + { + "epoch": 5.0620840369524185, + "grad_norm": 0.1435546875, + "learning_rate": 0.00039503725042217144, + "loss": 0.4826, + "step": 101920 + }, + { + "epoch": 5.062580709248038, + "grad_norm": 0.1103515625, + "learning_rate": 0.00039499751663852196, + "loss": 0.5154, + "step": 101930 + }, + { + "epoch": 5.063077381543658, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003949577828548724, + "loss": 0.4944, + "step": 101940 + }, + { + "epoch": 5.063574053839277, + "grad_norm": 0.13671875, + "learning_rate": 0.0003949180490712228, + "loss": 0.5743, + "step": 101950 + }, + { + "epoch": 5.064070726134896, + "grad_norm": 0.11474609375, + "learning_rate": 0.00039487831528757327, + "loss": 0.5165, + "step": 101960 + }, + { + "epoch": 5.0645673984305155, + "grad_norm": 0.125, + "learning_rate": 0.00039483858150392374, + "loss": 0.4993, + "step": 101970 + }, + { + "epoch": 5.065064070726135, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003947988477202742, + "loss": 0.4998, + "step": 101980 + }, + { + "epoch": 5.065560743021754, + "grad_norm": 0.1494140625, + "learning_rate": 0.0003947591139366246, + "loss": 0.478, + "step": 101990 + }, + { + "epoch": 5.066057415317373, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003947193801529751, + "loss": 0.5334, + "step": 102000 + }, + { + "epoch": 5.066554087612993, + "grad_norm": 0.11572265625, + "learning_rate": 0.00039467964636932557, + "loss": 0.5197, + "step": 102010 + }, + { + "epoch": 5.0670507599086125, + "grad_norm": 0.12158203125, + "learning_rate": 0.000394639912585676, + "loss": 0.5053, + "step": 102020 + }, + { + "epoch": 5.067547432204232, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003946001788020264, + "loss": 0.4691, + "step": 102030 + }, + { + "epoch": 5.068044104499851, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003945604450183769, + "loss": 0.4815, + "step": 102040 + }, + { + "epoch": 5.06854077679547, + "grad_norm": 0.107421875, + "learning_rate": 0.00039452071123472735, + "loss": 0.4819, + "step": 102050 + }, + { + "epoch": 5.069037449091089, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003944809774510778, + "loss": 0.4896, + "step": 102060 + }, + { + "epoch": 5.069534121386709, + "grad_norm": 0.12353515625, + "learning_rate": 0.00039444124366742824, + "loss": 0.4975, + "step": 102070 + }, + { + "epoch": 5.070030793682329, + "grad_norm": 0.1083984375, + "learning_rate": 0.0003944015098837787, + "loss": 0.487, + "step": 102080 + }, + { + "epoch": 5.070527465977948, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003943617761001292, + "loss": 0.5121, + "step": 102090 + }, + { + "epoch": 5.071024138273567, + "grad_norm": 0.123046875, + "learning_rate": 0.0003943220423164796, + "loss": 0.5056, + "step": 102100 + }, + { + "epoch": 5.0715208105691865, + "grad_norm": 0.150390625, + "learning_rate": 0.00039428230853283, + "loss": 0.5018, + "step": 102110 + }, + { + "epoch": 5.072017482864806, + "grad_norm": 0.130859375, + "learning_rate": 0.00039424257474918054, + "loss": 0.5073, + "step": 102120 + }, + { + "epoch": 5.072514155160425, + "grad_norm": 0.12158203125, + "learning_rate": 0.00039420284096553095, + "loss": 0.5065, + "step": 102130 + }, + { + "epoch": 5.073010827456044, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003941631071818814, + "loss": 0.5004, + "step": 102140 + }, + { + "epoch": 5.073507499751664, + "grad_norm": 0.1201171875, + "learning_rate": 0.00039412337339823184, + "loss": 0.5063, + "step": 102150 + }, + { + "epoch": 5.0740041720472835, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003940836396145823, + "loss": 0.4852, + "step": 102160 + }, + { + "epoch": 5.074500844342903, + "grad_norm": 0.14453125, + "learning_rate": 0.0003940439058309328, + "loss": 0.4731, + "step": 102170 + }, + { + "epoch": 5.074997516638522, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003940041720472832, + "loss": 0.5157, + "step": 102180 + }, + { + "epoch": 5.075494188934141, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003939644382636337, + "loss": 0.5061, + "step": 102190 + }, + { + "epoch": 5.07599086122976, + "grad_norm": 0.1025390625, + "learning_rate": 0.00039392470447998415, + "loss": 0.503, + "step": 102200 + }, + { + "epoch": 5.07648753352538, + "grad_norm": 0.11865234375, + "learning_rate": 0.00039388497069633456, + "loss": 0.4641, + "step": 102210 + }, + { + "epoch": 5.076984205821, + "grad_norm": 0.1220703125, + "learning_rate": 0.00039384523691268503, + "loss": 0.4968, + "step": 102220 + }, + { + "epoch": 5.077480878116619, + "grad_norm": 0.146484375, + "learning_rate": 0.0003938055031290355, + "loss": 0.4932, + "step": 102230 + }, + { + "epoch": 5.077977550412238, + "grad_norm": 0.142578125, + "learning_rate": 0.0003937657693453859, + "loss": 0.5269, + "step": 102240 + }, + { + "epoch": 5.078474222707857, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003937260355617364, + "loss": 0.4944, + "step": 102250 + }, + { + "epoch": 5.078970895003477, + "grad_norm": 0.1787109375, + "learning_rate": 0.0003936863017780868, + "loss": 0.5135, + "step": 102260 + }, + { + "epoch": 5.079467567299096, + "grad_norm": 0.109375, + "learning_rate": 0.0003936465679944373, + "loss": 0.5092, + "step": 102270 + }, + { + "epoch": 5.079964239594715, + "grad_norm": 0.1123046875, + "learning_rate": 0.00039360683421078775, + "loss": 0.4748, + "step": 102280 + }, + { + "epoch": 5.080460911890334, + "grad_norm": 0.10888671875, + "learning_rate": 0.00039356710042713817, + "loss": 0.4965, + "step": 102290 + }, + { + "epoch": 5.0809575841859544, + "grad_norm": 0.115234375, + "learning_rate": 0.00039352736664348864, + "loss": 0.523, + "step": 102300 + }, + { + "epoch": 5.081454256481574, + "grad_norm": 0.10498046875, + "learning_rate": 0.0003934876328598391, + "loss": 0.5026, + "step": 102310 + }, + { + "epoch": 5.081950928777193, + "grad_norm": 0.1728515625, + "learning_rate": 0.00039344789907618953, + "loss": 0.5191, + "step": 102320 + }, + { + "epoch": 5.082447601072812, + "grad_norm": 0.1201171875, + "learning_rate": 0.00039340816529254, + "loss": 0.5199, + "step": 102330 + }, + { + "epoch": 5.082944273368431, + "grad_norm": 0.125, + "learning_rate": 0.0003933684315088904, + "loss": 0.5079, + "step": 102340 + }, + { + "epoch": 5.083440945664051, + "grad_norm": 0.130859375, + "learning_rate": 0.00039332869772524095, + "loss": 0.5021, + "step": 102350 + }, + { + "epoch": 5.08393761795967, + "grad_norm": 0.11376953125, + "learning_rate": 0.00039328896394159136, + "loss": 0.5217, + "step": 102360 + }, + { + "epoch": 5.08443429025529, + "grad_norm": 0.11083984375, + "learning_rate": 0.0003932492301579418, + "loss": 0.4928, + "step": 102370 + }, + { + "epoch": 5.084930962550909, + "grad_norm": 0.1123046875, + "learning_rate": 0.00039320949637429225, + "loss": 0.5058, + "step": 102380 + }, + { + "epoch": 5.085427634846528, + "grad_norm": 0.10791015625, + "learning_rate": 0.0003931697625906427, + "loss": 0.4907, + "step": 102390 + }, + { + "epoch": 5.085924307142148, + "grad_norm": 0.10107421875, + "learning_rate": 0.00039313002880699314, + "loss": 0.5038, + "step": 102400 + }, + { + "epoch": 5.086420979437767, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003930902950233436, + "loss": 0.4868, + "step": 102410 + }, + { + "epoch": 5.086917651733386, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003930505612396941, + "loss": 0.4926, + "step": 102420 + }, + { + "epoch": 5.087414324029005, + "grad_norm": 0.162109375, + "learning_rate": 0.00039301082745604455, + "loss": 0.5046, + "step": 102430 + }, + { + "epoch": 5.087910996324625, + "grad_norm": 0.109375, + "learning_rate": 0.00039297109367239497, + "loss": 0.5194, + "step": 102440 + }, + { + "epoch": 5.088407668620245, + "grad_norm": 0.10595703125, + "learning_rate": 0.0003929313598887454, + "loss": 0.5021, + "step": 102450 + }, + { + "epoch": 5.088904340915864, + "grad_norm": 0.111328125, + "learning_rate": 0.0003928916261050959, + "loss": 0.5071, + "step": 102460 + }, + { + "epoch": 5.089401013211483, + "grad_norm": 0.1123046875, + "learning_rate": 0.00039285189232144633, + "loss": 0.5201, + "step": 102470 + }, + { + "epoch": 5.089897685507102, + "grad_norm": 0.119140625, + "learning_rate": 0.00039281215853779675, + "loss": 0.4912, + "step": 102480 + }, + { + "epoch": 5.0903943578027215, + "grad_norm": 0.171875, + "learning_rate": 0.0003927724247541472, + "loss": 0.5096, + "step": 102490 + }, + { + "epoch": 5.090891030098341, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003927326909704977, + "loss": 0.5001, + "step": 102500 + }, + { + "epoch": 5.091387702393961, + "grad_norm": 0.11767578125, + "learning_rate": 0.00039269295718684816, + "loss": 0.4984, + "step": 102510 + }, + { + "epoch": 5.09188437468958, + "grad_norm": 0.109375, + "learning_rate": 0.0003926532234031986, + "loss": 0.5196, + "step": 102520 + }, + { + "epoch": 5.092381046985199, + "grad_norm": 0.11181640625, + "learning_rate": 0.00039261348961954905, + "loss": 0.4902, + "step": 102530 + }, + { + "epoch": 5.092877719280819, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003925737558358995, + "loss": 0.4998, + "step": 102540 + }, + { + "epoch": 5.093374391576438, + "grad_norm": 0.10302734375, + "learning_rate": 0.00039253402205224994, + "loss": 0.4996, + "step": 102550 + }, + { + "epoch": 5.093871063872057, + "grad_norm": 0.11083984375, + "learning_rate": 0.00039249428826860036, + "loss": 0.4727, + "step": 102560 + }, + { + "epoch": 5.094367736167676, + "grad_norm": 0.111328125, + "learning_rate": 0.0003924545544849508, + "loss": 0.4791, + "step": 102570 + }, + { + "epoch": 5.094864408463296, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003924148207013013, + "loss": 0.5087, + "step": 102580 + }, + { + "epoch": 5.095361080758916, + "grad_norm": 0.1279296875, + "learning_rate": 0.00039237508691765177, + "loss": 0.4893, + "step": 102590 + }, + { + "epoch": 5.095857753054535, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003923353531340022, + "loss": 0.4865, + "step": 102600 + }, + { + "epoch": 5.096354425350154, + "grad_norm": 0.10888671875, + "learning_rate": 0.00039229561935035266, + "loss": 0.5286, + "step": 102610 + }, + { + "epoch": 5.096851097645773, + "grad_norm": 0.130859375, + "learning_rate": 0.00039225588556670313, + "loss": 0.491, + "step": 102620 + }, + { + "epoch": 5.0973477699413925, + "grad_norm": 0.1328125, + "learning_rate": 0.00039221615178305355, + "loss": 0.4991, + "step": 102630 + }, + { + "epoch": 5.097844442237012, + "grad_norm": 0.1494140625, + "learning_rate": 0.000392176417999404, + "loss": 0.4782, + "step": 102640 + }, + { + "epoch": 5.098341114532631, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003921366842157545, + "loss": 0.4842, + "step": 102650 + }, + { + "epoch": 5.098837786828251, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003920969504321049, + "loss": 0.5013, + "step": 102660 + }, + { + "epoch": 5.09933445912387, + "grad_norm": 0.119140625, + "learning_rate": 0.0003920572166484554, + "loss": 0.5183, + "step": 102670 + }, + { + "epoch": 5.0998311314194895, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003920174828648058, + "loss": 0.5004, + "step": 102680 + }, + { + "epoch": 5.100327803715109, + "grad_norm": 0.1630859375, + "learning_rate": 0.00039197774908115627, + "loss": 0.5079, + "step": 102690 + }, + { + "epoch": 5.100824476010728, + "grad_norm": 0.126953125, + "learning_rate": 0.00039193801529750674, + "loss": 0.5287, + "step": 102700 + }, + { + "epoch": 5.101321148306347, + "grad_norm": 0.171875, + "learning_rate": 0.00039189828151385715, + "loss": 0.4842, + "step": 102710 + }, + { + "epoch": 5.101817820601966, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003918585477302076, + "loss": 0.496, + "step": 102720 + }, + { + "epoch": 5.1023144928975865, + "grad_norm": 0.10693359375, + "learning_rate": 0.0003918188139465581, + "loss": 0.4851, + "step": 102730 + }, + { + "epoch": 5.102811165193206, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003917790801629085, + "loss": 0.5089, + "step": 102740 + }, + { + "epoch": 5.103307837488825, + "grad_norm": 0.11962890625, + "learning_rate": 0.000391739346379259, + "loss": 0.5283, + "step": 102750 + }, + { + "epoch": 5.103804509784444, + "grad_norm": 0.166015625, + "learning_rate": 0.00039169961259560946, + "loss": 0.5042, + "step": 102760 + }, + { + "epoch": 5.1043011820800634, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003916598788119599, + "loss": 0.5159, + "step": 102770 + }, + { + "epoch": 5.104797854375683, + "grad_norm": 0.10595703125, + "learning_rate": 0.00039162014502831035, + "loss": 0.4816, + "step": 102780 + }, + { + "epoch": 5.105294526671302, + "grad_norm": 0.162109375, + "learning_rate": 0.00039158041124466076, + "loss": 0.4739, + "step": 102790 + }, + { + "epoch": 5.105791198966922, + "grad_norm": 0.146484375, + "learning_rate": 0.0003915406774610113, + "loss": 0.4951, + "step": 102800 + }, + { + "epoch": 5.106287871262541, + "grad_norm": 0.1083984375, + "learning_rate": 0.0003915009436773617, + "loss": 0.5007, + "step": 102810 + }, + { + "epoch": 5.1067845435581605, + "grad_norm": 0.10986328125, + "learning_rate": 0.0003914612098937121, + "loss": 0.531, + "step": 102820 + }, + { + "epoch": 5.10728121585378, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003914214761100626, + "loss": 0.4928, + "step": 102830 + }, + { + "epoch": 5.107777888149399, + "grad_norm": 0.1259765625, + "learning_rate": 0.00039138174232641307, + "loss": 0.5146, + "step": 102840 + }, + { + "epoch": 5.108274560445018, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003913420085427635, + "loss": 0.5104, + "step": 102850 + }, + { + "epoch": 5.108771232740637, + "grad_norm": 0.12109375, + "learning_rate": 0.00039130227475911395, + "loss": 0.5028, + "step": 102860 + }, + { + "epoch": 5.1092679050362575, + "grad_norm": 0.11083984375, + "learning_rate": 0.00039126254097546437, + "loss": 0.5051, + "step": 102870 + }, + { + "epoch": 5.109764577331877, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003912228071918149, + "loss": 0.4709, + "step": 102880 + }, + { + "epoch": 5.110261249627496, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003911830734081653, + "loss": 0.5024, + "step": 102890 + }, + { + "epoch": 5.110757921923115, + "grad_norm": 0.1484375, + "learning_rate": 0.00039114333962451573, + "loss": 0.5313, + "step": 102900 + }, + { + "epoch": 5.111254594218734, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003911036058408662, + "loss": 0.4929, + "step": 102910 + }, + { + "epoch": 5.111751266514354, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003910638720572167, + "loss": 0.515, + "step": 102920 + }, + { + "epoch": 5.112247938809973, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003910241382735671, + "loss": 0.5006, + "step": 102930 + }, + { + "epoch": 5.112744611105592, + "grad_norm": 0.126953125, + "learning_rate": 0.00039098440448991756, + "loss": 0.5125, + "step": 102940 + }, + { + "epoch": 5.113241283401212, + "grad_norm": 0.16015625, + "learning_rate": 0.00039094467070626803, + "loss": 0.5217, + "step": 102950 + }, + { + "epoch": 5.113737955696831, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003909049369226185, + "loss": 0.4785, + "step": 102960 + }, + { + "epoch": 5.114234627992451, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003908652031389689, + "loss": 0.5008, + "step": 102970 + }, + { + "epoch": 5.11473130028807, + "grad_norm": 0.1083984375, + "learning_rate": 0.00039082546935531934, + "loss": 0.4956, + "step": 102980 + }, + { + "epoch": 5.115227972583689, + "grad_norm": 0.12255859375, + "learning_rate": 0.00039078573557166987, + "loss": 0.508, + "step": 102990 + }, + { + "epoch": 5.115724644879308, + "grad_norm": 0.103515625, + "learning_rate": 0.0003907460017880203, + "loss": 0.4925, + "step": 103000 + }, + { + "epoch": 5.116221317174928, + "grad_norm": 0.1279296875, + "learning_rate": 0.00039070626800437075, + "loss": 0.4951, + "step": 103010 + }, + { + "epoch": 5.116717989470548, + "grad_norm": 0.12890625, + "learning_rate": 0.00039066653422072117, + "loss": 0.4956, + "step": 103020 + }, + { + "epoch": 5.117214661766167, + "grad_norm": 0.1298828125, + "learning_rate": 0.00039062680043707164, + "loss": 0.478, + "step": 103030 + }, + { + "epoch": 5.117711334061786, + "grad_norm": 0.11328125, + "learning_rate": 0.0003905870666534221, + "loss": 0.5068, + "step": 103040 + }, + { + "epoch": 5.118208006357405, + "grad_norm": 0.130859375, + "learning_rate": 0.00039054733286977253, + "loss": 0.4952, + "step": 103050 + }, + { + "epoch": 5.118704678653025, + "grad_norm": 0.11669921875, + "learning_rate": 0.000390507599086123, + "loss": 0.5261, + "step": 103060 + }, + { + "epoch": 5.119201350948644, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003904678653024735, + "loss": 0.5127, + "step": 103070 + }, + { + "epoch": 5.119698023244263, + "grad_norm": 0.12890625, + "learning_rate": 0.0003904281315188239, + "loss": 0.5266, + "step": 103080 + }, + { + "epoch": 5.120194695539883, + "grad_norm": 0.11376953125, + "learning_rate": 0.00039038839773517436, + "loss": 0.5181, + "step": 103090 + }, + { + "epoch": 5.120691367835502, + "grad_norm": 0.11669921875, + "learning_rate": 0.00039034866395152483, + "loss": 0.4815, + "step": 103100 + }, + { + "epoch": 5.121188040131122, + "grad_norm": 0.154296875, + "learning_rate": 0.00039030893016787525, + "loss": 0.5016, + "step": 103110 + }, + { + "epoch": 5.121684712426741, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003902691963842257, + "loss": 0.5309, + "step": 103120 + }, + { + "epoch": 5.12218138472236, + "grad_norm": 0.1162109375, + "learning_rate": 0.00039022946260057614, + "loss": 0.4882, + "step": 103130 + }, + { + "epoch": 5.122678057017979, + "grad_norm": 0.138671875, + "learning_rate": 0.0003901897288169266, + "loss": 0.4995, + "step": 103140 + }, + { + "epoch": 5.1231747293135985, + "grad_norm": 0.1171875, + "learning_rate": 0.0003901499950332771, + "loss": 0.486, + "step": 103150 + }, + { + "epoch": 5.123671401609219, + "grad_norm": 0.150390625, + "learning_rate": 0.0003901102612496275, + "loss": 0.5112, + "step": 103160 + }, + { + "epoch": 5.124168073904838, + "grad_norm": 0.12255859375, + "learning_rate": 0.00039007052746597797, + "loss": 0.5025, + "step": 103170 + }, + { + "epoch": 5.124664746200457, + "grad_norm": 0.126953125, + "learning_rate": 0.00039003079368232844, + "loss": 0.4685, + "step": 103180 + }, + { + "epoch": 5.125161418496076, + "grad_norm": 0.1318359375, + "learning_rate": 0.00038999105989867886, + "loss": 0.5198, + "step": 103190 + }, + { + "epoch": 5.1256580907916955, + "grad_norm": 0.11767578125, + "learning_rate": 0.00038995132611502933, + "loss": 0.5119, + "step": 103200 + }, + { + "epoch": 5.126154763087315, + "grad_norm": 0.119140625, + "learning_rate": 0.00038991159233137975, + "loss": 0.5381, + "step": 103210 + }, + { + "epoch": 5.126651435382934, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003898718585477302, + "loss": 0.5304, + "step": 103220 + }, + { + "epoch": 5.127148107678554, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003898321247640807, + "loss": 0.4936, + "step": 103230 + }, + { + "epoch": 5.127644779974173, + "grad_norm": 0.15625, + "learning_rate": 0.0003897923909804311, + "loss": 0.4661, + "step": 103240 + }, + { + "epoch": 5.128141452269793, + "grad_norm": 0.111328125, + "learning_rate": 0.0003897526571967816, + "loss": 0.5276, + "step": 103250 + }, + { + "epoch": 5.128638124565412, + "grad_norm": 0.111328125, + "learning_rate": 0.00038971292341313205, + "loss": 0.4815, + "step": 103260 + }, + { + "epoch": 5.129134796861031, + "grad_norm": 0.12109375, + "learning_rate": 0.00038967318962948247, + "loss": 0.5073, + "step": 103270 + }, + { + "epoch": 5.12963146915665, + "grad_norm": 0.134765625, + "learning_rate": 0.00038963345584583294, + "loss": 0.4962, + "step": 103280 + }, + { + "epoch": 5.1301281414522695, + "grad_norm": 0.12890625, + "learning_rate": 0.0003895937220621834, + "loss": 0.5054, + "step": 103290 + }, + { + "epoch": 5.13062481374789, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003895539882785338, + "loss": 0.4623, + "step": 103300 + }, + { + "epoch": 5.131121486043509, + "grad_norm": 0.11279296875, + "learning_rate": 0.0003895142544948843, + "loss": 0.5243, + "step": 103310 + }, + { + "epoch": 5.131618158339128, + "grad_norm": 0.12890625, + "learning_rate": 0.0003894745207112347, + "loss": 0.5063, + "step": 103320 + }, + { + "epoch": 5.132114830634747, + "grad_norm": 0.1591796875, + "learning_rate": 0.00038943478692758524, + "loss": 0.512, + "step": 103330 + }, + { + "epoch": 5.1326115029303665, + "grad_norm": 0.11865234375, + "learning_rate": 0.00038939505314393566, + "loss": 0.4908, + "step": 103340 + }, + { + "epoch": 5.133108175225986, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003893553193602861, + "loss": 0.4813, + "step": 103350 + }, + { + "epoch": 5.133604847521605, + "grad_norm": 0.11474609375, + "learning_rate": 0.00038931558557663655, + "loss": 0.5134, + "step": 103360 + }, + { + "epoch": 5.134101519817224, + "grad_norm": 0.1376953125, + "learning_rate": 0.000389275851792987, + "loss": 0.4953, + "step": 103370 + }, + { + "epoch": 5.134598192112844, + "grad_norm": 0.111328125, + "learning_rate": 0.00038923611800933743, + "loss": 0.479, + "step": 103380 + }, + { + "epoch": 5.1350948644084635, + "grad_norm": 0.185546875, + "learning_rate": 0.0003891963842256879, + "loss": 0.4816, + "step": 103390 + }, + { + "epoch": 5.135591536704083, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003891566504420384, + "loss": 0.4737, + "step": 103400 + }, + { + "epoch": 5.136088208999702, + "grad_norm": 0.126953125, + "learning_rate": 0.00038911691665838885, + "loss": 0.4755, + "step": 103410 + }, + { + "epoch": 5.136584881295321, + "grad_norm": 0.1396484375, + "learning_rate": 0.00038907718287473927, + "loss": 0.5256, + "step": 103420 + }, + { + "epoch": 5.13708155359094, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003890374490910897, + "loss": 0.5065, + "step": 103430 + }, + { + "epoch": 5.13757822588656, + "grad_norm": 0.12890625, + "learning_rate": 0.00038899771530744015, + "loss": 0.5308, + "step": 103440 + }, + { + "epoch": 5.13807489818218, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003889579815237906, + "loss": 0.5085, + "step": 103450 + }, + { + "epoch": 5.138571570477799, + "grad_norm": 0.10693359375, + "learning_rate": 0.0003889182477401411, + "loss": 0.4964, + "step": 103460 + }, + { + "epoch": 5.139068242773418, + "grad_norm": 0.1083984375, + "learning_rate": 0.0003888785139564915, + "loss": 0.4791, + "step": 103470 + }, + { + "epoch": 5.1395649150690375, + "grad_norm": 0.12109375, + "learning_rate": 0.000388838780172842, + "loss": 0.5215, + "step": 103480 + }, + { + "epoch": 5.140061587364657, + "grad_norm": 0.125, + "learning_rate": 0.00038879904638919246, + "loss": 0.4902, + "step": 103490 + }, + { + "epoch": 5.140558259660276, + "grad_norm": 0.10986328125, + "learning_rate": 0.0003887593126055429, + "loss": 0.5147, + "step": 103500 + }, + { + "epoch": 5.141054931955895, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003887195788218933, + "loss": 0.468, + "step": 103510 + }, + { + "epoch": 5.141551604251515, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003886798450382438, + "loss": 0.4909, + "step": 103520 + }, + { + "epoch": 5.1420482765471345, + "grad_norm": 0.11083984375, + "learning_rate": 0.00038864011125459423, + "loss": 0.5078, + "step": 103530 + }, + { + "epoch": 5.142544948842754, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003886003774709447, + "loss": 0.511, + "step": 103540 + }, + { + "epoch": 5.143041621138373, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003885606436872951, + "loss": 0.5178, + "step": 103550 + }, + { + "epoch": 5.143538293433992, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003885209099036456, + "loss": 0.4828, + "step": 103560 + }, + { + "epoch": 5.144034965729611, + "grad_norm": 0.1318359375, + "learning_rate": 0.00038848117611999607, + "loss": 0.4752, + "step": 103570 + }, + { + "epoch": 5.144531638025231, + "grad_norm": 0.125, + "learning_rate": 0.0003884414423363465, + "loss": 0.4961, + "step": 103580 + }, + { + "epoch": 5.145028310320851, + "grad_norm": 0.1123046875, + "learning_rate": 0.00038840170855269695, + "loss": 0.5233, + "step": 103590 + }, + { + "epoch": 5.14552498261647, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003883619747690474, + "loss": 0.5459, + "step": 103600 + }, + { + "epoch": 5.146021654912089, + "grad_norm": 0.115234375, + "learning_rate": 0.00038832224098539784, + "loss": 0.5349, + "step": 103610 + }, + { + "epoch": 5.146518327207708, + "grad_norm": 0.115234375, + "learning_rate": 0.0003882825072017483, + "loss": 0.5141, + "step": 103620 + }, + { + "epoch": 5.147014999503328, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003882427734180988, + "loss": 0.5031, + "step": 103630 + }, + { + "epoch": 5.147511671798947, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003882030396344492, + "loss": 0.5, + "step": 103640 + }, + { + "epoch": 5.148008344094566, + "grad_norm": 0.134765625, + "learning_rate": 0.0003881633058507997, + "loss": 0.4862, + "step": 103650 + }, + { + "epoch": 5.148505016390185, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003881235720671501, + "loss": 0.5254, + "step": 103660 + }, + { + "epoch": 5.149001688685805, + "grad_norm": 0.134765625, + "learning_rate": 0.00038808383828350056, + "loss": 0.5075, + "step": 103670 + }, + { + "epoch": 5.149498360981425, + "grad_norm": 0.1796875, + "learning_rate": 0.00038804410449985103, + "loss": 0.5022, + "step": 103680 + }, + { + "epoch": 5.149995033277044, + "grad_norm": 0.1806640625, + "learning_rate": 0.00038800437071620145, + "loss": 0.5217, + "step": 103690 + }, + { + "epoch": 5.150491705572663, + "grad_norm": 0.1748046875, + "learning_rate": 0.0003879646369325519, + "loss": 0.4857, + "step": 103700 + }, + { + "epoch": 5.150988377868282, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003879249031489024, + "loss": 0.5378, + "step": 103710 + }, + { + "epoch": 5.151485050163902, + "grad_norm": 0.1787109375, + "learning_rate": 0.0003878851693652528, + "loss": 0.4913, + "step": 103720 + }, + { + "epoch": 5.151981722459521, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003878454355816033, + "loss": 0.4918, + "step": 103730 + }, + { + "epoch": 5.152478394755141, + "grad_norm": 0.1044921875, + "learning_rate": 0.0003878057017979537, + "loss": 0.5136, + "step": 103740 + }, + { + "epoch": 5.15297506705076, + "grad_norm": 0.13671875, + "learning_rate": 0.00038776596801430417, + "loss": 0.4936, + "step": 103750 + }, + { + "epoch": 5.153471739346379, + "grad_norm": 0.1220703125, + "learning_rate": 0.00038772623423065464, + "loss": 0.4903, + "step": 103760 + }, + { + "epoch": 5.153968411641999, + "grad_norm": 0.11767578125, + "learning_rate": 0.00038768650044700506, + "loss": 0.5015, + "step": 103770 + }, + { + "epoch": 5.154465083937618, + "grad_norm": 0.126953125, + "learning_rate": 0.00038764676666335553, + "loss": 0.486, + "step": 103780 + }, + { + "epoch": 5.154961756233237, + "grad_norm": 0.1171875, + "learning_rate": 0.000387607032879706, + "loss": 0.4983, + "step": 103790 + }, + { + "epoch": 5.155458428528856, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003875672990960564, + "loss": 0.5041, + "step": 103800 + }, + { + "epoch": 5.155955100824476, + "grad_norm": 0.11328125, + "learning_rate": 0.0003875275653124069, + "loss": 0.5199, + "step": 103810 + }, + { + "epoch": 5.156451773120096, + "grad_norm": 0.1201171875, + "learning_rate": 0.00038748783152875736, + "loss": 0.4944, + "step": 103820 + }, + { + "epoch": 5.156948445415715, + "grad_norm": 0.10107421875, + "learning_rate": 0.0003874480977451078, + "loss": 0.5237, + "step": 103830 + }, + { + "epoch": 5.157445117711334, + "grad_norm": 0.1455078125, + "learning_rate": 0.00038740836396145825, + "loss": 0.4921, + "step": 103840 + }, + { + "epoch": 5.157941790006953, + "grad_norm": 0.150390625, + "learning_rate": 0.00038736863017780867, + "loss": 0.5085, + "step": 103850 + }, + { + "epoch": 5.1584384623025725, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003873288963941592, + "loss": 0.5085, + "step": 103860 + }, + { + "epoch": 5.158935134598192, + "grad_norm": 0.10498046875, + "learning_rate": 0.0003872891626105096, + "loss": 0.4656, + "step": 103870 + }, + { + "epoch": 5.159431806893812, + "grad_norm": 0.10205078125, + "learning_rate": 0.00038724942882686, + "loss": 0.4783, + "step": 103880 + }, + { + "epoch": 5.159928479189431, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003872096950432105, + "loss": 0.5244, + "step": 103890 + }, + { + "epoch": 5.16042515148505, + "grad_norm": 0.11962890625, + "learning_rate": 0.00038716996125956097, + "loss": 0.4822, + "step": 103900 + }, + { + "epoch": 5.1609218237806695, + "grad_norm": 0.1201171875, + "learning_rate": 0.00038713022747591144, + "loss": 0.4976, + "step": 103910 + }, + { + "epoch": 5.161418496076289, + "grad_norm": 0.11572265625, + "learning_rate": 0.00038709049369226186, + "loss": 0.52, + "step": 103920 + }, + { + "epoch": 5.161915168371908, + "grad_norm": 0.15625, + "learning_rate": 0.00038705075990861233, + "loss": 0.4899, + "step": 103930 + }, + { + "epoch": 5.162411840667527, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003870110261249628, + "loss": 0.509, + "step": 103940 + }, + { + "epoch": 5.162908512963147, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003869712923413132, + "loss": 0.5289, + "step": 103950 + }, + { + "epoch": 5.163405185258767, + "grad_norm": 0.1796875, + "learning_rate": 0.00038693155855766363, + "loss": 0.516, + "step": 103960 + }, + { + "epoch": 5.163901857554386, + "grad_norm": 0.1171875, + "learning_rate": 0.0003868918247740141, + "loss": 0.5035, + "step": 103970 + }, + { + "epoch": 5.164398529850005, + "grad_norm": 0.12890625, + "learning_rate": 0.0003868520909903646, + "loss": 0.4768, + "step": 103980 + }, + { + "epoch": 5.164895202145624, + "grad_norm": 0.115234375, + "learning_rate": 0.00038681235720671505, + "loss": 0.5205, + "step": 103990 + }, + { + "epoch": 5.1653918744412435, + "grad_norm": 0.11962890625, + "learning_rate": 0.00038677262342306547, + "loss": 0.4778, + "step": 104000 + }, + { + "epoch": 5.165888546736863, + "grad_norm": 0.138671875, + "learning_rate": 0.00038673288963941594, + "loss": 0.507, + "step": 104010 + }, + { + "epoch": 5.166385219032483, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003866931558557664, + "loss": 0.4865, + "step": 104020 + }, + { + "epoch": 5.166881891328102, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003866534220721168, + "loss": 0.513, + "step": 104030 + }, + { + "epoch": 5.167378563623721, + "grad_norm": 0.1044921875, + "learning_rate": 0.00038661368828846724, + "loss": 0.498, + "step": 104040 + }, + { + "epoch": 5.1678752359193405, + "grad_norm": 0.1044921875, + "learning_rate": 0.00038657395450481777, + "loss": 0.4811, + "step": 104050 + }, + { + "epoch": 5.16837190821496, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003865342207211682, + "loss": 0.516, + "step": 104060 + }, + { + "epoch": 5.168868580510579, + "grad_norm": 0.12353515625, + "learning_rate": 0.00038649448693751866, + "loss": 0.4896, + "step": 104070 + }, + { + "epoch": 5.169365252806198, + "grad_norm": 0.162109375, + "learning_rate": 0.0003864547531538691, + "loss": 0.4965, + "step": 104080 + }, + { + "epoch": 5.169861925101817, + "grad_norm": 0.1328125, + "learning_rate": 0.00038641501937021955, + "loss": 0.5355, + "step": 104090 + }, + { + "epoch": 5.1703585973974375, + "grad_norm": 0.138671875, + "learning_rate": 0.00038637528558657, + "loss": 0.4911, + "step": 104100 + }, + { + "epoch": 5.170855269693057, + "grad_norm": 0.10693359375, + "learning_rate": 0.00038633555180292043, + "loss": 0.4893, + "step": 104110 + }, + { + "epoch": 5.171351941988676, + "grad_norm": 0.177734375, + "learning_rate": 0.0003862958180192709, + "loss": 0.5215, + "step": 104120 + }, + { + "epoch": 5.171848614284295, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003862560842356214, + "loss": 0.4862, + "step": 104130 + }, + { + "epoch": 5.172345286579914, + "grad_norm": 0.13671875, + "learning_rate": 0.0003862163504519718, + "loss": 0.5223, + "step": 104140 + }, + { + "epoch": 5.172841958875534, + "grad_norm": 0.1240234375, + "learning_rate": 0.00038617661666832227, + "loss": 0.5009, + "step": 104150 + }, + { + "epoch": 5.173338631171153, + "grad_norm": 0.1318359375, + "learning_rate": 0.00038613688288467274, + "loss": 0.5038, + "step": 104160 + }, + { + "epoch": 5.173835303466773, + "grad_norm": 0.11083984375, + "learning_rate": 0.00038609714910102315, + "loss": 0.503, + "step": 104170 + }, + { + "epoch": 5.174331975762392, + "grad_norm": 0.1123046875, + "learning_rate": 0.0003860574153173736, + "loss": 0.5017, + "step": 104180 + }, + { + "epoch": 5.1748286480580115, + "grad_norm": 0.11669921875, + "learning_rate": 0.00038601768153372404, + "loss": 0.5268, + "step": 104190 + }, + { + "epoch": 5.175325320353631, + "grad_norm": 0.171875, + "learning_rate": 0.0003859779477500745, + "loss": 0.5443, + "step": 104200 + }, + { + "epoch": 5.17582199264925, + "grad_norm": 0.126953125, + "learning_rate": 0.000385938213966425, + "loss": 0.4868, + "step": 104210 + }, + { + "epoch": 5.176318664944869, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003858984801827754, + "loss": 0.5093, + "step": 104220 + }, + { + "epoch": 5.176815337240488, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003858587463991259, + "loss": 0.4801, + "step": 104230 + }, + { + "epoch": 5.1773120095361085, + "grad_norm": 0.138671875, + "learning_rate": 0.00038581901261547635, + "loss": 0.5064, + "step": 104240 + }, + { + "epoch": 5.177808681831728, + "grad_norm": 0.1044921875, + "learning_rate": 0.00038577927883182676, + "loss": 0.5123, + "step": 104250 + }, + { + "epoch": 5.178305354127347, + "grad_norm": 0.11767578125, + "learning_rate": 0.00038573954504817723, + "loss": 0.4905, + "step": 104260 + }, + { + "epoch": 5.178802026422966, + "grad_norm": 0.11328125, + "learning_rate": 0.00038569981126452765, + "loss": 0.4858, + "step": 104270 + }, + { + "epoch": 5.179298698718585, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003856600774808782, + "loss": 0.4955, + "step": 104280 + }, + { + "epoch": 5.179795371014205, + "grad_norm": 0.1123046875, + "learning_rate": 0.0003856203436972286, + "loss": 0.4959, + "step": 104290 + }, + { + "epoch": 5.180292043309824, + "grad_norm": 0.126953125, + "learning_rate": 0.000385580609913579, + "loss": 0.4797, + "step": 104300 + }, + { + "epoch": 5.180788715605443, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003855408761299295, + "loss": 0.5123, + "step": 104310 + }, + { + "epoch": 5.181285387901063, + "grad_norm": 0.11962890625, + "learning_rate": 0.00038550114234627995, + "loss": 0.5286, + "step": 104320 + }, + { + "epoch": 5.181782060196682, + "grad_norm": 0.1337890625, + "learning_rate": 0.00038546140856263037, + "loss": 0.5173, + "step": 104330 + }, + { + "epoch": 5.182278732492302, + "grad_norm": 0.12109375, + "learning_rate": 0.00038542167477898084, + "loss": 0.5002, + "step": 104340 + }, + { + "epoch": 5.182775404787921, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003853819409953313, + "loss": 0.5172, + "step": 104350 + }, + { + "epoch": 5.18327207708354, + "grad_norm": 0.11328125, + "learning_rate": 0.0003853422072116818, + "loss": 0.5017, + "step": 104360 + }, + { + "epoch": 5.183768749379159, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003853024734280322, + "loss": 0.5134, + "step": 104370 + }, + { + "epoch": 5.1842654216747786, + "grad_norm": 0.1484375, + "learning_rate": 0.0003852627396443826, + "loss": 0.5253, + "step": 104380 + }, + { + "epoch": 5.184762093970399, + "grad_norm": 0.12353515625, + "learning_rate": 0.00038522300586073314, + "loss": 0.4886, + "step": 104390 + }, + { + "epoch": 5.185258766266018, + "grad_norm": 0.130859375, + "learning_rate": 0.00038518327207708356, + "loss": 0.549, + "step": 104400 + }, + { + "epoch": 5.185755438561637, + "grad_norm": 0.212890625, + "learning_rate": 0.000385143538293434, + "loss": 0.5098, + "step": 104410 + }, + { + "epoch": 5.186252110857256, + "grad_norm": 0.130859375, + "learning_rate": 0.00038510380450978445, + "loss": 0.5156, + "step": 104420 + }, + { + "epoch": 5.186748783152876, + "grad_norm": 0.12109375, + "learning_rate": 0.0003850640707261349, + "loss": 0.5012, + "step": 104430 + }, + { + "epoch": 5.187245455448495, + "grad_norm": 0.115234375, + "learning_rate": 0.0003850243369424854, + "loss": 0.4997, + "step": 104440 + }, + { + "epoch": 5.187742127744114, + "grad_norm": 0.15234375, + "learning_rate": 0.0003849846031588358, + "loss": 0.5118, + "step": 104450 + }, + { + "epoch": 5.188238800039734, + "grad_norm": 0.11328125, + "learning_rate": 0.0003849448693751863, + "loss": 0.4779, + "step": 104460 + }, + { + "epoch": 5.188735472335353, + "grad_norm": 0.1279296875, + "learning_rate": 0.00038490513559153675, + "loss": 0.5122, + "step": 104470 + }, + { + "epoch": 5.189232144630973, + "grad_norm": 0.11865234375, + "learning_rate": 0.00038486540180788717, + "loss": 0.5162, + "step": 104480 + }, + { + "epoch": 5.189728816926592, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003848256680242376, + "loss": 0.5082, + "step": 104490 + }, + { + "epoch": 5.190225489222211, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003847859342405881, + "loss": 0.5018, + "step": 104500 + }, + { + "epoch": 5.19072216151783, + "grad_norm": 0.11767578125, + "learning_rate": 0.00038474620045693853, + "loss": 0.5197, + "step": 104510 + }, + { + "epoch": 5.1912188338134495, + "grad_norm": 0.11669921875, + "learning_rate": 0.000384706466673289, + "loss": 0.4853, + "step": 104520 + }, + { + "epoch": 5.19171550610907, + "grad_norm": 0.1171875, + "learning_rate": 0.0003846667328896394, + "loss": 0.5236, + "step": 104530 + }, + { + "epoch": 5.192212178404689, + "grad_norm": 0.126953125, + "learning_rate": 0.0003846269991059899, + "loss": 0.5041, + "step": 104540 + }, + { + "epoch": 5.192708850700308, + "grad_norm": 0.1103515625, + "learning_rate": 0.00038458726532234036, + "loss": 0.5011, + "step": 104550 + }, + { + "epoch": 5.193205522995927, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003845475315386908, + "loss": 0.4773, + "step": 104560 + }, + { + "epoch": 5.1937021952915465, + "grad_norm": 0.1328125, + "learning_rate": 0.0003845077977550412, + "loss": 0.5185, + "step": 104570 + }, + { + "epoch": 5.194198867587166, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003844680639713917, + "loss": 0.5093, + "step": 104580 + }, + { + "epoch": 5.194695539882785, + "grad_norm": 0.1298828125, + "learning_rate": 0.00038442833018774214, + "loss": 0.4653, + "step": 104590 + }, + { + "epoch": 5.195192212178405, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003843885964040926, + "loss": 0.5008, + "step": 104600 + }, + { + "epoch": 5.195688884474024, + "grad_norm": 0.1318359375, + "learning_rate": 0.000384348862620443, + "loss": 0.4891, + "step": 104610 + }, + { + "epoch": 5.1961855567696436, + "grad_norm": 0.14453125, + "learning_rate": 0.0003843091288367935, + "loss": 0.5258, + "step": 104620 + }, + { + "epoch": 5.196682229065263, + "grad_norm": 0.1240234375, + "learning_rate": 0.00038426939505314397, + "loss": 0.5075, + "step": 104630 + }, + { + "epoch": 5.197178901360882, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003842296612694944, + "loss": 0.5251, + "step": 104640 + }, + { + "epoch": 5.197675573656501, + "grad_norm": 0.10107421875, + "learning_rate": 0.00038418992748584486, + "loss": 0.5075, + "step": 104650 + }, + { + "epoch": 5.1981722459521205, + "grad_norm": 0.169921875, + "learning_rate": 0.00038415019370219533, + "loss": 0.4976, + "step": 104660 + }, + { + "epoch": 5.198668918247741, + "grad_norm": 0.1689453125, + "learning_rate": 0.00038411045991854575, + "loss": 0.4958, + "step": 104670 + }, + { + "epoch": 5.19916559054336, + "grad_norm": 0.1494140625, + "learning_rate": 0.0003840707261348962, + "loss": 0.5061, + "step": 104680 + }, + { + "epoch": 5.199662262838979, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003840309923512467, + "loss": 0.5057, + "step": 104690 + }, + { + "epoch": 5.200158935134598, + "grad_norm": 0.11328125, + "learning_rate": 0.0003839912585675971, + "loss": 0.5427, + "step": 104700 + }, + { + "epoch": 5.2006556074302175, + "grad_norm": 0.10986328125, + "learning_rate": 0.0003839515247839476, + "loss": 0.4838, + "step": 104710 + }, + { + "epoch": 5.201152279725837, + "grad_norm": 0.130859375, + "learning_rate": 0.000383911791000298, + "loss": 0.5102, + "step": 104720 + }, + { + "epoch": 5.201648952021456, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003838720572166485, + "loss": 0.4762, + "step": 104730 + }, + { + "epoch": 5.202145624317075, + "grad_norm": 0.12890625, + "learning_rate": 0.00038383232343299894, + "loss": 0.5001, + "step": 104740 + }, + { + "epoch": 5.202642296612695, + "grad_norm": 0.13671875, + "learning_rate": 0.00038379258964934935, + "loss": 0.5035, + "step": 104750 + }, + { + "epoch": 5.2031389689083145, + "grad_norm": 0.1533203125, + "learning_rate": 0.0003837528558656998, + "loss": 0.5256, + "step": 104760 + }, + { + "epoch": 5.203635641203934, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003837131220820503, + "loss": 0.4938, + "step": 104770 + }, + { + "epoch": 5.204132313499553, + "grad_norm": 0.134765625, + "learning_rate": 0.0003836733882984007, + "loss": 0.5047, + "step": 104780 + }, + { + "epoch": 5.204628985795172, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003836336545147512, + "loss": 0.4969, + "step": 104790 + }, + { + "epoch": 5.205125658090791, + "grad_norm": 0.119140625, + "learning_rate": 0.00038359392073110166, + "loss": 0.5321, + "step": 104800 + }, + { + "epoch": 5.205622330386411, + "grad_norm": 0.1533203125, + "learning_rate": 0.00038355418694745213, + "loss": 0.49, + "step": 104810 + }, + { + "epoch": 5.206119002682031, + "grad_norm": 0.1220703125, + "learning_rate": 0.00038351445316380255, + "loss": 0.4994, + "step": 104820 + }, + { + "epoch": 5.20661567497765, + "grad_norm": 0.14453125, + "learning_rate": 0.00038347471938015296, + "loss": 0.494, + "step": 104830 + }, + { + "epoch": 5.207112347273269, + "grad_norm": 0.1123046875, + "learning_rate": 0.00038343498559650343, + "loss": 0.5085, + "step": 104840 + }, + { + "epoch": 5.207609019568888, + "grad_norm": 0.10693359375, + "learning_rate": 0.0003833952518128539, + "loss": 0.4951, + "step": 104850 + }, + { + "epoch": 5.208105691864508, + "grad_norm": 0.10693359375, + "learning_rate": 0.0003833555180292043, + "loss": 0.4921, + "step": 104860 + }, + { + "epoch": 5.208602364160127, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003833157842455548, + "loss": 0.4962, + "step": 104870 + }, + { + "epoch": 5.209099036455746, + "grad_norm": 0.1279296875, + "learning_rate": 0.00038327605046190526, + "loss": 0.4833, + "step": 104880 + }, + { + "epoch": 5.209595708751366, + "grad_norm": 0.11376953125, + "learning_rate": 0.00038323631667825574, + "loss": 0.5146, + "step": 104890 + }, + { + "epoch": 5.2100923810469855, + "grad_norm": 0.1337890625, + "learning_rate": 0.00038319658289460615, + "loss": 0.5261, + "step": 104900 + }, + { + "epoch": 5.210589053342605, + "grad_norm": 0.11376953125, + "learning_rate": 0.00038315684911095657, + "loss": 0.5051, + "step": 104910 + }, + { + "epoch": 5.211085725638224, + "grad_norm": 0.103515625, + "learning_rate": 0.0003831171153273071, + "loss": 0.5243, + "step": 104920 + }, + { + "epoch": 5.211582397933843, + "grad_norm": 0.111328125, + "learning_rate": 0.0003830773815436575, + "loss": 0.5141, + "step": 104930 + }, + { + "epoch": 5.212079070229462, + "grad_norm": 0.1142578125, + "learning_rate": 0.00038303764776000793, + "loss": 0.5209, + "step": 104940 + }, + { + "epoch": 5.212575742525082, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003829979139763584, + "loss": 0.5151, + "step": 104950 + }, + { + "epoch": 5.213072414820702, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003829581801927089, + "loss": 0.4639, + "step": 104960 + }, + { + "epoch": 5.213569087116321, + "grad_norm": 0.1083984375, + "learning_rate": 0.00038291844640905934, + "loss": 0.5022, + "step": 104970 + }, + { + "epoch": 5.21406575941194, + "grad_norm": 0.1162109375, + "learning_rate": 0.00038287871262540976, + "loss": 0.5297, + "step": 104980 + }, + { + "epoch": 5.214562431707559, + "grad_norm": 0.1123046875, + "learning_rate": 0.00038283897884176023, + "loss": 0.5268, + "step": 104990 + }, + { + "epoch": 5.215059104003179, + "grad_norm": 0.10791015625, + "learning_rate": 0.0003827992450581107, + "loss": 0.4914, + "step": 105000 + }, + { + "epoch": 5.215555776298798, + "grad_norm": 0.10986328125, + "learning_rate": 0.0003827595112744611, + "loss": 0.4876, + "step": 105010 + }, + { + "epoch": 5.216052448594417, + "grad_norm": 0.11279296875, + "learning_rate": 0.00038271977749081154, + "loss": 0.4791, + "step": 105020 + }, + { + "epoch": 5.216549120890036, + "grad_norm": 0.1181640625, + "learning_rate": 0.00038268004370716206, + "loss": 0.5153, + "step": 105030 + }, + { + "epoch": 5.217045793185656, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003826403099235125, + "loss": 0.4919, + "step": 105040 + }, + { + "epoch": 5.217542465481276, + "grad_norm": 0.1201171875, + "learning_rate": 0.00038260057613986295, + "loss": 0.529, + "step": 105050 + }, + { + "epoch": 5.218039137776895, + "grad_norm": 0.1181640625, + "learning_rate": 0.00038256084235621337, + "loss": 0.5042, + "step": 105060 + }, + { + "epoch": 5.218535810072514, + "grad_norm": 0.1357421875, + "learning_rate": 0.00038252110857256384, + "loss": 0.4991, + "step": 105070 + }, + { + "epoch": 5.219032482368133, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003824813747889143, + "loss": 0.5019, + "step": 105080 + }, + { + "epoch": 5.219529154663753, + "grad_norm": 0.1337890625, + "learning_rate": 0.00038244164100526473, + "loss": 0.4988, + "step": 105090 + }, + { + "epoch": 5.220025826959372, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003824019072216152, + "loss": 0.5088, + "step": 105100 + }, + { + "epoch": 5.220522499254992, + "grad_norm": 0.1240234375, + "learning_rate": 0.00038236217343796567, + "loss": 0.5021, + "step": 105110 + }, + { + "epoch": 5.221019171550611, + "grad_norm": 0.140625, + "learning_rate": 0.0003823224396543161, + "loss": 0.4859, + "step": 105120 + }, + { + "epoch": 5.22151584384623, + "grad_norm": 0.1533203125, + "learning_rate": 0.00038228270587066656, + "loss": 0.5081, + "step": 105130 + }, + { + "epoch": 5.22201251614185, + "grad_norm": 0.1513671875, + "learning_rate": 0.000382242972087017, + "loss": 0.5057, + "step": 105140 + }, + { + "epoch": 5.222509188437469, + "grad_norm": 0.1083984375, + "learning_rate": 0.00038220323830336745, + "loss": 0.5026, + "step": 105150 + }, + { + "epoch": 5.223005860733088, + "grad_norm": 0.125, + "learning_rate": 0.0003821635045197179, + "loss": 0.5161, + "step": 105160 + }, + { + "epoch": 5.223502533028707, + "grad_norm": 0.1181640625, + "learning_rate": 0.00038212377073606834, + "loss": 0.5062, + "step": 105170 + }, + { + "epoch": 5.223999205324327, + "grad_norm": 0.12109375, + "learning_rate": 0.0003820840369524188, + "loss": 0.5017, + "step": 105180 + }, + { + "epoch": 5.224495877619947, + "grad_norm": 0.134765625, + "learning_rate": 0.0003820443031687693, + "loss": 0.5259, + "step": 105190 + }, + { + "epoch": 5.224992549915566, + "grad_norm": 0.12890625, + "learning_rate": 0.0003820045693851197, + "loss": 0.5123, + "step": 105200 + }, + { + "epoch": 5.225489222211185, + "grad_norm": 0.130859375, + "learning_rate": 0.00038196483560147017, + "loss": 0.533, + "step": 105210 + }, + { + "epoch": 5.225985894506804, + "grad_norm": 0.1357421875, + "learning_rate": 0.00038192510181782064, + "loss": 0.4918, + "step": 105220 + }, + { + "epoch": 5.2264825668024235, + "grad_norm": 0.11181640625, + "learning_rate": 0.00038188536803417106, + "loss": 0.4995, + "step": 105230 + }, + { + "epoch": 5.226979239098043, + "grad_norm": 0.115234375, + "learning_rate": 0.00038184563425052153, + "loss": 0.5495, + "step": 105240 + }, + { + "epoch": 5.227475911393663, + "grad_norm": 0.12060546875, + "learning_rate": 0.00038180590046687195, + "loss": 0.4786, + "step": 105250 + }, + { + "epoch": 5.227972583689282, + "grad_norm": 0.11669921875, + "learning_rate": 0.00038176616668322247, + "loss": 0.494, + "step": 105260 + }, + { + "epoch": 5.228469255984901, + "grad_norm": 0.119140625, + "learning_rate": 0.0003817264328995729, + "loss": 0.4965, + "step": 105270 + }, + { + "epoch": 5.2289659282805205, + "grad_norm": 0.12890625, + "learning_rate": 0.0003816866991159233, + "loss": 0.4844, + "step": 105280 + }, + { + "epoch": 5.22946260057614, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003816469653322738, + "loss": 0.4911, + "step": 105290 + }, + { + "epoch": 5.229959272871759, + "grad_norm": 0.1259765625, + "learning_rate": 0.00038160723154862425, + "loss": 0.5403, + "step": 105300 + }, + { + "epoch": 5.230455945167378, + "grad_norm": 0.12255859375, + "learning_rate": 0.00038156749776497467, + "loss": 0.503, + "step": 105310 + }, + { + "epoch": 5.230952617462998, + "grad_norm": 0.134765625, + "learning_rate": 0.00038152776398132514, + "loss": 0.5058, + "step": 105320 + }, + { + "epoch": 5.231449289758618, + "grad_norm": 0.1474609375, + "learning_rate": 0.0003814880301976756, + "loss": 0.5087, + "step": 105330 + }, + { + "epoch": 5.231945962054237, + "grad_norm": 0.1533203125, + "learning_rate": 0.0003814482964140261, + "loss": 0.5184, + "step": 105340 + }, + { + "epoch": 5.232442634349856, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003814085626303765, + "loss": 0.4996, + "step": 105350 + }, + { + "epoch": 5.232939306645475, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003813688288467269, + "loss": 0.4916, + "step": 105360 + }, + { + "epoch": 5.2334359789410945, + "grad_norm": 0.1669921875, + "learning_rate": 0.00038132909506307744, + "loss": 0.5033, + "step": 105370 + }, + { + "epoch": 5.233932651236714, + "grad_norm": 0.109375, + "learning_rate": 0.00038128936127942786, + "loss": 0.5365, + "step": 105380 + }, + { + "epoch": 5.234429323532334, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003812496274957783, + "loss": 0.4962, + "step": 105390 + }, + { + "epoch": 5.234925995827953, + "grad_norm": 0.1083984375, + "learning_rate": 0.00038120989371212875, + "loss": 0.4889, + "step": 105400 + }, + { + "epoch": 5.235422668123572, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003811701599284792, + "loss": 0.5129, + "step": 105410 + }, + { + "epoch": 5.2359193404191915, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003811304261448297, + "loss": 0.4706, + "step": 105420 + }, + { + "epoch": 5.236416012714811, + "grad_norm": 0.125, + "learning_rate": 0.0003810906923611801, + "loss": 0.4972, + "step": 105430 + }, + { + "epoch": 5.23691268501043, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003810509585775305, + "loss": 0.5081, + "step": 105440 + }, + { + "epoch": 5.237409357306049, + "grad_norm": 0.1474609375, + "learning_rate": 0.00038101122479388105, + "loss": 0.4888, + "step": 105450 + }, + { + "epoch": 5.237906029601668, + "grad_norm": 0.134765625, + "learning_rate": 0.00038097149101023147, + "loss": 0.4939, + "step": 105460 + }, + { + "epoch": 5.2384027018972885, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003809317572265819, + "loss": 0.5051, + "step": 105470 + }, + { + "epoch": 5.238899374192908, + "grad_norm": 0.1171875, + "learning_rate": 0.00038089202344293235, + "loss": 0.5286, + "step": 105480 + }, + { + "epoch": 5.239396046488527, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003808522896592828, + "loss": 0.5226, + "step": 105490 + }, + { + "epoch": 5.239892718784146, + "grad_norm": 0.154296875, + "learning_rate": 0.0003808125558756333, + "loss": 0.5016, + "step": 105500 + }, + { + "epoch": 5.240389391079765, + "grad_norm": 0.11328125, + "learning_rate": 0.0003807728220919837, + "loss": 0.4833, + "step": 105510 + }, + { + "epoch": 5.240886063375385, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003807330883083342, + "loss": 0.5191, + "step": 105520 + }, + { + "epoch": 5.241382735671004, + "grad_norm": 0.1064453125, + "learning_rate": 0.00038069335452468466, + "loss": 0.4965, + "step": 105530 + }, + { + "epoch": 5.241879407966624, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003806536207410351, + "loss": 0.4785, + "step": 105540 + }, + { + "epoch": 5.242376080262243, + "grad_norm": 0.12890625, + "learning_rate": 0.00038061388695738554, + "loss": 0.5122, + "step": 105550 + }, + { + "epoch": 5.2428727525578624, + "grad_norm": 0.1103515625, + "learning_rate": 0.000380574153173736, + "loss": 0.4763, + "step": 105560 + }, + { + "epoch": 5.243369424853482, + "grad_norm": 0.11083984375, + "learning_rate": 0.00038053441939008643, + "loss": 0.5306, + "step": 105570 + }, + { + "epoch": 5.243866097149101, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003804946856064369, + "loss": 0.5305, + "step": 105580 + }, + { + "epoch": 5.24436276944472, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003804549518227873, + "loss": 0.4957, + "step": 105590 + }, + { + "epoch": 5.244859441740339, + "grad_norm": 0.12109375, + "learning_rate": 0.0003804152180391378, + "loss": 0.5046, + "step": 105600 + }, + { + "epoch": 5.2453561140359595, + "grad_norm": 0.11279296875, + "learning_rate": 0.00038037548425548826, + "loss": 0.4918, + "step": 105610 + }, + { + "epoch": 5.245852786331579, + "grad_norm": 0.125, + "learning_rate": 0.0003803357504718387, + "loss": 0.486, + "step": 105620 + }, + { + "epoch": 5.246349458627198, + "grad_norm": 0.1259765625, + "learning_rate": 0.00038029601668818915, + "loss": 0.4918, + "step": 105630 + }, + { + "epoch": 5.246846130922817, + "grad_norm": 0.109375, + "learning_rate": 0.0003802562829045396, + "loss": 0.4641, + "step": 105640 + }, + { + "epoch": 5.247342803218436, + "grad_norm": 0.130859375, + "learning_rate": 0.00038021654912089004, + "loss": 0.5065, + "step": 105650 + }, + { + "epoch": 5.247839475514056, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003801768153372405, + "loss": 0.5032, + "step": 105660 + }, + { + "epoch": 5.248336147809675, + "grad_norm": 0.10595703125, + "learning_rate": 0.00038013708155359093, + "loss": 0.4881, + "step": 105670 + }, + { + "epoch": 5.248832820105295, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003800973477699414, + "loss": 0.4983, + "step": 105680 + }, + { + "epoch": 5.249329492400914, + "grad_norm": 0.1630859375, + "learning_rate": 0.00038005761398629187, + "loss": 0.4834, + "step": 105690 + }, + { + "epoch": 5.249826164696533, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003800178802026423, + "loss": 0.4972, + "step": 105700 + }, + { + "epoch": 5.250322836992153, + "grad_norm": 0.119140625, + "learning_rate": 0.00037997814641899276, + "loss": 0.5257, + "step": 105710 + }, + { + "epoch": 5.250819509287772, + "grad_norm": 0.11865234375, + "learning_rate": 0.00037993841263534323, + "loss": 0.5216, + "step": 105720 + }, + { + "epoch": 5.251316181583391, + "grad_norm": 0.11669921875, + "learning_rate": 0.00037989867885169365, + "loss": 0.4775, + "step": 105730 + }, + { + "epoch": 5.25181285387901, + "grad_norm": 0.1484375, + "learning_rate": 0.0003798589450680441, + "loss": 0.4706, + "step": 105740 + }, + { + "epoch": 5.2523095261746295, + "grad_norm": 0.142578125, + "learning_rate": 0.0003798192112843946, + "loss": 0.5176, + "step": 105750 + }, + { + "epoch": 5.25280619847025, + "grad_norm": 0.1318359375, + "learning_rate": 0.000379779477500745, + "loss": 0.5154, + "step": 105760 + }, + { + "epoch": 5.253302870765869, + "grad_norm": 0.1650390625, + "learning_rate": 0.0003797397437170955, + "loss": 0.4837, + "step": 105770 + }, + { + "epoch": 5.253799543061488, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003797000099334459, + "loss": 0.5053, + "step": 105780 + }, + { + "epoch": 5.254296215357107, + "grad_norm": 0.2041015625, + "learning_rate": 0.0003796602761497964, + "loss": 0.5251, + "step": 105790 + }, + { + "epoch": 5.254792887652727, + "grad_norm": 0.12255859375, + "learning_rate": 0.00037962054236614684, + "loss": 0.4851, + "step": 105800 + }, + { + "epoch": 5.255289559948346, + "grad_norm": 0.1513671875, + "learning_rate": 0.00037958080858249726, + "loss": 0.502, + "step": 105810 + }, + { + "epoch": 5.255786232243965, + "grad_norm": 0.11328125, + "learning_rate": 0.00037954107479884773, + "loss": 0.5086, + "step": 105820 + }, + { + "epoch": 5.256282904539585, + "grad_norm": 0.13671875, + "learning_rate": 0.0003795013410151982, + "loss": 0.5406, + "step": 105830 + }, + { + "epoch": 5.256779576835204, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003794616072315486, + "loss": 0.5044, + "step": 105840 + }, + { + "epoch": 5.257276249130824, + "grad_norm": 0.123046875, + "learning_rate": 0.0003794218734478991, + "loss": 0.4961, + "step": 105850 + }, + { + "epoch": 5.257772921426443, + "grad_norm": 0.12158203125, + "learning_rate": 0.00037938213966424956, + "loss": 0.5446, + "step": 105860 + }, + { + "epoch": 5.258269593722062, + "grad_norm": 0.119140625, + "learning_rate": 0.00037934240588060003, + "loss": 0.4895, + "step": 105870 + }, + { + "epoch": 5.258766266017681, + "grad_norm": 0.1826171875, + "learning_rate": 0.00037930267209695045, + "loss": 0.4977, + "step": 105880 + }, + { + "epoch": 5.2592629383133005, + "grad_norm": 0.1396484375, + "learning_rate": 0.00037926293831330087, + "loss": 0.5135, + "step": 105890 + }, + { + "epoch": 5.259759610608921, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003792232045296514, + "loss": 0.4763, + "step": 105900 + }, + { + "epoch": 5.26025628290454, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003791834707460018, + "loss": 0.4835, + "step": 105910 + }, + { + "epoch": 5.260752955200159, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003791437369623522, + "loss": 0.4958, + "step": 105920 + }, + { + "epoch": 5.261249627495778, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003791040031787027, + "loss": 0.4818, + "step": 105930 + }, + { + "epoch": 5.2617462997913975, + "grad_norm": 0.11767578125, + "learning_rate": 0.00037906426939505317, + "loss": 0.5103, + "step": 105940 + }, + { + "epoch": 5.262242972087017, + "grad_norm": 0.11474609375, + "learning_rate": 0.00037902453561140364, + "loss": 0.4874, + "step": 105950 + }, + { + "epoch": 5.262739644382636, + "grad_norm": 0.1572265625, + "learning_rate": 0.00037898480182775406, + "loss": 0.5046, + "step": 105960 + }, + { + "epoch": 5.263236316678256, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003789450680441045, + "loss": 0.4964, + "step": 105970 + }, + { + "epoch": 5.263732988973875, + "grad_norm": 0.11328125, + "learning_rate": 0.000378905334260455, + "loss": 0.4827, + "step": 105980 + }, + { + "epoch": 5.2642296612694945, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003788656004768054, + "loss": 0.5055, + "step": 105990 + }, + { + "epoch": 5.264726333565114, + "grad_norm": 0.12109375, + "learning_rate": 0.0003788258666931559, + "loss": 0.5091, + "step": 106000 + }, + { + "epoch": 5.265223005860733, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003787861329095063, + "loss": 0.5217, + "step": 106010 + }, + { + "epoch": 5.265719678156352, + "grad_norm": 0.1552734375, + "learning_rate": 0.0003787463991258568, + "loss": 0.4808, + "step": 106020 + }, + { + "epoch": 5.2662163504519715, + "grad_norm": 0.11767578125, + "learning_rate": 0.00037870666534220725, + "loss": 0.5099, + "step": 106030 + }, + { + "epoch": 5.266713022747592, + "grad_norm": 0.1376953125, + "learning_rate": 0.00037866693155855767, + "loss": 0.5062, + "step": 106040 + }, + { + "epoch": 5.267209695043211, + "grad_norm": 0.1083984375, + "learning_rate": 0.00037862719777490814, + "loss": 0.5205, + "step": 106050 + }, + { + "epoch": 5.26770636733883, + "grad_norm": 0.142578125, + "learning_rate": 0.0003785874639912586, + "loss": 0.5015, + "step": 106060 + }, + { + "epoch": 5.268203039634449, + "grad_norm": 0.123046875, + "learning_rate": 0.000378547730207609, + "loss": 0.5136, + "step": 106070 + }, + { + "epoch": 5.2686997119300685, + "grad_norm": 0.11279296875, + "learning_rate": 0.0003785079964239595, + "loss": 0.4776, + "step": 106080 + }, + { + "epoch": 5.269196384225688, + "grad_norm": 0.1123046875, + "learning_rate": 0.00037846826264030997, + "loss": 0.5012, + "step": 106090 + }, + { + "epoch": 5.269693056521307, + "grad_norm": 0.107421875, + "learning_rate": 0.0003784285288566604, + "loss": 0.5194, + "step": 106100 + }, + { + "epoch": 5.270189728816927, + "grad_norm": 0.1240234375, + "learning_rate": 0.00037838879507301086, + "loss": 0.515, + "step": 106110 + }, + { + "epoch": 5.270686401112546, + "grad_norm": 0.1650390625, + "learning_rate": 0.0003783490612893613, + "loss": 0.4908, + "step": 106120 + }, + { + "epoch": 5.2711830734081655, + "grad_norm": 0.1298828125, + "learning_rate": 0.00037830932750571174, + "loss": 0.4953, + "step": 106130 + }, + { + "epoch": 5.271679745703785, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003782695937220622, + "loss": 0.493, + "step": 106140 + }, + { + "epoch": 5.272176417999404, + "grad_norm": 0.11767578125, + "learning_rate": 0.00037822985993841263, + "loss": 0.4988, + "step": 106150 + }, + { + "epoch": 5.272673090295023, + "grad_norm": 0.125, + "learning_rate": 0.0003781901261547631, + "loss": 0.4874, + "step": 106160 + }, + { + "epoch": 5.273169762590642, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003781503923711136, + "loss": 0.539, + "step": 106170 + }, + { + "epoch": 5.273666434886262, + "grad_norm": 0.140625, + "learning_rate": 0.000378110658587464, + "loss": 0.512, + "step": 106180 + }, + { + "epoch": 5.274163107181882, + "grad_norm": 0.130859375, + "learning_rate": 0.00037807092480381446, + "loss": 0.4819, + "step": 106190 + }, + { + "epoch": 5.274659779477501, + "grad_norm": 0.10546875, + "learning_rate": 0.00037803119102016494, + "loss": 0.5073, + "step": 106200 + }, + { + "epoch": 5.27515645177312, + "grad_norm": 0.177734375, + "learning_rate": 0.00037799145723651535, + "loss": 0.4994, + "step": 106210 + }, + { + "epoch": 5.275653124068739, + "grad_norm": 0.1611328125, + "learning_rate": 0.0003779517234528658, + "loss": 0.5123, + "step": 106220 + }, + { + "epoch": 5.276149796364359, + "grad_norm": 0.138671875, + "learning_rate": 0.00037791198966921624, + "loss": 0.4797, + "step": 106230 + }, + { + "epoch": 5.276646468659978, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003778722558855667, + "loss": 0.5056, + "step": 106240 + }, + { + "epoch": 5.277143140955597, + "grad_norm": 0.11083984375, + "learning_rate": 0.0003778325221019172, + "loss": 0.5202, + "step": 106250 + }, + { + "epoch": 5.277639813251217, + "grad_norm": 0.11328125, + "learning_rate": 0.0003777927883182676, + "loss": 0.5255, + "step": 106260 + }, + { + "epoch": 5.2781364855468365, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003777530545346181, + "loss": 0.4985, + "step": 106270 + }, + { + "epoch": 5.278633157842456, + "grad_norm": 0.1142578125, + "learning_rate": 0.00037771332075096854, + "loss": 0.5092, + "step": 106280 + }, + { + "epoch": 5.279129830138075, + "grad_norm": 0.1416015625, + "learning_rate": 0.00037767358696731896, + "loss": 0.5219, + "step": 106290 + }, + { + "epoch": 5.279626502433694, + "grad_norm": 0.1787109375, + "learning_rate": 0.00037763385318366943, + "loss": 0.5119, + "step": 106300 + }, + { + "epoch": 5.280123174729313, + "grad_norm": 0.130859375, + "learning_rate": 0.00037759411940001985, + "loss": 0.4844, + "step": 106310 + }, + { + "epoch": 5.280619847024933, + "grad_norm": 0.142578125, + "learning_rate": 0.0003775543856163704, + "loss": 0.493, + "step": 106320 + }, + { + "epoch": 5.281116519320553, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003775146518327208, + "loss": 0.5006, + "step": 106330 + }, + { + "epoch": 5.281613191616172, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003774749180490712, + "loss": 0.494, + "step": 106340 + }, + { + "epoch": 5.282109863911791, + "grad_norm": 0.125, + "learning_rate": 0.0003774351842654217, + "loss": 0.4872, + "step": 106350 + }, + { + "epoch": 5.28260653620741, + "grad_norm": 0.1083984375, + "learning_rate": 0.00037739545048177215, + "loss": 0.4894, + "step": 106360 + }, + { + "epoch": 5.28310320850303, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003773557166981226, + "loss": 0.5106, + "step": 106370 + }, + { + "epoch": 5.283599880798649, + "grad_norm": 0.123046875, + "learning_rate": 0.00037731598291447304, + "loss": 0.4896, + "step": 106380 + }, + { + "epoch": 5.284096553094268, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003772762491308235, + "loss": 0.513, + "step": 106390 + }, + { + "epoch": 5.284593225389887, + "grad_norm": 0.130859375, + "learning_rate": 0.000377236515347174, + "loss": 0.5174, + "step": 106400 + }, + { + "epoch": 5.285089897685507, + "grad_norm": 0.125, + "learning_rate": 0.0003771967815635244, + "loss": 0.5198, + "step": 106410 + }, + { + "epoch": 5.285586569981127, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003771570477798748, + "loss": 0.513, + "step": 106420 + }, + { + "epoch": 5.286083242276746, + "grad_norm": 0.1103515625, + "learning_rate": 0.00037711731399622534, + "loss": 0.4695, + "step": 106430 + }, + { + "epoch": 5.286579914572365, + "grad_norm": 0.126953125, + "learning_rate": 0.00037707758021257576, + "loss": 0.5157, + "step": 106440 + }, + { + "epoch": 5.287076586867984, + "grad_norm": 0.11279296875, + "learning_rate": 0.00037703784642892623, + "loss": 0.4926, + "step": 106450 + }, + { + "epoch": 5.2875732591636035, + "grad_norm": 0.12060546875, + "learning_rate": 0.00037699811264527665, + "loss": 0.5179, + "step": 106460 + }, + { + "epoch": 5.288069931459223, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003769583788616271, + "loss": 0.5059, + "step": 106470 + }, + { + "epoch": 5.288566603754843, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003769186450779776, + "loss": 0.5065, + "step": 106480 + }, + { + "epoch": 5.289063276050462, + "grad_norm": 0.1142578125, + "learning_rate": 0.000376878911294328, + "loss": 0.4938, + "step": 106490 + }, + { + "epoch": 5.289559948346081, + "grad_norm": 0.189453125, + "learning_rate": 0.0003768391775106785, + "loss": 0.5068, + "step": 106500 + }, + { + "epoch": 5.290056620641701, + "grad_norm": 0.12353515625, + "learning_rate": 0.00037679944372702895, + "loss": 0.5231, + "step": 106510 + }, + { + "epoch": 5.29055329293732, + "grad_norm": 0.11328125, + "learning_rate": 0.00037675970994337937, + "loss": 0.5003, + "step": 106520 + }, + { + "epoch": 5.291049965232939, + "grad_norm": 0.130859375, + "learning_rate": 0.00037671997615972984, + "loss": 0.4978, + "step": 106530 + }, + { + "epoch": 5.291546637528558, + "grad_norm": 0.1611328125, + "learning_rate": 0.00037668024237608026, + "loss": 0.4801, + "step": 106540 + }, + { + "epoch": 5.292043309824178, + "grad_norm": 0.1484375, + "learning_rate": 0.00037664050859243073, + "loss": 0.5235, + "step": 106550 + }, + { + "epoch": 5.292539982119798, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003766007748087812, + "loss": 0.4832, + "step": 106560 + }, + { + "epoch": 5.293036654415417, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003765610410251316, + "loss": 0.5165, + "step": 106570 + }, + { + "epoch": 5.293533326711036, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003765213072414821, + "loss": 0.5267, + "step": 106580 + }, + { + "epoch": 5.294029999006655, + "grad_norm": 0.126953125, + "learning_rate": 0.00037648157345783256, + "loss": 0.5177, + "step": 106590 + }, + { + "epoch": 5.2945266713022745, + "grad_norm": 0.1201171875, + "learning_rate": 0.000376441839674183, + "loss": 0.5257, + "step": 106600 + }, + { + "epoch": 5.295023343597894, + "grad_norm": 0.146484375, + "learning_rate": 0.00037640210589053345, + "loss": 0.498, + "step": 106610 + }, + { + "epoch": 5.295520015893514, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003763623721068839, + "loss": 0.5108, + "step": 106620 + }, + { + "epoch": 5.296016688189133, + "grad_norm": 0.11572265625, + "learning_rate": 0.00037632263832323434, + "loss": 0.4712, + "step": 106630 + }, + { + "epoch": 5.296513360484752, + "grad_norm": 0.1328125, + "learning_rate": 0.0003762829045395848, + "loss": 0.4676, + "step": 106640 + }, + { + "epoch": 5.2970100327803715, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003762431707559352, + "loss": 0.4873, + "step": 106650 + }, + { + "epoch": 5.297506705075991, + "grad_norm": 0.125, + "learning_rate": 0.0003762034369722857, + "loss": 0.5055, + "step": 106660 + }, + { + "epoch": 5.29800337737161, + "grad_norm": 0.134765625, + "learning_rate": 0.00037616370318863617, + "loss": 0.507, + "step": 106670 + }, + { + "epoch": 5.298500049667229, + "grad_norm": 0.140625, + "learning_rate": 0.0003761239694049866, + "loss": 0.5004, + "step": 106680 + }, + { + "epoch": 5.298996721962849, + "grad_norm": 0.19921875, + "learning_rate": 0.00037608423562133706, + "loss": 0.4958, + "step": 106690 + }, + { + "epoch": 5.2994933942584685, + "grad_norm": 0.125, + "learning_rate": 0.00037604450183768753, + "loss": 0.5015, + "step": 106700 + }, + { + "epoch": 5.299990066554088, + "grad_norm": 0.1435546875, + "learning_rate": 0.00037600476805403795, + "loss": 0.5173, + "step": 106710 + }, + { + "epoch": 5.300486738849707, + "grad_norm": 0.1328125, + "learning_rate": 0.0003759650342703884, + "loss": 0.5207, + "step": 106720 + }, + { + "epoch": 5.300983411145326, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003759253004867389, + "loss": 0.506, + "step": 106730 + }, + { + "epoch": 5.3014800834409455, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003758855667030893, + "loss": 0.4992, + "step": 106740 + }, + { + "epoch": 5.301976755736565, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003758458329194398, + "loss": 0.4862, + "step": 106750 + }, + { + "epoch": 5.302473428032185, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003758060991357902, + "loss": 0.5146, + "step": 106760 + }, + { + "epoch": 5.302970100327804, + "grad_norm": 0.181640625, + "learning_rate": 0.0003757663653521407, + "loss": 0.5075, + "step": 106770 + }, + { + "epoch": 5.303466772623423, + "grad_norm": 0.111328125, + "learning_rate": 0.00037572663156849114, + "loss": 0.5059, + "step": 106780 + }, + { + "epoch": 5.3039634449190425, + "grad_norm": 0.1220703125, + "learning_rate": 0.00037568689778484155, + "loss": 0.5584, + "step": 106790 + }, + { + "epoch": 5.304460117214662, + "grad_norm": 0.11962890625, + "learning_rate": 0.000375647164001192, + "loss": 0.527, + "step": 106800 + }, + { + "epoch": 5.304956789510281, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003756074302175425, + "loss": 0.5224, + "step": 106810 + }, + { + "epoch": 5.3054534618059, + "grad_norm": 0.1494140625, + "learning_rate": 0.00037556769643389297, + "loss": 0.4929, + "step": 106820 + }, + { + "epoch": 5.30595013410152, + "grad_norm": 0.109375, + "learning_rate": 0.0003755279626502434, + "loss": 0.5004, + "step": 106830 + }, + { + "epoch": 5.3064468063971395, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003754882288665938, + "loss": 0.5135, + "step": 106840 + }, + { + "epoch": 5.306943478692759, + "grad_norm": 0.1181640625, + "learning_rate": 0.00037544849508294433, + "loss": 0.523, + "step": 106850 + }, + { + "epoch": 5.307440150988378, + "grad_norm": 0.13671875, + "learning_rate": 0.00037540876129929474, + "loss": 0.5097, + "step": 106860 + }, + { + "epoch": 5.307936823283997, + "grad_norm": 0.1728515625, + "learning_rate": 0.00037536902751564516, + "loss": 0.4743, + "step": 106870 + }, + { + "epoch": 5.308433495579616, + "grad_norm": 0.1533203125, + "learning_rate": 0.00037532929373199563, + "loss": 0.4911, + "step": 106880 + }, + { + "epoch": 5.308930167875236, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003752895599483461, + "loss": 0.4959, + "step": 106890 + }, + { + "epoch": 5.309426840170855, + "grad_norm": 0.10693359375, + "learning_rate": 0.0003752498261646966, + "loss": 0.4944, + "step": 106900 + }, + { + "epoch": 5.309923512466475, + "grad_norm": 0.119140625, + "learning_rate": 0.000375210092381047, + "loss": 0.4845, + "step": 106910 + }, + { + "epoch": 5.310420184762094, + "grad_norm": 0.1416015625, + "learning_rate": 0.00037517035859739746, + "loss": 0.5044, + "step": 106920 + }, + { + "epoch": 5.310916857057713, + "grad_norm": 0.11962890625, + "learning_rate": 0.00037513062481374794, + "loss": 0.5066, + "step": 106930 + }, + { + "epoch": 5.311413529353333, + "grad_norm": 0.11962890625, + "learning_rate": 0.00037509089103009835, + "loss": 0.5175, + "step": 106940 + }, + { + "epoch": 5.311910201648952, + "grad_norm": 0.185546875, + "learning_rate": 0.00037505115724644877, + "loss": 0.4814, + "step": 106950 + }, + { + "epoch": 5.312406873944571, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003750114234627993, + "loss": 0.5106, + "step": 106960 + }, + { + "epoch": 5.31290354624019, + "grad_norm": 0.1513671875, + "learning_rate": 0.0003749716896791497, + "loss": 0.4795, + "step": 106970 + }, + { + "epoch": 5.3134002185358105, + "grad_norm": 0.1328125, + "learning_rate": 0.0003749319558955002, + "loss": 0.5057, + "step": 106980 + }, + { + "epoch": 5.31389689083143, + "grad_norm": 0.126953125, + "learning_rate": 0.0003748922221118506, + "loss": 0.4873, + "step": 106990 + }, + { + "epoch": 5.314393563127049, + "grad_norm": 0.1123046875, + "learning_rate": 0.00037485248832820107, + "loss": 0.5039, + "step": 107000 + }, + { + "epoch": 5.314890235422668, + "grad_norm": 0.119140625, + "learning_rate": 0.00037481275454455154, + "loss": 0.5208, + "step": 107010 + }, + { + "epoch": 5.315386907718287, + "grad_norm": 0.130859375, + "learning_rate": 0.00037477302076090196, + "loss": 0.4899, + "step": 107020 + }, + { + "epoch": 5.315883580013907, + "grad_norm": 0.11279296875, + "learning_rate": 0.00037473328697725243, + "loss": 0.5226, + "step": 107030 + }, + { + "epoch": 5.316380252309526, + "grad_norm": 0.11279296875, + "learning_rate": 0.0003746935531936029, + "loss": 0.5062, + "step": 107040 + }, + { + "epoch": 5.316876924605146, + "grad_norm": 0.11083984375, + "learning_rate": 0.0003746538194099533, + "loss": 0.5017, + "step": 107050 + }, + { + "epoch": 5.317373596900765, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003746140856263038, + "loss": 0.5258, + "step": 107060 + }, + { + "epoch": 5.317870269196384, + "grad_norm": 0.125, + "learning_rate": 0.00037457435184265426, + "loss": 0.4831, + "step": 107070 + }, + { + "epoch": 5.318366941492004, + "grad_norm": 0.10400390625, + "learning_rate": 0.0003745346180590047, + "loss": 0.4974, + "step": 107080 + }, + { + "epoch": 5.318863613787623, + "grad_norm": 0.1611328125, + "learning_rate": 0.00037449488427535515, + "loss": 0.5158, + "step": 107090 + }, + { + "epoch": 5.319360286083242, + "grad_norm": 0.123046875, + "learning_rate": 0.00037445515049170557, + "loss": 0.5136, + "step": 107100 + }, + { + "epoch": 5.319856958378861, + "grad_norm": 0.1513671875, + "learning_rate": 0.00037441541670805604, + "loss": 0.4901, + "step": 107110 + }, + { + "epoch": 5.3203536306744805, + "grad_norm": 0.11328125, + "learning_rate": 0.0003743756829244065, + "loss": 0.4909, + "step": 107120 + }, + { + "epoch": 5.320850302970101, + "grad_norm": 0.123046875, + "learning_rate": 0.00037433594914075693, + "loss": 0.5183, + "step": 107130 + }, + { + "epoch": 5.32134697526572, + "grad_norm": 0.10595703125, + "learning_rate": 0.0003742962153571074, + "loss": 0.4713, + "step": 107140 + }, + { + "epoch": 5.321843647561339, + "grad_norm": 0.1396484375, + "learning_rate": 0.00037425648157345787, + "loss": 0.5129, + "step": 107150 + }, + { + "epoch": 5.322340319856958, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003742167477898083, + "loss": 0.4957, + "step": 107160 + }, + { + "epoch": 5.3228369921525776, + "grad_norm": 0.138671875, + "learning_rate": 0.00037417701400615876, + "loss": 0.4968, + "step": 107170 + }, + { + "epoch": 5.323333664448197, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003741372802225092, + "loss": 0.4901, + "step": 107180 + }, + { + "epoch": 5.323830336743816, + "grad_norm": 0.1484375, + "learning_rate": 0.00037409754643885965, + "loss": 0.5394, + "step": 107190 + }, + { + "epoch": 5.324327009039436, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003740578126552101, + "loss": 0.5215, + "step": 107200 + }, + { + "epoch": 5.324823681335055, + "grad_norm": 0.1259765625, + "learning_rate": 0.00037401807887156054, + "loss": 0.4856, + "step": 107210 + }, + { + "epoch": 5.325320353630675, + "grad_norm": 0.1357421875, + "learning_rate": 0.000373978345087911, + "loss": 0.5135, + "step": 107220 + }, + { + "epoch": 5.325817025926294, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003739386113042615, + "loss": 0.5232, + "step": 107230 + }, + { + "epoch": 5.326313698221913, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003738988775206119, + "loss": 0.5302, + "step": 107240 + }, + { + "epoch": 5.326810370517532, + "grad_norm": 0.1416015625, + "learning_rate": 0.00037385914373696237, + "loss": 0.4882, + "step": 107250 + }, + { + "epoch": 5.3273070428131515, + "grad_norm": 0.16015625, + "learning_rate": 0.00037381940995331284, + "loss": 0.5092, + "step": 107260 + }, + { + "epoch": 5.327803715108772, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003737796761696633, + "loss": 0.533, + "step": 107270 + }, + { + "epoch": 5.328300387404391, + "grad_norm": 0.1357421875, + "learning_rate": 0.00037373994238601373, + "loss": 0.5166, + "step": 107280 + }, + { + "epoch": 5.32879705970001, + "grad_norm": 0.11669921875, + "learning_rate": 0.00037370020860236415, + "loss": 0.5068, + "step": 107290 + }, + { + "epoch": 5.329293731995629, + "grad_norm": 0.12109375, + "learning_rate": 0.00037366047481871467, + "loss": 0.4994, + "step": 107300 + }, + { + "epoch": 5.3297904042912485, + "grad_norm": 0.15625, + "learning_rate": 0.0003736207410350651, + "loss": 0.5246, + "step": 107310 + }, + { + "epoch": 5.330287076586868, + "grad_norm": 0.111328125, + "learning_rate": 0.0003735810072514155, + "loss": 0.4979, + "step": 107320 + }, + { + "epoch": 5.330783748882487, + "grad_norm": 0.12451171875, + "learning_rate": 0.000373541273467766, + "loss": 0.4869, + "step": 107330 + }, + { + "epoch": 5.331280421178107, + "grad_norm": 0.11962890625, + "learning_rate": 0.00037350153968411645, + "loss": 0.4768, + "step": 107340 + }, + { + "epoch": 5.331777093473726, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003734618059004669, + "loss": 0.478, + "step": 107350 + }, + { + "epoch": 5.3322737657693455, + "grad_norm": 0.12109375, + "learning_rate": 0.00037342207211681734, + "loss": 0.5075, + "step": 107360 + }, + { + "epoch": 5.332770438064965, + "grad_norm": 0.103515625, + "learning_rate": 0.00037338233833316775, + "loss": 0.4907, + "step": 107370 + }, + { + "epoch": 5.333267110360584, + "grad_norm": 0.1552734375, + "learning_rate": 0.0003733426045495183, + "loss": 0.5123, + "step": 107380 + }, + { + "epoch": 5.333763782656203, + "grad_norm": 0.126953125, + "learning_rate": 0.0003733028707658687, + "loss": 0.4853, + "step": 107390 + }, + { + "epoch": 5.334260454951822, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003732631369822191, + "loss": 0.5294, + "step": 107400 + }, + { + "epoch": 5.3347571272474426, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003732234031985696, + "loss": 0.5243, + "step": 107410 + }, + { + "epoch": 5.335253799543062, + "grad_norm": 0.125, + "learning_rate": 0.00037318366941492006, + "loss": 0.4995, + "step": 107420 + }, + { + "epoch": 5.335750471838681, + "grad_norm": 0.1240234375, + "learning_rate": 0.00037314393563127053, + "loss": 0.4997, + "step": 107430 + }, + { + "epoch": 5.3362471441343, + "grad_norm": 0.15234375, + "learning_rate": 0.00037310420184762094, + "loss": 0.4856, + "step": 107440 + }, + { + "epoch": 5.3367438164299195, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003730644680639714, + "loss": 0.4911, + "step": 107450 + }, + { + "epoch": 5.337240488725539, + "grad_norm": 0.171875, + "learning_rate": 0.0003730247342803219, + "loss": 0.5015, + "step": 107460 + }, + { + "epoch": 5.337737161021158, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003729850004966723, + "loss": 0.4819, + "step": 107470 + }, + { + "epoch": 5.338233833316778, + "grad_norm": 0.162109375, + "learning_rate": 0.0003729452667130227, + "loss": 0.4923, + "step": 107480 + }, + { + "epoch": 5.338730505612397, + "grad_norm": 0.1142578125, + "learning_rate": 0.00037290553292937325, + "loss": 0.4997, + "step": 107490 + }, + { + "epoch": 5.3392271779080165, + "grad_norm": 0.11376953125, + "learning_rate": 0.00037286579914572366, + "loss": 0.502, + "step": 107500 + }, + { + "epoch": 5.339723850203636, + "grad_norm": 0.12255859375, + "learning_rate": 0.00037282606536207414, + "loss": 0.5228, + "step": 107510 + }, + { + "epoch": 5.340220522499255, + "grad_norm": 0.138671875, + "learning_rate": 0.00037278633157842455, + "loss": 0.4805, + "step": 107520 + }, + { + "epoch": 5.340717194794874, + "grad_norm": 0.1328125, + "learning_rate": 0.000372746597794775, + "loss": 0.5407, + "step": 107530 + }, + { + "epoch": 5.341213867090493, + "grad_norm": 0.1591796875, + "learning_rate": 0.0003727068640111255, + "loss": 0.523, + "step": 107540 + }, + { + "epoch": 5.3417105393861135, + "grad_norm": 0.1513671875, + "learning_rate": 0.0003726671302274759, + "loss": 0.4976, + "step": 107550 + }, + { + "epoch": 5.342207211681733, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003726273964438264, + "loss": 0.4908, + "step": 107560 + }, + { + "epoch": 5.342703883977352, + "grad_norm": 0.1494140625, + "learning_rate": 0.00037258766266017686, + "loss": 0.5216, + "step": 107570 + }, + { + "epoch": 5.343200556272971, + "grad_norm": 0.1337890625, + "learning_rate": 0.00037254792887652727, + "loss": 0.4958, + "step": 107580 + }, + { + "epoch": 5.34369722856859, + "grad_norm": 0.1337890625, + "learning_rate": 0.00037250819509287774, + "loss": 0.4956, + "step": 107590 + }, + { + "epoch": 5.34419390086421, + "grad_norm": 0.13671875, + "learning_rate": 0.0003724684613092282, + "loss": 0.4909, + "step": 107600 + }, + { + "epoch": 5.344690573159829, + "grad_norm": 0.1328125, + "learning_rate": 0.00037242872752557863, + "loss": 0.4814, + "step": 107610 + }, + { + "epoch": 5.345187245455448, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003723889937419291, + "loss": 0.4906, + "step": 107620 + }, + { + "epoch": 5.345683917751068, + "grad_norm": 0.12890625, + "learning_rate": 0.0003723492599582795, + "loss": 0.4998, + "step": 107630 + }, + { + "epoch": 5.346180590046687, + "grad_norm": 0.1162109375, + "learning_rate": 0.00037230952617463, + "loss": 0.4901, + "step": 107640 + }, + { + "epoch": 5.346677262342307, + "grad_norm": 0.1162109375, + "learning_rate": 0.00037226979239098046, + "loss": 0.4998, + "step": 107650 + }, + { + "epoch": 5.347173934637926, + "grad_norm": 0.142578125, + "learning_rate": 0.0003722300586073309, + "loss": 0.4889, + "step": 107660 + }, + { + "epoch": 5.347670606933545, + "grad_norm": 0.1494140625, + "learning_rate": 0.00037219032482368135, + "loss": 0.5287, + "step": 107670 + }, + { + "epoch": 5.348167279229164, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003721505910400318, + "loss": 0.5008, + "step": 107680 + }, + { + "epoch": 5.348663951524784, + "grad_norm": 0.1220703125, + "learning_rate": 0.00037211085725638224, + "loss": 0.4908, + "step": 107690 + }, + { + "epoch": 5.349160623820404, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003720711234727327, + "loss": 0.4943, + "step": 107700 + }, + { + "epoch": 5.349657296116023, + "grad_norm": 0.1435546875, + "learning_rate": 0.00037203138968908313, + "loss": 0.5044, + "step": 107710 + }, + { + "epoch": 5.350153968411642, + "grad_norm": 0.12890625, + "learning_rate": 0.00037199165590543365, + "loss": 0.4952, + "step": 107720 + }, + { + "epoch": 5.350650640707261, + "grad_norm": 0.12109375, + "learning_rate": 0.00037195192212178407, + "loss": 0.5112, + "step": 107730 + }, + { + "epoch": 5.351147313002881, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003719121883381345, + "loss": 0.4978, + "step": 107740 + }, + { + "epoch": 5.3516439852985, + "grad_norm": 0.11083984375, + "learning_rate": 0.00037187245455448496, + "loss": 0.4872, + "step": 107750 + }, + { + "epoch": 5.352140657594119, + "grad_norm": 0.11474609375, + "learning_rate": 0.00037183272077083543, + "loss": 0.5244, + "step": 107760 + }, + { + "epoch": 5.352637329889738, + "grad_norm": 0.126953125, + "learning_rate": 0.00037179298698718585, + "loss": 0.5032, + "step": 107770 + }, + { + "epoch": 5.353134002185358, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003717532532035363, + "loss": 0.5246, + "step": 107780 + }, + { + "epoch": 5.353630674480978, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003717135194198868, + "loss": 0.4951, + "step": 107790 + }, + { + "epoch": 5.354127346776597, + "grad_norm": 0.130859375, + "learning_rate": 0.00037167378563623726, + "loss": 0.5311, + "step": 107800 + }, + { + "epoch": 5.354624019072216, + "grad_norm": 0.130859375, + "learning_rate": 0.0003716340518525877, + "loss": 0.5226, + "step": 107810 + }, + { + "epoch": 5.355120691367835, + "grad_norm": 0.11279296875, + "learning_rate": 0.0003715943180689381, + "loss": 0.5087, + "step": 107820 + }, + { + "epoch": 5.3556173636634545, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003715545842852886, + "loss": 0.4915, + "step": 107830 + }, + { + "epoch": 5.356114035959074, + "grad_norm": 0.10595703125, + "learning_rate": 0.00037151485050163904, + "loss": 0.4851, + "step": 107840 + }, + { + "epoch": 5.356610708254694, + "grad_norm": 0.10986328125, + "learning_rate": 0.00037147511671798946, + "loss": 0.4998, + "step": 107850 + }, + { + "epoch": 5.357107380550313, + "grad_norm": 0.134765625, + "learning_rate": 0.00037143538293433993, + "loss": 0.5159, + "step": 107860 + }, + { + "epoch": 5.357604052845932, + "grad_norm": 0.10302734375, + "learning_rate": 0.0003713956491506904, + "loss": 0.5063, + "step": 107870 + }, + { + "epoch": 5.358100725141552, + "grad_norm": 0.12353515625, + "learning_rate": 0.00037135591536704087, + "loss": 0.505, + "step": 107880 + }, + { + "epoch": 5.358597397437171, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003713161815833913, + "loss": 0.5133, + "step": 107890 + }, + { + "epoch": 5.35909406973279, + "grad_norm": 0.12109375, + "learning_rate": 0.00037127644779974176, + "loss": 0.489, + "step": 107900 + }, + { + "epoch": 5.359590742028409, + "grad_norm": 0.1162109375, + "learning_rate": 0.00037123671401609223, + "loss": 0.5074, + "step": 107910 + }, + { + "epoch": 5.360087414324029, + "grad_norm": 0.1455078125, + "learning_rate": 0.00037119698023244265, + "loss": 0.4719, + "step": 107920 + }, + { + "epoch": 5.360584086619649, + "grad_norm": 0.1220703125, + "learning_rate": 0.00037115724644879307, + "loss": 0.4836, + "step": 107930 + }, + { + "epoch": 5.361080758915268, + "grad_norm": 0.1474609375, + "learning_rate": 0.00037111751266514354, + "loss": 0.5127, + "step": 107940 + }, + { + "epoch": 5.361577431210887, + "grad_norm": 0.12890625, + "learning_rate": 0.000371077778881494, + "loss": 0.5171, + "step": 107950 + }, + { + "epoch": 5.362074103506506, + "grad_norm": 0.134765625, + "learning_rate": 0.0003710380450978445, + "loss": 0.4905, + "step": 107960 + }, + { + "epoch": 5.3625707758021255, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003709983113141949, + "loss": 0.4623, + "step": 107970 + }, + { + "epoch": 5.363067448097745, + "grad_norm": 0.11474609375, + "learning_rate": 0.00037095857753054537, + "loss": 0.5042, + "step": 107980 + }, + { + "epoch": 5.363564120393365, + "grad_norm": 0.138671875, + "learning_rate": 0.00037091884374689584, + "loss": 0.523, + "step": 107990 + }, + { + "epoch": 5.364060792688984, + "grad_norm": 0.12060546875, + "learning_rate": 0.00037087910996324626, + "loss": 0.4975, + "step": 108000 + }, + { + "epoch": 5.364557464984603, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003708393761795967, + "loss": 0.472, + "step": 108010 + }, + { + "epoch": 5.3650541372802225, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003707996423959472, + "loss": 0.5176, + "step": 108020 + }, + { + "epoch": 5.365550809575842, + "grad_norm": 0.1650390625, + "learning_rate": 0.0003707599086122976, + "loss": 0.505, + "step": 108030 + }, + { + "epoch": 5.366047481871461, + "grad_norm": 0.11328125, + "learning_rate": 0.0003707201748286481, + "loss": 0.522, + "step": 108040 + }, + { + "epoch": 5.36654415416708, + "grad_norm": 0.1494140625, + "learning_rate": 0.0003706804410449985, + "loss": 0.503, + "step": 108050 + }, + { + "epoch": 5.3670408264627, + "grad_norm": 0.16796875, + "learning_rate": 0.000370640707261349, + "loss": 0.5157, + "step": 108060 + }, + { + "epoch": 5.3675374987583195, + "grad_norm": 0.1162109375, + "learning_rate": 0.00037060097347769945, + "loss": 0.497, + "step": 108070 + }, + { + "epoch": 5.368034171053939, + "grad_norm": 0.1298828125, + "learning_rate": 0.00037056123969404986, + "loss": 0.4749, + "step": 108080 + }, + { + "epoch": 5.368530843349558, + "grad_norm": 0.1298828125, + "learning_rate": 0.00037052150591040034, + "loss": 0.4949, + "step": 108090 + }, + { + "epoch": 5.369027515645177, + "grad_norm": 0.12109375, + "learning_rate": 0.0003704817721267508, + "loss": 0.4938, + "step": 108100 + }, + { + "epoch": 5.3695241879407964, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003704420383431012, + "loss": 0.48, + "step": 108110 + }, + { + "epoch": 5.370020860236416, + "grad_norm": 0.126953125, + "learning_rate": 0.0003704023045594517, + "loss": 0.536, + "step": 108120 + }, + { + "epoch": 5.370517532532036, + "grad_norm": 0.1025390625, + "learning_rate": 0.00037036257077580217, + "loss": 0.4826, + "step": 108130 + }, + { + "epoch": 5.371014204827655, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003703228369921526, + "loss": 0.4985, + "step": 108140 + }, + { + "epoch": 5.371510877123274, + "grad_norm": 0.1298828125, + "learning_rate": 0.00037028310320850306, + "loss": 0.5027, + "step": 108150 + }, + { + "epoch": 5.3720075494188935, + "grad_norm": 0.15234375, + "learning_rate": 0.00037024336942485347, + "loss": 0.4979, + "step": 108160 + }, + { + "epoch": 5.372504221714513, + "grad_norm": 0.1904296875, + "learning_rate": 0.000370203635641204, + "loss": 0.4905, + "step": 108170 + }, + { + "epoch": 5.373000894010132, + "grad_norm": 0.138671875, + "learning_rate": 0.0003701639018575544, + "loss": 0.5008, + "step": 108180 + }, + { + "epoch": 5.373497566305751, + "grad_norm": 0.1083984375, + "learning_rate": 0.00037012416807390483, + "loss": 0.4952, + "step": 108190 + }, + { + "epoch": 5.373994238601371, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003700844342902553, + "loss": 0.5018, + "step": 108200 + }, + { + "epoch": 5.3744909108969905, + "grad_norm": 0.115234375, + "learning_rate": 0.0003700447005066058, + "loss": 0.486, + "step": 108210 + }, + { + "epoch": 5.37498758319261, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003700049667229562, + "loss": 0.5243, + "step": 108220 + }, + { + "epoch": 5.375484255488229, + "grad_norm": 0.12451171875, + "learning_rate": 0.00036996523293930666, + "loss": 0.4858, + "step": 108230 + }, + { + "epoch": 5.375980927783848, + "grad_norm": 0.1171875, + "learning_rate": 0.0003699254991556571, + "loss": 0.4943, + "step": 108240 + }, + { + "epoch": 5.376477600079467, + "grad_norm": 0.14453125, + "learning_rate": 0.0003698857653720076, + "loss": 0.5185, + "step": 108250 + }, + { + "epoch": 5.376974272375087, + "grad_norm": 0.11572265625, + "learning_rate": 0.000369846031588358, + "loss": 0.4881, + "step": 108260 + }, + { + "epoch": 5.377470944670706, + "grad_norm": 0.1240234375, + "learning_rate": 0.00036980629780470844, + "loss": 0.501, + "step": 108270 + }, + { + "epoch": 5.377967616966326, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003697665640210589, + "loss": 0.5114, + "step": 108280 + }, + { + "epoch": 5.378464289261945, + "grad_norm": 0.1328125, + "learning_rate": 0.0003697268302374094, + "loss": 0.5176, + "step": 108290 + }, + { + "epoch": 5.378960961557564, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003696870964537598, + "loss": 0.516, + "step": 108300 + }, + { + "epoch": 5.379457633853184, + "grad_norm": 0.1455078125, + "learning_rate": 0.00036964736267011027, + "loss": 0.5143, + "step": 108310 + }, + { + "epoch": 5.379954306148803, + "grad_norm": 0.138671875, + "learning_rate": 0.00036960762888646074, + "loss": 0.5209, + "step": 108320 + }, + { + "epoch": 5.380450978444422, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003695678951028112, + "loss": 0.5073, + "step": 108330 + }, + { + "epoch": 5.380947650740041, + "grad_norm": 0.11376953125, + "learning_rate": 0.00036952816131916163, + "loss": 0.4927, + "step": 108340 + }, + { + "epoch": 5.3814443230356614, + "grad_norm": 0.1357421875, + "learning_rate": 0.00036948842753551205, + "loss": 0.4988, + "step": 108350 + }, + { + "epoch": 5.381940995331281, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003694486937518626, + "loss": 0.5168, + "step": 108360 + }, + { + "epoch": 5.3824376676269, + "grad_norm": 0.12109375, + "learning_rate": 0.000369408959968213, + "loss": 0.4888, + "step": 108370 + }, + { + "epoch": 5.382934339922519, + "grad_norm": 0.1806640625, + "learning_rate": 0.0003693692261845634, + "loss": 0.5031, + "step": 108380 + }, + { + "epoch": 5.383431012218138, + "grad_norm": 0.10546875, + "learning_rate": 0.0003693294924009139, + "loss": 0.4912, + "step": 108390 + }, + { + "epoch": 5.383927684513758, + "grad_norm": 0.11376953125, + "learning_rate": 0.00036928975861726435, + "loss": 0.5249, + "step": 108400 + }, + { + "epoch": 5.384424356809377, + "grad_norm": 0.126953125, + "learning_rate": 0.0003692500248336148, + "loss": 0.4943, + "step": 108410 + }, + { + "epoch": 5.384921029104997, + "grad_norm": 0.162109375, + "learning_rate": 0.00036921029104996524, + "loss": 0.4961, + "step": 108420 + }, + { + "epoch": 5.385417701400616, + "grad_norm": 0.1552734375, + "learning_rate": 0.0003691705572663157, + "loss": 0.521, + "step": 108430 + }, + { + "epoch": 5.385914373696235, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003691308234826662, + "loss": 0.5057, + "step": 108440 + }, + { + "epoch": 5.386411045991855, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003690910896990166, + "loss": 0.4832, + "step": 108450 + }, + { + "epoch": 5.386907718287474, + "grad_norm": 0.140625, + "learning_rate": 0.000369051355915367, + "loss": 0.4958, + "step": 108460 + }, + { + "epoch": 5.387404390583093, + "grad_norm": 0.130859375, + "learning_rate": 0.00036901162213171754, + "loss": 0.5162, + "step": 108470 + }, + { + "epoch": 5.387901062878712, + "grad_norm": 0.138671875, + "learning_rate": 0.00036897188834806796, + "loss": 0.4704, + "step": 108480 + }, + { + "epoch": 5.3883977351743315, + "grad_norm": 0.1416015625, + "learning_rate": 0.00036893215456441843, + "loss": 0.4988, + "step": 108490 + }, + { + "epoch": 5.388894407469952, + "grad_norm": 0.15625, + "learning_rate": 0.00036889242078076885, + "loss": 0.4893, + "step": 108500 + }, + { + "epoch": 5.389391079765571, + "grad_norm": 0.1328125, + "learning_rate": 0.0003688526869971193, + "loss": 0.509, + "step": 108510 + }, + { + "epoch": 5.38988775206119, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003688129532134698, + "loss": 0.5246, + "step": 108520 + }, + { + "epoch": 5.390384424356809, + "grad_norm": 0.125, + "learning_rate": 0.0003687732194298202, + "loss": 0.5198, + "step": 108530 + }, + { + "epoch": 5.3908810966524285, + "grad_norm": 0.1123046875, + "learning_rate": 0.0003687334856461707, + "loss": 0.4906, + "step": 108540 + }, + { + "epoch": 5.391377768948048, + "grad_norm": 0.10791015625, + "learning_rate": 0.00036869375186252115, + "loss": 0.494, + "step": 108550 + }, + { + "epoch": 5.391874441243667, + "grad_norm": 0.16015625, + "learning_rate": 0.00036865401807887157, + "loss": 0.5274, + "step": 108560 + }, + { + "epoch": 5.392371113539287, + "grad_norm": 0.115234375, + "learning_rate": 0.00036861428429522204, + "loss": 0.5298, + "step": 108570 + }, + { + "epoch": 5.392867785834906, + "grad_norm": 0.11328125, + "learning_rate": 0.00036857455051157246, + "loss": 0.4937, + "step": 108580 + }, + { + "epoch": 5.393364458130526, + "grad_norm": 0.1318359375, + "learning_rate": 0.00036853481672792293, + "loss": 0.5091, + "step": 108590 + }, + { + "epoch": 5.393861130426145, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003684950829442734, + "loss": 0.5224, + "step": 108600 + }, + { + "epoch": 5.394357802721764, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003684553491606238, + "loss": 0.5062, + "step": 108610 + }, + { + "epoch": 5.394854475017383, + "grad_norm": 0.138671875, + "learning_rate": 0.0003684156153769743, + "loss": 0.4836, + "step": 108620 + }, + { + "epoch": 5.3953511473130025, + "grad_norm": 0.12109375, + "learning_rate": 0.00036837588159332476, + "loss": 0.5242, + "step": 108630 + }, + { + "epoch": 5.395847819608623, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003683361478096752, + "loss": 0.5123, + "step": 108640 + }, + { + "epoch": 5.396344491904242, + "grad_norm": 0.1279296875, + "learning_rate": 0.00036829641402602565, + "loss": 0.4747, + "step": 108650 + }, + { + "epoch": 5.396841164199861, + "grad_norm": 0.13671875, + "learning_rate": 0.0003682566802423761, + "loss": 0.4758, + "step": 108660 + }, + { + "epoch": 5.39733783649548, + "grad_norm": 0.16796875, + "learning_rate": 0.00036821694645872654, + "loss": 0.5006, + "step": 108670 + }, + { + "epoch": 5.3978345087910995, + "grad_norm": 0.1298828125, + "learning_rate": 0.000368177212675077, + "loss": 0.4897, + "step": 108680 + }, + { + "epoch": 5.398331181086719, + "grad_norm": 0.119140625, + "learning_rate": 0.0003681374788914274, + "loss": 0.5123, + "step": 108690 + }, + { + "epoch": 5.398827853382338, + "grad_norm": 0.1240234375, + "learning_rate": 0.00036809774510777795, + "loss": 0.4788, + "step": 108700 + }, + { + "epoch": 5.399324525677958, + "grad_norm": 0.1337890625, + "learning_rate": 0.00036805801132412837, + "loss": 0.4871, + "step": 108710 + }, + { + "epoch": 5.399821197973577, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003680182775404788, + "loss": 0.4988, + "step": 108720 + }, + { + "epoch": 5.4003178702691965, + "grad_norm": 0.1123046875, + "learning_rate": 0.00036797854375682926, + "loss": 0.49, + "step": 108730 + }, + { + "epoch": 5.400814542564816, + "grad_norm": 0.130859375, + "learning_rate": 0.00036793880997317973, + "loss": 0.5178, + "step": 108740 + }, + { + "epoch": 5.401311214860435, + "grad_norm": 0.14453125, + "learning_rate": 0.00036789907618953014, + "loss": 0.5088, + "step": 108750 + }, + { + "epoch": 5.401807887156054, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003678593424058806, + "loss": 0.502, + "step": 108760 + }, + { + "epoch": 5.402304559451673, + "grad_norm": 0.1513671875, + "learning_rate": 0.00036781960862223103, + "loss": 0.4923, + "step": 108770 + }, + { + "epoch": 5.4028012317472935, + "grad_norm": 0.1328125, + "learning_rate": 0.00036777987483858156, + "loss": 0.4984, + "step": 108780 + }, + { + "epoch": 5.403297904042913, + "grad_norm": 0.11572265625, + "learning_rate": 0.000367740141054932, + "loss": 0.5066, + "step": 108790 + }, + { + "epoch": 5.403794576338532, + "grad_norm": 0.1513671875, + "learning_rate": 0.0003677004072712824, + "loss": 0.5195, + "step": 108800 + }, + { + "epoch": 5.404291248634151, + "grad_norm": 0.1376953125, + "learning_rate": 0.00036766067348763286, + "loss": 0.5054, + "step": 108810 + }, + { + "epoch": 5.4047879209297705, + "grad_norm": 0.1171875, + "learning_rate": 0.00036762093970398334, + "loss": 0.5097, + "step": 108820 + }, + { + "epoch": 5.40528459322539, + "grad_norm": 0.1142578125, + "learning_rate": 0.00036758120592033375, + "loss": 0.504, + "step": 108830 + }, + { + "epoch": 5.405781265521009, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003675414721366842, + "loss": 0.5155, + "step": 108840 + }, + { + "epoch": 5.406277937816629, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003675017383530347, + "loss": 0.4882, + "step": 108850 + }, + { + "epoch": 5.406774610112248, + "grad_norm": 0.150390625, + "learning_rate": 0.00036746200456938517, + "loss": 0.5012, + "step": 108860 + }, + { + "epoch": 5.4072712824078675, + "grad_norm": 0.10498046875, + "learning_rate": 0.0003674222707857356, + "loss": 0.5125, + "step": 108870 + }, + { + "epoch": 5.407767954703487, + "grad_norm": 0.12109375, + "learning_rate": 0.000367382537002086, + "loss": 0.5049, + "step": 108880 + }, + { + "epoch": 5.408264626999106, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003673428032184365, + "loss": 0.5241, + "step": 108890 + }, + { + "epoch": 5.408761299294725, + "grad_norm": 0.115234375, + "learning_rate": 0.00036730306943478694, + "loss": 0.5068, + "step": 108900 + }, + { + "epoch": 5.409257971590344, + "grad_norm": 0.126953125, + "learning_rate": 0.0003672633356511374, + "loss": 0.496, + "step": 108910 + }, + { + "epoch": 5.4097546438859645, + "grad_norm": 0.1572265625, + "learning_rate": 0.00036722360186748783, + "loss": 0.5095, + "step": 108920 + }, + { + "epoch": 5.410251316181584, + "grad_norm": 0.169921875, + "learning_rate": 0.0003671838680838383, + "loss": 0.503, + "step": 108930 + }, + { + "epoch": 5.410747988477203, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003671441343001888, + "loss": 0.4936, + "step": 108940 + }, + { + "epoch": 5.411244660772822, + "grad_norm": 0.166015625, + "learning_rate": 0.0003671044005165392, + "loss": 0.4863, + "step": 108950 + }, + { + "epoch": 5.411741333068441, + "grad_norm": 0.1455078125, + "learning_rate": 0.00036706466673288966, + "loss": 0.5501, + "step": 108960 + }, + { + "epoch": 5.412238005364061, + "grad_norm": 0.111328125, + "learning_rate": 0.00036702493294924013, + "loss": 0.4985, + "step": 108970 + }, + { + "epoch": 5.41273467765968, + "grad_norm": 0.1201171875, + "learning_rate": 0.00036698519916559055, + "loss": 0.5307, + "step": 108980 + }, + { + "epoch": 5.413231349955299, + "grad_norm": 0.138671875, + "learning_rate": 0.000366945465381941, + "loss": 0.5281, + "step": 108990 + }, + { + "epoch": 5.413728022250919, + "grad_norm": 0.134765625, + "learning_rate": 0.0003669057315982915, + "loss": 0.5336, + "step": 109000 + }, + { + "epoch": 5.414224694546538, + "grad_norm": 0.150390625, + "learning_rate": 0.0003668659978146419, + "loss": 0.4793, + "step": 109010 + }, + { + "epoch": 5.414721366842158, + "grad_norm": 0.1650390625, + "learning_rate": 0.0003668262640309924, + "loss": 0.5175, + "step": 109020 + }, + { + "epoch": 5.415218039137777, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003667865302473428, + "loss": 0.4748, + "step": 109030 + }, + { + "epoch": 5.415714711433396, + "grad_norm": 0.12890625, + "learning_rate": 0.00036674679646369327, + "loss": 0.5304, + "step": 109040 + }, + { + "epoch": 5.416211383729015, + "grad_norm": 0.1484375, + "learning_rate": 0.00036670706268004374, + "loss": 0.5149, + "step": 109050 + }, + { + "epoch": 5.416708056024635, + "grad_norm": 0.125, + "learning_rate": 0.00036666732889639416, + "loss": 0.4983, + "step": 109060 + }, + { + "epoch": 5.417204728320255, + "grad_norm": 0.1259765625, + "learning_rate": 0.00036662759511274463, + "loss": 0.5059, + "step": 109070 + }, + { + "epoch": 5.417701400615874, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003665878613290951, + "loss": 0.4924, + "step": 109080 + }, + { + "epoch": 5.418198072911493, + "grad_norm": 0.11279296875, + "learning_rate": 0.0003665481275454455, + "loss": 0.4965, + "step": 109090 + }, + { + "epoch": 5.418694745207112, + "grad_norm": 0.1201171875, + "learning_rate": 0.000366508393761796, + "loss": 0.4815, + "step": 109100 + }, + { + "epoch": 5.419191417502732, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003664686599781464, + "loss": 0.5138, + "step": 109110 + }, + { + "epoch": 5.419688089798351, + "grad_norm": 0.1328125, + "learning_rate": 0.0003664289261944969, + "loss": 0.4985, + "step": 109120 + }, + { + "epoch": 5.42018476209397, + "grad_norm": 0.12890625, + "learning_rate": 0.00036638919241084735, + "loss": 0.5111, + "step": 109130 + }, + { + "epoch": 5.42068143438959, + "grad_norm": 0.11328125, + "learning_rate": 0.00036634945862719777, + "loss": 0.5004, + "step": 109140 + }, + { + "epoch": 5.421178106685209, + "grad_norm": 0.12109375, + "learning_rate": 0.00036630972484354824, + "loss": 0.4852, + "step": 109150 + }, + { + "epoch": 5.421674778980829, + "grad_norm": 0.1328125, + "learning_rate": 0.0003662699910598987, + "loss": 0.5202, + "step": 109160 + }, + { + "epoch": 5.422171451276448, + "grad_norm": 0.1162109375, + "learning_rate": 0.00036623025727624913, + "loss": 0.4982, + "step": 109170 + }, + { + "epoch": 5.422668123572067, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003661905234925996, + "loss": 0.4969, + "step": 109180 + }, + { + "epoch": 5.423164795867686, + "grad_norm": 0.11865234375, + "learning_rate": 0.00036615078970895007, + "loss": 0.5026, + "step": 109190 + }, + { + "epoch": 5.4236614681633055, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003661110559253005, + "loss": 0.5074, + "step": 109200 + }, + { + "epoch": 5.424158140458925, + "grad_norm": 0.1259765625, + "learning_rate": 0.00036607132214165096, + "loss": 0.4977, + "step": 109210 + }, + { + "epoch": 5.424654812754545, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003660315883580014, + "loss": 0.5052, + "step": 109220 + }, + { + "epoch": 5.425151485050164, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003659918545743519, + "loss": 0.5185, + "step": 109230 + }, + { + "epoch": 5.425648157345783, + "grad_norm": 0.1171875, + "learning_rate": 0.0003659521207907023, + "loss": 0.5034, + "step": 109240 + }, + { + "epoch": 5.4261448296414025, + "grad_norm": 0.14453125, + "learning_rate": 0.00036591238700705274, + "loss": 0.5235, + "step": 109250 + }, + { + "epoch": 5.426641501937022, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003658726532234032, + "loss": 0.5166, + "step": 109260 + }, + { + "epoch": 5.427138174232641, + "grad_norm": 0.138671875, + "learning_rate": 0.0003658329194397537, + "loss": 0.4626, + "step": 109270 + }, + { + "epoch": 5.42763484652826, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003657931856561041, + "loss": 0.5009, + "step": 109280 + }, + { + "epoch": 5.42813151882388, + "grad_norm": 0.12890625, + "learning_rate": 0.00036575345187245457, + "loss": 0.4857, + "step": 109290 + }, + { + "epoch": 5.4286281911195, + "grad_norm": 0.11328125, + "learning_rate": 0.00036571371808880504, + "loss": 0.5296, + "step": 109300 + }, + { + "epoch": 5.429124863415119, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003656739843051555, + "loss": 0.5115, + "step": 109310 + }, + { + "epoch": 5.429621535710738, + "grad_norm": 0.126953125, + "learning_rate": 0.00036563425052150593, + "loss": 0.506, + "step": 109320 + }, + { + "epoch": 5.430118208006357, + "grad_norm": 0.11962890625, + "learning_rate": 0.00036559451673785634, + "loss": 0.4808, + "step": 109330 + }, + { + "epoch": 5.4306148803019765, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003655547829542068, + "loss": 0.513, + "step": 109340 + }, + { + "epoch": 5.431111552597596, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003655150491705573, + "loss": 0.4674, + "step": 109350 + }, + { + "epoch": 5.431608224893216, + "grad_norm": 0.12890625, + "learning_rate": 0.00036547531538690776, + "loss": 0.5004, + "step": 109360 + }, + { + "epoch": 5.432104897188835, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003654355816032582, + "loss": 0.5174, + "step": 109370 + }, + { + "epoch": 5.432601569484454, + "grad_norm": 0.11376953125, + "learning_rate": 0.00036539584781960865, + "loss": 0.5371, + "step": 109380 + }, + { + "epoch": 5.4330982417800735, + "grad_norm": 0.12890625, + "learning_rate": 0.0003653561140359591, + "loss": 0.5203, + "step": 109390 + }, + { + "epoch": 5.433594914075693, + "grad_norm": 0.1142578125, + "learning_rate": 0.00036531638025230954, + "loss": 0.4986, + "step": 109400 + }, + { + "epoch": 5.434091586371312, + "grad_norm": 0.11767578125, + "learning_rate": 0.00036527664646865995, + "loss": 0.4866, + "step": 109410 + }, + { + "epoch": 5.434588258666931, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003652369126850105, + "loss": 0.5068, + "step": 109420 + }, + { + "epoch": 5.435084930962551, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003651971789013609, + "loss": 0.5019, + "step": 109430 + }, + { + "epoch": 5.4355816032581705, + "grad_norm": 0.134765625, + "learning_rate": 0.00036515744511771137, + "loss": 0.4791, + "step": 109440 + }, + { + "epoch": 5.43607827555379, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003651177113340618, + "loss": 0.5114, + "step": 109450 + }, + { + "epoch": 5.436574947849409, + "grad_norm": 0.115234375, + "learning_rate": 0.00036507797755041226, + "loss": 0.491, + "step": 109460 + }, + { + "epoch": 5.437071620145028, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003650382437667627, + "loss": 0.504, + "step": 109470 + }, + { + "epoch": 5.437568292440647, + "grad_norm": 0.125, + "learning_rate": 0.00036499850998311314, + "loss": 0.5213, + "step": 109480 + }, + { + "epoch": 5.438064964736267, + "grad_norm": 0.12109375, + "learning_rate": 0.0003649587761994636, + "loss": 0.5141, + "step": 109490 + }, + { + "epoch": 5.438561637031887, + "grad_norm": 0.109375, + "learning_rate": 0.0003649190424158141, + "loss": 0.5127, + "step": 109500 + }, + { + "epoch": 5.439058309327506, + "grad_norm": 0.1796875, + "learning_rate": 0.0003648793086321645, + "loss": 0.49, + "step": 109510 + }, + { + "epoch": 5.439554981623125, + "grad_norm": 0.130859375, + "learning_rate": 0.000364839574848515, + "loss": 0.5063, + "step": 109520 + }, + { + "epoch": 5.4400516539187445, + "grad_norm": 0.1142578125, + "learning_rate": 0.00036479984106486545, + "loss": 0.4929, + "step": 109530 + }, + { + "epoch": 5.440548326214364, + "grad_norm": 0.130859375, + "learning_rate": 0.00036476010728121586, + "loss": 0.5066, + "step": 109540 + }, + { + "epoch": 5.441044998509983, + "grad_norm": 0.1259765625, + "learning_rate": 0.00036472037349756633, + "loss": 0.5232, + "step": 109550 + }, + { + "epoch": 5.441541670805602, + "grad_norm": 0.15234375, + "learning_rate": 0.00036468063971391675, + "loss": 0.5274, + "step": 109560 + }, + { + "epoch": 5.442038343101222, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003646409059302672, + "loss": 0.491, + "step": 109570 + }, + { + "epoch": 5.4425350153968415, + "grad_norm": 0.1611328125, + "learning_rate": 0.0003646011721466177, + "loss": 0.4898, + "step": 109580 + }, + { + "epoch": 5.443031687692461, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003645614383629681, + "loss": 0.5091, + "step": 109590 + }, + { + "epoch": 5.44352835998808, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003645217045793186, + "loss": 0.5018, + "step": 109600 + }, + { + "epoch": 5.444025032283699, + "grad_norm": 0.1591796875, + "learning_rate": 0.00036448197079566905, + "loss": 0.5004, + "step": 109610 + }, + { + "epoch": 5.444521704579318, + "grad_norm": 0.14453125, + "learning_rate": 0.00036444223701201947, + "loss": 0.4917, + "step": 109620 + }, + { + "epoch": 5.445018376874938, + "grad_norm": 0.1279296875, + "learning_rate": 0.00036440250322836994, + "loss": 0.5041, + "step": 109630 + }, + { + "epoch": 5.445515049170558, + "grad_norm": 0.12255859375, + "learning_rate": 0.00036436276944472036, + "loss": 0.5227, + "step": 109640 + }, + { + "epoch": 5.446011721466177, + "grad_norm": 0.115234375, + "learning_rate": 0.00036432303566107083, + "loss": 0.509, + "step": 109650 + }, + { + "epoch": 5.446508393761796, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003642833018774213, + "loss": 0.4936, + "step": 109660 + }, + { + "epoch": 5.447005066057415, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003642435680937717, + "loss": 0.4939, + "step": 109670 + }, + { + "epoch": 5.447501738353035, + "grad_norm": 0.12890625, + "learning_rate": 0.0003642038343101222, + "loss": 0.4946, + "step": 109680 + }, + { + "epoch": 5.447998410648654, + "grad_norm": 0.1162109375, + "learning_rate": 0.00036416410052647266, + "loss": 0.5041, + "step": 109690 + }, + { + "epoch": 5.448495082944273, + "grad_norm": 0.119140625, + "learning_rate": 0.0003641243667428231, + "loss": 0.4819, + "step": 109700 + }, + { + "epoch": 5.448991755239892, + "grad_norm": 0.14453125, + "learning_rate": 0.00036408463295917355, + "loss": 0.5064, + "step": 109710 + }, + { + "epoch": 5.449488427535512, + "grad_norm": 0.1630859375, + "learning_rate": 0.000364044899175524, + "loss": 0.5081, + "step": 109720 + }, + { + "epoch": 5.449985099831132, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003640051653918745, + "loss": 0.503, + "step": 109730 + }, + { + "epoch": 5.450481772126751, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003639654316082249, + "loss": 0.52, + "step": 109740 + }, + { + "epoch": 5.45097844442237, + "grad_norm": 0.11669921875, + "learning_rate": 0.00036392569782457533, + "loss": 0.4944, + "step": 109750 + }, + { + "epoch": 5.451475116717989, + "grad_norm": 0.16796875, + "learning_rate": 0.00036388596404092585, + "loss": 0.5124, + "step": 109760 + }, + { + "epoch": 5.451971789013609, + "grad_norm": 0.119140625, + "learning_rate": 0.00036384623025727627, + "loss": 0.5257, + "step": 109770 + }, + { + "epoch": 5.452468461309228, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003638064964736267, + "loss": 0.5101, + "step": 109780 + }, + { + "epoch": 5.452965133604848, + "grad_norm": 0.126953125, + "learning_rate": 0.00036376676268997716, + "loss": 0.4994, + "step": 109790 + }, + { + "epoch": 5.453461805900467, + "grad_norm": 0.1181640625, + "learning_rate": 0.00036372702890632763, + "loss": 0.5259, + "step": 109800 + }, + { + "epoch": 5.453958478196086, + "grad_norm": 0.111328125, + "learning_rate": 0.0003636872951226781, + "loss": 0.4796, + "step": 109810 + }, + { + "epoch": 5.454455150491706, + "grad_norm": 0.11279296875, + "learning_rate": 0.0003636475613390285, + "loss": 0.477, + "step": 109820 + }, + { + "epoch": 5.454951822787325, + "grad_norm": 0.12158203125, + "learning_rate": 0.000363607827555379, + "loss": 0.493, + "step": 109830 + }, + { + "epoch": 5.455448495082944, + "grad_norm": 0.123046875, + "learning_rate": 0.00036356809377172946, + "loss": 0.516, + "step": 109840 + }, + { + "epoch": 5.455945167378563, + "grad_norm": 0.150390625, + "learning_rate": 0.0003635283599880799, + "loss": 0.4964, + "step": 109850 + }, + { + "epoch": 5.4564418396741825, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003634886262044303, + "loss": 0.5178, + "step": 109860 + }, + { + "epoch": 5.456938511969803, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003634488924207808, + "loss": 0.5005, + "step": 109870 + }, + { + "epoch": 5.457435184265422, + "grad_norm": 0.126953125, + "learning_rate": 0.00036340915863713124, + "loss": 0.4756, + "step": 109880 + }, + { + "epoch": 5.457931856561041, + "grad_norm": 0.1611328125, + "learning_rate": 0.0003633694248534817, + "loss": 0.5116, + "step": 109890 + }, + { + "epoch": 5.45842852885666, + "grad_norm": 0.12353515625, + "learning_rate": 0.00036332969106983213, + "loss": 0.511, + "step": 109900 + }, + { + "epoch": 5.4589252011522795, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003632899572861826, + "loss": 0.4732, + "step": 109910 + }, + { + "epoch": 5.459421873447899, + "grad_norm": 0.115234375, + "learning_rate": 0.00036325022350253307, + "loss": 0.487, + "step": 109920 + }, + { + "epoch": 5.459918545743518, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003632104897188835, + "loss": 0.5186, + "step": 109930 + }, + { + "epoch": 5.460415218039138, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003631707559352339, + "loss": 0.5075, + "step": 109940 + }, + { + "epoch": 5.460911890334757, + "grad_norm": 0.138671875, + "learning_rate": 0.00036313102215158443, + "loss": 0.5035, + "step": 109950 + }, + { + "epoch": 5.4614085626303766, + "grad_norm": 0.1025390625, + "learning_rate": 0.00036309128836793485, + "loss": 0.4867, + "step": 109960 + }, + { + "epoch": 5.461905234925996, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003630515545842853, + "loss": 0.4873, + "step": 109970 + }, + { + "epoch": 5.462401907221615, + "grad_norm": 0.2021484375, + "learning_rate": 0.00036301182080063574, + "loss": 0.5273, + "step": 109980 + }, + { + "epoch": 5.462898579517234, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003629720870169862, + "loss": 0.4841, + "step": 109990 + }, + { + "epoch": 5.4633952518128535, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003629323532333367, + "loss": 0.4807, + "step": 110000 + }, + { + "epoch": 5.463891924108474, + "grad_norm": 0.12109375, + "learning_rate": 0.0003628926194496871, + "loss": 0.5149, + "step": 110010 + }, + { + "epoch": 5.464388596404093, + "grad_norm": 0.1435546875, + "learning_rate": 0.00036285288566603757, + "loss": 0.5286, + "step": 110020 + }, + { + "epoch": 5.464885268699712, + "grad_norm": 0.1279296875, + "learning_rate": 0.00036281315188238804, + "loss": 0.5148, + "step": 110030 + }, + { + "epoch": 5.465381940995331, + "grad_norm": 0.119140625, + "learning_rate": 0.00036277341809873846, + "loss": 0.5212, + "step": 110040 + }, + { + "epoch": 5.4658786132909505, + "grad_norm": 0.1875, + "learning_rate": 0.0003627336843150889, + "loss": 0.5017, + "step": 110050 + }, + { + "epoch": 5.46637528558657, + "grad_norm": 0.1904296875, + "learning_rate": 0.0003626939505314394, + "loss": 0.5164, + "step": 110060 + }, + { + "epoch": 5.466871957882189, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003626542167477898, + "loss": 0.5149, + "step": 110070 + }, + { + "epoch": 5.467368630177809, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003626144829641403, + "loss": 0.4969, + "step": 110080 + }, + { + "epoch": 5.467865302473428, + "grad_norm": 0.10400390625, + "learning_rate": 0.0003625747491804907, + "loss": 0.5312, + "step": 110090 + }, + { + "epoch": 5.4683619747690475, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003625350153968412, + "loss": 0.4916, + "step": 110100 + }, + { + "epoch": 5.468858647064667, + "grad_norm": 0.11865234375, + "learning_rate": 0.00036249528161319165, + "loss": 0.4932, + "step": 110110 + }, + { + "epoch": 5.469355319360286, + "grad_norm": 0.12060546875, + "learning_rate": 0.00036245554782954206, + "loss": 0.5117, + "step": 110120 + }, + { + "epoch": 5.469851991655905, + "grad_norm": 0.1123046875, + "learning_rate": 0.00036241581404589254, + "loss": 0.5093, + "step": 110130 + }, + { + "epoch": 5.470348663951524, + "grad_norm": 0.11767578125, + "learning_rate": 0.000362376080262243, + "loss": 0.5008, + "step": 110140 + }, + { + "epoch": 5.4708453362471445, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003623363464785934, + "loss": 0.4835, + "step": 110150 + }, + { + "epoch": 5.471342008542764, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003622966126949439, + "loss": 0.5138, + "step": 110160 + }, + { + "epoch": 5.471838680838383, + "grad_norm": 0.1142578125, + "learning_rate": 0.00036225687891129437, + "loss": 0.4963, + "step": 110170 + }, + { + "epoch": 5.472335353134002, + "grad_norm": 0.140625, + "learning_rate": 0.00036221714512764484, + "loss": 0.5067, + "step": 110180 + }, + { + "epoch": 5.472832025429621, + "grad_norm": 0.1689453125, + "learning_rate": 0.00036217741134399525, + "loss": 0.4944, + "step": 110190 + }, + { + "epoch": 5.473328697725241, + "grad_norm": 0.1630859375, + "learning_rate": 0.00036213767756034567, + "loss": 0.5065, + "step": 110200 + }, + { + "epoch": 5.47382537002086, + "grad_norm": 0.1337890625, + "learning_rate": 0.00036209794377669614, + "loss": 0.5064, + "step": 110210 + }, + { + "epoch": 5.47432204231648, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003620582099930466, + "loss": 0.4773, + "step": 110220 + }, + { + "epoch": 5.474818714612099, + "grad_norm": 0.11962890625, + "learning_rate": 0.00036201847620939703, + "loss": 0.4551, + "step": 110230 + }, + { + "epoch": 5.4753153869077185, + "grad_norm": 0.115234375, + "learning_rate": 0.0003619787424257475, + "loss": 0.4933, + "step": 110240 + }, + { + "epoch": 5.475812059203338, + "grad_norm": 0.134765625, + "learning_rate": 0.000361939008642098, + "loss": 0.4873, + "step": 110250 + }, + { + "epoch": 5.476308731498957, + "grad_norm": 0.12109375, + "learning_rate": 0.00036189927485844845, + "loss": 0.4914, + "step": 110260 + }, + { + "epoch": 5.476805403794576, + "grad_norm": 0.130859375, + "learning_rate": 0.00036185954107479886, + "loss": 0.518, + "step": 110270 + }, + { + "epoch": 5.477302076090195, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003618198072911493, + "loss": 0.4907, + "step": 110280 + }, + { + "epoch": 5.4777987483858155, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003617800735074998, + "loss": 0.4896, + "step": 110290 + }, + { + "epoch": 5.478295420681435, + "grad_norm": 0.138671875, + "learning_rate": 0.0003617403397238502, + "loss": 0.4898, + "step": 110300 + }, + { + "epoch": 5.478792092977054, + "grad_norm": 0.1298828125, + "learning_rate": 0.00036170060594020064, + "loss": 0.5006, + "step": 110310 + }, + { + "epoch": 5.479288765272673, + "grad_norm": 0.119140625, + "learning_rate": 0.0003616608721565511, + "loss": 0.5047, + "step": 110320 + }, + { + "epoch": 5.479785437568292, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003616211383729016, + "loss": 0.5147, + "step": 110330 + }, + { + "epoch": 5.480282109863912, + "grad_norm": 0.1435546875, + "learning_rate": 0.00036158140458925205, + "loss": 0.5337, + "step": 110340 + }, + { + "epoch": 5.480778782159531, + "grad_norm": 0.11572265625, + "learning_rate": 0.00036154167080560247, + "loss": 0.5247, + "step": 110350 + }, + { + "epoch": 5.48127545445515, + "grad_norm": 0.11767578125, + "learning_rate": 0.00036150193702195294, + "loss": 0.5077, + "step": 110360 + }, + { + "epoch": 5.48177212675077, + "grad_norm": 0.119140625, + "learning_rate": 0.0003614622032383034, + "loss": 0.4701, + "step": 110370 + }, + { + "epoch": 5.482268799046389, + "grad_norm": 0.1201171875, + "learning_rate": 0.00036142246945465383, + "loss": 0.5172, + "step": 110380 + }, + { + "epoch": 5.482765471342009, + "grad_norm": 0.111328125, + "learning_rate": 0.00036138273567100425, + "loss": 0.5148, + "step": 110390 + }, + { + "epoch": 5.483262143637628, + "grad_norm": 0.123046875, + "learning_rate": 0.0003613430018873548, + "loss": 0.5129, + "step": 110400 + }, + { + "epoch": 5.483758815933247, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003613032681037052, + "loss": 0.4989, + "step": 110410 + }, + { + "epoch": 5.484255488228866, + "grad_norm": 0.16015625, + "learning_rate": 0.00036126353432005566, + "loss": 0.4895, + "step": 110420 + }, + { + "epoch": 5.4847521605244856, + "grad_norm": 0.1171875, + "learning_rate": 0.0003612238005364061, + "loss": 0.4899, + "step": 110430 + }, + { + "epoch": 5.485248832820106, + "grad_norm": 0.1484375, + "learning_rate": 0.00036118406675275655, + "loss": 0.525, + "step": 110440 + }, + { + "epoch": 5.485745505115725, + "grad_norm": 0.1142578125, + "learning_rate": 0.000361144332969107, + "loss": 0.4797, + "step": 110450 + }, + { + "epoch": 5.486242177411344, + "grad_norm": 0.154296875, + "learning_rate": 0.00036110459918545744, + "loss": 0.526, + "step": 110460 + }, + { + "epoch": 5.486738849706963, + "grad_norm": 0.1298828125, + "learning_rate": 0.00036106486540180786, + "loss": 0.5061, + "step": 110470 + }, + { + "epoch": 5.487235522002583, + "grad_norm": 0.1572265625, + "learning_rate": 0.0003610251316181584, + "loss": 0.4925, + "step": 110480 + }, + { + "epoch": 5.487732194298202, + "grad_norm": 0.12109375, + "learning_rate": 0.0003609853978345088, + "loss": 0.4995, + "step": 110490 + }, + { + "epoch": 5.488228866593821, + "grad_norm": 0.115234375, + "learning_rate": 0.00036094566405085927, + "loss": 0.495, + "step": 110500 + }, + { + "epoch": 5.488725538889441, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003609059302672097, + "loss": 0.4999, + "step": 110510 + }, + { + "epoch": 5.48922221118506, + "grad_norm": 0.1083984375, + "learning_rate": 0.00036086619648356016, + "loss": 0.4985, + "step": 110520 + }, + { + "epoch": 5.48971888348068, + "grad_norm": 0.13671875, + "learning_rate": 0.00036082646269991063, + "loss": 0.4989, + "step": 110530 + }, + { + "epoch": 5.490215555776299, + "grad_norm": 0.1572265625, + "learning_rate": 0.00036078672891626105, + "loss": 0.5001, + "step": 110540 + }, + { + "epoch": 5.490712228071918, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003607469951326115, + "loss": 0.4665, + "step": 110550 + }, + { + "epoch": 5.491208900367537, + "grad_norm": 0.345703125, + "learning_rate": 0.000360707261348962, + "loss": 0.5204, + "step": 110560 + }, + { + "epoch": 5.4917055726631565, + "grad_norm": 0.18359375, + "learning_rate": 0.0003606675275653124, + "loss": 0.4917, + "step": 110570 + }, + { + "epoch": 5.492202244958776, + "grad_norm": 0.12890625, + "learning_rate": 0.0003606277937816629, + "loss": 0.4965, + "step": 110580 + }, + { + "epoch": 5.492698917254396, + "grad_norm": 0.115234375, + "learning_rate": 0.00036058805999801335, + "loss": 0.5131, + "step": 110590 + }, + { + "epoch": 5.493195589550015, + "grad_norm": 0.1103515625, + "learning_rate": 0.00036054832621436377, + "loss": 0.4856, + "step": 110600 + }, + { + "epoch": 5.493692261845634, + "grad_norm": 0.1611328125, + "learning_rate": 0.00036050859243071424, + "loss": 0.4854, + "step": 110610 + }, + { + "epoch": 5.4941889341412535, + "grad_norm": 0.12109375, + "learning_rate": 0.00036046885864706466, + "loss": 0.4969, + "step": 110620 + }, + { + "epoch": 5.494685606436873, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003604291248634152, + "loss": 0.5093, + "step": 110630 + }, + { + "epoch": 5.495182278732492, + "grad_norm": 0.1474609375, + "learning_rate": 0.0003603893910797656, + "loss": 0.5054, + "step": 110640 + }, + { + "epoch": 5.495678951028111, + "grad_norm": 0.11669921875, + "learning_rate": 0.000360349657296116, + "loss": 0.4877, + "step": 110650 + }, + { + "epoch": 5.496175623323731, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003603099235124665, + "loss": 0.5036, + "step": 110660 + }, + { + "epoch": 5.496672295619351, + "grad_norm": 0.1435546875, + "learning_rate": 0.00036027018972881696, + "loss": 0.5093, + "step": 110670 + }, + { + "epoch": 5.49716896791497, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003602304559451674, + "loss": 0.4902, + "step": 110680 + }, + { + "epoch": 5.497665640210589, + "grad_norm": 0.130859375, + "learning_rate": 0.00036019072216151785, + "loss": 0.4923, + "step": 110690 + }, + { + "epoch": 5.498162312506208, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003601509883778683, + "loss": 0.5205, + "step": 110700 + }, + { + "epoch": 5.4986589848018275, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003601112545942188, + "loss": 0.5088, + "step": 110710 + }, + { + "epoch": 5.499155657097447, + "grad_norm": 0.125, + "learning_rate": 0.0003600715208105692, + "loss": 0.5202, + "step": 110720 + }, + { + "epoch": 5.499652329393067, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003600317870269196, + "loss": 0.5072, + "step": 110730 + }, + { + "epoch": 5.500149001688686, + "grad_norm": 0.1171875, + "learning_rate": 0.0003599920532432701, + "loss": 0.5187, + "step": 110740 + }, + { + "epoch": 5.500645673984305, + "grad_norm": 0.12060546875, + "learning_rate": 0.00035995231945962057, + "loss": 0.4875, + "step": 110750 + }, + { + "epoch": 5.5011423462799245, + "grad_norm": 0.12109375, + "learning_rate": 0.000359912585675971, + "loss": 0.4937, + "step": 110760 + }, + { + "epoch": 5.501639018575544, + "grad_norm": 0.1259765625, + "learning_rate": 0.00035987285189232146, + "loss": 0.5154, + "step": 110770 + }, + { + "epoch": 5.502135690871163, + "grad_norm": 0.126953125, + "learning_rate": 0.0003598331181086719, + "loss": 0.521, + "step": 110780 + }, + { + "epoch": 5.502632363166782, + "grad_norm": 0.1513671875, + "learning_rate": 0.0003597933843250224, + "loss": 0.5052, + "step": 110790 + }, + { + "epoch": 5.503129035462402, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003597536505413728, + "loss": 0.4976, + "step": 110800 + }, + { + "epoch": 5.5036257077580215, + "grad_norm": 0.11181640625, + "learning_rate": 0.00035971391675772323, + "loss": 0.4796, + "step": 110810 + }, + { + "epoch": 5.504122380053641, + "grad_norm": 0.126953125, + "learning_rate": 0.00035967418297407376, + "loss": 0.491, + "step": 110820 + }, + { + "epoch": 5.50461905234926, + "grad_norm": 0.12890625, + "learning_rate": 0.0003596344491904242, + "loss": 0.5089, + "step": 110830 + }, + { + "epoch": 5.505115724644879, + "grad_norm": 0.115234375, + "learning_rate": 0.0003595947154067746, + "loss": 0.5173, + "step": 110840 + }, + { + "epoch": 5.505612396940498, + "grad_norm": 0.1064453125, + "learning_rate": 0.00035955498162312506, + "loss": 0.4792, + "step": 110850 + }, + { + "epoch": 5.506109069236118, + "grad_norm": 0.14453125, + "learning_rate": 0.00035951524783947553, + "loss": 0.5183, + "step": 110860 + }, + { + "epoch": 5.506605741531738, + "grad_norm": 0.17578125, + "learning_rate": 0.000359475514055826, + "loss": 0.4963, + "step": 110870 + }, + { + "epoch": 5.507102413827357, + "grad_norm": 0.1650390625, + "learning_rate": 0.0003594357802721764, + "loss": 0.5182, + "step": 110880 + }, + { + "epoch": 5.507599086122976, + "grad_norm": 0.1611328125, + "learning_rate": 0.0003593960464885269, + "loss": 0.5177, + "step": 110890 + }, + { + "epoch": 5.5080957584185954, + "grad_norm": 0.115234375, + "learning_rate": 0.00035935631270487737, + "loss": 0.5087, + "step": 110900 + }, + { + "epoch": 5.508592430714215, + "grad_norm": 0.14453125, + "learning_rate": 0.0003593165789212278, + "loss": 0.5081, + "step": 110910 + }, + { + "epoch": 5.509089103009834, + "grad_norm": 0.125, + "learning_rate": 0.0003592768451375782, + "loss": 0.5092, + "step": 110920 + }, + { + "epoch": 5.509585775305453, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003592371113539287, + "loss": 0.501, + "step": 110930 + }, + { + "epoch": 5.510082447601073, + "grad_norm": 0.1318359375, + "learning_rate": 0.00035919737757027914, + "loss": 0.5157, + "step": 110940 + }, + { + "epoch": 5.5105791198966925, + "grad_norm": 0.109375, + "learning_rate": 0.0003591576437866296, + "loss": 0.4858, + "step": 110950 + }, + { + "epoch": 5.511075792192312, + "grad_norm": 0.1494140625, + "learning_rate": 0.00035911791000298003, + "loss": 0.5092, + "step": 110960 + }, + { + "epoch": 5.511572464487931, + "grad_norm": 0.15234375, + "learning_rate": 0.0003590781762193305, + "loss": 0.5241, + "step": 110970 + }, + { + "epoch": 5.51206913678355, + "grad_norm": 0.169921875, + "learning_rate": 0.000359038442435681, + "loss": 0.4872, + "step": 110980 + }, + { + "epoch": 5.512565809079169, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003589987086520314, + "loss": 0.5347, + "step": 110990 + }, + { + "epoch": 5.513062481374789, + "grad_norm": 0.154296875, + "learning_rate": 0.00035895897486838186, + "loss": 0.535, + "step": 111000 + }, + { + "epoch": 5.513559153670409, + "grad_norm": 0.11083984375, + "learning_rate": 0.00035891924108473233, + "loss": 0.4963, + "step": 111010 + }, + { + "epoch": 5.514055825966028, + "grad_norm": 0.134765625, + "learning_rate": 0.00035887950730108275, + "loss": 0.4886, + "step": 111020 + }, + { + "epoch": 5.514552498261647, + "grad_norm": 0.140625, + "learning_rate": 0.0003588397735174332, + "loss": 0.5294, + "step": 111030 + }, + { + "epoch": 5.515049170557266, + "grad_norm": 0.1630859375, + "learning_rate": 0.00035880003973378364, + "loss": 0.5258, + "step": 111040 + }, + { + "epoch": 5.515545842852886, + "grad_norm": 0.125, + "learning_rate": 0.0003587603059501341, + "loss": 0.513, + "step": 111050 + }, + { + "epoch": 5.516042515148505, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003587205721664846, + "loss": 0.4839, + "step": 111060 + }, + { + "epoch": 5.516539187444124, + "grad_norm": 0.1328125, + "learning_rate": 0.000358680838382835, + "loss": 0.49, + "step": 111070 + }, + { + "epoch": 5.517035859739744, + "grad_norm": 0.1220703125, + "learning_rate": 0.00035864110459918547, + "loss": 0.545, + "step": 111080 + }, + { + "epoch": 5.517532532035363, + "grad_norm": 0.173828125, + "learning_rate": 0.00035860137081553594, + "loss": 0.478, + "step": 111090 + }, + { + "epoch": 5.518029204330983, + "grad_norm": 0.1201171875, + "learning_rate": 0.00035856163703188636, + "loss": 0.4944, + "step": 111100 + }, + { + "epoch": 5.518525876626602, + "grad_norm": 0.11328125, + "learning_rate": 0.00035852190324823683, + "loss": 0.5055, + "step": 111110 + }, + { + "epoch": 5.519022548922221, + "grad_norm": 0.1787109375, + "learning_rate": 0.0003584821694645873, + "loss": 0.4963, + "step": 111120 + }, + { + "epoch": 5.51951922121784, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003584424356809377, + "loss": 0.522, + "step": 111130 + }, + { + "epoch": 5.52001589351346, + "grad_norm": 0.140625, + "learning_rate": 0.0003584027018972882, + "loss": 0.5008, + "step": 111140 + }, + { + "epoch": 5.520512565809079, + "grad_norm": 0.1630859375, + "learning_rate": 0.0003583629681136386, + "loss": 0.5025, + "step": 111150 + }, + { + "epoch": 5.521009238104699, + "grad_norm": 0.11962890625, + "learning_rate": 0.00035832323432998913, + "loss": 0.5103, + "step": 111160 + }, + { + "epoch": 5.521505910400318, + "grad_norm": 0.1279296875, + "learning_rate": 0.00035828350054633955, + "loss": 0.5237, + "step": 111170 + }, + { + "epoch": 5.522002582695937, + "grad_norm": 0.1396484375, + "learning_rate": 0.00035824376676268997, + "loss": 0.5219, + "step": 111180 + }, + { + "epoch": 5.522499254991557, + "grad_norm": 0.1767578125, + "learning_rate": 0.00035820403297904044, + "loss": 0.5106, + "step": 111190 + }, + { + "epoch": 5.522995927287176, + "grad_norm": 0.1884765625, + "learning_rate": 0.0003581642991953909, + "loss": 0.5001, + "step": 111200 + }, + { + "epoch": 5.523492599582795, + "grad_norm": 0.11328125, + "learning_rate": 0.00035812456541174133, + "loss": 0.5162, + "step": 111210 + }, + { + "epoch": 5.523989271878414, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003580848316280918, + "loss": 0.5089, + "step": 111220 + }, + { + "epoch": 5.5244859441740335, + "grad_norm": 0.115234375, + "learning_rate": 0.00035804509784444227, + "loss": 0.5231, + "step": 111230 + }, + { + "epoch": 5.524982616469654, + "grad_norm": 0.12109375, + "learning_rate": 0.00035800536406079274, + "loss": 0.4949, + "step": 111240 + }, + { + "epoch": 5.525479288765273, + "grad_norm": 0.1435546875, + "learning_rate": 0.00035796563027714316, + "loss": 0.5107, + "step": 111250 + }, + { + "epoch": 5.525975961060892, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003579258964934936, + "loss": 0.5279, + "step": 111260 + }, + { + "epoch": 5.526472633356511, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003578861627098441, + "loss": 0.5147, + "step": 111270 + }, + { + "epoch": 5.5269693056521305, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003578464289261945, + "loss": 0.4759, + "step": 111280 + }, + { + "epoch": 5.52746597794775, + "grad_norm": 0.1279296875, + "learning_rate": 0.00035780669514254494, + "loss": 0.4928, + "step": 111290 + }, + { + "epoch": 5.527962650243369, + "grad_norm": 0.138671875, + "learning_rate": 0.0003577669613588954, + "loss": 0.4903, + "step": 111300 + }, + { + "epoch": 5.528459322538989, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003577272275752459, + "loss": 0.5192, + "step": 111310 + }, + { + "epoch": 5.528955994834608, + "grad_norm": 0.140625, + "learning_rate": 0.00035768749379159635, + "loss": 0.5061, + "step": 111320 + }, + { + "epoch": 5.5294526671302275, + "grad_norm": 0.12353515625, + "learning_rate": 0.00035764776000794677, + "loss": 0.4941, + "step": 111330 + }, + { + "epoch": 5.529949339425847, + "grad_norm": 0.1484375, + "learning_rate": 0.0003576080262242972, + "loss": 0.5264, + "step": 111340 + }, + { + "epoch": 5.530446011721466, + "grad_norm": 0.12890625, + "learning_rate": 0.0003575682924406477, + "loss": 0.4951, + "step": 111350 + }, + { + "epoch": 5.530942684017085, + "grad_norm": 0.138671875, + "learning_rate": 0.0003575285586569981, + "loss": 0.5178, + "step": 111360 + }, + { + "epoch": 5.5314393563127044, + "grad_norm": 0.1201171875, + "learning_rate": 0.00035748882487334854, + "loss": 0.5282, + "step": 111370 + }, + { + "epoch": 5.531936028608325, + "grad_norm": 0.10888671875, + "learning_rate": 0.000357449091089699, + "loss": 0.5222, + "step": 111380 + }, + { + "epoch": 5.532432700903944, + "grad_norm": 0.1171875, + "learning_rate": 0.0003574093573060495, + "loss": 0.5135, + "step": 111390 + }, + { + "epoch": 5.532929373199563, + "grad_norm": 0.12255859375, + "learning_rate": 0.00035736962352239996, + "loss": 0.4777, + "step": 111400 + }, + { + "epoch": 5.533426045495182, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003573298897387504, + "loss": 0.4899, + "step": 111410 + }, + { + "epoch": 5.5339227177908015, + "grad_norm": 0.134765625, + "learning_rate": 0.00035729015595510085, + "loss": 0.524, + "step": 111420 + }, + { + "epoch": 5.534419390086421, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003572504221714513, + "loss": 0.4869, + "step": 111430 + }, + { + "epoch": 5.53491606238204, + "grad_norm": 0.12451171875, + "learning_rate": 0.00035721068838780173, + "loss": 0.4676, + "step": 111440 + }, + { + "epoch": 5.53541273467766, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003571709546041522, + "loss": 0.4894, + "step": 111450 + }, + { + "epoch": 5.535909406973279, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003571312208205027, + "loss": 0.4867, + "step": 111460 + }, + { + "epoch": 5.5364060792688985, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003570914870368531, + "loss": 0.5043, + "step": 111470 + }, + { + "epoch": 5.536902751564518, + "grad_norm": 0.11669921875, + "learning_rate": 0.00035705175325320357, + "loss": 0.5002, + "step": 111480 + }, + { + "epoch": 5.537399423860137, + "grad_norm": 0.11279296875, + "learning_rate": 0.000357012019469554, + "loss": 0.5109, + "step": 111490 + }, + { + "epoch": 5.537896096155756, + "grad_norm": 0.1279296875, + "learning_rate": 0.00035697228568590445, + "loss": 0.4679, + "step": 111500 + }, + { + "epoch": 5.538392768451375, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003569325519022549, + "loss": 0.4981, + "step": 111510 + }, + { + "epoch": 5.5388894407469955, + "grad_norm": 0.11083984375, + "learning_rate": 0.00035689281811860534, + "loss": 0.4789, + "step": 111520 + }, + { + "epoch": 5.539386113042615, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003568530843349558, + "loss": 0.508, + "step": 111530 + }, + { + "epoch": 5.539882785338234, + "grad_norm": 0.12109375, + "learning_rate": 0.0003568133505513063, + "loss": 0.5006, + "step": 111540 + }, + { + "epoch": 5.540379457633853, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003567736167676567, + "loss": 0.4994, + "step": 111550 + }, + { + "epoch": 5.540876129929472, + "grad_norm": 0.125, + "learning_rate": 0.0003567338829840072, + "loss": 0.5043, + "step": 111560 + }, + { + "epoch": 5.541372802225092, + "grad_norm": 0.142578125, + "learning_rate": 0.00035669414920035765, + "loss": 0.5191, + "step": 111570 + }, + { + "epoch": 5.541869474520711, + "grad_norm": 0.1357421875, + "learning_rate": 0.00035665441541670806, + "loss": 0.5158, + "step": 111580 + }, + { + "epoch": 5.542366146816331, + "grad_norm": 0.1298828125, + "learning_rate": 0.00035661468163305853, + "loss": 0.5008, + "step": 111590 + }, + { + "epoch": 5.54286281911195, + "grad_norm": 0.1318359375, + "learning_rate": 0.00035657494784940895, + "loss": 0.4865, + "step": 111600 + }, + { + "epoch": 5.5433594914075695, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003565352140657594, + "loss": 0.4886, + "step": 111610 + }, + { + "epoch": 5.543856163703189, + "grad_norm": 0.1083984375, + "learning_rate": 0.0003564954802821099, + "loss": 0.512, + "step": 111620 + }, + { + "epoch": 5.544352835998808, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003564557464984603, + "loss": 0.4671, + "step": 111630 + }, + { + "epoch": 5.544849508294427, + "grad_norm": 0.15625, + "learning_rate": 0.0003564160127148108, + "loss": 0.527, + "step": 111640 + }, + { + "epoch": 5.545346180590046, + "grad_norm": 0.12353515625, + "learning_rate": 0.00035637627893116125, + "loss": 0.506, + "step": 111650 + }, + { + "epoch": 5.5458428528856665, + "grad_norm": 0.1240234375, + "learning_rate": 0.00035633654514751167, + "loss": 0.5005, + "step": 111660 + }, + { + "epoch": 5.546339525181286, + "grad_norm": 0.13671875, + "learning_rate": 0.00035629681136386214, + "loss": 0.5076, + "step": 111670 + }, + { + "epoch": 5.546836197476905, + "grad_norm": 0.1171875, + "learning_rate": 0.00035625707758021256, + "loss": 0.496, + "step": 111680 + }, + { + "epoch": 5.547332869772524, + "grad_norm": 0.134765625, + "learning_rate": 0.0003562173437965631, + "loss": 0.4994, + "step": 111690 + }, + { + "epoch": 5.547829542068143, + "grad_norm": 0.140625, + "learning_rate": 0.0003561776100129135, + "loss": 0.4964, + "step": 111700 + }, + { + "epoch": 5.548326214363763, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003561378762292639, + "loss": 0.5088, + "step": 111710 + }, + { + "epoch": 5.548822886659382, + "grad_norm": 0.134765625, + "learning_rate": 0.0003560981424456144, + "loss": 0.4856, + "step": 111720 + }, + { + "epoch": 5.549319558955002, + "grad_norm": 0.111328125, + "learning_rate": 0.00035605840866196486, + "loss": 0.5071, + "step": 111730 + }, + { + "epoch": 5.549816231250621, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003560186748783153, + "loss": 0.5245, + "step": 111740 + }, + { + "epoch": 5.55031290354624, + "grad_norm": 0.11865234375, + "learning_rate": 0.00035597894109466575, + "loss": 0.4938, + "step": 111750 + }, + { + "epoch": 5.55080957584186, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003559392073110162, + "loss": 0.4827, + "step": 111760 + }, + { + "epoch": 5.551306248137479, + "grad_norm": 0.181640625, + "learning_rate": 0.0003558994735273667, + "loss": 0.5306, + "step": 111770 + }, + { + "epoch": 5.551802920433098, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003558597397437171, + "loss": 0.5068, + "step": 111780 + }, + { + "epoch": 5.552299592728717, + "grad_norm": 0.162109375, + "learning_rate": 0.00035582000596006753, + "loss": 0.5106, + "step": 111790 + }, + { + "epoch": 5.552796265024337, + "grad_norm": 0.189453125, + "learning_rate": 0.00035578027217641805, + "loss": 0.5352, + "step": 111800 + }, + { + "epoch": 5.553292937319957, + "grad_norm": 0.11572265625, + "learning_rate": 0.00035574053839276847, + "loss": 0.4932, + "step": 111810 + }, + { + "epoch": 5.553789609615576, + "grad_norm": 0.1123046875, + "learning_rate": 0.0003557008046091189, + "loss": 0.5136, + "step": 111820 + }, + { + "epoch": 5.554286281911195, + "grad_norm": 0.126953125, + "learning_rate": 0.00035566107082546936, + "loss": 0.5193, + "step": 111830 + }, + { + "epoch": 5.554782954206814, + "grad_norm": 0.1171875, + "learning_rate": 0.00035562133704181983, + "loss": 0.4814, + "step": 111840 + }, + { + "epoch": 5.555279626502434, + "grad_norm": 0.126953125, + "learning_rate": 0.0003555816032581703, + "loss": 0.5252, + "step": 111850 + }, + { + "epoch": 5.555776298798053, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003555418694745207, + "loss": 0.5123, + "step": 111860 + }, + { + "epoch": 5.556272971093672, + "grad_norm": 0.123046875, + "learning_rate": 0.0003555021356908712, + "loss": 0.4962, + "step": 111870 + }, + { + "epoch": 5.556769643389291, + "grad_norm": 0.1572265625, + "learning_rate": 0.00035546240190722166, + "loss": 0.4998, + "step": 111880 + }, + { + "epoch": 5.557266315684911, + "grad_norm": 0.169921875, + "learning_rate": 0.0003554226681235721, + "loss": 0.5142, + "step": 111890 + }, + { + "epoch": 5.557762987980531, + "grad_norm": 0.1181640625, + "learning_rate": 0.00035538293433992255, + "loss": 0.4966, + "step": 111900 + }, + { + "epoch": 5.55825966027615, + "grad_norm": 0.10693359375, + "learning_rate": 0.00035534320055627297, + "loss": 0.5049, + "step": 111910 + }, + { + "epoch": 5.558756332571769, + "grad_norm": 0.126953125, + "learning_rate": 0.00035530346677262344, + "loss": 0.5032, + "step": 111920 + }, + { + "epoch": 5.559253004867388, + "grad_norm": 0.123046875, + "learning_rate": 0.0003552637329889739, + "loss": 0.5364, + "step": 111930 + }, + { + "epoch": 5.5597496771630075, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003552239992053243, + "loss": 0.5056, + "step": 111940 + }, + { + "epoch": 5.560246349458627, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003551842654216748, + "loss": 0.4908, + "step": 111950 + }, + { + "epoch": 5.560743021754247, + "grad_norm": 0.1298828125, + "learning_rate": 0.00035514453163802527, + "loss": 0.4613, + "step": 111960 + }, + { + "epoch": 5.561239694049866, + "grad_norm": 0.115234375, + "learning_rate": 0.0003551047978543757, + "loss": 0.5262, + "step": 111970 + }, + { + "epoch": 5.561736366345485, + "grad_norm": 0.1328125, + "learning_rate": 0.00035506506407072616, + "loss": 0.4986, + "step": 111980 + }, + { + "epoch": 5.5622330386411045, + "grad_norm": 0.1220703125, + "learning_rate": 0.00035502533028707663, + "loss": 0.5007, + "step": 111990 + }, + { + "epoch": 5.562729710936724, + "grad_norm": 0.11865234375, + "learning_rate": 0.00035498559650342705, + "loss": 0.5097, + "step": 112000 + }, + { + "epoch": 5.563226383232343, + "grad_norm": 0.10693359375, + "learning_rate": 0.0003549458627197775, + "loss": 0.5015, + "step": 112010 + }, + { + "epoch": 5.563723055527962, + "grad_norm": 0.1533203125, + "learning_rate": 0.00035490612893612794, + "loss": 0.5014, + "step": 112020 + }, + { + "epoch": 5.564219727823582, + "grad_norm": 0.15234375, + "learning_rate": 0.0003548663951524784, + "loss": 0.4997, + "step": 112030 + }, + { + "epoch": 5.5647164001192015, + "grad_norm": 0.138671875, + "learning_rate": 0.0003548266613688289, + "loss": 0.5313, + "step": 112040 + }, + { + "epoch": 5.565213072414821, + "grad_norm": 0.130859375, + "learning_rate": 0.0003547869275851793, + "loss": 0.5256, + "step": 112050 + }, + { + "epoch": 5.56570974471044, + "grad_norm": 0.12060546875, + "learning_rate": 0.00035474719380152977, + "loss": 0.4737, + "step": 112060 + }, + { + "epoch": 5.566206417006059, + "grad_norm": 0.12158203125, + "learning_rate": 0.00035470746001788024, + "loss": 0.4921, + "step": 112070 + }, + { + "epoch": 5.5667030893016785, + "grad_norm": 0.10791015625, + "learning_rate": 0.00035466772623423065, + "loss": 0.5041, + "step": 112080 + }, + { + "epoch": 5.567199761597298, + "grad_norm": 0.1650390625, + "learning_rate": 0.0003546279924505811, + "loss": 0.5132, + "step": 112090 + }, + { + "epoch": 5.567696433892918, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003545882586669316, + "loss": 0.511, + "step": 112100 + }, + { + "epoch": 5.568193106188537, + "grad_norm": 0.126953125, + "learning_rate": 0.000354548524883282, + "loss": 0.5041, + "step": 112110 + }, + { + "epoch": 5.568689778484156, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003545087910996325, + "loss": 0.4775, + "step": 112120 + }, + { + "epoch": 5.5691864507797755, + "grad_norm": 0.126953125, + "learning_rate": 0.0003544690573159829, + "loss": 0.5107, + "step": 112130 + }, + { + "epoch": 5.569683123075395, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003544293235323334, + "loss": 0.4981, + "step": 112140 + }, + { + "epoch": 5.570179795371014, + "grad_norm": 0.12060546875, + "learning_rate": 0.00035438958974868385, + "loss": 0.5178, + "step": 112150 + }, + { + "epoch": 5.570676467666633, + "grad_norm": 0.142578125, + "learning_rate": 0.00035434985596503426, + "loss": 0.515, + "step": 112160 + }, + { + "epoch": 5.571173139962253, + "grad_norm": 0.146484375, + "learning_rate": 0.00035431012218138473, + "loss": 0.5216, + "step": 112170 + }, + { + "epoch": 5.5716698122578725, + "grad_norm": 0.10791015625, + "learning_rate": 0.0003542703883977352, + "loss": 0.4892, + "step": 112180 + }, + { + "epoch": 5.572166484553492, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003542306546140856, + "loss": 0.4874, + "step": 112190 + }, + { + "epoch": 5.572663156849111, + "grad_norm": 0.12109375, + "learning_rate": 0.0003541909208304361, + "loss": 0.4912, + "step": 112200 + }, + { + "epoch": 5.57315982914473, + "grad_norm": 0.11328125, + "learning_rate": 0.0003541511870467865, + "loss": 0.5085, + "step": 112210 + }, + { + "epoch": 5.573656501440349, + "grad_norm": 0.17578125, + "learning_rate": 0.00035411145326313704, + "loss": 0.5043, + "step": 112220 + }, + { + "epoch": 5.574153173735969, + "grad_norm": 0.1748046875, + "learning_rate": 0.00035407171947948745, + "loss": 0.5144, + "step": 112230 + }, + { + "epoch": 5.574649846031589, + "grad_norm": 0.1103515625, + "learning_rate": 0.00035403198569583787, + "loss": 0.5169, + "step": 112240 + }, + { + "epoch": 5.575146518327208, + "grad_norm": 0.12060546875, + "learning_rate": 0.00035399225191218834, + "loss": 0.4937, + "step": 112250 + }, + { + "epoch": 5.575643190622827, + "grad_norm": 0.15234375, + "learning_rate": 0.0003539525181285388, + "loss": 0.5045, + "step": 112260 + }, + { + "epoch": 5.576139862918446, + "grad_norm": 0.126953125, + "learning_rate": 0.0003539127843448893, + "loss": 0.5102, + "step": 112270 + }, + { + "epoch": 5.576636535214066, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003538730505612397, + "loss": 0.5018, + "step": 112280 + }, + { + "epoch": 5.577133207509685, + "grad_norm": 0.166015625, + "learning_rate": 0.0003538333167775902, + "loss": 0.4976, + "step": 112290 + }, + { + "epoch": 5.577629879805304, + "grad_norm": 0.123046875, + "learning_rate": 0.00035379358299394065, + "loss": 0.5036, + "step": 112300 + }, + { + "epoch": 5.578126552100924, + "grad_norm": 0.1435546875, + "learning_rate": 0.00035375384921029106, + "loss": 0.4883, + "step": 112310 + }, + { + "epoch": 5.5786232243965435, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003537141154266415, + "loss": 0.4795, + "step": 112320 + }, + { + "epoch": 5.579119896692163, + "grad_norm": 0.1259765625, + "learning_rate": 0.000353674381642992, + "loss": 0.5187, + "step": 112330 + }, + { + "epoch": 5.579616568987782, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003536346478593424, + "loss": 0.4851, + "step": 112340 + }, + { + "epoch": 5.580113241283401, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003535949140756929, + "loss": 0.5324, + "step": 112350 + }, + { + "epoch": 5.58060991357902, + "grad_norm": 0.1484375, + "learning_rate": 0.0003535551802920433, + "loss": 0.5057, + "step": 112360 + }, + { + "epoch": 5.58110658587464, + "grad_norm": 0.1572265625, + "learning_rate": 0.0003535154465083938, + "loss": 0.5042, + "step": 112370 + }, + { + "epoch": 5.58160325817026, + "grad_norm": 0.12109375, + "learning_rate": 0.00035347571272474425, + "loss": 0.4992, + "step": 112380 + }, + { + "epoch": 5.582099930465879, + "grad_norm": 0.1533203125, + "learning_rate": 0.00035343597894109467, + "loss": 0.4987, + "step": 112390 + }, + { + "epoch": 5.582596602761498, + "grad_norm": 0.1142578125, + "learning_rate": 0.00035339624515744514, + "loss": 0.4869, + "step": 112400 + }, + { + "epoch": 5.583093275057117, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003533565113737956, + "loss": 0.5111, + "step": 112410 + }, + { + "epoch": 5.583589947352737, + "grad_norm": 0.11572265625, + "learning_rate": 0.00035331677759014603, + "loss": 0.4857, + "step": 112420 + }, + { + "epoch": 5.584086619648356, + "grad_norm": 0.1171875, + "learning_rate": 0.0003532770438064965, + "loss": 0.5344, + "step": 112430 + }, + { + "epoch": 5.584583291943975, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003532373100228469, + "loss": 0.5189, + "step": 112440 + }, + { + "epoch": 5.585079964239595, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003531975762391974, + "loss": 0.4859, + "step": 112450 + }, + { + "epoch": 5.585576636535214, + "grad_norm": 0.140625, + "learning_rate": 0.00035315784245554786, + "loss": 0.511, + "step": 112460 + }, + { + "epoch": 5.586073308830834, + "grad_norm": 0.126953125, + "learning_rate": 0.0003531181086718983, + "loss": 0.4974, + "step": 112470 + }, + { + "epoch": 5.586569981126453, + "grad_norm": 0.14453125, + "learning_rate": 0.00035307837488824875, + "loss": 0.4927, + "step": 112480 + }, + { + "epoch": 5.587066653422072, + "grad_norm": 0.154296875, + "learning_rate": 0.0003530386411045992, + "loss": 0.4816, + "step": 112490 + }, + { + "epoch": 5.587563325717691, + "grad_norm": 0.12255859375, + "learning_rate": 0.00035299890732094964, + "loss": 0.466, + "step": 112500 + }, + { + "epoch": 5.5880599980133105, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003529591735373001, + "loss": 0.5036, + "step": 112510 + }, + { + "epoch": 5.588556670308931, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003529194397536506, + "loss": 0.5224, + "step": 112520 + }, + { + "epoch": 5.58905334260455, + "grad_norm": 0.197265625, + "learning_rate": 0.000352879705970001, + "loss": 0.4723, + "step": 112530 + }, + { + "epoch": 5.589550014900169, + "grad_norm": 0.10205078125, + "learning_rate": 0.00035283997218635147, + "loss": 0.5103, + "step": 112540 + }, + { + "epoch": 5.590046687195788, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003528002384027019, + "loss": 0.502, + "step": 112550 + }, + { + "epoch": 5.590543359491408, + "grad_norm": 0.1455078125, + "learning_rate": 0.00035276050461905236, + "loss": 0.5116, + "step": 112560 + }, + { + "epoch": 5.591040031787027, + "grad_norm": 0.11962890625, + "learning_rate": 0.00035272077083540283, + "loss": 0.5403, + "step": 112570 + }, + { + "epoch": 5.591536704082646, + "grad_norm": 0.1416015625, + "learning_rate": 0.00035268103705175325, + "loss": 0.5146, + "step": 112580 + }, + { + "epoch": 5.592033376378265, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003526413032681037, + "loss": 0.5076, + "step": 112590 + }, + { + "epoch": 5.5925300486738845, + "grad_norm": 0.123046875, + "learning_rate": 0.0003526015694844542, + "loss": 0.5097, + "step": 112600 + }, + { + "epoch": 5.593026720969505, + "grad_norm": 0.123046875, + "learning_rate": 0.0003525618357008046, + "loss": 0.4999, + "step": 112610 + }, + { + "epoch": 5.593523393265124, + "grad_norm": 0.134765625, + "learning_rate": 0.0003525221019171551, + "loss": 0.4954, + "step": 112620 + }, + { + "epoch": 5.594020065560743, + "grad_norm": 0.1513671875, + "learning_rate": 0.00035248236813350555, + "loss": 0.5037, + "step": 112630 + }, + { + "epoch": 5.594516737856362, + "grad_norm": 0.11669921875, + "learning_rate": 0.00035244263434985597, + "loss": 0.5443, + "step": 112640 + }, + { + "epoch": 5.5950134101519815, + "grad_norm": 0.1455078125, + "learning_rate": 0.00035240290056620644, + "loss": 0.4872, + "step": 112650 + }, + { + "epoch": 5.595510082447601, + "grad_norm": 0.134765625, + "learning_rate": 0.00035236316678255686, + "loss": 0.5063, + "step": 112660 + }, + { + "epoch": 5.59600675474322, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003523234329989074, + "loss": 0.518, + "step": 112670 + }, + { + "epoch": 5.59650342703884, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003522836992152578, + "loss": 0.5125, + "step": 112680 + }, + { + "epoch": 5.597000099334459, + "grad_norm": 0.158203125, + "learning_rate": 0.0003522439654316082, + "loss": 0.5128, + "step": 112690 + }, + { + "epoch": 5.5974967716300785, + "grad_norm": 0.10986328125, + "learning_rate": 0.0003522042316479587, + "loss": 0.5089, + "step": 112700 + }, + { + "epoch": 5.597993443925698, + "grad_norm": 0.126953125, + "learning_rate": 0.00035216449786430916, + "loss": 0.4967, + "step": 112710 + }, + { + "epoch": 5.598490116221317, + "grad_norm": 0.1240234375, + "learning_rate": 0.00035212476408065963, + "loss": 0.5239, + "step": 112720 + }, + { + "epoch": 5.598986788516936, + "grad_norm": 0.109375, + "learning_rate": 0.00035208503029701005, + "loss": 0.512, + "step": 112730 + }, + { + "epoch": 5.599483460812555, + "grad_norm": 0.1337890625, + "learning_rate": 0.00035204529651336046, + "loss": 0.4772, + "step": 112740 + }, + { + "epoch": 5.5999801331081756, + "grad_norm": 0.1357421875, + "learning_rate": 0.000352005562729711, + "loss": 0.5299, + "step": 112750 + }, + { + "epoch": 5.600476805403795, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003519658289460614, + "loss": 0.4781, + "step": 112760 + }, + { + "epoch": 5.600973477699414, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003519260951624118, + "loss": 0.5015, + "step": 112770 + }, + { + "epoch": 5.601470149995033, + "grad_norm": 0.16015625, + "learning_rate": 0.0003518863613787623, + "loss": 0.4884, + "step": 112780 + }, + { + "epoch": 5.6019668222906525, + "grad_norm": 0.123046875, + "learning_rate": 0.00035184662759511277, + "loss": 0.4633, + "step": 112790 + }, + { + "epoch": 5.602463494586272, + "grad_norm": 0.140625, + "learning_rate": 0.00035180689381146324, + "loss": 0.4953, + "step": 112800 + }, + { + "epoch": 5.602960166881891, + "grad_norm": 0.1337890625, + "learning_rate": 0.00035176716002781365, + "loss": 0.5085, + "step": 112810 + }, + { + "epoch": 5.603456839177511, + "grad_norm": 0.2236328125, + "learning_rate": 0.0003517274262441641, + "loss": 0.4908, + "step": 112820 + }, + { + "epoch": 5.60395351147313, + "grad_norm": 0.130859375, + "learning_rate": 0.0003516876924605146, + "loss": 0.4784, + "step": 112830 + }, + { + "epoch": 5.6044501837687495, + "grad_norm": 0.12353515625, + "learning_rate": 0.000351647958676865, + "loss": 0.5038, + "step": 112840 + }, + { + "epoch": 5.604946856064369, + "grad_norm": 0.1318359375, + "learning_rate": 0.00035160822489321543, + "loss": 0.5014, + "step": 112850 + }, + { + "epoch": 5.605443528359988, + "grad_norm": 0.10986328125, + "learning_rate": 0.00035156849110956596, + "loss": 0.5004, + "step": 112860 + }, + { + "epoch": 5.605940200655607, + "grad_norm": 0.119140625, + "learning_rate": 0.0003515287573259164, + "loss": 0.4771, + "step": 112870 + }, + { + "epoch": 5.606436872951226, + "grad_norm": 0.1220703125, + "learning_rate": 0.00035148902354226685, + "loss": 0.5097, + "step": 112880 + }, + { + "epoch": 5.6069335452468465, + "grad_norm": 0.162109375, + "learning_rate": 0.00035144928975861726, + "loss": 0.4958, + "step": 112890 + }, + { + "epoch": 5.607430217542466, + "grad_norm": 0.119140625, + "learning_rate": 0.00035140955597496773, + "loss": 0.4939, + "step": 112900 + }, + { + "epoch": 5.607926889838085, + "grad_norm": 0.12890625, + "learning_rate": 0.0003513698221913182, + "loss": 0.5129, + "step": 112910 + }, + { + "epoch": 5.608423562133704, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003513300884076686, + "loss": 0.4886, + "step": 112920 + }, + { + "epoch": 5.608920234429323, + "grad_norm": 0.10546875, + "learning_rate": 0.0003512903546240191, + "loss": 0.5186, + "step": 112930 + }, + { + "epoch": 5.609416906724943, + "grad_norm": 0.11669921875, + "learning_rate": 0.00035125062084036957, + "loss": 0.4782, + "step": 112940 + }, + { + "epoch": 5.609913579020562, + "grad_norm": 0.1328125, + "learning_rate": 0.00035121088705672, + "loss": 0.518, + "step": 112950 + }, + { + "epoch": 5.610410251316182, + "grad_norm": 0.1337890625, + "learning_rate": 0.00035117115327307045, + "loss": 0.505, + "step": 112960 + }, + { + "epoch": 5.610906923611801, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003511314194894209, + "loss": 0.51, + "step": 112970 + }, + { + "epoch": 5.61140359590742, + "grad_norm": 0.12451171875, + "learning_rate": 0.00035109168570577134, + "loss": 0.5075, + "step": 112980 + }, + { + "epoch": 5.61190026820304, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003510519519221218, + "loss": 0.5147, + "step": 112990 + }, + { + "epoch": 5.612396940498659, + "grad_norm": 0.134765625, + "learning_rate": 0.00035101221813847223, + "loss": 0.4915, + "step": 113000 + }, + { + "epoch": 5.612893612794278, + "grad_norm": 0.123046875, + "learning_rate": 0.0003509724843548227, + "loss": 0.5033, + "step": 113010 + }, + { + "epoch": 5.613390285089897, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003509327505711732, + "loss": 0.5055, + "step": 113020 + }, + { + "epoch": 5.6138869573855175, + "grad_norm": 0.10986328125, + "learning_rate": 0.0003508930167875236, + "loss": 0.496, + "step": 113030 + }, + { + "epoch": 5.614383629681137, + "grad_norm": 0.15625, + "learning_rate": 0.00035085328300387406, + "loss": 0.5035, + "step": 113040 + }, + { + "epoch": 5.614880301976756, + "grad_norm": 0.11962890625, + "learning_rate": 0.00035081354922022453, + "loss": 0.4637, + "step": 113050 + }, + { + "epoch": 5.615376974272375, + "grad_norm": 0.10986328125, + "learning_rate": 0.00035077381543657495, + "loss": 0.5175, + "step": 113060 + }, + { + "epoch": 5.615873646567994, + "grad_norm": 0.11328125, + "learning_rate": 0.0003507340816529254, + "loss": 0.5448, + "step": 113070 + }, + { + "epoch": 5.616370318863614, + "grad_norm": 0.16796875, + "learning_rate": 0.00035069434786927584, + "loss": 0.5084, + "step": 113080 + }, + { + "epoch": 5.616866991159233, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003506546140856263, + "loss": 0.4975, + "step": 113090 + }, + { + "epoch": 5.617363663454853, + "grad_norm": 0.1533203125, + "learning_rate": 0.0003506148803019768, + "loss": 0.5024, + "step": 113100 + }, + { + "epoch": 5.617860335750472, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003505751465183272, + "loss": 0.5177, + "step": 113110 + }, + { + "epoch": 5.618357008046091, + "grad_norm": 0.11767578125, + "learning_rate": 0.00035053541273467767, + "loss": 0.4979, + "step": 113120 + }, + { + "epoch": 5.618853680341711, + "grad_norm": 0.1259765625, + "learning_rate": 0.00035049567895102814, + "loss": 0.5113, + "step": 113130 + }, + { + "epoch": 5.61935035263733, + "grad_norm": 0.12255859375, + "learning_rate": 0.00035045594516737856, + "loss": 0.5245, + "step": 113140 + }, + { + "epoch": 5.619847024932949, + "grad_norm": 0.14453125, + "learning_rate": 0.00035041621138372903, + "loss": 0.535, + "step": 113150 + }, + { + "epoch": 5.620343697228568, + "grad_norm": 0.1064453125, + "learning_rate": 0.0003503764776000795, + "loss": 0.5191, + "step": 113160 + }, + { + "epoch": 5.620840369524188, + "grad_norm": 0.1162109375, + "learning_rate": 0.00035033674381642997, + "loss": 0.501, + "step": 113170 + }, + { + "epoch": 5.621337041819808, + "grad_norm": 0.1171875, + "learning_rate": 0.0003502970100327804, + "loss": 0.509, + "step": 113180 + }, + { + "epoch": 5.621833714115427, + "grad_norm": 0.142578125, + "learning_rate": 0.0003502572762491308, + "loss": 0.507, + "step": 113190 + }, + { + "epoch": 5.622330386411046, + "grad_norm": 0.1279296875, + "learning_rate": 0.00035021754246548133, + "loss": 0.5035, + "step": 113200 + }, + { + "epoch": 5.622827058706665, + "grad_norm": 0.123046875, + "learning_rate": 0.00035017780868183175, + "loss": 0.5092, + "step": 113210 + }, + { + "epoch": 5.6233237310022846, + "grad_norm": 0.11572265625, + "learning_rate": 0.00035013807489818217, + "loss": 0.4871, + "step": 113220 + }, + { + "epoch": 5.623820403297904, + "grad_norm": 0.1259765625, + "learning_rate": 0.00035009834111453264, + "loss": 0.4793, + "step": 113230 + }, + { + "epoch": 5.624317075593523, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003500586073308831, + "loss": 0.4974, + "step": 113240 + }, + { + "epoch": 5.624813747889143, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003500188735472336, + "loss": 0.5088, + "step": 113250 + }, + { + "epoch": 5.625310420184762, + "grad_norm": 0.150390625, + "learning_rate": 0.000349979139763584, + "loss": 0.4957, + "step": 113260 + }, + { + "epoch": 5.625807092480382, + "grad_norm": 0.12109375, + "learning_rate": 0.00034993940597993447, + "loss": 0.4833, + "step": 113270 + }, + { + "epoch": 5.626303764776001, + "grad_norm": 0.1689453125, + "learning_rate": 0.00034989967219628494, + "loss": 0.4983, + "step": 113280 + }, + { + "epoch": 5.62680043707162, + "grad_norm": 0.1171875, + "learning_rate": 0.00034985993841263536, + "loss": 0.4798, + "step": 113290 + }, + { + "epoch": 5.627297109367239, + "grad_norm": 0.1640625, + "learning_rate": 0.0003498202046289858, + "loss": 0.5027, + "step": 113300 + }, + { + "epoch": 5.6277937816628585, + "grad_norm": 0.1279296875, + "learning_rate": 0.00034978047084533625, + "loss": 0.4887, + "step": 113310 + }, + { + "epoch": 5.628290453958478, + "grad_norm": 0.12109375, + "learning_rate": 0.0003497407370616867, + "loss": 0.4926, + "step": 113320 + }, + { + "epoch": 5.628787126254098, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003497010032780372, + "loss": 0.5032, + "step": 113330 + }, + { + "epoch": 5.629283798549717, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003496612694943876, + "loss": 0.4918, + "step": 113340 + }, + { + "epoch": 5.629780470845336, + "grad_norm": 0.103515625, + "learning_rate": 0.0003496215357107381, + "loss": 0.5331, + "step": 113350 + }, + { + "epoch": 5.6302771431409555, + "grad_norm": 0.11083984375, + "learning_rate": 0.00034958180192708855, + "loss": 0.5183, + "step": 113360 + }, + { + "epoch": 5.630773815436575, + "grad_norm": 0.1318359375, + "learning_rate": 0.00034954206814343897, + "loss": 0.5034, + "step": 113370 + }, + { + "epoch": 5.631270487732194, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003495023343597894, + "loss": 0.503, + "step": 113380 + }, + { + "epoch": 5.631767160027813, + "grad_norm": 0.1533203125, + "learning_rate": 0.0003494626005761399, + "loss": 0.5195, + "step": 113390 + }, + { + "epoch": 5.632263832323433, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003494228667924903, + "loss": 0.4737, + "step": 113400 + }, + { + "epoch": 5.6327605046190525, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003493831330088408, + "loss": 0.5063, + "step": 113410 + }, + { + "epoch": 5.633257176914672, + "grad_norm": 0.1875, + "learning_rate": 0.0003493433992251912, + "loss": 0.5096, + "step": 113420 + }, + { + "epoch": 5.633753849210291, + "grad_norm": 0.134765625, + "learning_rate": 0.0003493036654415417, + "loss": 0.4873, + "step": 113430 + }, + { + "epoch": 5.63425052150591, + "grad_norm": 0.150390625, + "learning_rate": 0.00034926393165789216, + "loss": 0.4895, + "step": 113440 + }, + { + "epoch": 5.634747193801529, + "grad_norm": 0.123046875, + "learning_rate": 0.0003492241978742426, + "loss": 0.5058, + "step": 113450 + }, + { + "epoch": 5.635243866097149, + "grad_norm": 0.11181640625, + "learning_rate": 0.00034918446409059305, + "loss": 0.4999, + "step": 113460 + }, + { + "epoch": 5.635740538392769, + "grad_norm": 0.1708984375, + "learning_rate": 0.0003491447303069435, + "loss": 0.5198, + "step": 113470 + }, + { + "epoch": 5.636237210688388, + "grad_norm": 0.1162109375, + "learning_rate": 0.00034910499652329393, + "loss": 0.5367, + "step": 113480 + }, + { + "epoch": 5.636733882984007, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003490652627396444, + "loss": 0.5206, + "step": 113490 + }, + { + "epoch": 5.6372305552796265, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003490255289559949, + "loss": 0.4997, + "step": 113500 + }, + { + "epoch": 5.637727227575246, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003489857951723453, + "loss": 0.4914, + "step": 113510 + }, + { + "epoch": 5.638223899870865, + "grad_norm": 0.130859375, + "learning_rate": 0.00034894606138869577, + "loss": 0.5156, + "step": 113520 + }, + { + "epoch": 5.638720572166484, + "grad_norm": 0.130859375, + "learning_rate": 0.0003489063276050462, + "loss": 0.5263, + "step": 113530 + }, + { + "epoch": 5.639217244462104, + "grad_norm": 0.12890625, + "learning_rate": 0.0003488665938213967, + "loss": 0.5106, + "step": 113540 + }, + { + "epoch": 5.6397139167577235, + "grad_norm": 0.154296875, + "learning_rate": 0.0003488268600377471, + "loss": 0.4905, + "step": 113550 + }, + { + "epoch": 5.640210589053343, + "grad_norm": 0.12451171875, + "learning_rate": 0.00034878712625409754, + "loss": 0.4771, + "step": 113560 + }, + { + "epoch": 5.640707261348962, + "grad_norm": 0.1376953125, + "learning_rate": 0.000348747392470448, + "loss": 0.4879, + "step": 113570 + }, + { + "epoch": 5.641203933644581, + "grad_norm": 0.126953125, + "learning_rate": 0.0003487076586867985, + "loss": 0.5118, + "step": 113580 + }, + { + "epoch": 5.6417006059402, + "grad_norm": 0.134765625, + "learning_rate": 0.0003486679249031489, + "loss": 0.5268, + "step": 113590 + }, + { + "epoch": 5.64219727823582, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003486281911194994, + "loss": 0.5118, + "step": 113600 + }, + { + "epoch": 5.64269395053144, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003485884573358498, + "loss": 0.501, + "step": 113610 + }, + { + "epoch": 5.643190622827059, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003485487235522003, + "loss": 0.528, + "step": 113620 + }, + { + "epoch": 5.643687295122678, + "grad_norm": 0.13671875, + "learning_rate": 0.00034850898976855073, + "loss": 0.5168, + "step": 113630 + }, + { + "epoch": 5.644183967418297, + "grad_norm": 0.12255859375, + "learning_rate": 0.00034846925598490115, + "loss": 0.5091, + "step": 113640 + }, + { + "epoch": 5.644680639713917, + "grad_norm": 0.150390625, + "learning_rate": 0.0003484295222012516, + "loss": 0.5126, + "step": 113650 + }, + { + "epoch": 5.645177312009536, + "grad_norm": 0.10986328125, + "learning_rate": 0.0003483897884176021, + "loss": 0.5085, + "step": 113660 + }, + { + "epoch": 5.645673984305155, + "grad_norm": 0.15625, + "learning_rate": 0.0003483500546339525, + "loss": 0.5356, + "step": 113670 + }, + { + "epoch": 5.646170656600775, + "grad_norm": 0.125, + "learning_rate": 0.000348310320850303, + "loss": 0.5088, + "step": 113680 + }, + { + "epoch": 5.6466673288963944, + "grad_norm": 0.134765625, + "learning_rate": 0.00034827058706665345, + "loss": 0.5075, + "step": 113690 + }, + { + "epoch": 5.647164001192014, + "grad_norm": 0.1083984375, + "learning_rate": 0.0003482308532830039, + "loss": 0.5144, + "step": 113700 + }, + { + "epoch": 5.647660673487633, + "grad_norm": 0.12255859375, + "learning_rate": 0.00034819111949935434, + "loss": 0.4816, + "step": 113710 + }, + { + "epoch": 5.648157345783252, + "grad_norm": 0.11376953125, + "learning_rate": 0.00034815138571570476, + "loss": 0.523, + "step": 113720 + }, + { + "epoch": 5.648654018078871, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003481116519320553, + "loss": 0.5075, + "step": 113730 + }, + { + "epoch": 5.649150690374491, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003480719181484057, + "loss": 0.5195, + "step": 113740 + }, + { + "epoch": 5.649647362670111, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003480321843647561, + "loss": 0.5362, + "step": 113750 + }, + { + "epoch": 5.65014403496573, + "grad_norm": 0.1572265625, + "learning_rate": 0.0003479924505811066, + "loss": 0.515, + "step": 113760 + }, + { + "epoch": 5.650640707261349, + "grad_norm": 0.1142578125, + "learning_rate": 0.00034795271679745706, + "loss": 0.491, + "step": 113770 + }, + { + "epoch": 5.651137379556968, + "grad_norm": 0.1240234375, + "learning_rate": 0.00034791298301380753, + "loss": 0.4865, + "step": 113780 + }, + { + "epoch": 5.651634051852588, + "grad_norm": 0.11962890625, + "learning_rate": 0.00034787324923015795, + "loss": 0.5131, + "step": 113790 + }, + { + "epoch": 5.652130724148207, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003478335154465084, + "loss": 0.5108, + "step": 113800 + }, + { + "epoch": 5.652627396443826, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003477937816628589, + "loss": 0.5055, + "step": 113810 + }, + { + "epoch": 5.653124068739446, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003477540478792093, + "loss": 0.4903, + "step": 113820 + }, + { + "epoch": 5.653620741035065, + "grad_norm": 0.11279296875, + "learning_rate": 0.0003477143140955597, + "loss": 0.5069, + "step": 113830 + }, + { + "epoch": 5.654117413330685, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003476745803119102, + "loss": 0.4916, + "step": 113840 + }, + { + "epoch": 5.654614085626304, + "grad_norm": 0.138671875, + "learning_rate": 0.00034763484652826067, + "loss": 0.5005, + "step": 113850 + }, + { + "epoch": 5.655110757921923, + "grad_norm": 0.13671875, + "learning_rate": 0.00034759511274461114, + "loss": 0.4995, + "step": 113860 + }, + { + "epoch": 5.655607430217542, + "grad_norm": 0.1513671875, + "learning_rate": 0.00034755537896096156, + "loss": 0.5523, + "step": 113870 + }, + { + "epoch": 5.6561041025131615, + "grad_norm": 0.12890625, + "learning_rate": 0.00034751564517731203, + "loss": 0.522, + "step": 113880 + }, + { + "epoch": 5.656600774808782, + "grad_norm": 0.126953125, + "learning_rate": 0.0003474759113936625, + "loss": 0.4748, + "step": 113890 + }, + { + "epoch": 5.657097447104401, + "grad_norm": 0.12109375, + "learning_rate": 0.0003474361776100129, + "loss": 0.5299, + "step": 113900 + }, + { + "epoch": 5.65759411940002, + "grad_norm": 0.11767578125, + "learning_rate": 0.00034739644382636334, + "loss": 0.5318, + "step": 113910 + }, + { + "epoch": 5.658090791695639, + "grad_norm": 0.130859375, + "learning_rate": 0.00034735671004271386, + "loss": 0.524, + "step": 113920 + }, + { + "epoch": 5.658587463991259, + "grad_norm": 0.15625, + "learning_rate": 0.0003473169762590643, + "loss": 0.494, + "step": 113930 + }, + { + "epoch": 5.659084136286878, + "grad_norm": 0.1640625, + "learning_rate": 0.00034727724247541475, + "loss": 0.4914, + "step": 113940 + }, + { + "epoch": 5.659580808582497, + "grad_norm": 0.115234375, + "learning_rate": 0.00034723750869176517, + "loss": 0.4848, + "step": 113950 + }, + { + "epoch": 5.660077480878116, + "grad_norm": 0.123046875, + "learning_rate": 0.00034719777490811564, + "loss": 0.4869, + "step": 113960 + }, + { + "epoch": 5.660574153173736, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003471580411244661, + "loss": 0.4768, + "step": 113970 + }, + { + "epoch": 5.661070825469356, + "grad_norm": 0.203125, + "learning_rate": 0.0003471183073408165, + "loss": 0.491, + "step": 113980 + }, + { + "epoch": 5.661567497764975, + "grad_norm": 0.1162109375, + "learning_rate": 0.000347078573557167, + "loss": 0.4945, + "step": 113990 + }, + { + "epoch": 5.662064170060594, + "grad_norm": 0.11279296875, + "learning_rate": 0.00034703883977351747, + "loss": 0.5128, + "step": 114000 + }, + { + "epoch": 5.662560842356213, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003469991059898679, + "loss": 0.4963, + "step": 114010 + }, + { + "epoch": 5.6630575146518325, + "grad_norm": 0.12890625, + "learning_rate": 0.00034695937220621836, + "loss": 0.5241, + "step": 114020 + }, + { + "epoch": 5.663554186947452, + "grad_norm": 0.115234375, + "learning_rate": 0.00034691963842256883, + "loss": 0.5257, + "step": 114030 + }, + { + "epoch": 5.664050859243071, + "grad_norm": 0.125, + "learning_rate": 0.00034687990463891925, + "loss": 0.5243, + "step": 114040 + }, + { + "epoch": 5.664547531538691, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003468401708552697, + "loss": 0.4992, + "step": 114050 + }, + { + "epoch": 5.66504420383431, + "grad_norm": 0.13671875, + "learning_rate": 0.00034680043707162013, + "loss": 0.5164, + "step": 114060 + }, + { + "epoch": 5.6655408761299295, + "grad_norm": 0.15625, + "learning_rate": 0.00034676070328797066, + "loss": 0.5001, + "step": 114070 + }, + { + "epoch": 5.666037548425549, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003467209695043211, + "loss": 0.5042, + "step": 114080 + }, + { + "epoch": 5.666534220721168, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003466812357206715, + "loss": 0.5113, + "step": 114090 + }, + { + "epoch": 5.667030893016787, + "grad_norm": 0.12109375, + "learning_rate": 0.00034664150193702197, + "loss": 0.5155, + "step": 114100 + }, + { + "epoch": 5.667527565312406, + "grad_norm": 0.119140625, + "learning_rate": 0.00034660176815337244, + "loss": 0.5023, + "step": 114110 + }, + { + "epoch": 5.6680242376080265, + "grad_norm": 0.142578125, + "learning_rate": 0.00034656203436972285, + "loss": 0.4907, + "step": 114120 + }, + { + "epoch": 5.668520909903646, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003465223005860733, + "loss": 0.481, + "step": 114130 + }, + { + "epoch": 5.669017582199265, + "grad_norm": 0.126953125, + "learning_rate": 0.00034648256680242374, + "loss": 0.5019, + "step": 114140 + }, + { + "epoch": 5.669514254494884, + "grad_norm": 0.115234375, + "learning_rate": 0.00034644283301877427, + "loss": 0.4908, + "step": 114150 + }, + { + "epoch": 5.6700109267905034, + "grad_norm": 0.138671875, + "learning_rate": 0.0003464030992351247, + "loss": 0.5105, + "step": 114160 + }, + { + "epoch": 5.670507599086123, + "grad_norm": 0.171875, + "learning_rate": 0.0003463633654514751, + "loss": 0.5104, + "step": 114170 + }, + { + "epoch": 5.671004271381742, + "grad_norm": 0.134765625, + "learning_rate": 0.0003463236316678256, + "loss": 0.5042, + "step": 114180 + }, + { + "epoch": 5.671500943677362, + "grad_norm": 0.166015625, + "learning_rate": 0.00034628389788417605, + "loss": 0.5159, + "step": 114190 + }, + { + "epoch": 5.671997615972981, + "grad_norm": 0.1357421875, + "learning_rate": 0.00034624416410052646, + "loss": 0.5134, + "step": 114200 + }, + { + "epoch": 5.6724942882686005, + "grad_norm": 0.1240234375, + "learning_rate": 0.00034620443031687693, + "loss": 0.5087, + "step": 114210 + }, + { + "epoch": 5.67299096056422, + "grad_norm": 0.12890625, + "learning_rate": 0.0003461646965332274, + "loss": 0.5168, + "step": 114220 + }, + { + "epoch": 5.673487632859839, + "grad_norm": 0.12890625, + "learning_rate": 0.0003461249627495779, + "loss": 0.5074, + "step": 114230 + }, + { + "epoch": 5.673984305155458, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003460852289659283, + "loss": 0.5065, + "step": 114240 + }, + { + "epoch": 5.674480977451077, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003460454951822787, + "loss": 0.5326, + "step": 114250 + }, + { + "epoch": 5.6749776497466975, + "grad_norm": 0.126953125, + "learning_rate": 0.00034600576139862924, + "loss": 0.4766, + "step": 114260 + }, + { + "epoch": 5.675474322042317, + "grad_norm": 0.1298828125, + "learning_rate": 0.00034596602761497965, + "loss": 0.5074, + "step": 114270 + }, + { + "epoch": 5.675970994337936, + "grad_norm": 0.126953125, + "learning_rate": 0.00034592629383133007, + "loss": 0.497, + "step": 114280 + }, + { + "epoch": 5.676467666633555, + "grad_norm": 0.1123046875, + "learning_rate": 0.00034588656004768054, + "loss": 0.5005, + "step": 114290 + }, + { + "epoch": 5.676964338929174, + "grad_norm": 0.138671875, + "learning_rate": 0.000345846826264031, + "loss": 0.4988, + "step": 114300 + }, + { + "epoch": 5.677461011224794, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003458070924803815, + "loss": 0.5226, + "step": 114310 + }, + { + "epoch": 5.677957683520413, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003457673586967319, + "loss": 0.5349, + "step": 114320 + }, + { + "epoch": 5.678454355816033, + "grad_norm": 0.140625, + "learning_rate": 0.0003457276249130824, + "loss": 0.4942, + "step": 114330 + }, + { + "epoch": 5.678951028111652, + "grad_norm": 0.142578125, + "learning_rate": 0.00034568789112943284, + "loss": 0.4801, + "step": 114340 + }, + { + "epoch": 5.679447700407271, + "grad_norm": 0.130859375, + "learning_rate": 0.00034564815734578326, + "loss": 0.5183, + "step": 114350 + }, + { + "epoch": 5.679944372702891, + "grad_norm": 0.1708984375, + "learning_rate": 0.0003456084235621337, + "loss": 0.4792, + "step": 114360 + }, + { + "epoch": 5.68044104499851, + "grad_norm": 0.11328125, + "learning_rate": 0.0003455686897784842, + "loss": 0.5151, + "step": 114370 + }, + { + "epoch": 5.680937717294129, + "grad_norm": 0.1806640625, + "learning_rate": 0.0003455289559948346, + "loss": 0.5321, + "step": 114380 + }, + { + "epoch": 5.681434389589748, + "grad_norm": 0.18359375, + "learning_rate": 0.0003454892222111851, + "loss": 0.4892, + "step": 114390 + }, + { + "epoch": 5.6819310618853685, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003454494884275355, + "loss": 0.4954, + "step": 114400 + }, + { + "epoch": 5.682427734180988, + "grad_norm": 0.1396484375, + "learning_rate": 0.000345409754643886, + "loss": 0.4942, + "step": 114410 + }, + { + "epoch": 5.682924406476607, + "grad_norm": 0.14453125, + "learning_rate": 0.00034537002086023645, + "loss": 0.5226, + "step": 114420 + }, + { + "epoch": 5.683421078772226, + "grad_norm": 0.123046875, + "learning_rate": 0.00034533028707658687, + "loss": 0.5203, + "step": 114430 + }, + { + "epoch": 5.683917751067845, + "grad_norm": 0.1376953125, + "learning_rate": 0.00034529055329293734, + "loss": 0.5137, + "step": 114440 + }, + { + "epoch": 5.684414423363465, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003452508195092878, + "loss": 0.5044, + "step": 114450 + }, + { + "epoch": 5.684911095659084, + "grad_norm": 0.1259765625, + "learning_rate": 0.00034521108572563823, + "loss": 0.5375, + "step": 114460 + }, + { + "epoch": 5.685407767954704, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003451713519419887, + "loss": 0.4996, + "step": 114470 + }, + { + "epoch": 5.685904440250323, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003451316181583391, + "loss": 0.4906, + "step": 114480 + }, + { + "epoch": 5.686401112545942, + "grad_norm": 0.12890625, + "learning_rate": 0.0003450918843746896, + "loss": 0.5102, + "step": 114490 + }, + { + "epoch": 5.686897784841562, + "grad_norm": 0.1201171875, + "learning_rate": 0.00034505215059104006, + "loss": 0.5323, + "step": 114500 + }, + { + "epoch": 5.687394457137181, + "grad_norm": 0.1669921875, + "learning_rate": 0.0003450124168073905, + "loss": 0.5173, + "step": 114510 + }, + { + "epoch": 5.6878911294328, + "grad_norm": 0.138671875, + "learning_rate": 0.00034497268302374095, + "loss": 0.5176, + "step": 114520 + }, + { + "epoch": 5.688387801728419, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003449329492400914, + "loss": 0.5059, + "step": 114530 + }, + { + "epoch": 5.688884474024039, + "grad_norm": 0.1689453125, + "learning_rate": 0.00034489321545644184, + "loss": 0.5014, + "step": 114540 + }, + { + "epoch": 5.689381146319659, + "grad_norm": 0.13671875, + "learning_rate": 0.0003448534816727923, + "loss": 0.51, + "step": 114550 + }, + { + "epoch": 5.689877818615278, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003448137478891428, + "loss": 0.5091, + "step": 114560 + }, + { + "epoch": 5.690374490910897, + "grad_norm": 0.12890625, + "learning_rate": 0.0003447740141054932, + "loss": 0.5274, + "step": 114570 + }, + { + "epoch": 5.690871163206516, + "grad_norm": 0.134765625, + "learning_rate": 0.00034473428032184367, + "loss": 0.5088, + "step": 114580 + }, + { + "epoch": 5.6913678355021355, + "grad_norm": 0.134765625, + "learning_rate": 0.0003446945465381941, + "loss": 0.4935, + "step": 114590 + }, + { + "epoch": 5.691864507797755, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003446548127545446, + "loss": 0.527, + "step": 114600 + }, + { + "epoch": 5.692361180093375, + "grad_norm": 0.12451171875, + "learning_rate": 0.00034461507897089503, + "loss": 0.5384, + "step": 114610 + }, + { + "epoch": 5.692857852388994, + "grad_norm": 0.1279296875, + "learning_rate": 0.00034457534518724545, + "loss": 0.5439, + "step": 114620 + }, + { + "epoch": 5.693354524684613, + "grad_norm": 0.140625, + "learning_rate": 0.0003445356114035959, + "loss": 0.5353, + "step": 114630 + }, + { + "epoch": 5.693851196980233, + "grad_norm": 0.14453125, + "learning_rate": 0.0003444958776199464, + "loss": 0.4961, + "step": 114640 + }, + { + "epoch": 5.694347869275852, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003444561438362968, + "loss": 0.5086, + "step": 114650 + }, + { + "epoch": 5.694844541571471, + "grad_norm": 0.1875, + "learning_rate": 0.0003444164100526473, + "loss": 0.5043, + "step": 114660 + }, + { + "epoch": 5.69534121386709, + "grad_norm": 0.10693359375, + "learning_rate": 0.00034437667626899775, + "loss": 0.4958, + "step": 114670 + }, + { + "epoch": 5.6958378861627095, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003443369424853482, + "loss": 0.4841, + "step": 114680 + }, + { + "epoch": 5.696334558458329, + "grad_norm": 0.12353515625, + "learning_rate": 0.00034429720870169864, + "loss": 0.4919, + "step": 114690 + }, + { + "epoch": 5.696831230753949, + "grad_norm": 0.1298828125, + "learning_rate": 0.00034425747491804905, + "loss": 0.5071, + "step": 114700 + }, + { + "epoch": 5.697327903049568, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003442177411343995, + "loss": 0.4982, + "step": 114710 + }, + { + "epoch": 5.697824575345187, + "grad_norm": 0.1259765625, + "learning_rate": 0.00034417800735075, + "loss": 0.533, + "step": 114720 + }, + { + "epoch": 5.6983212476408065, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003441382735671004, + "loss": 0.4986, + "step": 114730 + }, + { + "epoch": 5.698817919936426, + "grad_norm": 0.12890625, + "learning_rate": 0.0003440985397834509, + "loss": 0.5243, + "step": 114740 + }, + { + "epoch": 5.699314592232045, + "grad_norm": 0.1279296875, + "learning_rate": 0.00034405880599980136, + "loss": 0.503, + "step": 114750 + }, + { + "epoch": 5.699811264527664, + "grad_norm": 0.1357421875, + "learning_rate": 0.00034401907221615183, + "loss": 0.494, + "step": 114760 + }, + { + "epoch": 5.700307936823284, + "grad_norm": 0.12890625, + "learning_rate": 0.00034397933843250225, + "loss": 0.5262, + "step": 114770 + }, + { + "epoch": 5.7008046091189035, + "grad_norm": 0.1328125, + "learning_rate": 0.00034393960464885266, + "loss": 0.5391, + "step": 114780 + }, + { + "epoch": 5.701301281414523, + "grad_norm": 0.1328125, + "learning_rate": 0.0003438998708652032, + "loss": 0.5111, + "step": 114790 + }, + { + "epoch": 5.701797953710142, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003438601370815536, + "loss": 0.4906, + "step": 114800 + }, + { + "epoch": 5.702294626005761, + "grad_norm": 0.12109375, + "learning_rate": 0.0003438204032979041, + "loss": 0.4968, + "step": 114810 + }, + { + "epoch": 5.70279129830138, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003437806695142545, + "loss": 0.4965, + "step": 114820 + }, + { + "epoch": 5.703287970597, + "grad_norm": 0.1328125, + "learning_rate": 0.00034374093573060497, + "loss": 0.5153, + "step": 114830 + }, + { + "epoch": 5.70378464289262, + "grad_norm": 0.12060546875, + "learning_rate": 0.00034370120194695544, + "loss": 0.5069, + "step": 114840 + }, + { + "epoch": 5.704281315188239, + "grad_norm": 0.1259765625, + "learning_rate": 0.00034366146816330585, + "loss": 0.4992, + "step": 114850 + }, + { + "epoch": 5.704777987483858, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003436217343796563, + "loss": 0.4968, + "step": 114860 + }, + { + "epoch": 5.7052746597794775, + "grad_norm": 0.1328125, + "learning_rate": 0.0003435820005960068, + "loss": 0.482, + "step": 114870 + }, + { + "epoch": 5.705771332075097, + "grad_norm": 0.115234375, + "learning_rate": 0.0003435422668123572, + "loss": 0.4977, + "step": 114880 + }, + { + "epoch": 5.706268004370716, + "grad_norm": 0.1328125, + "learning_rate": 0.0003435025330287077, + "loss": 0.5002, + "step": 114890 + }, + { + "epoch": 5.706764676666335, + "grad_norm": 0.1572265625, + "learning_rate": 0.00034346279924505816, + "loss": 0.5028, + "step": 114900 + }, + { + "epoch": 5.707261348961955, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003434230654614086, + "loss": 0.522, + "step": 114910 + }, + { + "epoch": 5.7077580212575745, + "grad_norm": 0.1728515625, + "learning_rate": 0.00034338333167775904, + "loss": 0.4802, + "step": 114920 + }, + { + "epoch": 5.708254693553194, + "grad_norm": 0.158203125, + "learning_rate": 0.00034334359789410946, + "loss": 0.4978, + "step": 114930 + }, + { + "epoch": 5.708751365848813, + "grad_norm": 0.1416015625, + "learning_rate": 0.00034330386411045993, + "loss": 0.5344, + "step": 114940 + }, + { + "epoch": 5.709248038144432, + "grad_norm": 0.103515625, + "learning_rate": 0.0003432641303268104, + "loss": 0.5075, + "step": 114950 + }, + { + "epoch": 5.709744710440051, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003432243965431608, + "loss": 0.4632, + "step": 114960 + }, + { + "epoch": 5.710241382735671, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003431846627595113, + "loss": 0.495, + "step": 114970 + }, + { + "epoch": 5.710738055031291, + "grad_norm": 0.140625, + "learning_rate": 0.00034314492897586176, + "loss": 0.5062, + "step": 114980 + }, + { + "epoch": 5.71123472732691, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003431051951922122, + "loss": 0.5016, + "step": 114990 + }, + { + "epoch": 5.711731399622529, + "grad_norm": 0.11376953125, + "learning_rate": 0.00034306546140856265, + "loss": 0.4985, + "step": 115000 + }, + { + "epoch": 5.712228071918148, + "grad_norm": 0.126953125, + "learning_rate": 0.00034302572762491307, + "loss": 0.5135, + "step": 115010 + }, + { + "epoch": 5.712724744213768, + "grad_norm": 0.12890625, + "learning_rate": 0.00034298599384126354, + "loss": 0.5065, + "step": 115020 + }, + { + "epoch": 5.713221416509387, + "grad_norm": 0.1611328125, + "learning_rate": 0.000342946260057614, + "loss": 0.4908, + "step": 115030 + }, + { + "epoch": 5.713718088805006, + "grad_norm": 0.11962890625, + "learning_rate": 0.00034290652627396443, + "loss": 0.5072, + "step": 115040 + }, + { + "epoch": 5.714214761100626, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003428667924903149, + "loss": 0.4969, + "step": 115050 + }, + { + "epoch": 5.714711433396245, + "grad_norm": 0.1318359375, + "learning_rate": 0.00034282705870666537, + "loss": 0.4945, + "step": 115060 + }, + { + "epoch": 5.715208105691865, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003427873249230158, + "loss": 0.502, + "step": 115070 + }, + { + "epoch": 5.715704777987484, + "grad_norm": 0.1337890625, + "learning_rate": 0.00034274759113936626, + "loss": 0.5, + "step": 115080 + }, + { + "epoch": 5.716201450283103, + "grad_norm": 0.142578125, + "learning_rate": 0.00034270785735571673, + "loss": 0.49, + "step": 115090 + }, + { + "epoch": 5.716698122578722, + "grad_norm": 0.11279296875, + "learning_rate": 0.00034266812357206715, + "loss": 0.4713, + "step": 115100 + }, + { + "epoch": 5.717194794874342, + "grad_norm": 0.1494140625, + "learning_rate": 0.0003426283897884176, + "loss": 0.4878, + "step": 115110 + }, + { + "epoch": 5.717691467169962, + "grad_norm": 0.12353515625, + "learning_rate": 0.00034258865600476804, + "loss": 0.4958, + "step": 115120 + }, + { + "epoch": 5.718188139465581, + "grad_norm": 0.11328125, + "learning_rate": 0.00034254892222111856, + "loss": 0.4822, + "step": 115130 + }, + { + "epoch": 5.7186848117612, + "grad_norm": 0.1552734375, + "learning_rate": 0.000342509188437469, + "loss": 0.5026, + "step": 115140 + }, + { + "epoch": 5.719181484056819, + "grad_norm": 0.125, + "learning_rate": 0.0003424694546538194, + "loss": 0.5005, + "step": 115150 + }, + { + "epoch": 5.719678156352439, + "grad_norm": 0.166015625, + "learning_rate": 0.00034242972087016987, + "loss": 0.5175, + "step": 115160 + }, + { + "epoch": 5.720174828648058, + "grad_norm": 0.1484375, + "learning_rate": 0.00034238998708652034, + "loss": 0.4952, + "step": 115170 + }, + { + "epoch": 5.720671500943677, + "grad_norm": 0.1357421875, + "learning_rate": 0.00034235025330287076, + "loss": 0.4982, + "step": 115180 + }, + { + "epoch": 5.721168173239297, + "grad_norm": 0.10888671875, + "learning_rate": 0.00034231051951922123, + "loss": 0.4914, + "step": 115190 + }, + { + "epoch": 5.721664845534916, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003422707857355717, + "loss": 0.5042, + "step": 115200 + }, + { + "epoch": 5.722161517830536, + "grad_norm": 0.1171875, + "learning_rate": 0.00034223105195192217, + "loss": 0.503, + "step": 115210 + }, + { + "epoch": 5.722658190126155, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003421913181682726, + "loss": 0.4848, + "step": 115220 + }, + { + "epoch": 5.723154862421774, + "grad_norm": 0.16796875, + "learning_rate": 0.000342151584384623, + "loss": 0.4676, + "step": 115230 + }, + { + "epoch": 5.723651534717393, + "grad_norm": 0.11083984375, + "learning_rate": 0.00034211185060097353, + "loss": 0.4628, + "step": 115240 + }, + { + "epoch": 5.7241482070130125, + "grad_norm": 0.115234375, + "learning_rate": 0.00034207211681732395, + "loss": 0.5114, + "step": 115250 + }, + { + "epoch": 5.724644879308633, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003420323830336744, + "loss": 0.5178, + "step": 115260 + }, + { + "epoch": 5.725141551604252, + "grad_norm": 0.138671875, + "learning_rate": 0.00034199264925002484, + "loss": 0.49, + "step": 115270 + }, + { + "epoch": 5.725638223899871, + "grad_norm": 0.11328125, + "learning_rate": 0.0003419529154663753, + "loss": 0.4979, + "step": 115280 + }, + { + "epoch": 5.72613489619549, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003419131816827258, + "loss": 0.52, + "step": 115290 + }, + { + "epoch": 5.7266315684911095, + "grad_norm": 0.142578125, + "learning_rate": 0.0003418734478990762, + "loss": 0.4943, + "step": 115300 + }, + { + "epoch": 5.727128240786729, + "grad_norm": 0.134765625, + "learning_rate": 0.0003418337141154266, + "loss": 0.5051, + "step": 115310 + }, + { + "epoch": 5.727624913082348, + "grad_norm": 0.12890625, + "learning_rate": 0.00034179398033177714, + "loss": 0.4983, + "step": 115320 + }, + { + "epoch": 5.728121585377967, + "grad_norm": 0.123046875, + "learning_rate": 0.00034175424654812756, + "loss": 0.5153, + "step": 115330 + }, + { + "epoch": 5.728618257673587, + "grad_norm": 0.119140625, + "learning_rate": 0.00034171451276447803, + "loss": 0.4977, + "step": 115340 + }, + { + "epoch": 5.729114929969207, + "grad_norm": 0.1298828125, + "learning_rate": 0.00034167477898082845, + "loss": 0.5151, + "step": 115350 + }, + { + "epoch": 5.729611602264826, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003416350451971789, + "loss": 0.4898, + "step": 115360 + }, + { + "epoch": 5.730108274560445, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003415953114135294, + "loss": 0.4805, + "step": 115370 + }, + { + "epoch": 5.730604946856064, + "grad_norm": 0.1171875, + "learning_rate": 0.0003415555776298798, + "loss": 0.4821, + "step": 115380 + }, + { + "epoch": 5.7311016191516835, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003415158438462303, + "loss": 0.5175, + "step": 115390 + }, + { + "epoch": 5.731598291447303, + "grad_norm": 0.12060546875, + "learning_rate": 0.00034147611006258075, + "loss": 0.5031, + "step": 115400 + }, + { + "epoch": 5.732094963742922, + "grad_norm": 0.119140625, + "learning_rate": 0.00034143637627893117, + "loss": 0.5136, + "step": 115410 + }, + { + "epoch": 5.732591636038542, + "grad_norm": 0.12255859375, + "learning_rate": 0.00034139664249528164, + "loss": 0.5181, + "step": 115420 + }, + { + "epoch": 5.733088308334161, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003413569087116321, + "loss": 0.4761, + "step": 115430 + }, + { + "epoch": 5.7335849806297805, + "grad_norm": 0.1611328125, + "learning_rate": 0.0003413171749279825, + "loss": 0.513, + "step": 115440 + }, + { + "epoch": 5.7340816529254, + "grad_norm": 0.12890625, + "learning_rate": 0.000341277441144333, + "loss": 0.4977, + "step": 115450 + }, + { + "epoch": 5.734578325221019, + "grad_norm": 0.1328125, + "learning_rate": 0.0003412377073606834, + "loss": 0.4923, + "step": 115460 + }, + { + "epoch": 5.735074997516638, + "grad_norm": 0.1328125, + "learning_rate": 0.0003411979735770339, + "loss": 0.4929, + "step": 115470 + }, + { + "epoch": 5.735571669812257, + "grad_norm": 0.11669921875, + "learning_rate": 0.00034115823979338436, + "loss": 0.5056, + "step": 115480 + }, + { + "epoch": 5.7360683421078775, + "grad_norm": 0.1171875, + "learning_rate": 0.0003411185060097348, + "loss": 0.487, + "step": 115490 + }, + { + "epoch": 5.736565014403497, + "grad_norm": 0.1630859375, + "learning_rate": 0.00034107877222608524, + "loss": 0.483, + "step": 115500 + }, + { + "epoch": 5.737061686699116, + "grad_norm": 0.12109375, + "learning_rate": 0.0003410390384424357, + "loss": 0.5022, + "step": 115510 + }, + { + "epoch": 5.737558358994735, + "grad_norm": 0.1240234375, + "learning_rate": 0.00034099930465878613, + "loss": 0.4968, + "step": 115520 + }, + { + "epoch": 5.738055031290354, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003409595708751366, + "loss": 0.5115, + "step": 115530 + }, + { + "epoch": 5.738551703585974, + "grad_norm": 0.12255859375, + "learning_rate": 0.000340919837091487, + "loss": 0.4868, + "step": 115540 + }, + { + "epoch": 5.739048375881593, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003408801033078375, + "loss": 0.5299, + "step": 115550 + }, + { + "epoch": 5.739545048177213, + "grad_norm": 0.1318359375, + "learning_rate": 0.00034084036952418796, + "loss": 0.4985, + "step": 115560 + }, + { + "epoch": 5.740041720472832, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003408006357405384, + "loss": 0.4886, + "step": 115570 + }, + { + "epoch": 5.7405383927684515, + "grad_norm": 0.1171875, + "learning_rate": 0.00034076090195688885, + "loss": 0.4821, + "step": 115580 + }, + { + "epoch": 5.741035065064071, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003407211681732393, + "loss": 0.5072, + "step": 115590 + }, + { + "epoch": 5.74153173735969, + "grad_norm": 0.1279296875, + "learning_rate": 0.00034068143438958974, + "loss": 0.5099, + "step": 115600 + }, + { + "epoch": 5.742028409655309, + "grad_norm": 0.140625, + "learning_rate": 0.0003406417006059402, + "loss": 0.5313, + "step": 115610 + }, + { + "epoch": 5.742525081950928, + "grad_norm": 0.12890625, + "learning_rate": 0.0003406019668222907, + "loss": 0.5168, + "step": 115620 + }, + { + "epoch": 5.7430217542465485, + "grad_norm": 0.1494140625, + "learning_rate": 0.00034056223303864116, + "loss": 0.4783, + "step": 115630 + }, + { + "epoch": 5.743518426542168, + "grad_norm": 0.138671875, + "learning_rate": 0.00034052249925499157, + "loss": 0.5161, + "step": 115640 + }, + { + "epoch": 5.744015098837787, + "grad_norm": 0.1298828125, + "learning_rate": 0.000340482765471342, + "loss": 0.4956, + "step": 115650 + }, + { + "epoch": 5.744511771133406, + "grad_norm": 0.134765625, + "learning_rate": 0.0003404430316876925, + "loss": 0.4915, + "step": 115660 + }, + { + "epoch": 5.745008443429025, + "grad_norm": 0.115234375, + "learning_rate": 0.00034040329790404293, + "loss": 0.5149, + "step": 115670 + }, + { + "epoch": 5.745505115724645, + "grad_norm": 0.12353515625, + "learning_rate": 0.00034036356412039335, + "loss": 0.4939, + "step": 115680 + }, + { + "epoch": 5.746001788020264, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003403238303367438, + "loss": 0.4763, + "step": 115690 + }, + { + "epoch": 5.746498460315884, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003402840965530943, + "loss": 0.5225, + "step": 115700 + }, + { + "epoch": 5.746995132611503, + "grad_norm": 0.1787109375, + "learning_rate": 0.00034024436276944476, + "loss": 0.5139, + "step": 115710 + }, + { + "epoch": 5.747491804907122, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003402046289857952, + "loss": 0.5062, + "step": 115720 + }, + { + "epoch": 5.747988477202742, + "grad_norm": 0.1533203125, + "learning_rate": 0.00034016489520214565, + "loss": 0.5167, + "step": 115730 + }, + { + "epoch": 5.748485149498361, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003401251614184961, + "loss": 0.5147, + "step": 115740 + }, + { + "epoch": 5.74898182179398, + "grad_norm": 0.11767578125, + "learning_rate": 0.00034008542763484654, + "loss": 0.5476, + "step": 115750 + }, + { + "epoch": 5.749478494089599, + "grad_norm": 0.1279296875, + "learning_rate": 0.00034004569385119696, + "loss": 0.5189, + "step": 115760 + }, + { + "epoch": 5.749975166385219, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003400059600675475, + "loss": 0.4933, + "step": 115770 + }, + { + "epoch": 5.750471838680839, + "grad_norm": 0.119140625, + "learning_rate": 0.0003399662262838979, + "loss": 0.5053, + "step": 115780 + }, + { + "epoch": 5.750968510976458, + "grad_norm": 0.1318359375, + "learning_rate": 0.00033992649250024837, + "loss": 0.5216, + "step": 115790 + }, + { + "epoch": 5.751465183272077, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003398867587165988, + "loss": 0.4888, + "step": 115800 + }, + { + "epoch": 5.751961855567696, + "grad_norm": 0.1318359375, + "learning_rate": 0.00033984702493294926, + "loss": 0.5226, + "step": 115810 + }, + { + "epoch": 5.752458527863316, + "grad_norm": 0.1240234375, + "learning_rate": 0.00033980729114929973, + "loss": 0.5049, + "step": 115820 + }, + { + "epoch": 5.752955200158935, + "grad_norm": 0.1396484375, + "learning_rate": 0.00033976755736565015, + "loss": 0.5168, + "step": 115830 + }, + { + "epoch": 5.753451872454555, + "grad_norm": 0.134765625, + "learning_rate": 0.00033972782358200057, + "loss": 0.4807, + "step": 115840 + }, + { + "epoch": 5.753948544750174, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003396880897983511, + "loss": 0.492, + "step": 115850 + }, + { + "epoch": 5.754445217045793, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003396483560147015, + "loss": 0.4788, + "step": 115860 + }, + { + "epoch": 5.754941889341413, + "grad_norm": 0.12451171875, + "learning_rate": 0.000339608622231052, + "loss": 0.507, + "step": 115870 + }, + { + "epoch": 5.755438561637032, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003395688884474024, + "loss": 0.5107, + "step": 115880 + }, + { + "epoch": 5.755935233932651, + "grad_norm": 0.142578125, + "learning_rate": 0.00033952915466375287, + "loss": 0.4984, + "step": 115890 + }, + { + "epoch": 5.75643190622827, + "grad_norm": 0.12109375, + "learning_rate": 0.00033948942088010334, + "loss": 0.5008, + "step": 115900 + }, + { + "epoch": 5.75692857852389, + "grad_norm": 0.126953125, + "learning_rate": 0.00033944968709645376, + "loss": 0.508, + "step": 115910 + }, + { + "epoch": 5.75742525081951, + "grad_norm": 0.181640625, + "learning_rate": 0.00033940995331280423, + "loss": 0.5232, + "step": 115920 + }, + { + "epoch": 5.757921923115129, + "grad_norm": 0.126953125, + "learning_rate": 0.0003393702195291547, + "loss": 0.4951, + "step": 115930 + }, + { + "epoch": 5.758418595410748, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003393304857455051, + "loss": 0.5019, + "step": 115940 + }, + { + "epoch": 5.758915267706367, + "grad_norm": 0.115234375, + "learning_rate": 0.0003392907519618556, + "loss": 0.5148, + "step": 115950 + }, + { + "epoch": 5.7594119400019865, + "grad_norm": 0.134765625, + "learning_rate": 0.00033925101817820606, + "loss": 0.5035, + "step": 115960 + }, + { + "epoch": 5.759908612297606, + "grad_norm": 0.126953125, + "learning_rate": 0.0003392112843945565, + "loss": 0.4882, + "step": 115970 + }, + { + "epoch": 5.760405284593226, + "grad_norm": 0.150390625, + "learning_rate": 0.00033917155061090695, + "loss": 0.4819, + "step": 115980 + }, + { + "epoch": 5.760901956888845, + "grad_norm": 0.126953125, + "learning_rate": 0.00033913181682725737, + "loss": 0.5107, + "step": 115990 + }, + { + "epoch": 5.761398629184464, + "grad_norm": 0.1259765625, + "learning_rate": 0.00033909208304360784, + "loss": 0.4972, + "step": 116000 + }, + { + "epoch": 5.7618953014800836, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003390523492599583, + "loss": 0.5162, + "step": 116010 + }, + { + "epoch": 5.762391973775703, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003390126154763087, + "loss": 0.5025, + "step": 116020 + }, + { + "epoch": 5.762888646071322, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003389728816926592, + "loss": 0.5068, + "step": 116030 + }, + { + "epoch": 5.763385318366941, + "grad_norm": 0.11572265625, + "learning_rate": 0.00033893314790900967, + "loss": 0.4853, + "step": 116040 + }, + { + "epoch": 5.7638819906625605, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003388934141253601, + "loss": 0.5271, + "step": 116050 + }, + { + "epoch": 5.764378662958181, + "grad_norm": 0.11376953125, + "learning_rate": 0.00033885368034171056, + "loss": 0.4749, + "step": 116060 + }, + { + "epoch": 5.7648753352538, + "grad_norm": 0.10888671875, + "learning_rate": 0.00033881394655806103, + "loss": 0.486, + "step": 116070 + }, + { + "epoch": 5.765372007549419, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003387742127744115, + "loss": 0.4745, + "step": 116080 + }, + { + "epoch": 5.765868679845038, + "grad_norm": 0.11328125, + "learning_rate": 0.0003387344789907619, + "loss": 0.4979, + "step": 116090 + }, + { + "epoch": 5.7663653521406575, + "grad_norm": 0.109375, + "learning_rate": 0.00033869474520711233, + "loss": 0.4611, + "step": 116100 + }, + { + "epoch": 5.766862024436277, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003386550114234628, + "loss": 0.5248, + "step": 116110 + }, + { + "epoch": 5.767358696731896, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003386152776398133, + "loss": 0.5053, + "step": 116120 + }, + { + "epoch": 5.767855369027515, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003385755438561637, + "loss": 0.5074, + "step": 116130 + }, + { + "epoch": 5.768352041323135, + "grad_norm": 0.11376953125, + "learning_rate": 0.00033853581007251416, + "loss": 0.5085, + "step": 116140 + }, + { + "epoch": 5.7688487136187545, + "grad_norm": 0.1328125, + "learning_rate": 0.00033849607628886464, + "loss": 0.5461, + "step": 116150 + }, + { + "epoch": 5.769345385914374, + "grad_norm": 0.126953125, + "learning_rate": 0.0003384563425052151, + "loss": 0.5136, + "step": 116160 + }, + { + "epoch": 5.769842058209993, + "grad_norm": 0.13671875, + "learning_rate": 0.0003384166087215655, + "loss": 0.5117, + "step": 116170 + }, + { + "epoch": 5.770338730505612, + "grad_norm": 0.11962890625, + "learning_rate": 0.00033837687493791594, + "loss": 0.5246, + "step": 116180 + }, + { + "epoch": 5.770835402801231, + "grad_norm": 0.11474609375, + "learning_rate": 0.00033833714115426647, + "loss": 0.4963, + "step": 116190 + }, + { + "epoch": 5.771332075096851, + "grad_norm": 0.1171875, + "learning_rate": 0.0003382974073706169, + "loss": 0.5181, + "step": 116200 + }, + { + "epoch": 5.771828747392471, + "grad_norm": 0.10888671875, + "learning_rate": 0.0003382576735869673, + "loss": 0.4944, + "step": 116210 + }, + { + "epoch": 5.77232541968809, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003382179398033178, + "loss": 0.525, + "step": 116220 + }, + { + "epoch": 5.772822091983709, + "grad_norm": 0.1171875, + "learning_rate": 0.00033817820601966824, + "loss": 0.4985, + "step": 116230 + }, + { + "epoch": 5.773318764279328, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003381384722360187, + "loss": 0.4936, + "step": 116240 + }, + { + "epoch": 5.773815436574948, + "grad_norm": 0.154296875, + "learning_rate": 0.00033809873845236913, + "loss": 0.498, + "step": 116250 + }, + { + "epoch": 5.774312108870567, + "grad_norm": 0.14453125, + "learning_rate": 0.0003380590046687196, + "loss": 0.5084, + "step": 116260 + }, + { + "epoch": 5.774808781166186, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003380192708850701, + "loss": 0.4746, + "step": 116270 + }, + { + "epoch": 5.775305453461806, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003379795371014205, + "loss": 0.5184, + "step": 116280 + }, + { + "epoch": 5.7758021257574255, + "grad_norm": 0.146484375, + "learning_rate": 0.0003379398033177709, + "loss": 0.5068, + "step": 116290 + }, + { + "epoch": 5.776298798053045, + "grad_norm": 0.1142578125, + "learning_rate": 0.00033790006953412144, + "loss": 0.5128, + "step": 116300 + }, + { + "epoch": 5.776795470348664, + "grad_norm": 0.1494140625, + "learning_rate": 0.00033786033575047185, + "loss": 0.4971, + "step": 116310 + }, + { + "epoch": 5.777292142644283, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003378206019668223, + "loss": 0.5134, + "step": 116320 + }, + { + "epoch": 5.777788814939902, + "grad_norm": 0.12353515625, + "learning_rate": 0.00033778086818317274, + "loss": 0.4887, + "step": 116330 + }, + { + "epoch": 5.778285487235522, + "grad_norm": 0.12890625, + "learning_rate": 0.0003377411343995232, + "loss": 0.4808, + "step": 116340 + }, + { + "epoch": 5.778782159531142, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003377014006158737, + "loss": 0.5144, + "step": 116350 + }, + { + "epoch": 5.779278831826761, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003376616668322241, + "loss": 0.4807, + "step": 116360 + }, + { + "epoch": 5.77977550412238, + "grad_norm": 0.142578125, + "learning_rate": 0.00033762193304857457, + "loss": 0.4983, + "step": 116370 + }, + { + "epoch": 5.780272176417999, + "grad_norm": 0.130859375, + "learning_rate": 0.00033758219926492504, + "loss": 0.4903, + "step": 116380 + }, + { + "epoch": 5.780768848713619, + "grad_norm": 0.1435546875, + "learning_rate": 0.00033754246548127546, + "loss": 0.5133, + "step": 116390 + }, + { + "epoch": 5.781265521009238, + "grad_norm": 0.119140625, + "learning_rate": 0.00033750273169762593, + "loss": 0.5031, + "step": 116400 + }, + { + "epoch": 5.781762193304857, + "grad_norm": 0.1328125, + "learning_rate": 0.00033746299791397635, + "loss": 0.4948, + "step": 116410 + }, + { + "epoch": 5.782258865600477, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003374232641303268, + "loss": 0.4974, + "step": 116420 + }, + { + "epoch": 5.782755537896096, + "grad_norm": 0.130859375, + "learning_rate": 0.0003373835303466773, + "loss": 0.5041, + "step": 116430 + }, + { + "epoch": 5.783252210191716, + "grad_norm": 0.1494140625, + "learning_rate": 0.0003373437965630277, + "loss": 0.5212, + "step": 116440 + }, + { + "epoch": 5.783748882487335, + "grad_norm": 0.125, + "learning_rate": 0.0003373040627793782, + "loss": 0.4982, + "step": 116450 + }, + { + "epoch": 5.784245554782954, + "grad_norm": 0.166015625, + "learning_rate": 0.00033726432899572865, + "loss": 0.5074, + "step": 116460 + }, + { + "epoch": 5.784742227078573, + "grad_norm": 0.140625, + "learning_rate": 0.00033722459521207907, + "loss": 0.5313, + "step": 116470 + }, + { + "epoch": 5.785238899374193, + "grad_norm": 0.123046875, + "learning_rate": 0.00033718486142842954, + "loss": 0.521, + "step": 116480 + }, + { + "epoch": 5.785735571669813, + "grad_norm": 0.1640625, + "learning_rate": 0.00033714512764478, + "loss": 0.5085, + "step": 116490 + }, + { + "epoch": 5.786232243965432, + "grad_norm": 0.1328125, + "learning_rate": 0.00033710539386113043, + "loss": 0.5141, + "step": 116500 + }, + { + "epoch": 5.786728916261051, + "grad_norm": 0.12890625, + "learning_rate": 0.0003370656600774809, + "loss": 0.5, + "step": 116510 + }, + { + "epoch": 5.78722558855667, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003370259262938313, + "loss": 0.5004, + "step": 116520 + }, + { + "epoch": 5.78772226085229, + "grad_norm": 0.146484375, + "learning_rate": 0.00033698619251018184, + "loss": 0.4756, + "step": 116530 + }, + { + "epoch": 5.788218933147909, + "grad_norm": 0.1259765625, + "learning_rate": 0.00033694645872653226, + "loss": 0.4956, + "step": 116540 + }, + { + "epoch": 5.788715605443528, + "grad_norm": 0.1484375, + "learning_rate": 0.0003369067249428827, + "loss": 0.4909, + "step": 116550 + }, + { + "epoch": 5.789212277739148, + "grad_norm": 0.115234375, + "learning_rate": 0.00033686699115923315, + "loss": 0.5151, + "step": 116560 + }, + { + "epoch": 5.789708950034767, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003368272573755836, + "loss": 0.5047, + "step": 116570 + }, + { + "epoch": 5.790205622330387, + "grad_norm": 0.1376953125, + "learning_rate": 0.00033678752359193404, + "loss": 0.5064, + "step": 116580 + }, + { + "epoch": 5.790702294626006, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003367477898082845, + "loss": 0.5127, + "step": 116590 + }, + { + "epoch": 5.791198966921625, + "grad_norm": 0.146484375, + "learning_rate": 0.000336708056024635, + "loss": 0.5037, + "step": 116600 + }, + { + "epoch": 5.791695639217244, + "grad_norm": 0.1591796875, + "learning_rate": 0.00033666832224098545, + "loss": 0.5015, + "step": 116610 + }, + { + "epoch": 5.7921923115128635, + "grad_norm": 0.126953125, + "learning_rate": 0.00033662858845733587, + "loss": 0.524, + "step": 116620 + }, + { + "epoch": 5.792688983808484, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003365888546736863, + "loss": 0.4881, + "step": 116630 + }, + { + "epoch": 5.793185656104103, + "grad_norm": 0.125, + "learning_rate": 0.0003365491208900368, + "loss": 0.5178, + "step": 116640 + }, + { + "epoch": 5.793682328399722, + "grad_norm": 0.1171875, + "learning_rate": 0.00033650938710638723, + "loss": 0.4994, + "step": 116650 + }, + { + "epoch": 5.794179000695341, + "grad_norm": 0.1162109375, + "learning_rate": 0.00033646965332273765, + "loss": 0.4946, + "step": 116660 + }, + { + "epoch": 5.7946756729909605, + "grad_norm": 0.134765625, + "learning_rate": 0.0003364299195390881, + "loss": 0.4832, + "step": 116670 + }, + { + "epoch": 5.79517234528658, + "grad_norm": 0.12109375, + "learning_rate": 0.0003363901857554386, + "loss": 0.5086, + "step": 116680 + }, + { + "epoch": 5.795669017582199, + "grad_norm": 0.12890625, + "learning_rate": 0.00033635045197178906, + "loss": 0.5078, + "step": 116690 + }, + { + "epoch": 5.796165689877819, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003363107181881395, + "loss": 0.5327, + "step": 116700 + }, + { + "epoch": 5.796662362173438, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003362709844044899, + "loss": 0.4991, + "step": 116710 + }, + { + "epoch": 5.797159034469058, + "grad_norm": 0.1845703125, + "learning_rate": 0.0003362312506208404, + "loss": 0.4685, + "step": 116720 + }, + { + "epoch": 5.797655706764677, + "grad_norm": 0.12109375, + "learning_rate": 0.00033619151683719084, + "loss": 0.5092, + "step": 116730 + }, + { + "epoch": 5.798152379060296, + "grad_norm": 0.1279296875, + "learning_rate": 0.00033615178305354125, + "loss": 0.4894, + "step": 116740 + }, + { + "epoch": 5.798649051355915, + "grad_norm": 0.107421875, + "learning_rate": 0.0003361120492698917, + "loss": 0.4927, + "step": 116750 + }, + { + "epoch": 5.7991457236515345, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003360723154862422, + "loss": 0.5244, + "step": 116760 + }, + { + "epoch": 5.799642395947154, + "grad_norm": 0.11572265625, + "learning_rate": 0.00033603258170259267, + "loss": 0.5282, + "step": 116770 + }, + { + "epoch": 5.800139068242773, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003359928479189431, + "loss": 0.5163, + "step": 116780 + }, + { + "epoch": 5.800635740538393, + "grad_norm": 0.138671875, + "learning_rate": 0.00033595311413529356, + "loss": 0.5125, + "step": 116790 + }, + { + "epoch": 5.801132412834012, + "grad_norm": 0.1435546875, + "learning_rate": 0.00033591338035164403, + "loss": 0.5156, + "step": 116800 + }, + { + "epoch": 5.8016290851296315, + "grad_norm": 0.17578125, + "learning_rate": 0.00033587364656799444, + "loss": 0.4645, + "step": 116810 + }, + { + "epoch": 5.802125757425251, + "grad_norm": 0.1357421875, + "learning_rate": 0.00033583391278434486, + "loss": 0.4997, + "step": 116820 + }, + { + "epoch": 5.80262242972087, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003357941790006954, + "loss": 0.5129, + "step": 116830 + }, + { + "epoch": 5.803119102016489, + "grad_norm": 0.11328125, + "learning_rate": 0.0003357544452170458, + "loss": 0.4992, + "step": 116840 + }, + { + "epoch": 5.803615774312108, + "grad_norm": 0.150390625, + "learning_rate": 0.0003357147114333963, + "loss": 0.5252, + "step": 116850 + }, + { + "epoch": 5.8041124466077285, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003356749776497467, + "loss": 0.5282, + "step": 116860 + }, + { + "epoch": 5.804609118903348, + "grad_norm": 0.12109375, + "learning_rate": 0.00033563524386609716, + "loss": 0.4747, + "step": 116870 + }, + { + "epoch": 5.805105791198967, + "grad_norm": 0.130859375, + "learning_rate": 0.00033559551008244764, + "loss": 0.4811, + "step": 116880 + }, + { + "epoch": 5.805602463494586, + "grad_norm": 0.1513671875, + "learning_rate": 0.00033555577629879805, + "loss": 0.4758, + "step": 116890 + }, + { + "epoch": 5.806099135790205, + "grad_norm": 0.142578125, + "learning_rate": 0.0003355160425151485, + "loss": 0.5076, + "step": 116900 + }, + { + "epoch": 5.806595808085825, + "grad_norm": 0.1279296875, + "learning_rate": 0.000335476308731499, + "loss": 0.4745, + "step": 116910 + }, + { + "epoch": 5.807092480381444, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003354365749478494, + "loss": 0.5009, + "step": 116920 + }, + { + "epoch": 5.807589152677064, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003353968411641999, + "loss": 0.5389, + "step": 116930 + }, + { + "epoch": 5.808085824972683, + "grad_norm": 0.12353515625, + "learning_rate": 0.00033535710738055036, + "loss": 0.5034, + "step": 116940 + }, + { + "epoch": 5.8085824972683024, + "grad_norm": 0.1337890625, + "learning_rate": 0.00033531737359690077, + "loss": 0.5461, + "step": 116950 + }, + { + "epoch": 5.809079169563922, + "grad_norm": 0.125, + "learning_rate": 0.00033527763981325124, + "loss": 0.4941, + "step": 116960 + }, + { + "epoch": 5.809575841859541, + "grad_norm": 0.1083984375, + "learning_rate": 0.00033523790602960166, + "loss": 0.5037, + "step": 116970 + }, + { + "epoch": 5.81007251415516, + "grad_norm": 0.1142578125, + "learning_rate": 0.00033519817224595213, + "loss": 0.5012, + "step": 116980 + }, + { + "epoch": 5.810569186450779, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003351584384623026, + "loss": 0.4791, + "step": 116990 + }, + { + "epoch": 5.8110658587463995, + "grad_norm": 0.1259765625, + "learning_rate": 0.000335118704678653, + "loss": 0.49, + "step": 117000 + }, + { + "epoch": 5.811562531042019, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003350789708950035, + "loss": 0.4665, + "step": 117010 + }, + { + "epoch": 5.812059203337638, + "grad_norm": 0.1201171875, + "learning_rate": 0.00033503923711135396, + "loss": 0.5215, + "step": 117020 + }, + { + "epoch": 5.812555875633257, + "grad_norm": 0.1533203125, + "learning_rate": 0.0003349995033277044, + "loss": 0.5096, + "step": 117030 + }, + { + "epoch": 5.813052547928876, + "grad_norm": 0.126953125, + "learning_rate": 0.00033495976954405485, + "loss": 0.4757, + "step": 117040 + }, + { + "epoch": 5.813549220224496, + "grad_norm": 0.1103515625, + "learning_rate": 0.00033492003576040527, + "loss": 0.4872, + "step": 117050 + }, + { + "epoch": 5.814045892520115, + "grad_norm": 0.1171875, + "learning_rate": 0.0003348803019767558, + "loss": 0.4954, + "step": 117060 + }, + { + "epoch": 5.814542564815735, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003348405681931062, + "loss": 0.5031, + "step": 117070 + }, + { + "epoch": 5.815039237111354, + "grad_norm": 0.12158203125, + "learning_rate": 0.00033480083440945663, + "loss": 0.5036, + "step": 117080 + }, + { + "epoch": 5.815535909406973, + "grad_norm": 0.1201171875, + "learning_rate": 0.0003347611006258071, + "loss": 0.4804, + "step": 117090 + }, + { + "epoch": 5.816032581702593, + "grad_norm": 0.1103515625, + "learning_rate": 0.00033472136684215757, + "loss": 0.5143, + "step": 117100 + }, + { + "epoch": 5.816529253998212, + "grad_norm": 0.1357421875, + "learning_rate": 0.000334681633058508, + "loss": 0.5067, + "step": 117110 + }, + { + "epoch": 5.817025926293831, + "grad_norm": 0.119140625, + "learning_rate": 0.00033464189927485846, + "loss": 0.5047, + "step": 117120 + }, + { + "epoch": 5.81752259858945, + "grad_norm": 0.1416015625, + "learning_rate": 0.00033460216549120893, + "loss": 0.5313, + "step": 117130 + }, + { + "epoch": 5.81801927088507, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003345624317075594, + "loss": 0.5599, + "step": 117140 + }, + { + "epoch": 5.81851594318069, + "grad_norm": 0.150390625, + "learning_rate": 0.0003345226979239098, + "loss": 0.4944, + "step": 117150 + }, + { + "epoch": 5.819012615476309, + "grad_norm": 0.119140625, + "learning_rate": 0.00033448296414026024, + "loss": 0.5168, + "step": 117160 + }, + { + "epoch": 5.819509287771928, + "grad_norm": 0.1162109375, + "learning_rate": 0.00033444323035661076, + "loss": 0.4926, + "step": 117170 + }, + { + "epoch": 5.820005960067547, + "grad_norm": 0.111328125, + "learning_rate": 0.0003344034965729612, + "loss": 0.4978, + "step": 117180 + }, + { + "epoch": 5.820502632363167, + "grad_norm": 0.1767578125, + "learning_rate": 0.0003343637627893116, + "loss": 0.5229, + "step": 117190 + }, + { + "epoch": 5.820999304658786, + "grad_norm": 0.203125, + "learning_rate": 0.00033432402900566207, + "loss": 0.5083, + "step": 117200 + }, + { + "epoch": 5.821495976954406, + "grad_norm": 0.126953125, + "learning_rate": 0.00033428429522201254, + "loss": 0.5103, + "step": 117210 + }, + { + "epoch": 5.821992649250025, + "grad_norm": 0.119140625, + "learning_rate": 0.000334244561438363, + "loss": 0.508, + "step": 117220 + }, + { + "epoch": 5.822489321545644, + "grad_norm": 0.111328125, + "learning_rate": 0.00033420482765471343, + "loss": 0.4983, + "step": 117230 + }, + { + "epoch": 5.822985993841264, + "grad_norm": 0.115234375, + "learning_rate": 0.00033416509387106385, + "loss": 0.5179, + "step": 117240 + }, + { + "epoch": 5.823482666136883, + "grad_norm": 0.13671875, + "learning_rate": 0.00033412536008741437, + "loss": 0.5201, + "step": 117250 + }, + { + "epoch": 5.823979338432502, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003340856263037648, + "loss": 0.5031, + "step": 117260 + }, + { + "epoch": 5.824476010728121, + "grad_norm": 0.140625, + "learning_rate": 0.0003340458925201152, + "loss": 0.5299, + "step": 117270 + }, + { + "epoch": 5.824972683023741, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003340061587364657, + "loss": 0.5158, + "step": 117280 + }, + { + "epoch": 5.825469355319361, + "grad_norm": 0.109375, + "learning_rate": 0.00033396642495281615, + "loss": 0.4898, + "step": 117290 + }, + { + "epoch": 5.82596602761498, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003339266911691666, + "loss": 0.5246, + "step": 117300 + }, + { + "epoch": 5.826462699910599, + "grad_norm": 0.1083984375, + "learning_rate": 0.00033388695738551704, + "loss": 0.5086, + "step": 117310 + }, + { + "epoch": 5.826959372206218, + "grad_norm": 0.130859375, + "learning_rate": 0.0003338472236018675, + "loss": 0.4911, + "step": 117320 + }, + { + "epoch": 5.8274560445018375, + "grad_norm": 0.1572265625, + "learning_rate": 0.000333807489818218, + "loss": 0.5211, + "step": 117330 + }, + { + "epoch": 5.827952716797457, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003337677560345684, + "loss": 0.514, + "step": 117340 + }, + { + "epoch": 5.828449389093077, + "grad_norm": 0.1298828125, + "learning_rate": 0.00033372802225091887, + "loss": 0.49, + "step": 117350 + }, + { + "epoch": 5.828946061388696, + "grad_norm": 0.1162109375, + "learning_rate": 0.00033368828846726934, + "loss": 0.511, + "step": 117360 + }, + { + "epoch": 5.829442733684315, + "grad_norm": 0.111328125, + "learning_rate": 0.00033364855468361976, + "loss": 0.5215, + "step": 117370 + }, + { + "epoch": 5.8299394059799345, + "grad_norm": 0.146484375, + "learning_rate": 0.00033360882089997023, + "loss": 0.4963, + "step": 117380 + }, + { + "epoch": 5.830436078275554, + "grad_norm": 0.1298828125, + "learning_rate": 0.00033356908711632064, + "loss": 0.5116, + "step": 117390 + }, + { + "epoch": 5.830932750571173, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003335293533326711, + "loss": 0.4927, + "step": 117400 + }, + { + "epoch": 5.831429422866792, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003334896195490216, + "loss": 0.4983, + "step": 117410 + }, + { + "epoch": 5.831926095162412, + "grad_norm": 0.1259765625, + "learning_rate": 0.000333449885765372, + "loss": 0.4852, + "step": 117420 + }, + { + "epoch": 5.832422767458032, + "grad_norm": 0.1572265625, + "learning_rate": 0.0003334101519817225, + "loss": 0.5369, + "step": 117430 + }, + { + "epoch": 5.832919439753651, + "grad_norm": 0.1767578125, + "learning_rate": 0.00033337041819807295, + "loss": 0.5075, + "step": 117440 + }, + { + "epoch": 5.83341611204927, + "grad_norm": 0.12890625, + "learning_rate": 0.00033333068441442336, + "loss": 0.5141, + "step": 117450 + }, + { + "epoch": 5.833912784344889, + "grad_norm": 0.140625, + "learning_rate": 0.00033329095063077384, + "loss": 0.5138, + "step": 117460 + }, + { + "epoch": 5.8344094566405085, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003332512168471243, + "loss": 0.5337, + "step": 117470 + }, + { + "epoch": 5.834906128936128, + "grad_norm": 0.111328125, + "learning_rate": 0.0003332114830634747, + "loss": 0.5158, + "step": 117480 + }, + { + "epoch": 5.835402801231747, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003331717492798252, + "loss": 0.497, + "step": 117490 + }, + { + "epoch": 5.835899473527366, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003331320154961756, + "loss": 0.4804, + "step": 117500 + }, + { + "epoch": 5.836396145822986, + "grad_norm": 0.140625, + "learning_rate": 0.0003330922817125261, + "loss": 0.5316, + "step": 117510 + }, + { + "epoch": 5.8368928181186055, + "grad_norm": 0.1357421875, + "learning_rate": 0.00033305254792887656, + "loss": 0.5328, + "step": 117520 + }, + { + "epoch": 5.837389490414225, + "grad_norm": 0.12353515625, + "learning_rate": 0.00033301281414522697, + "loss": 0.4812, + "step": 117530 + }, + { + "epoch": 5.837886162709844, + "grad_norm": 0.126953125, + "learning_rate": 0.00033297308036157744, + "loss": 0.4885, + "step": 117540 + }, + { + "epoch": 5.838382835005463, + "grad_norm": 0.142578125, + "learning_rate": 0.0003329333465779279, + "loss": 0.4769, + "step": 117550 + }, + { + "epoch": 5.838879507301082, + "grad_norm": 0.1328125, + "learning_rate": 0.00033289361279427833, + "loss": 0.4961, + "step": 117560 + }, + { + "epoch": 5.839376179596702, + "grad_norm": 0.12890625, + "learning_rate": 0.0003328538790106288, + "loss": 0.4942, + "step": 117570 + }, + { + "epoch": 5.839872851892322, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003328141452269792, + "loss": 0.5328, + "step": 117580 + }, + { + "epoch": 5.840369524187941, + "grad_norm": 0.11279296875, + "learning_rate": 0.00033277441144332975, + "loss": 0.4926, + "step": 117590 + }, + { + "epoch": 5.84086619648356, + "grad_norm": 0.12255859375, + "learning_rate": 0.00033273467765968016, + "loss": 0.5053, + "step": 117600 + }, + { + "epoch": 5.841362868779179, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003326949438760306, + "loss": 0.498, + "step": 117610 + }, + { + "epoch": 5.841859541074799, + "grad_norm": 0.11474609375, + "learning_rate": 0.00033265521009238105, + "loss": 0.4987, + "step": 117620 + }, + { + "epoch": 5.842356213370418, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003326154763087315, + "loss": 0.4759, + "step": 117630 + }, + { + "epoch": 5.842852885666037, + "grad_norm": 0.1181640625, + "learning_rate": 0.00033257574252508194, + "loss": 0.5073, + "step": 117640 + }, + { + "epoch": 5.843349557961657, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003325360087414324, + "loss": 0.4777, + "step": 117650 + }, + { + "epoch": 5.8438462302572765, + "grad_norm": 0.126953125, + "learning_rate": 0.0003324962749577829, + "loss": 0.5161, + "step": 117660 + }, + { + "epoch": 5.844342902552896, + "grad_norm": 0.1240234375, + "learning_rate": 0.00033245654117413335, + "loss": 0.4854, + "step": 117670 + }, + { + "epoch": 5.844839574848515, + "grad_norm": 0.11572265625, + "learning_rate": 0.00033241680739048377, + "loss": 0.517, + "step": 117680 + }, + { + "epoch": 5.845336247144134, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003323770736068342, + "loss": 0.5202, + "step": 117690 + }, + { + "epoch": 5.845832919439753, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003323373398231847, + "loss": 0.4899, + "step": 117700 + }, + { + "epoch": 5.846329591735373, + "grad_norm": 0.123046875, + "learning_rate": 0.00033229760603953513, + "loss": 0.4762, + "step": 117710 + }, + { + "epoch": 5.846826264030993, + "grad_norm": 0.203125, + "learning_rate": 0.00033225787225588555, + "loss": 0.4953, + "step": 117720 + }, + { + "epoch": 5.847322936326612, + "grad_norm": 0.1328125, + "learning_rate": 0.000332218138472236, + "loss": 0.4721, + "step": 117730 + }, + { + "epoch": 5.847819608622231, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003321784046885865, + "loss": 0.503, + "step": 117740 + }, + { + "epoch": 5.84831628091785, + "grad_norm": 0.126953125, + "learning_rate": 0.00033213867090493696, + "loss": 0.4832, + "step": 117750 + }, + { + "epoch": 5.84881295321347, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003320989371212874, + "loss": 0.4994, + "step": 117760 + }, + { + "epoch": 5.849309625509089, + "grad_norm": 0.119140625, + "learning_rate": 0.00033205920333763785, + "loss": 0.4854, + "step": 117770 + }, + { + "epoch": 5.849806297804708, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003320194695539883, + "loss": 0.4945, + "step": 117780 + }, + { + "epoch": 5.850302970100328, + "grad_norm": 0.11962890625, + "learning_rate": 0.00033197973577033874, + "loss": 0.49, + "step": 117790 + }, + { + "epoch": 5.850799642395947, + "grad_norm": 0.140625, + "learning_rate": 0.0003319400019866892, + "loss": 0.4972, + "step": 117800 + }, + { + "epoch": 5.851296314691567, + "grad_norm": 0.1376953125, + "learning_rate": 0.00033190026820303963, + "loss": 0.4971, + "step": 117810 + }, + { + "epoch": 5.851792986987186, + "grad_norm": 0.126953125, + "learning_rate": 0.0003318605344193901, + "loss": 0.5217, + "step": 117820 + }, + { + "epoch": 5.852289659282805, + "grad_norm": 0.11962890625, + "learning_rate": 0.00033182080063574057, + "loss": 0.5168, + "step": 117830 + }, + { + "epoch": 5.852786331578424, + "grad_norm": 0.1533203125, + "learning_rate": 0.000331781066852091, + "loss": 0.4887, + "step": 117840 + }, + { + "epoch": 5.8532830038740435, + "grad_norm": 0.14453125, + "learning_rate": 0.00033174133306844146, + "loss": 0.4975, + "step": 117850 + }, + { + "epoch": 5.853779676169664, + "grad_norm": 0.12890625, + "learning_rate": 0.00033170159928479193, + "loss": 0.5065, + "step": 117860 + }, + { + "epoch": 5.854276348465283, + "grad_norm": 0.10888671875, + "learning_rate": 0.00033166186550114235, + "loss": 0.4773, + "step": 117870 + }, + { + "epoch": 5.854773020760902, + "grad_norm": 0.201171875, + "learning_rate": 0.0003316221317174928, + "loss": 0.507, + "step": 117880 + }, + { + "epoch": 5.855269693056521, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003315823979338433, + "loss": 0.4938, + "step": 117890 + }, + { + "epoch": 5.855766365352141, + "grad_norm": 0.1708984375, + "learning_rate": 0.0003315426641501937, + "loss": 0.5063, + "step": 117900 + }, + { + "epoch": 5.85626303764776, + "grad_norm": 0.1103515625, + "learning_rate": 0.0003315029303665442, + "loss": 0.5022, + "step": 117910 + }, + { + "epoch": 5.856759709943379, + "grad_norm": 0.140625, + "learning_rate": 0.0003314631965828946, + "loss": 0.5145, + "step": 117920 + }, + { + "epoch": 5.857256382238999, + "grad_norm": 0.1572265625, + "learning_rate": 0.00033142346279924507, + "loss": 0.5042, + "step": 117930 + }, + { + "epoch": 5.857753054534618, + "grad_norm": 0.1357421875, + "learning_rate": 0.00033138372901559554, + "loss": 0.4761, + "step": 117940 + }, + { + "epoch": 5.858249726830238, + "grad_norm": 0.1240234375, + "learning_rate": 0.00033134399523194596, + "loss": 0.4655, + "step": 117950 + }, + { + "epoch": 5.858746399125857, + "grad_norm": 0.1591796875, + "learning_rate": 0.00033130426144829643, + "loss": 0.5248, + "step": 117960 + }, + { + "epoch": 5.859243071421476, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003312645276646469, + "loss": 0.5076, + "step": 117970 + }, + { + "epoch": 5.859739743717095, + "grad_norm": 0.1572265625, + "learning_rate": 0.0003312247938809973, + "loss": 0.5143, + "step": 117980 + }, + { + "epoch": 5.8602364160127145, + "grad_norm": 0.13671875, + "learning_rate": 0.0003311850600973478, + "loss": 0.5171, + "step": 117990 + }, + { + "epoch": 5.860733088308335, + "grad_norm": 0.138671875, + "learning_rate": 0.00033114532631369826, + "loss": 0.4982, + "step": 118000 + }, + { + "epoch": 5.861229760603954, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003311055925300487, + "loss": 0.4797, + "step": 118010 + }, + { + "epoch": 5.861726432899573, + "grad_norm": 0.12890625, + "learning_rate": 0.00033106585874639915, + "loss": 0.4773, + "step": 118020 + }, + { + "epoch": 5.862223105195192, + "grad_norm": 0.11279296875, + "learning_rate": 0.00033102612496274956, + "loss": 0.5194, + "step": 118030 + }, + { + "epoch": 5.8627197774908115, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003309863911791001, + "loss": 0.5051, + "step": 118040 + }, + { + "epoch": 5.863216449786431, + "grad_norm": 0.14453125, + "learning_rate": 0.0003309466573954505, + "loss": 0.4984, + "step": 118050 + }, + { + "epoch": 5.86371312208205, + "grad_norm": 0.134765625, + "learning_rate": 0.0003309069236118009, + "loss": 0.4948, + "step": 118060 + }, + { + "epoch": 5.86420979437767, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003308671898281514, + "loss": 0.486, + "step": 118070 + }, + { + "epoch": 5.864706466673289, + "grad_norm": 0.10986328125, + "learning_rate": 0.00033082745604450187, + "loss": 0.5001, + "step": 118080 + }, + { + "epoch": 5.8652031389689085, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003307877222608523, + "loss": 0.4748, + "step": 118090 + }, + { + "epoch": 5.865699811264528, + "grad_norm": 0.126953125, + "learning_rate": 0.00033074798847720276, + "loss": 0.5161, + "step": 118100 + }, + { + "epoch": 5.866196483560147, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003307082546935532, + "loss": 0.521, + "step": 118110 + }, + { + "epoch": 5.866693155855766, + "grad_norm": 0.140625, + "learning_rate": 0.0003306685209099037, + "loss": 0.5077, + "step": 118120 + }, + { + "epoch": 5.8671898281513855, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003306287871262541, + "loss": 0.5289, + "step": 118130 + }, + { + "epoch": 5.867686500447005, + "grad_norm": 0.11376953125, + "learning_rate": 0.00033058905334260453, + "loss": 0.5263, + "step": 118140 + }, + { + "epoch": 5.868183172742625, + "grad_norm": 0.1591796875, + "learning_rate": 0.000330549319558955, + "loss": 0.5321, + "step": 118150 + }, + { + "epoch": 5.868679845038244, + "grad_norm": 0.171875, + "learning_rate": 0.0003305095857753055, + "loss": 0.501, + "step": 118160 + }, + { + "epoch": 5.869176517333863, + "grad_norm": 0.1279296875, + "learning_rate": 0.00033046985199165595, + "loss": 0.5024, + "step": 118170 + }, + { + "epoch": 5.8696731896294825, + "grad_norm": 0.134765625, + "learning_rate": 0.00033043011820800636, + "loss": 0.5344, + "step": 118180 + }, + { + "epoch": 5.870169861925102, + "grad_norm": 0.11376953125, + "learning_rate": 0.00033039038442435684, + "loss": 0.5226, + "step": 118190 + }, + { + "epoch": 5.870666534220721, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003303506506407073, + "loss": 0.5285, + "step": 118200 + }, + { + "epoch": 5.87116320651634, + "grad_norm": 0.1474609375, + "learning_rate": 0.0003303109168570577, + "loss": 0.4912, + "step": 118210 + }, + { + "epoch": 5.871659878811959, + "grad_norm": 0.130859375, + "learning_rate": 0.00033027118307340814, + "loss": 0.4776, + "step": 118220 + }, + { + "epoch": 5.8721565511075795, + "grad_norm": 0.11279296875, + "learning_rate": 0.00033023144928975867, + "loss": 0.4904, + "step": 118230 + }, + { + "epoch": 5.872653223403199, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003301917155061091, + "loss": 0.5071, + "step": 118240 + }, + { + "epoch": 5.873149895698818, + "grad_norm": 0.12353515625, + "learning_rate": 0.00033015198172245956, + "loss": 0.4906, + "step": 118250 + }, + { + "epoch": 5.873646567994437, + "grad_norm": 0.119140625, + "learning_rate": 0.00033011224793880997, + "loss": 0.495, + "step": 118260 + }, + { + "epoch": 5.874143240290056, + "grad_norm": 0.134765625, + "learning_rate": 0.00033007251415516044, + "loss": 0.4985, + "step": 118270 + }, + { + "epoch": 5.874639912585676, + "grad_norm": 0.1640625, + "learning_rate": 0.0003300327803715109, + "loss": 0.4986, + "step": 118280 + }, + { + "epoch": 5.875136584881295, + "grad_norm": 0.1533203125, + "learning_rate": 0.00032999304658786133, + "loss": 0.4812, + "step": 118290 + }, + { + "epoch": 5.875633257176915, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003299533128042118, + "loss": 0.4896, + "step": 118300 + }, + { + "epoch": 5.876129929472534, + "grad_norm": 0.1728515625, + "learning_rate": 0.0003299135790205623, + "loss": 0.4845, + "step": 118310 + }, + { + "epoch": 5.876626601768153, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003298738452369127, + "loss": 0.5161, + "step": 118320 + }, + { + "epoch": 5.877123274063773, + "grad_norm": 0.1328125, + "learning_rate": 0.00032983411145326316, + "loss": 0.5102, + "step": 118330 + }, + { + "epoch": 5.877619946359392, + "grad_norm": 0.1279296875, + "learning_rate": 0.00032979437766961363, + "loss": 0.4987, + "step": 118340 + }, + { + "epoch": 5.878116618655011, + "grad_norm": 0.1259765625, + "learning_rate": 0.00032975464388596405, + "loss": 0.5095, + "step": 118350 + }, + { + "epoch": 5.87861329095063, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003297149101023145, + "loss": 0.5482, + "step": 118360 + }, + { + "epoch": 5.8791099632462505, + "grad_norm": 0.1357421875, + "learning_rate": 0.00032967517631866494, + "loss": 0.5039, + "step": 118370 + }, + { + "epoch": 5.87960663554187, + "grad_norm": 0.138671875, + "learning_rate": 0.0003296354425350154, + "loss": 0.5345, + "step": 118380 + }, + { + "epoch": 5.880103307837489, + "grad_norm": 0.130859375, + "learning_rate": 0.0003295957087513659, + "loss": 0.4986, + "step": 118390 + }, + { + "epoch": 5.880599980133108, + "grad_norm": 0.162109375, + "learning_rate": 0.0003295559749677163, + "loss": 0.5201, + "step": 118400 + }, + { + "epoch": 5.881096652428727, + "grad_norm": 0.134765625, + "learning_rate": 0.00032951624118406677, + "loss": 0.4885, + "step": 118410 + }, + { + "epoch": 5.881593324724347, + "grad_norm": 0.1435546875, + "learning_rate": 0.00032947650740041724, + "loss": 0.505, + "step": 118420 + }, + { + "epoch": 5.882089997019966, + "grad_norm": 0.12890625, + "learning_rate": 0.00032943677361676766, + "loss": 0.4985, + "step": 118430 + }, + { + "epoch": 5.882586669315586, + "grad_norm": 0.1982421875, + "learning_rate": 0.00032939703983311813, + "loss": 0.5121, + "step": 118440 + }, + { + "epoch": 5.883083341611205, + "grad_norm": 0.1318359375, + "learning_rate": 0.00032935730604946855, + "loss": 0.5012, + "step": 118450 + }, + { + "epoch": 5.883580013906824, + "grad_norm": 0.1416015625, + "learning_rate": 0.000329317572265819, + "loss": 0.519, + "step": 118460 + }, + { + "epoch": 5.884076686202444, + "grad_norm": 0.123046875, + "learning_rate": 0.0003292778384821695, + "loss": 0.5172, + "step": 118470 + }, + { + "epoch": 5.884573358498063, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003292381046985199, + "loss": 0.4944, + "step": 118480 + }, + { + "epoch": 5.885070030793682, + "grad_norm": 0.171875, + "learning_rate": 0.0003291983709148704, + "loss": 0.5043, + "step": 118490 + }, + { + "epoch": 5.885566703089301, + "grad_norm": 0.126953125, + "learning_rate": 0.00032915863713122085, + "loss": 0.4903, + "step": 118500 + }, + { + "epoch": 5.886063375384921, + "grad_norm": 0.1318359375, + "learning_rate": 0.00032911890334757127, + "loss": 0.5015, + "step": 118510 + }, + { + "epoch": 5.886560047680541, + "grad_norm": 0.119140625, + "learning_rate": 0.00032907916956392174, + "loss": 0.4816, + "step": 118520 + }, + { + "epoch": 5.88705671997616, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003290394357802722, + "loss": 0.5231, + "step": 118530 + }, + { + "epoch": 5.887553392271779, + "grad_norm": 0.1279296875, + "learning_rate": 0.00032899970199662263, + "loss": 0.4856, + "step": 118540 + }, + { + "epoch": 5.888050064567398, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003289599682129731, + "loss": 0.5284, + "step": 118550 + }, + { + "epoch": 5.8885467368630175, + "grad_norm": 0.119140625, + "learning_rate": 0.0003289202344293235, + "loss": 0.4831, + "step": 118560 + }, + { + "epoch": 5.889043409158637, + "grad_norm": 0.12060546875, + "learning_rate": 0.00032888050064567404, + "loss": 0.5038, + "step": 118570 + }, + { + "epoch": 5.889540081454257, + "grad_norm": 0.12890625, + "learning_rate": 0.00032884076686202446, + "loss": 0.5066, + "step": 118580 + }, + { + "epoch": 5.890036753749876, + "grad_norm": 0.134765625, + "learning_rate": 0.0003288010330783749, + "loss": 0.5145, + "step": 118590 + }, + { + "epoch": 5.890533426045495, + "grad_norm": 0.1484375, + "learning_rate": 0.00032876129929472535, + "loss": 0.5495, + "step": 118600 + }, + { + "epoch": 5.891030098341115, + "grad_norm": 0.1171875, + "learning_rate": 0.0003287215655110758, + "loss": 0.4683, + "step": 118610 + }, + { + "epoch": 5.891526770636734, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003286818317274263, + "loss": 0.4913, + "step": 118620 + }, + { + "epoch": 5.892023442932353, + "grad_norm": 0.13671875, + "learning_rate": 0.0003286420979437767, + "loss": 0.5002, + "step": 118630 + }, + { + "epoch": 5.892520115227972, + "grad_norm": 0.13671875, + "learning_rate": 0.0003286023641601272, + "loss": 0.4984, + "step": 118640 + }, + { + "epoch": 5.893016787523592, + "grad_norm": 0.1298828125, + "learning_rate": 0.00032856263037647765, + "loss": 0.5122, + "step": 118650 + }, + { + "epoch": 5.893513459819212, + "grad_norm": 0.126953125, + "learning_rate": 0.00032852289659282807, + "loss": 0.532, + "step": 118660 + }, + { + "epoch": 5.894010132114831, + "grad_norm": 0.1171875, + "learning_rate": 0.0003284831628091785, + "loss": 0.5041, + "step": 118670 + }, + { + "epoch": 5.89450680441045, + "grad_norm": 0.140625, + "learning_rate": 0.00032844342902552896, + "loss": 0.5026, + "step": 118680 + }, + { + "epoch": 5.895003476706069, + "grad_norm": 0.125, + "learning_rate": 0.00032840369524187943, + "loss": 0.4592, + "step": 118690 + }, + { + "epoch": 5.8955001490016885, + "grad_norm": 0.1328125, + "learning_rate": 0.0003283639614582299, + "loss": 0.5028, + "step": 118700 + }, + { + "epoch": 5.895996821297308, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003283242276745803, + "loss": 0.4951, + "step": 118710 + }, + { + "epoch": 5.896493493592928, + "grad_norm": 0.126953125, + "learning_rate": 0.0003282844938909308, + "loss": 0.5101, + "step": 118720 + }, + { + "epoch": 5.896990165888547, + "grad_norm": 0.115234375, + "learning_rate": 0.00032824476010728126, + "loss": 0.4962, + "step": 118730 + }, + { + "epoch": 5.897486838184166, + "grad_norm": 0.123046875, + "learning_rate": 0.0003282050263236317, + "loss": 0.4997, + "step": 118740 + }, + { + "epoch": 5.8979835104797855, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003281652925399821, + "loss": 0.5008, + "step": 118750 + }, + { + "epoch": 5.898480182775405, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003281255587563326, + "loss": 0.5007, + "step": 118760 + }, + { + "epoch": 5.898976855071024, + "grad_norm": 0.123046875, + "learning_rate": 0.00032808582497268304, + "loss": 0.4933, + "step": 118770 + }, + { + "epoch": 5.899473527366643, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003280460911890335, + "loss": 0.508, + "step": 118780 + }, + { + "epoch": 5.899970199662263, + "grad_norm": 0.13671875, + "learning_rate": 0.0003280063574053839, + "loss": 0.4906, + "step": 118790 + }, + { + "epoch": 5.9004668719578826, + "grad_norm": 0.1494140625, + "learning_rate": 0.0003279666236217344, + "loss": 0.5132, + "step": 118800 + }, + { + "epoch": 5.900963544253502, + "grad_norm": 0.1357421875, + "learning_rate": 0.00032792688983808487, + "loss": 0.5058, + "step": 118810 + }, + { + "epoch": 5.901460216549121, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003278871560544353, + "loss": 0.5065, + "step": 118820 + }, + { + "epoch": 5.90195688884474, + "grad_norm": 0.1259765625, + "learning_rate": 0.00032784742227078576, + "loss": 0.5009, + "step": 118830 + }, + { + "epoch": 5.9024535611403595, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003278076884871362, + "loss": 0.5332, + "step": 118840 + }, + { + "epoch": 5.902950233435979, + "grad_norm": 0.1298828125, + "learning_rate": 0.00032776795470348664, + "loss": 0.493, + "step": 118850 + }, + { + "epoch": 5.903446905731598, + "grad_norm": 0.1513671875, + "learning_rate": 0.0003277282209198371, + "loss": 0.4811, + "step": 118860 + }, + { + "epoch": 5.903943578027217, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003276884871361876, + "loss": 0.4869, + "step": 118870 + }, + { + "epoch": 5.904440250322837, + "grad_norm": 0.1279296875, + "learning_rate": 0.000327648753352538, + "loss": 0.5196, + "step": 118880 + }, + { + "epoch": 5.9049369226184565, + "grad_norm": 0.126953125, + "learning_rate": 0.0003276090195688885, + "loss": 0.4773, + "step": 118890 + }, + { + "epoch": 5.905433594914076, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003275692857852389, + "loss": 0.5117, + "step": 118900 + }, + { + "epoch": 5.905930267209695, + "grad_norm": 0.13671875, + "learning_rate": 0.00032752955200158936, + "loss": 0.5024, + "step": 118910 + }, + { + "epoch": 5.906426939505314, + "grad_norm": 0.1474609375, + "learning_rate": 0.00032748981821793983, + "loss": 0.4918, + "step": 118920 + }, + { + "epoch": 5.906923611800933, + "grad_norm": 0.11865234375, + "learning_rate": 0.00032745008443429025, + "loss": 0.503, + "step": 118930 + }, + { + "epoch": 5.907420284096553, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003274103506506407, + "loss": 0.5204, + "step": 118940 + }, + { + "epoch": 5.907916956392173, + "grad_norm": 0.12890625, + "learning_rate": 0.0003273706168669912, + "loss": 0.4752, + "step": 118950 + }, + { + "epoch": 5.908413628687792, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003273308830833416, + "loss": 0.5238, + "step": 118960 + }, + { + "epoch": 5.908910300983411, + "grad_norm": 0.126953125, + "learning_rate": 0.0003272911492996921, + "loss": 0.4999, + "step": 118970 + }, + { + "epoch": 5.90940697327903, + "grad_norm": 0.109375, + "learning_rate": 0.0003272514155160425, + "loss": 0.5208, + "step": 118980 + }, + { + "epoch": 5.90990364557465, + "grad_norm": 0.1240234375, + "learning_rate": 0.00032721168173239297, + "loss": 0.5137, + "step": 118990 + }, + { + "epoch": 5.910400317870269, + "grad_norm": 0.12158203125, + "learning_rate": 0.00032717194794874344, + "loss": 0.4515, + "step": 119000 + }, + { + "epoch": 5.910896990165888, + "grad_norm": 0.146484375, + "learning_rate": 0.00032713221416509386, + "loss": 0.5002, + "step": 119010 + }, + { + "epoch": 5.911393662461508, + "grad_norm": 0.185546875, + "learning_rate": 0.00032709248038144433, + "loss": 0.5258, + "step": 119020 + }, + { + "epoch": 5.911890334757127, + "grad_norm": 0.1943359375, + "learning_rate": 0.0003270527465977948, + "loss": 0.4869, + "step": 119030 + }, + { + "epoch": 5.912387007052747, + "grad_norm": 0.248046875, + "learning_rate": 0.0003270130128141452, + "loss": 0.513, + "step": 119040 + }, + { + "epoch": 5.912883679348366, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003269732790304957, + "loss": 0.4948, + "step": 119050 + }, + { + "epoch": 5.913380351643985, + "grad_norm": 0.1259765625, + "learning_rate": 0.00032693354524684616, + "loss": 0.5159, + "step": 119060 + }, + { + "epoch": 5.913877023939604, + "grad_norm": 0.123046875, + "learning_rate": 0.00032689381146319663, + "loss": 0.4906, + "step": 119070 + }, + { + "epoch": 5.914373696235224, + "grad_norm": 0.146484375, + "learning_rate": 0.00032685407767954705, + "loss": 0.4982, + "step": 119080 + }, + { + "epoch": 5.914870368530844, + "grad_norm": 0.1572265625, + "learning_rate": 0.00032681434389589747, + "loss": 0.5223, + "step": 119090 + }, + { + "epoch": 5.915367040826463, + "grad_norm": 0.12890625, + "learning_rate": 0.000326774610112248, + "loss": 0.5294, + "step": 119100 + }, + { + "epoch": 5.915863713122082, + "grad_norm": 0.126953125, + "learning_rate": 0.0003267348763285984, + "loss": 0.5004, + "step": 119110 + }, + { + "epoch": 5.916360385417701, + "grad_norm": 0.12158203125, + "learning_rate": 0.00032669514254494883, + "loss": 0.5063, + "step": 119120 + }, + { + "epoch": 5.916857057713321, + "grad_norm": 0.1474609375, + "learning_rate": 0.0003266554087612993, + "loss": 0.5225, + "step": 119130 + }, + { + "epoch": 5.91735373000894, + "grad_norm": 0.12158203125, + "learning_rate": 0.00032661567497764977, + "loss": 0.5236, + "step": 119140 + }, + { + "epoch": 5.917850402304559, + "grad_norm": 0.1259765625, + "learning_rate": 0.00032657594119400024, + "loss": 0.5233, + "step": 119150 + }, + { + "epoch": 5.918347074600179, + "grad_norm": 0.1279296875, + "learning_rate": 0.00032653620741035066, + "loss": 0.5082, + "step": 119160 + }, + { + "epoch": 5.918843746895798, + "grad_norm": 0.1201171875, + "learning_rate": 0.00032649647362670113, + "loss": 0.5145, + "step": 119170 + }, + { + "epoch": 5.919340419191418, + "grad_norm": 0.142578125, + "learning_rate": 0.0003264567398430516, + "loss": 0.4983, + "step": 119180 + }, + { + "epoch": 5.919837091487037, + "grad_norm": 0.1337890625, + "learning_rate": 0.000326417006059402, + "loss": 0.5243, + "step": 119190 + }, + { + "epoch": 5.920333763782656, + "grad_norm": 0.1298828125, + "learning_rate": 0.00032637727227575244, + "loss": 0.5181, + "step": 119200 + }, + { + "epoch": 5.920830436078275, + "grad_norm": 0.1474609375, + "learning_rate": 0.0003263375384921029, + "loss": 0.5084, + "step": 119210 + }, + { + "epoch": 5.9213271083738945, + "grad_norm": 0.16015625, + "learning_rate": 0.0003262978047084534, + "loss": 0.4905, + "step": 119220 + }, + { + "epoch": 5.921823780669515, + "grad_norm": 0.1240234375, + "learning_rate": 0.00032625807092480385, + "loss": 0.5001, + "step": 119230 + }, + { + "epoch": 5.922320452965134, + "grad_norm": 0.1259765625, + "learning_rate": 0.00032621833714115427, + "loss": 0.5056, + "step": 119240 + }, + { + "epoch": 5.922817125260753, + "grad_norm": 0.1376953125, + "learning_rate": 0.00032617860335750474, + "loss": 0.4853, + "step": 119250 + }, + { + "epoch": 5.923313797556372, + "grad_norm": 0.19140625, + "learning_rate": 0.0003261388695738552, + "loss": 0.4933, + "step": 119260 + }, + { + "epoch": 5.9238104698519916, + "grad_norm": 0.130859375, + "learning_rate": 0.00032609913579020563, + "loss": 0.5344, + "step": 119270 + }, + { + "epoch": 5.924307142147611, + "grad_norm": 0.1181640625, + "learning_rate": 0.00032605940200655604, + "loss": 0.5063, + "step": 119280 + }, + { + "epoch": 5.92480381444323, + "grad_norm": 0.13671875, + "learning_rate": 0.00032601966822290657, + "loss": 0.5223, + "step": 119290 + }, + { + "epoch": 5.92530048673885, + "grad_norm": 0.140625, + "learning_rate": 0.000325979934439257, + "loss": 0.5342, + "step": 119300 + }, + { + "epoch": 5.925797159034469, + "grad_norm": 0.1396484375, + "learning_rate": 0.00032594020065560746, + "loss": 0.5046, + "step": 119310 + }, + { + "epoch": 5.926293831330089, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003259004668719579, + "loss": 0.5225, + "step": 119320 + }, + { + "epoch": 5.926790503625708, + "grad_norm": 0.126953125, + "learning_rate": 0.00032586073308830835, + "loss": 0.4972, + "step": 119330 + }, + { + "epoch": 5.927287175921327, + "grad_norm": 0.1171875, + "learning_rate": 0.0003258209993046588, + "loss": 0.5219, + "step": 119340 + }, + { + "epoch": 5.927783848216946, + "grad_norm": 0.11474609375, + "learning_rate": 0.00032578126552100924, + "loss": 0.4868, + "step": 119350 + }, + { + "epoch": 5.9282805205125655, + "grad_norm": 0.1328125, + "learning_rate": 0.0003257415317373597, + "loss": 0.5057, + "step": 119360 + }, + { + "epoch": 5.928777192808186, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003257017979537102, + "loss": 0.4783, + "step": 119370 + }, + { + "epoch": 5.929273865103805, + "grad_norm": 0.1630859375, + "learning_rate": 0.0003256620641700606, + "loss": 0.5347, + "step": 119380 + }, + { + "epoch": 5.929770537399424, + "grad_norm": 0.11865234375, + "learning_rate": 0.00032562233038641107, + "loss": 0.4995, + "step": 119390 + }, + { + "epoch": 5.930267209695043, + "grad_norm": 0.1494140625, + "learning_rate": 0.00032558259660276154, + "loss": 0.5117, + "step": 119400 + }, + { + "epoch": 5.9307638819906625, + "grad_norm": 0.11328125, + "learning_rate": 0.00032554286281911196, + "loss": 0.4947, + "step": 119410 + }, + { + "epoch": 5.931260554286282, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003255031290354624, + "loss": 0.5476, + "step": 119420 + }, + { + "epoch": 5.931757226581901, + "grad_norm": 0.1357421875, + "learning_rate": 0.00032546339525181284, + "loss": 0.5358, + "step": 119430 + }, + { + "epoch": 5.932253898877521, + "grad_norm": 0.12890625, + "learning_rate": 0.00032542366146816337, + "loss": 0.4916, + "step": 119440 + }, + { + "epoch": 5.93275057117314, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003253839276845138, + "loss": 0.5134, + "step": 119450 + }, + { + "epoch": 5.9332472434687595, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003253441939008642, + "loss": 0.5055, + "step": 119460 + }, + { + "epoch": 5.933743915764379, + "grad_norm": 0.1171875, + "learning_rate": 0.0003253044601172147, + "loss": 0.5068, + "step": 119470 + }, + { + "epoch": 5.934240588059998, + "grad_norm": 0.12060546875, + "learning_rate": 0.00032526472633356515, + "loss": 0.5089, + "step": 119480 + }, + { + "epoch": 5.934737260355617, + "grad_norm": 0.12890625, + "learning_rate": 0.00032522499254991556, + "loss": 0.539, + "step": 119490 + }, + { + "epoch": 5.935233932651236, + "grad_norm": 0.1357421875, + "learning_rate": 0.00032518525876626604, + "loss": 0.5004, + "step": 119500 + }, + { + "epoch": 5.935730604946857, + "grad_norm": 0.142578125, + "learning_rate": 0.00032514552498261645, + "loss": 0.5103, + "step": 119510 + }, + { + "epoch": 5.936227277242476, + "grad_norm": 0.138671875, + "learning_rate": 0.000325105791198967, + "loss": 0.4969, + "step": 119520 + }, + { + "epoch": 5.936723949538095, + "grad_norm": 0.150390625, + "learning_rate": 0.0003250660574153174, + "loss": 0.5127, + "step": 119530 + }, + { + "epoch": 5.937220621833714, + "grad_norm": 0.1533203125, + "learning_rate": 0.0003250263236316678, + "loss": 0.5122, + "step": 119540 + }, + { + "epoch": 5.9377172941293335, + "grad_norm": 0.130859375, + "learning_rate": 0.0003249865898480183, + "loss": 0.5198, + "step": 119550 + }, + { + "epoch": 5.938213966424953, + "grad_norm": 0.12890625, + "learning_rate": 0.00032494685606436875, + "loss": 0.5059, + "step": 119560 + }, + { + "epoch": 5.938710638720572, + "grad_norm": 0.12353515625, + "learning_rate": 0.00032490712228071917, + "loss": 0.5045, + "step": 119570 + }, + { + "epoch": 5.939207311016191, + "grad_norm": 0.146484375, + "learning_rate": 0.00032486738849706964, + "loss": 0.498, + "step": 119580 + }, + { + "epoch": 5.93970398331181, + "grad_norm": 0.126953125, + "learning_rate": 0.0003248276547134201, + "loss": 0.49, + "step": 119590 + }, + { + "epoch": 5.9402006556074305, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003247879209297706, + "loss": 0.5009, + "step": 119600 + }, + { + "epoch": 5.94069732790305, + "grad_norm": 0.12255859375, + "learning_rate": 0.000324748187146121, + "loss": 0.5049, + "step": 119610 + }, + { + "epoch": 5.941194000198669, + "grad_norm": 0.142578125, + "learning_rate": 0.0003247084533624714, + "loss": 0.4934, + "step": 119620 + }, + { + "epoch": 5.941690672494288, + "grad_norm": 0.130859375, + "learning_rate": 0.00032466871957882195, + "loss": 0.5154, + "step": 119630 + }, + { + "epoch": 5.942187344789907, + "grad_norm": 0.1328125, + "learning_rate": 0.00032462898579517236, + "loss": 0.5056, + "step": 119640 + }, + { + "epoch": 5.942684017085527, + "grad_norm": 0.11865234375, + "learning_rate": 0.0003245892520115228, + "loss": 0.4994, + "step": 119650 + }, + { + "epoch": 5.943180689381146, + "grad_norm": 0.12890625, + "learning_rate": 0.00032454951822787325, + "loss": 0.5202, + "step": 119660 + }, + { + "epoch": 5.943677361676766, + "grad_norm": 0.115234375, + "learning_rate": 0.0003245097844442237, + "loss": 0.4709, + "step": 119670 + }, + { + "epoch": 5.944174033972385, + "grad_norm": 0.134765625, + "learning_rate": 0.0003244700506605742, + "loss": 0.4979, + "step": 119680 + }, + { + "epoch": 5.944670706268004, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003244303168769246, + "loss": 0.4862, + "step": 119690 + }, + { + "epoch": 5.945167378563624, + "grad_norm": 0.125, + "learning_rate": 0.0003243905830932751, + "loss": 0.4885, + "step": 119700 + }, + { + "epoch": 5.945664050859243, + "grad_norm": 0.140625, + "learning_rate": 0.00032435084930962555, + "loss": 0.5369, + "step": 119710 + }, + { + "epoch": 5.946160723154862, + "grad_norm": 0.12353515625, + "learning_rate": 0.00032431111552597597, + "loss": 0.5043, + "step": 119720 + }, + { + "epoch": 5.946657395450481, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003242713817423264, + "loss": 0.5042, + "step": 119730 + }, + { + "epoch": 5.9471540677461014, + "grad_norm": 0.12109375, + "learning_rate": 0.0003242316479586769, + "loss": 0.4988, + "step": 119740 + }, + { + "epoch": 5.947650740041721, + "grad_norm": 0.150390625, + "learning_rate": 0.00032419191417502733, + "loss": 0.4771, + "step": 119750 + }, + { + "epoch": 5.94814741233734, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003241521803913778, + "loss": 0.4992, + "step": 119760 + }, + { + "epoch": 5.948644084632959, + "grad_norm": 0.1484375, + "learning_rate": 0.0003241124466077282, + "loss": 0.4915, + "step": 119770 + }, + { + "epoch": 5.949140756928578, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003240727128240787, + "loss": 0.4938, + "step": 119780 + }, + { + "epoch": 5.949637429224198, + "grad_norm": 0.1318359375, + "learning_rate": 0.00032403297904042916, + "loss": 0.5259, + "step": 119790 + }, + { + "epoch": 5.950134101519817, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003239932452567796, + "loss": 0.5263, + "step": 119800 + }, + { + "epoch": 5.950630773815437, + "grad_norm": 0.1201171875, + "learning_rate": 0.00032395351147313, + "loss": 0.5156, + "step": 119810 + }, + { + "epoch": 5.951127446111056, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003239137776894805, + "loss": 0.4934, + "step": 119820 + }, + { + "epoch": 5.951624118406675, + "grad_norm": 0.12158203125, + "learning_rate": 0.00032387404390583094, + "loss": 0.4919, + "step": 119830 + }, + { + "epoch": 5.952120790702295, + "grad_norm": 0.154296875, + "learning_rate": 0.0003238343101221814, + "loss": 0.482, + "step": 119840 + }, + { + "epoch": 5.952617462997914, + "grad_norm": 0.130859375, + "learning_rate": 0.00032379457633853183, + "loss": 0.4674, + "step": 119850 + }, + { + "epoch": 5.953114135293533, + "grad_norm": 0.126953125, + "learning_rate": 0.0003237548425548823, + "loss": 0.5062, + "step": 119860 + }, + { + "epoch": 5.953610807589152, + "grad_norm": 0.1337890625, + "learning_rate": 0.00032371510877123277, + "loss": 0.492, + "step": 119870 + }, + { + "epoch": 5.954107479884772, + "grad_norm": 0.126953125, + "learning_rate": 0.0003236753749875832, + "loss": 0.5158, + "step": 119880 + }, + { + "epoch": 5.954604152180392, + "grad_norm": 0.11767578125, + "learning_rate": 0.00032363564120393366, + "loss": 0.51, + "step": 119890 + }, + { + "epoch": 5.955100824476011, + "grad_norm": 0.125, + "learning_rate": 0.00032359590742028413, + "loss": 0.5204, + "step": 119900 + }, + { + "epoch": 5.95559749677163, + "grad_norm": 0.1513671875, + "learning_rate": 0.00032355617363663455, + "loss": 0.4954, + "step": 119910 + }, + { + "epoch": 5.956094169067249, + "grad_norm": 0.119140625, + "learning_rate": 0.000323516439852985, + "loss": 0.4786, + "step": 119920 + }, + { + "epoch": 5.9565908413628685, + "grad_norm": 0.1572265625, + "learning_rate": 0.0003234767060693355, + "loss": 0.4845, + "step": 119930 + }, + { + "epoch": 5.957087513658488, + "grad_norm": 0.10888671875, + "learning_rate": 0.0003234369722856859, + "loss": 0.5167, + "step": 119940 + }, + { + "epoch": 5.957584185954108, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003233972385020364, + "loss": 0.5018, + "step": 119950 + }, + { + "epoch": 5.958080858249727, + "grad_norm": 0.19921875, + "learning_rate": 0.0003233575047183868, + "loss": 0.4911, + "step": 119960 + }, + { + "epoch": 5.958577530545346, + "grad_norm": 0.11181640625, + "learning_rate": 0.0003233177709347373, + "loss": 0.5116, + "step": 119970 + }, + { + "epoch": 5.959074202840966, + "grad_norm": 0.1357421875, + "learning_rate": 0.00032327803715108774, + "loss": 0.4855, + "step": 119980 + }, + { + "epoch": 5.959570875136585, + "grad_norm": 0.1201171875, + "learning_rate": 0.00032323830336743816, + "loss": 0.5309, + "step": 119990 + }, + { + "epoch": 5.960067547432204, + "grad_norm": 0.1162109375, + "learning_rate": 0.0003231985695837886, + "loss": 0.4953, + "step": 120000 + }, + { + "epoch": 5.960564219727823, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003231588358001391, + "loss": 0.5211, + "step": 120010 + }, + { + "epoch": 5.961060892023443, + "grad_norm": 0.15234375, + "learning_rate": 0.0003231191020164895, + "loss": 0.5121, + "step": 120020 + }, + { + "epoch": 5.961557564319063, + "grad_norm": 0.1435546875, + "learning_rate": 0.00032307936823284, + "loss": 0.4955, + "step": 120030 + }, + { + "epoch": 5.962054236614682, + "grad_norm": 0.162109375, + "learning_rate": 0.00032303963444919046, + "loss": 0.5018, + "step": 120040 + }, + { + "epoch": 5.962550908910301, + "grad_norm": 0.166015625, + "learning_rate": 0.00032299990066554093, + "loss": 0.5069, + "step": 120050 + }, + { + "epoch": 5.96304758120592, + "grad_norm": 0.12158203125, + "learning_rate": 0.00032296016688189135, + "loss": 0.4962, + "step": 120060 + }, + { + "epoch": 5.9635442535015395, + "grad_norm": 0.1357421875, + "learning_rate": 0.00032292043309824176, + "loss": 0.4756, + "step": 120070 + }, + { + "epoch": 5.964040925797159, + "grad_norm": 0.1357421875, + "learning_rate": 0.00032288069931459224, + "loss": 0.4822, + "step": 120080 + }, + { + "epoch": 5.964537598092779, + "grad_norm": 0.13671875, + "learning_rate": 0.0003228409655309427, + "loss": 0.5071, + "step": 120090 + }, + { + "epoch": 5.965034270388398, + "grad_norm": 0.2109375, + "learning_rate": 0.0003228012317472931, + "loss": 0.4711, + "step": 120100 + }, + { + "epoch": 5.965530942684017, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003227614979636436, + "loss": 0.4887, + "step": 120110 + }, + { + "epoch": 5.9660276149796365, + "grad_norm": 0.1337890625, + "learning_rate": 0.00032272176417999407, + "loss": 0.5109, + "step": 120120 + }, + { + "epoch": 5.966524287275256, + "grad_norm": 0.1298828125, + "learning_rate": 0.00032268203039634454, + "loss": 0.5055, + "step": 120130 + }, + { + "epoch": 5.967020959570875, + "grad_norm": 0.1376953125, + "learning_rate": 0.00032264229661269495, + "loss": 0.5203, + "step": 120140 + }, + { + "epoch": 5.967517631866494, + "grad_norm": 0.11865234375, + "learning_rate": 0.00032260256282904537, + "loss": 0.5225, + "step": 120150 + }, + { + "epoch": 5.968014304162114, + "grad_norm": 0.158203125, + "learning_rate": 0.0003225628290453959, + "loss": 0.5164, + "step": 120160 + }, + { + "epoch": 5.9685109764577335, + "grad_norm": 0.134765625, + "learning_rate": 0.0003225230952617463, + "loss": 0.4974, + "step": 120170 + }, + { + "epoch": 5.969007648753353, + "grad_norm": 0.1552734375, + "learning_rate": 0.00032248336147809673, + "loss": 0.5121, + "step": 120180 + }, + { + "epoch": 5.969504321048972, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003224436276944472, + "loss": 0.5109, + "step": 120190 + }, + { + "epoch": 5.970000993344591, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003224038939107977, + "loss": 0.5188, + "step": 120200 + }, + { + "epoch": 5.9704976656402104, + "grad_norm": 0.15234375, + "learning_rate": 0.00032236416012714815, + "loss": 0.4893, + "step": 120210 + }, + { + "epoch": 5.97099433793583, + "grad_norm": 0.15234375, + "learning_rate": 0.00032232442634349856, + "loss": 0.5092, + "step": 120220 + }, + { + "epoch": 5.971491010231449, + "grad_norm": 0.1708984375, + "learning_rate": 0.00032228469255984903, + "loss": 0.496, + "step": 120230 + }, + { + "epoch": 5.971987682527069, + "grad_norm": 0.119140625, + "learning_rate": 0.0003222449587761995, + "loss": 0.5161, + "step": 120240 + }, + { + "epoch": 5.972484354822688, + "grad_norm": 0.14453125, + "learning_rate": 0.0003222052249925499, + "loss": 0.531, + "step": 120250 + }, + { + "epoch": 5.9729810271183075, + "grad_norm": 0.1943359375, + "learning_rate": 0.0003221654912089004, + "loss": 0.4973, + "step": 120260 + }, + { + "epoch": 5.973477699413927, + "grad_norm": 0.11572265625, + "learning_rate": 0.00032212575742525087, + "loss": 0.5332, + "step": 120270 + }, + { + "epoch": 5.973974371709546, + "grad_norm": 0.123046875, + "learning_rate": 0.0003220860236416013, + "loss": 0.5034, + "step": 120280 + }, + { + "epoch": 5.974471044005165, + "grad_norm": 0.11962890625, + "learning_rate": 0.00032204628985795175, + "loss": 0.5177, + "step": 120290 + }, + { + "epoch": 5.974967716300784, + "grad_norm": 0.1259765625, + "learning_rate": 0.00032200655607430217, + "loss": 0.4775, + "step": 120300 + }, + { + "epoch": 5.975464388596404, + "grad_norm": 0.1337890625, + "learning_rate": 0.00032196682229065264, + "loss": 0.4971, + "step": 120310 + }, + { + "epoch": 5.975961060892024, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003219270885070031, + "loss": 0.5205, + "step": 120320 + }, + { + "epoch": 5.976457733187643, + "grad_norm": 0.1337890625, + "learning_rate": 0.00032188735472335353, + "loss": 0.507, + "step": 120330 + }, + { + "epoch": 5.976954405483262, + "grad_norm": 0.1337890625, + "learning_rate": 0.000321847620939704, + "loss": 0.5493, + "step": 120340 + }, + { + "epoch": 5.977451077778881, + "grad_norm": 0.12109375, + "learning_rate": 0.0003218078871560545, + "loss": 0.4735, + "step": 120350 + }, + { + "epoch": 5.977947750074501, + "grad_norm": 0.13671875, + "learning_rate": 0.0003217681533724049, + "loss": 0.5099, + "step": 120360 + }, + { + "epoch": 5.97844442237012, + "grad_norm": 0.1474609375, + "learning_rate": 0.00032172841958875536, + "loss": 0.486, + "step": 120370 + }, + { + "epoch": 5.978941094665739, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003216886858051058, + "loss": 0.5223, + "step": 120380 + }, + { + "epoch": 5.979437766961359, + "grad_norm": 0.1396484375, + "learning_rate": 0.00032164895202145625, + "loss": 0.489, + "step": 120390 + }, + { + "epoch": 5.979934439256978, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003216092182378067, + "loss": 0.4881, + "step": 120400 + }, + { + "epoch": 5.980431111552598, + "grad_norm": 0.1337890625, + "learning_rate": 0.00032156948445415714, + "loss": 0.4873, + "step": 120410 + }, + { + "epoch": 5.980927783848217, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003215297506705076, + "loss": 0.4935, + "step": 120420 + }, + { + "epoch": 5.981424456143836, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003214900168868581, + "loss": 0.5151, + "step": 120430 + }, + { + "epoch": 5.981921128439455, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003214502831032085, + "loss": 0.4887, + "step": 120440 + }, + { + "epoch": 5.982417800735075, + "grad_norm": 0.1328125, + "learning_rate": 0.00032141054931955897, + "loss": 0.4957, + "step": 120450 + }, + { + "epoch": 5.982914473030695, + "grad_norm": 0.1240234375, + "learning_rate": 0.00032137081553590944, + "loss": 0.5141, + "step": 120460 + }, + { + "epoch": 5.983411145326314, + "grad_norm": 0.1279296875, + "learning_rate": 0.00032133108175225986, + "loss": 0.4862, + "step": 120470 + }, + { + "epoch": 5.983907817621933, + "grad_norm": 0.1357421875, + "learning_rate": 0.00032129134796861033, + "loss": 0.5018, + "step": 120480 + }, + { + "epoch": 5.984404489917552, + "grad_norm": 0.1552734375, + "learning_rate": 0.00032125161418496075, + "loss": 0.5064, + "step": 120490 + }, + { + "epoch": 5.984901162213172, + "grad_norm": 0.126953125, + "learning_rate": 0.0003212118804013113, + "loss": 0.4906, + "step": 120500 + }, + { + "epoch": 5.985397834508791, + "grad_norm": 0.123046875, + "learning_rate": 0.0003211721466176617, + "loss": 0.4987, + "step": 120510 + }, + { + "epoch": 5.98589450680441, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003211324128340121, + "loss": 0.4978, + "step": 120520 + }, + { + "epoch": 5.98639117910003, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003210926790503626, + "loss": 0.4796, + "step": 120530 + }, + { + "epoch": 5.986887851395649, + "grad_norm": 0.12451171875, + "learning_rate": 0.00032105294526671305, + "loss": 0.5211, + "step": 120540 + }, + { + "epoch": 5.987384523691269, + "grad_norm": 0.115234375, + "learning_rate": 0.00032101321148306347, + "loss": 0.5118, + "step": 120550 + }, + { + "epoch": 5.987881195986888, + "grad_norm": 0.1376953125, + "learning_rate": 0.00032097347769941394, + "loss": 0.5214, + "step": 120560 + }, + { + "epoch": 5.988377868282507, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003209337439157644, + "loss": 0.5044, + "step": 120570 + }, + { + "epoch": 5.988874540578126, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003208940101321149, + "loss": 0.4716, + "step": 120580 + }, + { + "epoch": 5.9893712128737455, + "grad_norm": 0.123046875, + "learning_rate": 0.0003208542763484653, + "loss": 0.5025, + "step": 120590 + }, + { + "epoch": 5.989867885169366, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003208145425648157, + "loss": 0.4836, + "step": 120600 + }, + { + "epoch": 5.990364557464985, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003207748087811662, + "loss": 0.5115, + "step": 120610 + }, + { + "epoch": 5.990861229760604, + "grad_norm": 0.11962890625, + "learning_rate": 0.00032073507499751666, + "loss": 0.4903, + "step": 120620 + }, + { + "epoch": 5.991357902056223, + "grad_norm": 0.13671875, + "learning_rate": 0.0003206953412138671, + "loss": 0.4831, + "step": 120630 + }, + { + "epoch": 5.9918545743518425, + "grad_norm": 0.1396484375, + "learning_rate": 0.00032065560743021755, + "loss": 0.4961, + "step": 120640 + }, + { + "epoch": 5.992351246647462, + "grad_norm": 0.1181640625, + "learning_rate": 0.000320615873646568, + "loss": 0.4973, + "step": 120650 + }, + { + "epoch": 5.992847918943081, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003205761398629185, + "loss": 0.4923, + "step": 120660 + }, + { + "epoch": 5.993344591238701, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003205364060792689, + "loss": 0.5084, + "step": 120670 + }, + { + "epoch": 5.99384126353432, + "grad_norm": 0.11474609375, + "learning_rate": 0.0003204966722956193, + "loss": 0.5122, + "step": 120680 + }, + { + "epoch": 5.99433793582994, + "grad_norm": 0.12255859375, + "learning_rate": 0.00032045693851196985, + "loss": 0.4817, + "step": 120690 + }, + { + "epoch": 5.994834608125559, + "grad_norm": 0.1279296875, + "learning_rate": 0.00032041720472832027, + "loss": 0.5426, + "step": 120700 + }, + { + "epoch": 5.995331280421178, + "grad_norm": 0.1435546875, + "learning_rate": 0.00032037747094467074, + "loss": 0.514, + "step": 120710 + }, + { + "epoch": 5.995827952716797, + "grad_norm": 0.1806640625, + "learning_rate": 0.00032033773716102116, + "loss": 0.4534, + "step": 120720 + }, + { + "epoch": 5.9963246250124165, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003202980033773716, + "loss": 0.4894, + "step": 120730 + }, + { + "epoch": 5.996821297308037, + "grad_norm": 0.1474609375, + "learning_rate": 0.0003202582695937221, + "loss": 0.495, + "step": 120740 + }, + { + "epoch": 5.997317969603656, + "grad_norm": 0.126953125, + "learning_rate": 0.0003202185358100725, + "loss": 0.481, + "step": 120750 + }, + { + "epoch": 5.997814641899275, + "grad_norm": 0.119140625, + "learning_rate": 0.000320178802026423, + "loss": 0.4966, + "step": 120760 + }, + { + "epoch": 5.998311314194894, + "grad_norm": 0.12109375, + "learning_rate": 0.00032013906824277346, + "loss": 0.5028, + "step": 120770 + }, + { + "epoch": 5.9988079864905135, + "grad_norm": 0.125, + "learning_rate": 0.0003200993344591239, + "loss": 0.4975, + "step": 120780 + }, + { + "epoch": 5.999304658786133, + "grad_norm": 0.11669921875, + "learning_rate": 0.00032005960067547435, + "loss": 0.4972, + "step": 120790 + }, + { + "epoch": 5.999801331081752, + "grad_norm": 0.12890625, + "learning_rate": 0.0003200198668918248, + "loss": 0.5059, + "step": 120800 + }, + { + "epoch": 6.000298003377371, + "grad_norm": 0.140625, + "learning_rate": 0.00031998013310817523, + "loss": 0.4947, + "step": 120810 + }, + { + "epoch": 6.000794675672991, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003199403993245257, + "loss": 0.4986, + "step": 120820 + }, + { + "epoch": 6.0012913479686105, + "grad_norm": 0.1474609375, + "learning_rate": 0.0003199006655408761, + "loss": 0.4952, + "step": 120830 + }, + { + "epoch": 6.00178802026423, + "grad_norm": 0.111328125, + "learning_rate": 0.0003198609317572266, + "loss": 0.4735, + "step": 120840 + }, + { + "epoch": 6.002284692559849, + "grad_norm": 0.140625, + "learning_rate": 0.00031982119797357707, + "loss": 0.5047, + "step": 120850 + }, + { + "epoch": 6.002781364855468, + "grad_norm": 0.2119140625, + "learning_rate": 0.0003197814641899275, + "loss": 0.5082, + "step": 120860 + }, + { + "epoch": 6.003278037151087, + "grad_norm": 0.11865234375, + "learning_rate": 0.00031974173040627795, + "loss": 0.5107, + "step": 120870 + }, + { + "epoch": 6.003774709446707, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003197019966226284, + "loss": 0.488, + "step": 120880 + }, + { + "epoch": 6.004271381742327, + "grad_norm": 0.12890625, + "learning_rate": 0.00031966226283897884, + "loss": 0.4976, + "step": 120890 + }, + { + "epoch": 6.004768054037946, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003196225290553293, + "loss": 0.4974, + "step": 120900 + }, + { + "epoch": 6.005264726333565, + "grad_norm": 0.134765625, + "learning_rate": 0.00031958279527167973, + "loss": 0.4583, + "step": 120910 + }, + { + "epoch": 6.0057613986291845, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003195430614880302, + "loss": 0.4876, + "step": 120920 + }, + { + "epoch": 6.006258070924804, + "grad_norm": 0.13671875, + "learning_rate": 0.0003195033277043807, + "loss": 0.4885, + "step": 120930 + }, + { + "epoch": 6.006754743220423, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003194635939207311, + "loss": 0.5095, + "step": 120940 + }, + { + "epoch": 6.007251415516042, + "grad_norm": 0.1611328125, + "learning_rate": 0.00031942386013708156, + "loss": 0.4792, + "step": 120950 + }, + { + "epoch": 6.007748087811662, + "grad_norm": 0.19921875, + "learning_rate": 0.00031938412635343203, + "loss": 0.4897, + "step": 120960 + }, + { + "epoch": 6.0082447601072815, + "grad_norm": 0.12890625, + "learning_rate": 0.00031934439256978245, + "loss": 0.4957, + "step": 120970 + }, + { + "epoch": 6.008741432402901, + "grad_norm": 0.146484375, + "learning_rate": 0.0003193046587861329, + "loss": 0.4716, + "step": 120980 + }, + { + "epoch": 6.00923810469852, + "grad_norm": 0.12109375, + "learning_rate": 0.0003192649250024834, + "loss": 0.4969, + "step": 120990 + }, + { + "epoch": 6.009734776994139, + "grad_norm": 0.115234375, + "learning_rate": 0.0003192251912188338, + "loss": 0.4757, + "step": 121000 + }, + { + "epoch": 6.010231449289758, + "grad_norm": 0.12890625, + "learning_rate": 0.0003191854574351843, + "loss": 0.4886, + "step": 121010 + }, + { + "epoch": 6.010728121585378, + "grad_norm": 0.125, + "learning_rate": 0.0003191457236515347, + "loss": 0.4558, + "step": 121020 + }, + { + "epoch": 6.011224793880998, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003191059898678852, + "loss": 0.476, + "step": 121030 + }, + { + "epoch": 6.011721466176617, + "grad_norm": 0.162109375, + "learning_rate": 0.00031906625608423564, + "loss": 0.5165, + "step": 121040 + }, + { + "epoch": 6.012218138472236, + "grad_norm": 0.1328125, + "learning_rate": 0.00031902652230058606, + "loss": 0.493, + "step": 121050 + }, + { + "epoch": 6.012714810767855, + "grad_norm": 0.123046875, + "learning_rate": 0.00031898678851693653, + "loss": 0.4721, + "step": 121060 + }, + { + "epoch": 6.013211483063475, + "grad_norm": 0.1279296875, + "learning_rate": 0.000318947054733287, + "loss": 0.5085, + "step": 121070 + }, + { + "epoch": 6.013708155359094, + "grad_norm": 0.115234375, + "learning_rate": 0.0003189073209496374, + "loss": 0.4958, + "step": 121080 + }, + { + "epoch": 6.014204827654713, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003188675871659879, + "loss": 0.4813, + "step": 121090 + }, + { + "epoch": 6.014701499950333, + "grad_norm": 0.11962890625, + "learning_rate": 0.00031882785338233836, + "loss": 0.4742, + "step": 121100 + }, + { + "epoch": 6.015198172245952, + "grad_norm": 0.130859375, + "learning_rate": 0.00031878811959868883, + "loss": 0.484, + "step": 121110 + }, + { + "epoch": 6.015694844541572, + "grad_norm": 0.12255859375, + "learning_rate": 0.00031874838581503925, + "loss": 0.4848, + "step": 121120 + }, + { + "epoch": 6.016191516837191, + "grad_norm": 0.1513671875, + "learning_rate": 0.00031870865203138967, + "loss": 0.4943, + "step": 121130 + }, + { + "epoch": 6.01668818913281, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003186689182477402, + "loss": 0.4791, + "step": 121140 + }, + { + "epoch": 6.017184861428429, + "grad_norm": 0.126953125, + "learning_rate": 0.0003186291844640906, + "loss": 0.4831, + "step": 121150 + }, + { + "epoch": 6.017681533724049, + "grad_norm": 0.1494140625, + "learning_rate": 0.0003185894506804411, + "loss": 0.49, + "step": 121160 + }, + { + "epoch": 6.018178206019668, + "grad_norm": 0.1572265625, + "learning_rate": 0.0003185497168967915, + "loss": 0.5079, + "step": 121170 + }, + { + "epoch": 6.018674878315288, + "grad_norm": 0.12255859375, + "learning_rate": 0.00031850998311314197, + "loss": 0.4653, + "step": 121180 + }, + { + "epoch": 6.019171550610907, + "grad_norm": 0.12890625, + "learning_rate": 0.00031847024932949244, + "loss": 0.479, + "step": 121190 + }, + { + "epoch": 6.019668222906526, + "grad_norm": 0.130859375, + "learning_rate": 0.00031843051554584286, + "loss": 0.5137, + "step": 121200 + }, + { + "epoch": 6.020164895202146, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003183907817621933, + "loss": 0.4862, + "step": 121210 + }, + { + "epoch": 6.020661567497765, + "grad_norm": 0.142578125, + "learning_rate": 0.0003183510479785438, + "loss": 0.5049, + "step": 121220 + }, + { + "epoch": 6.021158239793384, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003183113141948942, + "loss": 0.4808, + "step": 121230 + }, + { + "epoch": 6.021654912089003, + "grad_norm": 0.126953125, + "learning_rate": 0.0003182715804112447, + "loss": 0.5062, + "step": 121240 + }, + { + "epoch": 6.022151584384623, + "grad_norm": 0.1533203125, + "learning_rate": 0.0003182318466275951, + "loss": 0.4616, + "step": 121250 + }, + { + "epoch": 6.022648256680243, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003181921128439456, + "loss": 0.4979, + "step": 121260 + }, + { + "epoch": 6.023144928975862, + "grad_norm": 0.1337890625, + "learning_rate": 0.00031815237906029605, + "loss": 0.479, + "step": 121270 + }, + { + "epoch": 6.023641601271481, + "grad_norm": 0.1337890625, + "learning_rate": 0.00031811264527664647, + "loss": 0.5044, + "step": 121280 + }, + { + "epoch": 6.0241382735671, + "grad_norm": 0.125, + "learning_rate": 0.00031807291149299694, + "loss": 0.4953, + "step": 121290 + }, + { + "epoch": 6.0246349458627195, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003180331777093474, + "loss": 0.477, + "step": 121300 + }, + { + "epoch": 6.025131618158339, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003179934439256978, + "loss": 0.4868, + "step": 121310 + }, + { + "epoch": 6.025628290453959, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003179537101420483, + "loss": 0.4859, + "step": 121320 + }, + { + "epoch": 6.026124962749578, + "grad_norm": 0.12890625, + "learning_rate": 0.00031791397635839877, + "loss": 0.4923, + "step": 121330 + }, + { + "epoch": 6.026621635045197, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003178742425747492, + "loss": 0.5309, + "step": 121340 + }, + { + "epoch": 6.0271183073408165, + "grad_norm": 0.1357421875, + "learning_rate": 0.00031783450879109966, + "loss": 0.4882, + "step": 121350 + }, + { + "epoch": 6.027614979636436, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003177947750074501, + "loss": 0.5061, + "step": 121360 + }, + { + "epoch": 6.028111651932055, + "grad_norm": 0.142578125, + "learning_rate": 0.00031775504122380055, + "loss": 0.4982, + "step": 121370 + }, + { + "epoch": 6.028608324227674, + "grad_norm": 0.115234375, + "learning_rate": 0.000317715307440151, + "loss": 0.4912, + "step": 121380 + }, + { + "epoch": 6.029104996523294, + "grad_norm": 0.1357421875, + "learning_rate": 0.00031767557365650143, + "loss": 0.4962, + "step": 121390 + }, + { + "epoch": 6.029601668818914, + "grad_norm": 0.1748046875, + "learning_rate": 0.0003176358398728519, + "loss": 0.5142, + "step": 121400 + }, + { + "epoch": 6.030098341114533, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003175961060892024, + "loss": 0.5126, + "step": 121410 + }, + { + "epoch": 6.030595013410152, + "grad_norm": 0.1572265625, + "learning_rate": 0.0003175563723055528, + "loss": 0.4588, + "step": 121420 + }, + { + "epoch": 6.031091685705771, + "grad_norm": 0.166015625, + "learning_rate": 0.00031751663852190327, + "loss": 0.4973, + "step": 121430 + }, + { + "epoch": 6.0315883580013905, + "grad_norm": 0.12158203125, + "learning_rate": 0.00031747690473825374, + "loss": 0.504, + "step": 121440 + }, + { + "epoch": 6.03208503029701, + "grad_norm": 0.11669921875, + "learning_rate": 0.00031743717095460415, + "loss": 0.5137, + "step": 121450 + }, + { + "epoch": 6.03258170259263, + "grad_norm": 0.1142578125, + "learning_rate": 0.0003173974371709546, + "loss": 0.4801, + "step": 121460 + }, + { + "epoch": 6.033078374888249, + "grad_norm": 0.1259765625, + "learning_rate": 0.00031735770338730504, + "loss": 0.4453, + "step": 121470 + }, + { + "epoch": 6.033575047183868, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003173179696036555, + "loss": 0.5028, + "step": 121480 + }, + { + "epoch": 6.0340717194794875, + "grad_norm": 0.12890625, + "learning_rate": 0.000317278235820006, + "loss": 0.5093, + "step": 121490 + }, + { + "epoch": 6.034568391775107, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003172385020363564, + "loss": 0.4924, + "step": 121500 + }, + { + "epoch": 6.035065064070726, + "grad_norm": 0.140625, + "learning_rate": 0.0003171987682527069, + "loss": 0.5078, + "step": 121510 + }, + { + "epoch": 6.035561736366345, + "grad_norm": 0.1240234375, + "learning_rate": 0.00031715903446905735, + "loss": 0.4967, + "step": 121520 + }, + { + "epoch": 6.036058408661964, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003171193006854078, + "loss": 0.4631, + "step": 121530 + }, + { + "epoch": 6.0365550809575845, + "grad_norm": 0.1083984375, + "learning_rate": 0.00031707956690175823, + "loss": 0.5028, + "step": 121540 + }, + { + "epoch": 6.037051753253204, + "grad_norm": 0.12060546875, + "learning_rate": 0.00031703983311810865, + "loss": 0.4688, + "step": 121550 + }, + { + "epoch": 6.037548425548823, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003170000993344592, + "loss": 0.4739, + "step": 121560 + }, + { + "epoch": 6.038045097844442, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003169603655508096, + "loss": 0.5197, + "step": 121570 + }, + { + "epoch": 6.038541770140061, + "grad_norm": 0.17578125, + "learning_rate": 0.00031692063176716, + "loss": 0.5127, + "step": 121580 + }, + { + "epoch": 6.039038442435681, + "grad_norm": 0.138671875, + "learning_rate": 0.0003168808979835105, + "loss": 0.4882, + "step": 121590 + }, + { + "epoch": 6.0395351147313, + "grad_norm": 0.1298828125, + "learning_rate": 0.00031684116419986095, + "loss": 0.4512, + "step": 121600 + }, + { + "epoch": 6.04003178702692, + "grad_norm": 0.138671875, + "learning_rate": 0.0003168014304162114, + "loss": 0.508, + "step": 121610 + }, + { + "epoch": 6.040528459322539, + "grad_norm": 0.123046875, + "learning_rate": 0.00031676169663256184, + "loss": 0.4672, + "step": 121620 + }, + { + "epoch": 6.0410251316181585, + "grad_norm": 0.154296875, + "learning_rate": 0.0003167219628489123, + "loss": 0.5086, + "step": 121630 + }, + { + "epoch": 6.041521803913778, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003166822290652628, + "loss": 0.4855, + "step": 121640 + }, + { + "epoch": 6.042018476209397, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003166424952816132, + "loss": 0.4929, + "step": 121650 + }, + { + "epoch": 6.042515148505016, + "grad_norm": 0.16015625, + "learning_rate": 0.0003166027614979636, + "loss": 0.5249, + "step": 121660 + }, + { + "epoch": 6.043011820800635, + "grad_norm": 0.142578125, + "learning_rate": 0.00031656302771431415, + "loss": 0.5053, + "step": 121670 + }, + { + "epoch": 6.0435084930962555, + "grad_norm": 0.1337890625, + "learning_rate": 0.00031652329393066456, + "loss": 0.4981, + "step": 121680 + }, + { + "epoch": 6.044005165391875, + "grad_norm": 0.1474609375, + "learning_rate": 0.00031648356014701503, + "loss": 0.5165, + "step": 121690 + }, + { + "epoch": 6.044501837687494, + "grad_norm": 0.150390625, + "learning_rate": 0.00031644382636336545, + "loss": 0.4961, + "step": 121700 + }, + { + "epoch": 6.044998509983113, + "grad_norm": 0.10888671875, + "learning_rate": 0.0003164040925797159, + "loss": 0.4909, + "step": 121710 + }, + { + "epoch": 6.045495182278732, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003163643587960664, + "loss": 0.4829, + "step": 121720 + }, + { + "epoch": 6.045991854574352, + "grad_norm": 0.158203125, + "learning_rate": 0.0003163246250124168, + "loss": 0.4891, + "step": 121730 + }, + { + "epoch": 6.046488526869971, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003162848912287673, + "loss": 0.4948, + "step": 121740 + }, + { + "epoch": 6.046985199165591, + "grad_norm": 0.138671875, + "learning_rate": 0.00031624515744511775, + "loss": 0.5221, + "step": 121750 + }, + { + "epoch": 6.04748187146121, + "grad_norm": 0.1474609375, + "learning_rate": 0.00031620542366146817, + "loss": 0.4994, + "step": 121760 + }, + { + "epoch": 6.047978543756829, + "grad_norm": 0.130859375, + "learning_rate": 0.00031616568987781864, + "loss": 0.5257, + "step": 121770 + }, + { + "epoch": 6.048475216052449, + "grad_norm": 0.11865234375, + "learning_rate": 0.00031612595609416906, + "loss": 0.4944, + "step": 121780 + }, + { + "epoch": 6.048971888348068, + "grad_norm": 0.1279296875, + "learning_rate": 0.00031608622231051953, + "loss": 0.5183, + "step": 121790 + }, + { + "epoch": 6.049468560643687, + "grad_norm": 0.11376953125, + "learning_rate": 0.00031604648852687, + "loss": 0.4723, + "step": 121800 + }, + { + "epoch": 6.049965232939306, + "grad_norm": 0.15625, + "learning_rate": 0.0003160067547432204, + "loss": 0.4915, + "step": 121810 + }, + { + "epoch": 6.0504619052349256, + "grad_norm": 0.130859375, + "learning_rate": 0.0003159670209595709, + "loss": 0.4774, + "step": 121820 + }, + { + "epoch": 6.050958577530546, + "grad_norm": 0.1357421875, + "learning_rate": 0.00031592728717592136, + "loss": 0.5147, + "step": 121830 + }, + { + "epoch": 6.051455249826165, + "grad_norm": 0.13671875, + "learning_rate": 0.0003158875533922718, + "loss": 0.4558, + "step": 121840 + }, + { + "epoch": 6.051951922121784, + "grad_norm": 0.1259765625, + "learning_rate": 0.00031584781960862225, + "loss": 0.456, + "step": 121850 + }, + { + "epoch": 6.052448594417403, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003158080858249727, + "loss": 0.4759, + "step": 121860 + }, + { + "epoch": 6.052945266713023, + "grad_norm": 0.146484375, + "learning_rate": 0.00031576835204132314, + "loss": 0.4621, + "step": 121870 + }, + { + "epoch": 6.053441939008642, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003157286182576736, + "loss": 0.5008, + "step": 121880 + }, + { + "epoch": 6.053938611304261, + "grad_norm": 0.146484375, + "learning_rate": 0.000315688884474024, + "loss": 0.5374, + "step": 121890 + }, + { + "epoch": 6.054435283599881, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003156491506903745, + "loss": 0.4992, + "step": 121900 + }, + { + "epoch": 6.0549319558955, + "grad_norm": 0.12890625, + "learning_rate": 0.00031560941690672497, + "loss": 0.5023, + "step": 121910 + }, + { + "epoch": 6.05542862819112, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003155696831230754, + "loss": 0.5027, + "step": 121920 + }, + { + "epoch": 6.055925300486739, + "grad_norm": 0.1328125, + "learning_rate": 0.00031552994933942586, + "loss": 0.4602, + "step": 121930 + }, + { + "epoch": 6.056421972782358, + "grad_norm": 0.126953125, + "learning_rate": 0.00031549021555577633, + "loss": 0.4808, + "step": 121940 + }, + { + "epoch": 6.056918645077977, + "grad_norm": 0.12109375, + "learning_rate": 0.00031545048177212675, + "loss": 0.4805, + "step": 121950 + }, + { + "epoch": 6.0574153173735965, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003154107479884772, + "loss": 0.5113, + "step": 121960 + }, + { + "epoch": 6.057911989669217, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003153710142048277, + "loss": 0.4748, + "step": 121970 + }, + { + "epoch": 6.058408661964836, + "grad_norm": 0.12158203125, + "learning_rate": 0.00031533128042117816, + "loss": 0.4888, + "step": 121980 + }, + { + "epoch": 6.058905334260455, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003152915466375286, + "loss": 0.4952, + "step": 121990 + }, + { + "epoch": 6.059402006556074, + "grad_norm": 0.1875, + "learning_rate": 0.000315251812853879, + "loss": 0.4945, + "step": 122000 + }, + { + "epoch": 6.0598986788516935, + "grad_norm": 0.1220703125, + "learning_rate": 0.00031521207907022947, + "loss": 0.4795, + "step": 122010 + }, + { + "epoch": 6.060395351147313, + "grad_norm": 0.142578125, + "learning_rate": 0.00031517234528657994, + "loss": 0.4664, + "step": 122020 + }, + { + "epoch": 6.060892023442932, + "grad_norm": 0.12158203125, + "learning_rate": 0.00031513261150293035, + "loss": 0.4937, + "step": 122030 + }, + { + "epoch": 6.061388695738552, + "grad_norm": 0.1630859375, + "learning_rate": 0.0003150928777192808, + "loss": 0.5135, + "step": 122040 + }, + { + "epoch": 6.061885368034171, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003150531439356313, + "loss": 0.4935, + "step": 122050 + }, + { + "epoch": 6.0623820403297906, + "grad_norm": 0.1630859375, + "learning_rate": 0.00031501341015198177, + "loss": 0.4765, + "step": 122060 + }, + { + "epoch": 6.06287871262541, + "grad_norm": 0.1123046875, + "learning_rate": 0.0003149736763683322, + "loss": 0.4895, + "step": 122070 + }, + { + "epoch": 6.063375384921029, + "grad_norm": 0.1455078125, + "learning_rate": 0.0003149339425846826, + "loss": 0.4835, + "step": 122080 + }, + { + "epoch": 6.063872057216648, + "grad_norm": 0.15625, + "learning_rate": 0.00031489420880103313, + "loss": 0.4845, + "step": 122090 + }, + { + "epoch": 6.0643687295122675, + "grad_norm": 0.11474609375, + "learning_rate": 0.00031485447501738355, + "loss": 0.4877, + "step": 122100 + }, + { + "epoch": 6.064865401807888, + "grad_norm": 0.1220703125, + "learning_rate": 0.00031481474123373396, + "loss": 0.5008, + "step": 122110 + }, + { + "epoch": 6.065362074103507, + "grad_norm": 0.1298828125, + "learning_rate": 0.00031477500745008443, + "loss": 0.5, + "step": 122120 + }, + { + "epoch": 6.065858746399126, + "grad_norm": 0.134765625, + "learning_rate": 0.0003147352736664349, + "loss": 0.49, + "step": 122130 + }, + { + "epoch": 6.066355418694745, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003146955398827854, + "loss": 0.4981, + "step": 122140 + }, + { + "epoch": 6.0668520909903645, + "grad_norm": 0.12890625, + "learning_rate": 0.0003146558060991358, + "loss": 0.4632, + "step": 122150 + }, + { + "epoch": 6.067348763285984, + "grad_norm": 0.11474609375, + "learning_rate": 0.00031461607231548627, + "loss": 0.4868, + "step": 122160 + }, + { + "epoch": 6.067845435581603, + "grad_norm": 0.126953125, + "learning_rate": 0.00031457633853183674, + "loss": 0.4793, + "step": 122170 + }, + { + "epoch": 6.068342107877223, + "grad_norm": 0.1767578125, + "learning_rate": 0.00031453660474818715, + "loss": 0.5122, + "step": 122180 + }, + { + "epoch": 6.068838780172842, + "grad_norm": 0.1328125, + "learning_rate": 0.00031449687096453757, + "loss": 0.4774, + "step": 122190 + }, + { + "epoch": 6.0693354524684615, + "grad_norm": 0.134765625, + "learning_rate": 0.0003144571371808881, + "loss": 0.4966, + "step": 122200 + }, + { + "epoch": 6.069832124764081, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003144174033972385, + "loss": 0.4972, + "step": 122210 + }, + { + "epoch": 6.0703287970597, + "grad_norm": 0.16015625, + "learning_rate": 0.000314377669613589, + "loss": 0.5048, + "step": 122220 + }, + { + "epoch": 6.070825469355319, + "grad_norm": 0.123046875, + "learning_rate": 0.0003143379358299394, + "loss": 0.4965, + "step": 122230 + }, + { + "epoch": 6.071322141650938, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003142982020462899, + "loss": 0.489, + "step": 122240 + }, + { + "epoch": 6.071818813946558, + "grad_norm": 0.1416015625, + "learning_rate": 0.00031425846826264035, + "loss": 0.4829, + "step": 122250 + }, + { + "epoch": 6.072315486242178, + "grad_norm": 0.11767578125, + "learning_rate": 0.00031421873447899076, + "loss": 0.4837, + "step": 122260 + }, + { + "epoch": 6.072812158537797, + "grad_norm": 0.162109375, + "learning_rate": 0.00031417900069534123, + "loss": 0.4843, + "step": 122270 + }, + { + "epoch": 6.073308830833416, + "grad_norm": 0.138671875, + "learning_rate": 0.0003141392669116917, + "loss": 0.4852, + "step": 122280 + }, + { + "epoch": 6.073805503129035, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003140995331280421, + "loss": 0.502, + "step": 122290 + }, + { + "epoch": 6.074302175424655, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003140597993443926, + "loss": 0.4878, + "step": 122300 + }, + { + "epoch": 6.074798847720274, + "grad_norm": 0.138671875, + "learning_rate": 0.000314020065560743, + "loss": 0.4589, + "step": 122310 + }, + { + "epoch": 6.075295520015893, + "grad_norm": 0.1552734375, + "learning_rate": 0.0003139803317770935, + "loss": 0.4846, + "step": 122320 + }, + { + "epoch": 6.075792192311513, + "grad_norm": 0.142578125, + "learning_rate": 0.00031394059799344395, + "loss": 0.4871, + "step": 122330 + }, + { + "epoch": 6.0762888646071325, + "grad_norm": 0.1279296875, + "learning_rate": 0.00031390086420979437, + "loss": 0.4678, + "step": 122340 + }, + { + "epoch": 6.076785536902752, + "grad_norm": 0.134765625, + "learning_rate": 0.00031386113042614484, + "loss": 0.4907, + "step": 122350 + }, + { + "epoch": 6.077282209198371, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003138213966424953, + "loss": 0.4916, + "step": 122360 + }, + { + "epoch": 6.07777888149399, + "grad_norm": 0.14453125, + "learning_rate": 0.00031378166285884573, + "loss": 0.482, + "step": 122370 + }, + { + "epoch": 6.078275553789609, + "grad_norm": 0.134765625, + "learning_rate": 0.0003137419290751962, + "loss": 0.518, + "step": 122380 + }, + { + "epoch": 6.078772226085229, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003137021952915467, + "loss": 0.5041, + "step": 122390 + }, + { + "epoch": 6.079268898380849, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003136624615078971, + "loss": 0.4746, + "step": 122400 + }, + { + "epoch": 6.079765570676468, + "grad_norm": 0.13671875, + "learning_rate": 0.00031362272772424756, + "loss": 0.5145, + "step": 122410 + }, + { + "epoch": 6.080262242972087, + "grad_norm": 0.11767578125, + "learning_rate": 0.000313582993940598, + "loss": 0.4585, + "step": 122420 + }, + { + "epoch": 6.080758915267706, + "grad_norm": 0.125, + "learning_rate": 0.0003135432601569485, + "loss": 0.4651, + "step": 122430 + }, + { + "epoch": 6.081255587563326, + "grad_norm": 0.130859375, + "learning_rate": 0.0003135035263732989, + "loss": 0.4972, + "step": 122440 + }, + { + "epoch": 6.081752259858945, + "grad_norm": 0.12890625, + "learning_rate": 0.00031346379258964934, + "loss": 0.4813, + "step": 122450 + }, + { + "epoch": 6.082248932154564, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003134240588059998, + "loss": 0.5177, + "step": 122460 + }, + { + "epoch": 6.082745604450184, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003133843250223503, + "loss": 0.4936, + "step": 122470 + }, + { + "epoch": 6.083242276745803, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003133445912387007, + "loss": 0.4782, + "step": 122480 + }, + { + "epoch": 6.083738949041423, + "grad_norm": 0.13671875, + "learning_rate": 0.00031330485745505117, + "loss": 0.4816, + "step": 122490 + }, + { + "epoch": 6.084235621337042, + "grad_norm": 0.1298828125, + "learning_rate": 0.00031326512367140164, + "loss": 0.4851, + "step": 122500 + }, + { + "epoch": 6.084732293632661, + "grad_norm": 0.12890625, + "learning_rate": 0.0003132253898877521, + "loss": 0.5047, + "step": 122510 + }, + { + "epoch": 6.08522896592828, + "grad_norm": 0.1748046875, + "learning_rate": 0.00031318565610410253, + "loss": 0.4938, + "step": 122520 + }, + { + "epoch": 6.0857256382239, + "grad_norm": 0.1279296875, + "learning_rate": 0.00031314592232045295, + "loss": 0.4875, + "step": 122530 + }, + { + "epoch": 6.086222310519519, + "grad_norm": 0.1201171875, + "learning_rate": 0.00031310618853680347, + "loss": 0.5155, + "step": 122540 + }, + { + "epoch": 6.086718982815139, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003130664547531539, + "loss": 0.4958, + "step": 122550 + }, + { + "epoch": 6.087215655110758, + "grad_norm": 0.12890625, + "learning_rate": 0.0003130267209695043, + "loss": 0.4881, + "step": 122560 + }, + { + "epoch": 6.087712327406377, + "grad_norm": 0.162109375, + "learning_rate": 0.0003129869871858548, + "loss": 0.4943, + "step": 122570 + }, + { + "epoch": 6.088208999701997, + "grad_norm": 0.1328125, + "learning_rate": 0.00031294725340220525, + "loss": 0.5165, + "step": 122580 + }, + { + "epoch": 6.088705671997616, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003129075196185557, + "loss": 0.5117, + "step": 122590 + }, + { + "epoch": 6.089202344293235, + "grad_norm": 0.1298828125, + "learning_rate": 0.00031286778583490614, + "loss": 0.5006, + "step": 122600 + }, + { + "epoch": 6.089699016588854, + "grad_norm": 0.1328125, + "learning_rate": 0.00031282805205125656, + "loss": 0.4859, + "step": 122610 + }, + { + "epoch": 6.090195688884474, + "grad_norm": 0.1767578125, + "learning_rate": 0.0003127883182676071, + "loss": 0.522, + "step": 122620 + }, + { + "epoch": 6.090692361180094, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003127485844839575, + "loss": 0.5042, + "step": 122630 + }, + { + "epoch": 6.091189033475713, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003127088507003079, + "loss": 0.4801, + "step": 122640 + }, + { + "epoch": 6.091685705771332, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003126691169166584, + "loss": 0.4733, + "step": 122650 + }, + { + "epoch": 6.092182378066951, + "grad_norm": 0.16015625, + "learning_rate": 0.00031262938313300886, + "loss": 0.497, + "step": 122660 + }, + { + "epoch": 6.0926790503625705, + "grad_norm": 0.1416015625, + "learning_rate": 0.00031258964934935933, + "loss": 0.4845, + "step": 122670 + }, + { + "epoch": 6.09317572265819, + "grad_norm": 0.1259765625, + "learning_rate": 0.00031254991556570975, + "loss": 0.4707, + "step": 122680 + }, + { + "epoch": 6.09367239495381, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003125101817820602, + "loss": 0.4533, + "step": 122690 + }, + { + "epoch": 6.094169067249429, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003124704479984107, + "loss": 0.5105, + "step": 122700 + }, + { + "epoch": 6.094665739545048, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003124307142147611, + "loss": 0.4662, + "step": 122710 + }, + { + "epoch": 6.0951624118406675, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003123909804311115, + "loss": 0.474, + "step": 122720 + }, + { + "epoch": 6.095659084136287, + "grad_norm": 0.15234375, + "learning_rate": 0.00031235124664746205, + "loss": 0.4727, + "step": 122730 + }, + { + "epoch": 6.096155756431906, + "grad_norm": 0.1435546875, + "learning_rate": 0.00031231151286381247, + "loss": 0.4855, + "step": 122740 + }, + { + "epoch": 6.096652428727525, + "grad_norm": 0.125, + "learning_rate": 0.00031227177908016294, + "loss": 0.4932, + "step": 122750 + }, + { + "epoch": 6.097149101023145, + "grad_norm": 0.125, + "learning_rate": 0.00031223204529651335, + "loss": 0.4787, + "step": 122760 + }, + { + "epoch": 6.097645773318765, + "grad_norm": 0.1328125, + "learning_rate": 0.0003121923115128638, + "loss": 0.4811, + "step": 122770 + }, + { + "epoch": 6.098142445614384, + "grad_norm": 0.119140625, + "learning_rate": 0.0003121525777292143, + "loss": 0.5072, + "step": 122780 + }, + { + "epoch": 6.098639117910003, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003121128439455647, + "loss": 0.4753, + "step": 122790 + }, + { + "epoch": 6.099135790205622, + "grad_norm": 0.1865234375, + "learning_rate": 0.0003120731101619152, + "loss": 0.4942, + "step": 122800 + }, + { + "epoch": 6.0996324625012415, + "grad_norm": 0.134765625, + "learning_rate": 0.00031203337637826566, + "loss": 0.5047, + "step": 122810 + }, + { + "epoch": 6.100129134796861, + "grad_norm": 0.12890625, + "learning_rate": 0.0003119936425946161, + "loss": 0.4998, + "step": 122820 + }, + { + "epoch": 6.100625807092481, + "grad_norm": 0.12353515625, + "learning_rate": 0.00031195390881096655, + "loss": 0.5084, + "step": 122830 + }, + { + "epoch": 6.1011224793881, + "grad_norm": 0.12890625, + "learning_rate": 0.000311914175027317, + "loss": 0.4893, + "step": 122840 + }, + { + "epoch": 6.101619151683719, + "grad_norm": 0.12353515625, + "learning_rate": 0.00031187444124366743, + "loss": 0.4915, + "step": 122850 + }, + { + "epoch": 6.1021158239793385, + "grad_norm": 0.11767578125, + "learning_rate": 0.0003118347074600179, + "loss": 0.4839, + "step": 122860 + }, + { + "epoch": 6.102612496274958, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003117949736763683, + "loss": 0.4837, + "step": 122870 + }, + { + "epoch": 6.103109168570577, + "grad_norm": 0.130859375, + "learning_rate": 0.0003117552398927188, + "loss": 0.5083, + "step": 122880 + }, + { + "epoch": 6.103605840866196, + "grad_norm": 0.1259765625, + "learning_rate": 0.00031171550610906927, + "loss": 0.477, + "step": 122890 + }, + { + "epoch": 6.104102513161815, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003116757723254197, + "loss": 0.4669, + "step": 122900 + }, + { + "epoch": 6.1045991854574355, + "grad_norm": 0.158203125, + "learning_rate": 0.00031163603854177015, + "loss": 0.5053, + "step": 122910 + }, + { + "epoch": 6.105095857753055, + "grad_norm": 0.12890625, + "learning_rate": 0.0003115963047581206, + "loss": 0.4852, + "step": 122920 + }, + { + "epoch": 6.105592530048674, + "grad_norm": 0.2294921875, + "learning_rate": 0.00031155657097447104, + "loss": 0.4933, + "step": 122930 + }, + { + "epoch": 6.106089202344293, + "grad_norm": 0.15625, + "learning_rate": 0.0003115168371908215, + "loss": 0.4907, + "step": 122940 + }, + { + "epoch": 6.106585874639912, + "grad_norm": 0.1396484375, + "learning_rate": 0.00031147710340717193, + "loss": 0.473, + "step": 122950 + }, + { + "epoch": 6.107082546935532, + "grad_norm": 0.1259765625, + "learning_rate": 0.00031143736962352246, + "loss": 0.4875, + "step": 122960 + }, + { + "epoch": 6.107579219231151, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003113976358398729, + "loss": 0.4991, + "step": 122970 + }, + { + "epoch": 6.108075891526771, + "grad_norm": 0.1533203125, + "learning_rate": 0.0003113579020562233, + "loss": 0.5012, + "step": 122980 + }, + { + "epoch": 6.10857256382239, + "grad_norm": 0.12158203125, + "learning_rate": 0.00031131816827257376, + "loss": 0.4928, + "step": 122990 + }, + { + "epoch": 6.1090692361180094, + "grad_norm": 0.203125, + "learning_rate": 0.00031127843448892423, + "loss": 0.5107, + "step": 123000 + }, + { + "epoch": 6.109565908413629, + "grad_norm": 0.126953125, + "learning_rate": 0.00031123870070527465, + "loss": 0.5096, + "step": 123010 + }, + { + "epoch": 6.110062580709248, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003111989669216251, + "loss": 0.5067, + "step": 123020 + }, + { + "epoch": 6.110559253004867, + "grad_norm": 0.11328125, + "learning_rate": 0.0003111592331379756, + "loss": 0.4906, + "step": 123030 + }, + { + "epoch": 6.111055925300486, + "grad_norm": 0.1796875, + "learning_rate": 0.00031111949935432606, + "loss": 0.5247, + "step": 123040 + }, + { + "epoch": 6.1115525975961065, + "grad_norm": 0.11376953125, + "learning_rate": 0.0003110797655706765, + "loss": 0.4742, + "step": 123050 + }, + { + "epoch": 6.112049269891726, + "grad_norm": 0.12890625, + "learning_rate": 0.0003110400317870269, + "loss": 0.5073, + "step": 123060 + }, + { + "epoch": 6.112545942187345, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003110002980033774, + "loss": 0.5044, + "step": 123070 + }, + { + "epoch": 6.113042614482964, + "grad_norm": 0.1396484375, + "learning_rate": 0.00031096056421972784, + "loss": 0.4577, + "step": 123080 + }, + { + "epoch": 6.113539286778583, + "grad_norm": 0.1357421875, + "learning_rate": 0.00031092083043607826, + "loss": 0.5072, + "step": 123090 + }, + { + "epoch": 6.114035959074203, + "grad_norm": 0.2177734375, + "learning_rate": 0.00031088109665242873, + "loss": 0.4861, + "step": 123100 + }, + { + "epoch": 6.114532631369822, + "grad_norm": 0.203125, + "learning_rate": 0.0003108413628687792, + "loss": 0.4976, + "step": 123110 + }, + { + "epoch": 6.115029303665442, + "grad_norm": 0.1123046875, + "learning_rate": 0.00031080162908512967, + "loss": 0.5007, + "step": 123120 + }, + { + "epoch": 6.115525975961061, + "grad_norm": 0.134765625, + "learning_rate": 0.0003107618953014801, + "loss": 0.508, + "step": 123130 + }, + { + "epoch": 6.11602264825668, + "grad_norm": 0.1318359375, + "learning_rate": 0.00031072216151783056, + "loss": 0.4942, + "step": 123140 + }, + { + "epoch": 6.1165193205523, + "grad_norm": 0.1376953125, + "learning_rate": 0.00031068242773418103, + "loss": 0.4963, + "step": 123150 + }, + { + "epoch": 6.117015992847919, + "grad_norm": 0.1552734375, + "learning_rate": 0.00031064269395053145, + "loss": 0.5308, + "step": 123160 + }, + { + "epoch": 6.117512665143538, + "grad_norm": 0.138671875, + "learning_rate": 0.00031060296016688187, + "loss": 0.513, + "step": 123170 + }, + { + "epoch": 6.118009337439157, + "grad_norm": 0.1767578125, + "learning_rate": 0.00031056322638323234, + "loss": 0.5011, + "step": 123180 + }, + { + "epoch": 6.118506009734777, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003105234925995828, + "loss": 0.4966, + "step": 123190 + }, + { + "epoch": 6.119002682030397, + "grad_norm": 0.1328125, + "learning_rate": 0.0003104837588159333, + "loss": 0.5091, + "step": 123200 + }, + { + "epoch": 6.119499354326016, + "grad_norm": 0.138671875, + "learning_rate": 0.0003104440250322837, + "loss": 0.5286, + "step": 123210 + }, + { + "epoch": 6.119996026621635, + "grad_norm": 0.154296875, + "learning_rate": 0.00031040429124863417, + "loss": 0.5122, + "step": 123220 + }, + { + "epoch": 6.120492698917254, + "grad_norm": 0.125, + "learning_rate": 0.00031036455746498464, + "loss": 0.4795, + "step": 123230 + }, + { + "epoch": 6.120989371212874, + "grad_norm": 0.1455078125, + "learning_rate": 0.00031032482368133506, + "loss": 0.4663, + "step": 123240 + }, + { + "epoch": 6.121486043508493, + "grad_norm": 0.130859375, + "learning_rate": 0.00031028508989768553, + "loss": 0.5169, + "step": 123250 + }, + { + "epoch": 6.121982715804112, + "grad_norm": 0.130859375, + "learning_rate": 0.000310245356114036, + "loss": 0.5197, + "step": 123260 + }, + { + "epoch": 6.122479388099732, + "grad_norm": 0.115234375, + "learning_rate": 0.0003102056223303864, + "loss": 0.4805, + "step": 123270 + }, + { + "epoch": 6.122976060395351, + "grad_norm": 0.119140625, + "learning_rate": 0.0003101658885467369, + "loss": 0.478, + "step": 123280 + }, + { + "epoch": 6.123472732690971, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003101261547630873, + "loss": 0.507, + "step": 123290 + }, + { + "epoch": 6.12396940498659, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003100864209794378, + "loss": 0.4978, + "step": 123300 + }, + { + "epoch": 6.124466077282209, + "grad_norm": 0.13671875, + "learning_rate": 0.00031004668719578825, + "loss": 0.4718, + "step": 123310 + }, + { + "epoch": 6.124962749577828, + "grad_norm": 0.13671875, + "learning_rate": 0.00031000695341213867, + "loss": 0.4664, + "step": 123320 + }, + { + "epoch": 6.1254594218734475, + "grad_norm": 0.1279296875, + "learning_rate": 0.00030996721962848914, + "loss": 0.4814, + "step": 123330 + }, + { + "epoch": 6.125956094169068, + "grad_norm": 0.12890625, + "learning_rate": 0.0003099274858448396, + "loss": 0.5186, + "step": 123340 + }, + { + "epoch": 6.126452766464687, + "grad_norm": 0.1650390625, + "learning_rate": 0.00030988775206119, + "loss": 0.4886, + "step": 123350 + }, + { + "epoch": 6.126949438760306, + "grad_norm": 0.11328125, + "learning_rate": 0.0003098480182775405, + "loss": 0.514, + "step": 123360 + }, + { + "epoch": 6.127446111055925, + "grad_norm": 0.1484375, + "learning_rate": 0.00030980828449389097, + "loss": 0.4787, + "step": 123370 + }, + { + "epoch": 6.1279427833515445, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003097685507102414, + "loss": 0.4772, + "step": 123380 + }, + { + "epoch": 6.128439455647164, + "grad_norm": 0.1298828125, + "learning_rate": 0.00030972881692659186, + "loss": 0.4977, + "step": 123390 + }, + { + "epoch": 6.128936127942783, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003096890831429423, + "loss": 0.4828, + "step": 123400 + }, + { + "epoch": 6.129432800238403, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003096493493592928, + "loss": 0.5129, + "step": 123410 + }, + { + "epoch": 6.129929472534022, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003096096155756432, + "loss": 0.4896, + "step": 123420 + }, + { + "epoch": 6.1304261448296415, + "grad_norm": 0.12890625, + "learning_rate": 0.00030956988179199363, + "loss": 0.5021, + "step": 123430 + }, + { + "epoch": 6.130922817125261, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003095301480083441, + "loss": 0.5188, + "step": 123440 + }, + { + "epoch": 6.13141948942088, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003094904142246946, + "loss": 0.4963, + "step": 123450 + }, + { + "epoch": 6.131916161716499, + "grad_norm": 0.1279296875, + "learning_rate": 0.000309450680441045, + "loss": 0.4983, + "step": 123460 + }, + { + "epoch": 6.1324128340121185, + "grad_norm": 0.12060546875, + "learning_rate": 0.00030941094665739547, + "loss": 0.4857, + "step": 123470 + }, + { + "epoch": 6.132909506307739, + "grad_norm": 0.138671875, + "learning_rate": 0.0003093712128737459, + "loss": 0.4902, + "step": 123480 + }, + { + "epoch": 6.133406178603358, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003093314790900964, + "loss": 0.4763, + "step": 123490 + }, + { + "epoch": 6.133902850898977, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003092917453064468, + "loss": 0.4937, + "step": 123500 + }, + { + "epoch": 6.134399523194596, + "grad_norm": 0.142578125, + "learning_rate": 0.00030925201152279724, + "loss": 0.4926, + "step": 123510 + }, + { + "epoch": 6.1348961954902155, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003092122777391477, + "loss": 0.5156, + "step": 123520 + }, + { + "epoch": 6.135392867785835, + "grad_norm": 0.115234375, + "learning_rate": 0.0003091725439554982, + "loss": 0.4528, + "step": 123530 + }, + { + "epoch": 6.135889540081454, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003091328101718486, + "loss": 0.4661, + "step": 123540 + }, + { + "epoch": 6.136386212377074, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003090930763881991, + "loss": 0.4997, + "step": 123550 + }, + { + "epoch": 6.136882884672693, + "grad_norm": 0.125, + "learning_rate": 0.00030905334260454954, + "loss": 0.5024, + "step": 123560 + }, + { + "epoch": 6.1373795569683125, + "grad_norm": 0.1474609375, + "learning_rate": 0.0003090136088209, + "loss": 0.5092, + "step": 123570 + }, + { + "epoch": 6.137876229263932, + "grad_norm": 0.1259765625, + "learning_rate": 0.00030897387503725043, + "loss": 0.462, + "step": 123580 + }, + { + "epoch": 6.138372901559551, + "grad_norm": 0.11962890625, + "learning_rate": 0.00030893414125360085, + "loss": 0.5407, + "step": 123590 + }, + { + "epoch": 6.13886957385517, + "grad_norm": 0.15234375, + "learning_rate": 0.0003088944074699514, + "loss": 0.4835, + "step": 123600 + }, + { + "epoch": 6.139366246150789, + "grad_norm": 0.16015625, + "learning_rate": 0.0003088546736863018, + "loss": 0.5255, + "step": 123610 + }, + { + "epoch": 6.139862918446409, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003088149399026522, + "loss": 0.4959, + "step": 123620 + }, + { + "epoch": 6.140359590742029, + "grad_norm": 0.126953125, + "learning_rate": 0.0003087752061190027, + "loss": 0.4974, + "step": 123630 + }, + { + "epoch": 6.140856263037648, + "grad_norm": 0.158203125, + "learning_rate": 0.00030873547233535315, + "loss": 0.4779, + "step": 123640 + }, + { + "epoch": 6.141352935333267, + "grad_norm": 0.1591796875, + "learning_rate": 0.0003086957385517036, + "loss": 0.51, + "step": 123650 + }, + { + "epoch": 6.141849607628886, + "grad_norm": 0.150390625, + "learning_rate": 0.00030865600476805404, + "loss": 0.4807, + "step": 123660 + }, + { + "epoch": 6.142346279924506, + "grad_norm": 0.130859375, + "learning_rate": 0.0003086162709844045, + "loss": 0.482, + "step": 123670 + }, + { + "epoch": 6.142842952220125, + "grad_norm": 0.1318359375, + "learning_rate": 0.000308576537200755, + "loss": 0.4833, + "step": 123680 + }, + { + "epoch": 6.143339624515744, + "grad_norm": 0.12060546875, + "learning_rate": 0.0003085368034171054, + "loss": 0.4663, + "step": 123690 + }, + { + "epoch": 6.143836296811364, + "grad_norm": 0.14453125, + "learning_rate": 0.00030849706963345587, + "loss": 0.5121, + "step": 123700 + }, + { + "epoch": 6.1443329691069835, + "grad_norm": 0.15234375, + "learning_rate": 0.0003084573358498063, + "loss": 0.5233, + "step": 123710 + }, + { + "epoch": 6.144829641402603, + "grad_norm": 0.1298828125, + "learning_rate": 0.00030841760206615676, + "loss": 0.4788, + "step": 123720 + }, + { + "epoch": 6.145326313698222, + "grad_norm": 0.1220703125, + "learning_rate": 0.00030837786828250723, + "loss": 0.5001, + "step": 123730 + }, + { + "epoch": 6.145822985993841, + "grad_norm": 0.1259765625, + "learning_rate": 0.00030833813449885765, + "loss": 0.4717, + "step": 123740 + }, + { + "epoch": 6.14631965828946, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003082984007152081, + "loss": 0.4938, + "step": 123750 + }, + { + "epoch": 6.14681633058508, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003082586669315586, + "loss": 0.4876, + "step": 123760 + }, + { + "epoch": 6.1473130028807, + "grad_norm": 0.1318359375, + "learning_rate": 0.000308218933147909, + "loss": 0.5093, + "step": 123770 + }, + { + "epoch": 6.147809675176319, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003081791993642595, + "loss": 0.4692, + "step": 123780 + }, + { + "epoch": 6.148306347471938, + "grad_norm": 0.1474609375, + "learning_rate": 0.00030813946558060995, + "loss": 0.5277, + "step": 123790 + }, + { + "epoch": 6.148803019767557, + "grad_norm": 0.126953125, + "learning_rate": 0.00030809973179696037, + "loss": 0.4778, + "step": 123800 + }, + { + "epoch": 6.149299692063177, + "grad_norm": 0.1611328125, + "learning_rate": 0.00030805999801331084, + "loss": 0.4829, + "step": 123810 + }, + { + "epoch": 6.149796364358796, + "grad_norm": 0.126953125, + "learning_rate": 0.00030802026422966126, + "loss": 0.5135, + "step": 123820 + }, + { + "epoch": 6.150293036654415, + "grad_norm": 0.12353515625, + "learning_rate": 0.00030798053044601173, + "loss": 0.4854, + "step": 123830 + }, + { + "epoch": 6.150789708950035, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003079407966623622, + "loss": 0.4766, + "step": 123840 + }, + { + "epoch": 6.151286381245654, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003079010628787126, + "loss": 0.4788, + "step": 123850 + }, + { + "epoch": 6.151783053541274, + "grad_norm": 0.12890625, + "learning_rate": 0.0003078613290950631, + "loss": 0.4611, + "step": 123860 + }, + { + "epoch": 6.152279725836893, + "grad_norm": 0.140625, + "learning_rate": 0.00030782159531141356, + "loss": 0.4839, + "step": 123870 + }, + { + "epoch": 6.152776398132512, + "grad_norm": 0.126953125, + "learning_rate": 0.000307781861527764, + "loss": 0.4894, + "step": 123880 + }, + { + "epoch": 6.153273070428131, + "grad_norm": 0.1279296875, + "learning_rate": 0.00030774212774411445, + "loss": 0.5046, + "step": 123890 + }, + { + "epoch": 6.1537697427237505, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003077023939604649, + "loss": 0.4849, + "step": 123900 + }, + { + "epoch": 6.15426641501937, + "grad_norm": 0.1708984375, + "learning_rate": 0.00030766266017681534, + "loss": 0.4949, + "step": 123910 + }, + { + "epoch": 6.15476308731499, + "grad_norm": 0.126953125, + "learning_rate": 0.0003076229263931658, + "loss": 0.4976, + "step": 123920 + }, + { + "epoch": 6.155259759610609, + "grad_norm": 0.162109375, + "learning_rate": 0.0003075831926095162, + "loss": 0.4847, + "step": 123930 + }, + { + "epoch": 6.155756431906228, + "grad_norm": 0.1162109375, + "learning_rate": 0.00030754345882586675, + "loss": 0.479, + "step": 123940 + }, + { + "epoch": 6.156253104201848, + "grad_norm": 0.12890625, + "learning_rate": 0.00030750372504221717, + "loss": 0.5073, + "step": 123950 + }, + { + "epoch": 6.156749776497467, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003074639912585676, + "loss": 0.5041, + "step": 123960 + }, + { + "epoch": 6.157246448793086, + "grad_norm": 0.150390625, + "learning_rate": 0.00030742425747491806, + "loss": 0.4983, + "step": 123970 + }, + { + "epoch": 6.157743121088705, + "grad_norm": 0.1435546875, + "learning_rate": 0.00030738452369126853, + "loss": 0.4878, + "step": 123980 + }, + { + "epoch": 6.158239793384325, + "grad_norm": 0.130859375, + "learning_rate": 0.00030734478990761895, + "loss": 0.4951, + "step": 123990 + }, + { + "epoch": 6.158736465679945, + "grad_norm": 0.1435546875, + "learning_rate": 0.0003073050561239694, + "loss": 0.515, + "step": 124000 + }, + { + "epoch": 6.159233137975564, + "grad_norm": 0.1513671875, + "learning_rate": 0.00030726532234031983, + "loss": 0.4662, + "step": 124010 + }, + { + "epoch": 6.159729810271183, + "grad_norm": 0.13671875, + "learning_rate": 0.00030722558855667036, + "loss": 0.4846, + "step": 124020 + }, + { + "epoch": 6.160226482566802, + "grad_norm": 0.2216796875, + "learning_rate": 0.0003071858547730208, + "loss": 0.4826, + "step": 124030 + }, + { + "epoch": 6.1607231548624215, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003071461209893712, + "loss": 0.5076, + "step": 124040 + }, + { + "epoch": 6.161219827158041, + "grad_norm": 0.1328125, + "learning_rate": 0.00030710638720572167, + "loss": 0.4848, + "step": 124050 + }, + { + "epoch": 6.161716499453661, + "grad_norm": 0.16796875, + "learning_rate": 0.00030706665342207214, + "loss": 0.4963, + "step": 124060 + }, + { + "epoch": 6.16221317174928, + "grad_norm": 0.134765625, + "learning_rate": 0.0003070269196384226, + "loss": 0.4908, + "step": 124070 + }, + { + "epoch": 6.162709844044899, + "grad_norm": 0.142578125, + "learning_rate": 0.000306987185854773, + "loss": 0.5055, + "step": 124080 + }, + { + "epoch": 6.1632065163405185, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003069474520711235, + "loss": 0.5082, + "step": 124090 + }, + { + "epoch": 6.163703188636138, + "grad_norm": 0.1513671875, + "learning_rate": 0.00030690771828747397, + "loss": 0.5017, + "step": 124100 + }, + { + "epoch": 6.164199860931757, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003068679845038244, + "loss": 0.5244, + "step": 124110 + }, + { + "epoch": 6.164696533227376, + "grad_norm": 0.123046875, + "learning_rate": 0.0003068282507201748, + "loss": 0.4966, + "step": 124120 + }, + { + "epoch": 6.165193205522996, + "grad_norm": 0.11767578125, + "learning_rate": 0.00030678851693652533, + "loss": 0.4702, + "step": 124130 + }, + { + "epoch": 6.1656898778186155, + "grad_norm": 0.1826171875, + "learning_rate": 0.00030674878315287575, + "loss": 0.4807, + "step": 124140 + }, + { + "epoch": 6.166186550114235, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003067090493692262, + "loss": 0.4972, + "step": 124150 + }, + { + "epoch": 6.166683222409854, + "grad_norm": 0.1298828125, + "learning_rate": 0.00030666931558557663, + "loss": 0.4697, + "step": 124160 + }, + { + "epoch": 6.167179894705473, + "grad_norm": 0.1259765625, + "learning_rate": 0.0003066295818019271, + "loss": 0.4771, + "step": 124170 + }, + { + "epoch": 6.1676765670010925, + "grad_norm": 0.1552734375, + "learning_rate": 0.0003065898480182776, + "loss": 0.4711, + "step": 124180 + }, + { + "epoch": 6.168173239296712, + "grad_norm": 0.126953125, + "learning_rate": 0.000306550114234628, + "loss": 0.4897, + "step": 124190 + }, + { + "epoch": 6.168669911592332, + "grad_norm": 0.1376953125, + "learning_rate": 0.00030651038045097846, + "loss": 0.4892, + "step": 124200 + }, + { + "epoch": 6.169166583887951, + "grad_norm": 0.126953125, + "learning_rate": 0.00030647064666732894, + "loss": 0.5014, + "step": 124210 + }, + { + "epoch": 6.16966325618357, + "grad_norm": 0.150390625, + "learning_rate": 0.00030643091288367935, + "loss": 0.5003, + "step": 124220 + }, + { + "epoch": 6.1701599284791895, + "grad_norm": 0.2001953125, + "learning_rate": 0.0003063911791000298, + "loss": 0.4752, + "step": 124230 + }, + { + "epoch": 6.170656600774809, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003063514453163803, + "loss": 0.5083, + "step": 124240 + }, + { + "epoch": 6.171153273070428, + "grad_norm": 0.1240234375, + "learning_rate": 0.0003063117115327307, + "loss": 0.487, + "step": 124250 + }, + { + "epoch": 6.171649945366047, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003062719777490812, + "loss": 0.4781, + "step": 124260 + }, + { + "epoch": 6.172146617661667, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003062322439654316, + "loss": 0.5144, + "step": 124270 + }, + { + "epoch": 6.1726432899572865, + "grad_norm": 0.146484375, + "learning_rate": 0.0003061925101817821, + "loss": 0.4547, + "step": 124280 + }, + { + "epoch": 6.173139962252906, + "grad_norm": 0.1259765625, + "learning_rate": 0.00030615277639813254, + "loss": 0.5286, + "step": 124290 + }, + { + "epoch": 6.173636634548525, + "grad_norm": 0.14453125, + "learning_rate": 0.00030611304261448296, + "loss": 0.4818, + "step": 124300 + }, + { + "epoch": 6.174133306844144, + "grad_norm": 0.1474609375, + "learning_rate": 0.00030607330883083343, + "loss": 0.4961, + "step": 124310 + }, + { + "epoch": 6.174629979139763, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003060335750471839, + "loss": 0.4753, + "step": 124320 + }, + { + "epoch": 6.175126651435383, + "grad_norm": 0.130859375, + "learning_rate": 0.0003059938412635343, + "loss": 0.4897, + "step": 124330 + }, + { + "epoch": 6.175623323731002, + "grad_norm": 0.1318359375, + "learning_rate": 0.0003059541074798848, + "loss": 0.4628, + "step": 124340 + }, + { + "epoch": 6.176119996026622, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003059143736962352, + "loss": 0.5122, + "step": 124350 + }, + { + "epoch": 6.176616668322241, + "grad_norm": 0.11962890625, + "learning_rate": 0.0003058746399125857, + "loss": 0.4718, + "step": 124360 + }, + { + "epoch": 6.17711334061786, + "grad_norm": 0.130859375, + "learning_rate": 0.00030583490612893615, + "loss": 0.495, + "step": 124370 + }, + { + "epoch": 6.17761001291348, + "grad_norm": 0.11572265625, + "learning_rate": 0.00030579517234528657, + "loss": 0.4725, + "step": 124380 + }, + { + "epoch": 6.178106685209099, + "grad_norm": 0.16015625, + "learning_rate": 0.00030575543856163704, + "loss": 0.4962, + "step": 124390 + }, + { + "epoch": 6.178603357504718, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003057157047779875, + "loss": 0.4841, + "step": 124400 + }, + { + "epoch": 6.179100029800337, + "grad_norm": 0.11962890625, + "learning_rate": 0.00030567597099433793, + "loss": 0.4938, + "step": 124410 + }, + { + "epoch": 6.1795967020959575, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003056362372106884, + "loss": 0.4639, + "step": 124420 + }, + { + "epoch": 6.180093374391577, + "grad_norm": 0.1875, + "learning_rate": 0.00030559650342703887, + "loss": 0.4921, + "step": 124430 + }, + { + "epoch": 6.180590046687196, + "grad_norm": 0.126953125, + "learning_rate": 0.0003055567696433893, + "loss": 0.4631, + "step": 124440 + }, + { + "epoch": 6.181086718982815, + "grad_norm": 0.12353515625, + "learning_rate": 0.00030551703585973976, + "loss": 0.4965, + "step": 124450 + }, + { + "epoch": 6.181583391278434, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003054773020760902, + "loss": 0.4815, + "step": 124460 + }, + { + "epoch": 6.182080063574054, + "grad_norm": 0.125, + "learning_rate": 0.0003054375682924407, + "loss": 0.4847, + "step": 124470 + }, + { + "epoch": 6.182576735869673, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003053978345087911, + "loss": 0.4787, + "step": 124480 + }, + { + "epoch": 6.183073408165293, + "grad_norm": 0.126953125, + "learning_rate": 0.00030535810072514154, + "loss": 0.4774, + "step": 124490 + }, + { + "epoch": 6.183570080460912, + "grad_norm": 0.138671875, + "learning_rate": 0.000305318366941492, + "loss": 0.4949, + "step": 124500 + }, + { + "epoch": 6.184066752756531, + "grad_norm": 0.1328125, + "learning_rate": 0.0003052786331578425, + "loss": 0.4976, + "step": 124510 + }, + { + "epoch": 6.184563425052151, + "grad_norm": 0.158203125, + "learning_rate": 0.00030523889937419295, + "loss": 0.4635, + "step": 124520 + }, + { + "epoch": 6.18506009734777, + "grad_norm": 0.12158203125, + "learning_rate": 0.00030519916559054337, + "loss": 0.4866, + "step": 124530 + }, + { + "epoch": 6.185556769643389, + "grad_norm": 0.140625, + "learning_rate": 0.00030515943180689384, + "loss": 0.4859, + "step": 124540 + }, + { + "epoch": 6.186053441939008, + "grad_norm": 0.1533203125, + "learning_rate": 0.0003051196980232443, + "loss": 0.4943, + "step": 124550 + }, + { + "epoch": 6.186550114234628, + "grad_norm": 0.1552734375, + "learning_rate": 0.00030507996423959473, + "loss": 0.4914, + "step": 124560 + }, + { + "epoch": 6.187046786530248, + "grad_norm": 0.109375, + "learning_rate": 0.00030504023045594515, + "loss": 0.5256, + "step": 124570 + }, + { + "epoch": 6.187543458825867, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003050004966722956, + "loss": 0.5115, + "step": 124580 + }, + { + "epoch": 6.188040131121486, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003049607628886461, + "loss": 0.5121, + "step": 124590 + }, + { + "epoch": 6.188536803417105, + "grad_norm": 0.1416015625, + "learning_rate": 0.00030492102910499656, + "loss": 0.4957, + "step": 124600 + }, + { + "epoch": 6.1890334757127246, + "grad_norm": 0.1416015625, + "learning_rate": 0.000304881295321347, + "loss": 0.5048, + "step": 124610 + }, + { + "epoch": 6.189530148008344, + "grad_norm": 0.1572265625, + "learning_rate": 0.00030484156153769745, + "loss": 0.4986, + "step": 124620 + }, + { + "epoch": 6.190026820303963, + "grad_norm": 0.140625, + "learning_rate": 0.0003048018277540479, + "loss": 0.5007, + "step": 124630 + }, + { + "epoch": 6.190523492599583, + "grad_norm": 0.12060546875, + "learning_rate": 0.00030476209397039834, + "loss": 0.4803, + "step": 124640 + }, + { + "epoch": 6.191020164895202, + "grad_norm": 0.1328125, + "learning_rate": 0.00030472236018674875, + "loss": 0.5185, + "step": 124650 + }, + { + "epoch": 6.191516837190822, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003046826264030993, + "loss": 0.4786, + "step": 124660 + }, + { + "epoch": 6.192013509486441, + "grad_norm": 0.150390625, + "learning_rate": 0.0003046428926194497, + "loss": 0.513, + "step": 124670 + }, + { + "epoch": 6.19251018178206, + "grad_norm": 0.1552734375, + "learning_rate": 0.00030460315883580017, + "loss": 0.4815, + "step": 124680 + }, + { + "epoch": 6.193006854077679, + "grad_norm": 0.2041015625, + "learning_rate": 0.0003045634250521506, + "loss": 0.5076, + "step": 124690 + }, + { + "epoch": 6.1935035263732985, + "grad_norm": 0.12451171875, + "learning_rate": 0.00030452369126850106, + "loss": 0.4995, + "step": 124700 + }, + { + "epoch": 6.194000198668919, + "grad_norm": 0.1533203125, + "learning_rate": 0.00030448395748485153, + "loss": 0.4794, + "step": 124710 + }, + { + "epoch": 6.194496870964538, + "grad_norm": 0.126953125, + "learning_rate": 0.00030444422370120195, + "loss": 0.5052, + "step": 124720 + }, + { + "epoch": 6.194993543260157, + "grad_norm": 0.150390625, + "learning_rate": 0.0003044044899175524, + "loss": 0.4816, + "step": 124730 + }, + { + "epoch": 6.195490215555776, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003043647561339029, + "loss": 0.4861, + "step": 124740 + }, + { + "epoch": 6.1959868878513955, + "grad_norm": 0.123046875, + "learning_rate": 0.0003043250223502533, + "loss": 0.4902, + "step": 124750 + }, + { + "epoch": 6.196483560147015, + "grad_norm": 0.11572265625, + "learning_rate": 0.0003042852885666038, + "loss": 0.4454, + "step": 124760 + }, + { + "epoch": 6.196980232442634, + "grad_norm": 0.126953125, + "learning_rate": 0.00030424555478295425, + "loss": 0.5016, + "step": 124770 + }, + { + "epoch": 6.197476904738254, + "grad_norm": 0.12451171875, + "learning_rate": 0.00030420582099930467, + "loss": 0.5039, + "step": 124780 + }, + { + "epoch": 6.197973577033873, + "grad_norm": 0.1552734375, + "learning_rate": 0.00030416608721565514, + "loss": 0.5065, + "step": 124790 + }, + { + "epoch": 6.1984702493294925, + "grad_norm": 0.11865234375, + "learning_rate": 0.00030412635343200555, + "loss": 0.4974, + "step": 124800 + }, + { + "epoch": 6.198966921625112, + "grad_norm": 0.1572265625, + "learning_rate": 0.000304086619648356, + "loss": 0.4879, + "step": 124810 + }, + { + "epoch": 6.199463593920731, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003040468858647065, + "loss": 0.5344, + "step": 124820 + }, + { + "epoch": 6.19996026621635, + "grad_norm": 0.1328125, + "learning_rate": 0.0003040071520810569, + "loss": 0.4836, + "step": 124830 + }, + { + "epoch": 6.200456938511969, + "grad_norm": 0.125, + "learning_rate": 0.0003039674182974074, + "loss": 0.4812, + "step": 124840 + }, + { + "epoch": 6.2009536108075896, + "grad_norm": 0.15234375, + "learning_rate": 0.00030392768451375786, + "loss": 0.4752, + "step": 124850 + }, + { + "epoch": 6.201450283103209, + "grad_norm": 0.126953125, + "learning_rate": 0.0003038879507301083, + "loss": 0.4966, + "step": 124860 + }, + { + "epoch": 6.201946955398828, + "grad_norm": 0.1376953125, + "learning_rate": 0.00030384821694645874, + "loss": 0.5055, + "step": 124870 + }, + { + "epoch": 6.202443627694447, + "grad_norm": 0.11767578125, + "learning_rate": 0.00030380848316280916, + "loss": 0.491, + "step": 124880 + }, + { + "epoch": 6.2029402999900665, + "grad_norm": 0.12109375, + "learning_rate": 0.00030376874937915963, + "loss": 0.4983, + "step": 124890 + }, + { + "epoch": 6.203436972285686, + "grad_norm": 0.1484375, + "learning_rate": 0.0003037290155955101, + "loss": 0.4944, + "step": 124900 + }, + { + "epoch": 6.203933644581305, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003036892818118605, + "loss": 0.4925, + "step": 124910 + }, + { + "epoch": 6.204430316876925, + "grad_norm": 0.150390625, + "learning_rate": 0.000303649548028211, + "loss": 0.49, + "step": 124920 + }, + { + "epoch": 6.204926989172544, + "grad_norm": 0.1484375, + "learning_rate": 0.00030360981424456146, + "loss": 0.4816, + "step": 124930 + }, + { + "epoch": 6.2054236614681635, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003035700804609119, + "loss": 0.4919, + "step": 124940 + }, + { + "epoch": 6.205920333763783, + "grad_norm": 0.125, + "learning_rate": 0.00030353034667726235, + "loss": 0.4759, + "step": 124950 + }, + { + "epoch": 6.206417006059402, + "grad_norm": 0.2236328125, + "learning_rate": 0.0003034906128936128, + "loss": 0.5108, + "step": 124960 + }, + { + "epoch": 6.206913678355021, + "grad_norm": 0.1416015625, + "learning_rate": 0.0003034508791099633, + "loss": 0.4897, + "step": 124970 + }, + { + "epoch": 6.20741035065064, + "grad_norm": 0.1611328125, + "learning_rate": 0.0003034111453263137, + "loss": 0.4982, + "step": 124980 + }, + { + "epoch": 6.2079070229462605, + "grad_norm": 0.1552734375, + "learning_rate": 0.00030337141154266413, + "loss": 0.5029, + "step": 124990 + }, + { + "epoch": 6.20840369524188, + "grad_norm": 0.11767578125, + "learning_rate": 0.00030333167775901466, + "loss": 0.4894, + "step": 125000 + }, + { + "epoch": 6.208900367537499, + "grad_norm": 0.181640625, + "learning_rate": 0.00030329194397536507, + "loss": 0.4799, + "step": 125010 + }, + { + "epoch": 6.209397039833118, + "grad_norm": 0.142578125, + "learning_rate": 0.0003032522101917155, + "loss": 0.5078, + "step": 125020 + }, + { + "epoch": 6.209893712128737, + "grad_norm": 0.1181640625, + "learning_rate": 0.00030321247640806596, + "loss": 0.5166, + "step": 125030 + }, + { + "epoch": 6.210390384424357, + "grad_norm": 0.1376953125, + "learning_rate": 0.00030317274262441643, + "loss": 0.4705, + "step": 125040 + }, + { + "epoch": 6.210887056719976, + "grad_norm": 0.125, + "learning_rate": 0.0003031330088407669, + "loss": 0.4962, + "step": 125050 + }, + { + "epoch": 6.211383729015595, + "grad_norm": 0.1494140625, + "learning_rate": 0.0003030932750571173, + "loss": 0.4978, + "step": 125060 + }, + { + "epoch": 6.211880401311215, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003030535412734678, + "loss": 0.4915, + "step": 125070 + }, + { + "epoch": 6.212377073606834, + "grad_norm": 0.12353515625, + "learning_rate": 0.00030301380748981826, + "loss": 0.463, + "step": 125080 + }, + { + "epoch": 6.212873745902454, + "grad_norm": 0.138671875, + "learning_rate": 0.0003029740737061687, + "loss": 0.4876, + "step": 125090 + }, + { + "epoch": 6.213370418198073, + "grad_norm": 0.150390625, + "learning_rate": 0.0003029343399225191, + "loss": 0.5183, + "step": 125100 + }, + { + "epoch": 6.213867090493692, + "grad_norm": 0.1513671875, + "learning_rate": 0.0003028946061388696, + "loss": 0.5035, + "step": 125110 + }, + { + "epoch": 6.214363762789311, + "grad_norm": 0.11865234375, + "learning_rate": 0.00030285487235522004, + "loss": 0.5088, + "step": 125120 + }, + { + "epoch": 6.214860435084931, + "grad_norm": 0.1328125, + "learning_rate": 0.0003028151385715705, + "loss": 0.4658, + "step": 125130 + }, + { + "epoch": 6.215357107380551, + "grad_norm": 0.1357421875, + "learning_rate": 0.00030277540478792093, + "loss": 0.4993, + "step": 125140 + }, + { + "epoch": 6.21585377967617, + "grad_norm": 0.13671875, + "learning_rate": 0.0003027356710042714, + "loss": 0.5285, + "step": 125150 + }, + { + "epoch": 6.216350451971789, + "grad_norm": 0.1669921875, + "learning_rate": 0.00030269593722062187, + "loss": 0.4791, + "step": 125160 + }, + { + "epoch": 6.216847124267408, + "grad_norm": 0.1669921875, + "learning_rate": 0.0003026562034369723, + "loss": 0.5056, + "step": 125170 + }, + { + "epoch": 6.217343796563028, + "grad_norm": 0.119140625, + "learning_rate": 0.0003026164696533227, + "loss": 0.5113, + "step": 125180 + }, + { + "epoch": 6.217840468858647, + "grad_norm": 0.1376953125, + "learning_rate": 0.00030257673586967323, + "loss": 0.4772, + "step": 125190 + }, + { + "epoch": 6.218337141154266, + "grad_norm": 0.1552734375, + "learning_rate": 0.00030253700208602365, + "loss": 0.4725, + "step": 125200 + }, + { + "epoch": 6.218833813449886, + "grad_norm": 0.123046875, + "learning_rate": 0.0003024972683023741, + "loss": 0.4514, + "step": 125210 + }, + { + "epoch": 6.219330485745505, + "grad_norm": 0.1611328125, + "learning_rate": 0.00030245753451872454, + "loss": 0.4987, + "step": 125220 + }, + { + "epoch": 6.219827158041125, + "grad_norm": 0.12158203125, + "learning_rate": 0.000302417800735075, + "loss": 0.5278, + "step": 125230 + }, + { + "epoch": 6.220323830336744, + "grad_norm": 0.142578125, + "learning_rate": 0.0003023780669514255, + "loss": 0.4924, + "step": 125240 + }, + { + "epoch": 6.220820502632363, + "grad_norm": 0.14453125, + "learning_rate": 0.0003023383331677759, + "loss": 0.4994, + "step": 125250 + }, + { + "epoch": 6.221317174927982, + "grad_norm": 0.134765625, + "learning_rate": 0.00030229859938412637, + "loss": 0.5085, + "step": 125260 + }, + { + "epoch": 6.2218138472236015, + "grad_norm": 0.1376953125, + "learning_rate": 0.00030225886560047684, + "loss": 0.5046, + "step": 125270 + }, + { + "epoch": 6.222310519519221, + "grad_norm": 0.1337890625, + "learning_rate": 0.00030221913181682726, + "loss": 0.5063, + "step": 125280 + }, + { + "epoch": 6.222807191814841, + "grad_norm": 0.1552734375, + "learning_rate": 0.00030217939803317773, + "loss": 0.5053, + "step": 125290 + }, + { + "epoch": 6.22330386411046, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003021396642495282, + "loss": 0.4956, + "step": 125300 + }, + { + "epoch": 6.223800536406079, + "grad_norm": 0.1865234375, + "learning_rate": 0.0003020999304658786, + "loss": 0.523, + "step": 125310 + }, + { + "epoch": 6.224297208701699, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003020601966822291, + "loss": 0.4867, + "step": 125320 + }, + { + "epoch": 6.224793880997318, + "grad_norm": 0.12158203125, + "learning_rate": 0.0003020204628985795, + "loss": 0.4856, + "step": 125330 + }, + { + "epoch": 6.225290553292937, + "grad_norm": 0.1259765625, + "learning_rate": 0.00030198072911493003, + "loss": 0.4829, + "step": 125340 + }, + { + "epoch": 6.225787225588556, + "grad_norm": 0.1298828125, + "learning_rate": 0.00030194099533128045, + "loss": 0.5264, + "step": 125350 + }, + { + "epoch": 6.226283897884176, + "grad_norm": 0.1279296875, + "learning_rate": 0.00030190126154763087, + "loss": 0.4902, + "step": 125360 + }, + { + "epoch": 6.226780570179796, + "grad_norm": 0.154296875, + "learning_rate": 0.00030186152776398134, + "loss": 0.5243, + "step": 125370 + }, + { + "epoch": 6.227277242475415, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003018217939803318, + "loss": 0.4956, + "step": 125380 + }, + { + "epoch": 6.227773914771034, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003017820601966822, + "loss": 0.5035, + "step": 125390 + }, + { + "epoch": 6.228270587066653, + "grad_norm": 0.1220703125, + "learning_rate": 0.0003017423264130327, + "loss": 0.5026, + "step": 125400 + }, + { + "epoch": 6.2287672593622725, + "grad_norm": 0.140625, + "learning_rate": 0.0003017025926293831, + "loss": 0.4802, + "step": 125410 + }, + { + "epoch": 6.229263931657892, + "grad_norm": 0.12890625, + "learning_rate": 0.00030166285884573364, + "loss": 0.5156, + "step": 125420 + }, + { + "epoch": 6.229760603953512, + "grad_norm": 0.125, + "learning_rate": 0.00030162312506208406, + "loss": 0.4948, + "step": 125430 + }, + { + "epoch": 6.230257276249131, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003015833912784345, + "loss": 0.4874, + "step": 125440 + }, + { + "epoch": 6.23075394854475, + "grad_norm": 0.1279296875, + "learning_rate": 0.00030154365749478494, + "loss": 0.4725, + "step": 125450 + }, + { + "epoch": 6.2312506208403695, + "grad_norm": 0.140625, + "learning_rate": 0.0003015039237111354, + "loss": 0.4897, + "step": 125460 + }, + { + "epoch": 6.231747293135989, + "grad_norm": 0.12890625, + "learning_rate": 0.00030146418992748583, + "loss": 0.5026, + "step": 125470 + }, + { + "epoch": 6.232243965431608, + "grad_norm": 0.11669921875, + "learning_rate": 0.0003014244561438363, + "loss": 0.4734, + "step": 125480 + }, + { + "epoch": 6.232740637727227, + "grad_norm": 0.142578125, + "learning_rate": 0.0003013847223601868, + "loss": 0.4932, + "step": 125490 + }, + { + "epoch": 6.233237310022847, + "grad_norm": 0.14453125, + "learning_rate": 0.00030134498857653725, + "loss": 0.4852, + "step": 125500 + }, + { + "epoch": 6.2337339823184665, + "grad_norm": 0.2236328125, + "learning_rate": 0.00030130525479288766, + "loss": 0.4757, + "step": 125510 + }, + { + "epoch": 6.234230654614086, + "grad_norm": 0.1396484375, + "learning_rate": 0.0003012655210092381, + "loss": 0.4989, + "step": 125520 + }, + { + "epoch": 6.234727326909705, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003012257872255886, + "loss": 0.4904, + "step": 125530 + }, + { + "epoch": 6.235223999205324, + "grad_norm": 0.1962890625, + "learning_rate": 0.000301186053441939, + "loss": 0.5209, + "step": 125540 + }, + { + "epoch": 6.2357206715009434, + "grad_norm": 0.1259765625, + "learning_rate": 0.00030114631965828944, + "loss": 0.4679, + "step": 125550 + }, + { + "epoch": 6.236217343796563, + "grad_norm": 0.1376953125, + "learning_rate": 0.0003011065858746399, + "loss": 0.5565, + "step": 125560 + }, + { + "epoch": 6.236714016092183, + "grad_norm": 0.12255859375, + "learning_rate": 0.0003010668520909904, + "loss": 0.4529, + "step": 125570 + }, + { + "epoch": 6.237210688387802, + "grad_norm": 0.13671875, + "learning_rate": 0.00030102711830734086, + "loss": 0.4735, + "step": 125580 + }, + { + "epoch": 6.237707360683421, + "grad_norm": 0.1328125, + "learning_rate": 0.00030098738452369127, + "loss": 0.4933, + "step": 125590 + }, + { + "epoch": 6.2382040329790405, + "grad_norm": 0.1494140625, + "learning_rate": 0.00030094765074004174, + "loss": 0.4917, + "step": 125600 + }, + { + "epoch": 6.23870070527466, + "grad_norm": 0.12451171875, + "learning_rate": 0.0003009079169563922, + "loss": 0.4819, + "step": 125610 + }, + { + "epoch": 6.239197377570279, + "grad_norm": 0.150390625, + "learning_rate": 0.00030086818317274263, + "loss": 0.4896, + "step": 125620 + }, + { + "epoch": 6.239694049865898, + "grad_norm": 0.177734375, + "learning_rate": 0.00030082844938909305, + "loss": 0.5149, + "step": 125630 + }, + { + "epoch": 6.240190722161518, + "grad_norm": 0.1298828125, + "learning_rate": 0.0003007887156054436, + "loss": 0.5049, + "step": 125640 + }, + { + "epoch": 6.2406873944571375, + "grad_norm": 0.12109375, + "learning_rate": 0.000300748981821794, + "loss": 0.4964, + "step": 125650 + }, + { + "epoch": 6.241184066752757, + "grad_norm": 0.12158203125, + "learning_rate": 0.00030070924803814446, + "loss": 0.5195, + "step": 125660 + }, + { + "epoch": 6.241680739048376, + "grad_norm": 0.1357421875, + "learning_rate": 0.0003006695142544949, + "loss": 0.4992, + "step": 125670 + }, + { + "epoch": 6.242177411343995, + "grad_norm": 0.12451171875, + "learning_rate": 0.00030062978047084535, + "loss": 0.4925, + "step": 125680 + }, + { + "epoch": 6.242674083639614, + "grad_norm": 0.1337890625, + "learning_rate": 0.0003005900466871958, + "loss": 0.4941, + "step": 125690 + }, + { + "epoch": 6.243170755935234, + "grad_norm": 0.150390625, + "learning_rate": 0.00030055031290354624, + "loss": 0.5038, + "step": 125700 + }, + { + "epoch": 6.243667428230853, + "grad_norm": 0.1494140625, + "learning_rate": 0.00030051057911989666, + "loss": 0.4964, + "step": 125710 + }, + { + "epoch": 6.244164100526473, + "grad_norm": 0.126953125, + "learning_rate": 0.0003004708453362472, + "loss": 0.4944, + "step": 125720 + }, + { + "epoch": 6.244660772822092, + "grad_norm": 0.1708984375, + "learning_rate": 0.0003004311115525976, + "loss": 0.455, + "step": 125730 + }, + { + "epoch": 6.245157445117711, + "grad_norm": 0.1376953125, + "learning_rate": 0.00030039137776894807, + "loss": 0.502, + "step": 125740 + }, + { + "epoch": 6.245654117413331, + "grad_norm": 0.1328125, + "learning_rate": 0.0003003516439852985, + "loss": 0.4861, + "step": 125750 + }, + { + "epoch": 6.24615078970895, + "grad_norm": 0.1298828125, + "learning_rate": 0.00030031191020164896, + "loss": 0.5074, + "step": 125760 + }, + { + "epoch": 6.246647462004569, + "grad_norm": 0.13671875, + "learning_rate": 0.00030027217641799943, + "loss": 0.5044, + "step": 125770 + }, + { + "epoch": 6.247144134300188, + "grad_norm": 0.1201171875, + "learning_rate": 0.00030023244263434985, + "loss": 0.504, + "step": 125780 + }, + { + "epoch": 6.2476408065958084, + "grad_norm": 0.173828125, + "learning_rate": 0.0003001927088507003, + "loss": 0.5104, + "step": 125790 + }, + { + "epoch": 6.248137478891428, + "grad_norm": 0.150390625, + "learning_rate": 0.0003001529750670508, + "loss": 0.4601, + "step": 125800 + }, + { + "epoch": 6.248634151187047, + "grad_norm": 0.125, + "learning_rate": 0.0003001132412834012, + "loss": 0.5146, + "step": 125810 + }, + { + "epoch": 6.249130823482666, + "grad_norm": 0.1279296875, + "learning_rate": 0.0003000735074997517, + "loss": 0.4963, + "step": 125820 + }, + { + "epoch": 6.249627495778285, + "grad_norm": 0.142578125, + "learning_rate": 0.00030003377371610215, + "loss": 0.4802, + "step": 125830 + }, + { + "epoch": 6.250124168073905, + "grad_norm": 0.1279296875, + "learning_rate": 0.00029999403993245257, + "loss": 0.489, + "step": 125840 + }, + { + "epoch": 6.250620840369524, + "grad_norm": 0.12451171875, + "learning_rate": 0.00029995430614880304, + "loss": 0.4896, + "step": 125850 + }, + { + "epoch": 6.251117512665144, + "grad_norm": 0.1630859375, + "learning_rate": 0.00029991457236515346, + "loss": 0.4813, + "step": 125860 + }, + { + "epoch": 6.251614184960763, + "grad_norm": 0.138671875, + "learning_rate": 0.000299874838581504, + "loss": 0.5038, + "step": 125870 + }, + { + "epoch": 6.252110857256382, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002998351047978544, + "loss": 0.5025, + "step": 125880 + }, + { + "epoch": 6.252607529552002, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002997953710142048, + "loss": 0.4924, + "step": 125890 + }, + { + "epoch": 6.253104201847621, + "grad_norm": 0.12890625, + "learning_rate": 0.0002997556372305553, + "loss": 0.5108, + "step": 125900 + }, + { + "epoch": 6.25360087414324, + "grad_norm": 0.1689453125, + "learning_rate": 0.00029971590344690576, + "loss": 0.5069, + "step": 125910 + }, + { + "epoch": 6.254097546438859, + "grad_norm": 0.193359375, + "learning_rate": 0.0002996761696632562, + "loss": 0.4908, + "step": 125920 + }, + { + "epoch": 6.254594218734479, + "grad_norm": 0.1279296875, + "learning_rate": 0.00029963643587960665, + "loss": 0.5153, + "step": 125930 + }, + { + "epoch": 6.255090891030099, + "grad_norm": 0.12890625, + "learning_rate": 0.0002995967020959571, + "loss": 0.4955, + "step": 125940 + }, + { + "epoch": 6.255587563325718, + "grad_norm": 0.154296875, + "learning_rate": 0.0002995569683123076, + "loss": 0.5118, + "step": 125950 + }, + { + "epoch": 6.256084235621337, + "grad_norm": 0.1298828125, + "learning_rate": 0.000299517234528658, + "loss": 0.5032, + "step": 125960 + }, + { + "epoch": 6.256580907916956, + "grad_norm": 0.154296875, + "learning_rate": 0.0002994775007450084, + "loss": 0.5198, + "step": 125970 + }, + { + "epoch": 6.2570775802125755, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002994377669613589, + "loss": 0.4657, + "step": 125980 + }, + { + "epoch": 6.257574252508195, + "grad_norm": 0.15234375, + "learning_rate": 0.00029939803317770937, + "loss": 0.5155, + "step": 125990 + }, + { + "epoch": 6.258070924803814, + "grad_norm": 0.123046875, + "learning_rate": 0.0002993582993940598, + "loss": 0.4896, + "step": 126000 + }, + { + "epoch": 6.258567597099434, + "grad_norm": 0.12890625, + "learning_rate": 0.00029931856561041026, + "loss": 0.4847, + "step": 126010 + }, + { + "epoch": 6.259064269395053, + "grad_norm": 0.1259765625, + "learning_rate": 0.00029927883182676073, + "loss": 0.5099, + "step": 126020 + }, + { + "epoch": 6.259560941690673, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002992390980431112, + "loss": 0.503, + "step": 126030 + }, + { + "epoch": 6.260057613986292, + "grad_norm": 0.13671875, + "learning_rate": 0.0002991993642594616, + "loss": 0.5044, + "step": 126040 + }, + { + "epoch": 6.260554286281911, + "grad_norm": 0.1376953125, + "learning_rate": 0.00029915963047581203, + "loss": 0.5047, + "step": 126050 + }, + { + "epoch": 6.26105095857753, + "grad_norm": 0.142578125, + "learning_rate": 0.00029911989669216256, + "loss": 0.4773, + "step": 126060 + }, + { + "epoch": 6.2615476308731495, + "grad_norm": 0.1279296875, + "learning_rate": 0.000299080162908513, + "loss": 0.4588, + "step": 126070 + }, + { + "epoch": 6.26204430316877, + "grad_norm": 0.126953125, + "learning_rate": 0.0002990404291248634, + "loss": 0.4935, + "step": 126080 + }, + { + "epoch": 6.262540975464389, + "grad_norm": 0.140625, + "learning_rate": 0.00029900069534121386, + "loss": 0.5197, + "step": 126090 + }, + { + "epoch": 6.263037647760008, + "grad_norm": 0.12353515625, + "learning_rate": 0.00029896096155756434, + "loss": 0.4806, + "step": 126100 + }, + { + "epoch": 6.263534320055627, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002989212277739148, + "loss": 0.4985, + "step": 126110 + }, + { + "epoch": 6.2640309923512465, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002988814939902652, + "loss": 0.4776, + "step": 126120 + }, + { + "epoch": 6.264527664646866, + "grad_norm": 0.171875, + "learning_rate": 0.0002988417602066157, + "loss": 0.4892, + "step": 126130 + }, + { + "epoch": 6.265024336942485, + "grad_norm": 0.134765625, + "learning_rate": 0.00029880202642296617, + "loss": 0.5194, + "step": 126140 + }, + { + "epoch": 6.265521009238105, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002987622926393166, + "loss": 0.5437, + "step": 126150 + }, + { + "epoch": 6.266017681533724, + "grad_norm": 0.1181640625, + "learning_rate": 0.00029872255885566706, + "loss": 0.48, + "step": 126160 + }, + { + "epoch": 6.2665143538293435, + "grad_norm": 0.1318359375, + "learning_rate": 0.00029868282507201753, + "loss": 0.4789, + "step": 126170 + }, + { + "epoch": 6.267011026124963, + "grad_norm": 0.1728515625, + "learning_rate": 0.00029864309128836794, + "loss": 0.4868, + "step": 126180 + }, + { + "epoch": 6.267507698420582, + "grad_norm": 0.134765625, + "learning_rate": 0.0002986033575047184, + "loss": 0.5062, + "step": 126190 + }, + { + "epoch": 6.268004370716201, + "grad_norm": 0.126953125, + "learning_rate": 0.00029856362372106883, + "loss": 0.4907, + "step": 126200 + }, + { + "epoch": 6.26850104301182, + "grad_norm": 0.1328125, + "learning_rate": 0.0002985238899374193, + "loss": 0.5097, + "step": 126210 + }, + { + "epoch": 6.2689977153074405, + "grad_norm": 0.12890625, + "learning_rate": 0.0002984841561537698, + "loss": 0.4833, + "step": 126220 + }, + { + "epoch": 6.26949438760306, + "grad_norm": 0.20703125, + "learning_rate": 0.0002984444223701202, + "loss": 0.4974, + "step": 126230 + }, + { + "epoch": 6.269991059898679, + "grad_norm": 0.1357421875, + "learning_rate": 0.00029840468858647066, + "loss": 0.5146, + "step": 126240 + }, + { + "epoch": 6.270487732194298, + "grad_norm": 0.1376953125, + "learning_rate": 0.00029836495480282114, + "loss": 0.4795, + "step": 126250 + }, + { + "epoch": 6.2709844044899175, + "grad_norm": 0.1357421875, + "learning_rate": 0.00029832522101917155, + "loss": 0.4976, + "step": 126260 + }, + { + "epoch": 6.271481076785537, + "grad_norm": 0.134765625, + "learning_rate": 0.000298285487235522, + "loss": 0.5039, + "step": 126270 + }, + { + "epoch": 6.271977749081156, + "grad_norm": 0.12353515625, + "learning_rate": 0.00029824575345187244, + "loss": 0.488, + "step": 126280 + }, + { + "epoch": 6.272474421376776, + "grad_norm": 0.11376953125, + "learning_rate": 0.0002982060196682229, + "loss": 0.4881, + "step": 126290 + }, + { + "epoch": 6.272971093672395, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002981662858845734, + "loss": 0.52, + "step": 126300 + }, + { + "epoch": 6.2734677659680145, + "grad_norm": 0.19140625, + "learning_rate": 0.0002981265521009238, + "loss": 0.5048, + "step": 126310 + }, + { + "epoch": 6.273964438263634, + "grad_norm": 0.138671875, + "learning_rate": 0.00029808681831727427, + "loss": 0.4607, + "step": 126320 + }, + { + "epoch": 6.274461110559253, + "grad_norm": 0.1298828125, + "learning_rate": 0.00029804708453362474, + "loss": 0.5297, + "step": 126330 + }, + { + "epoch": 6.274957782854872, + "grad_norm": 0.158203125, + "learning_rate": 0.00029800735074997516, + "loss": 0.5118, + "step": 126340 + }, + { + "epoch": 6.275454455150491, + "grad_norm": 0.158203125, + "learning_rate": 0.00029796761696632563, + "loss": 0.4898, + "step": 126350 + }, + { + "epoch": 6.2759511274461115, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002979278831826761, + "loss": 0.4915, + "step": 126360 + }, + { + "epoch": 6.276447799741731, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002978881493990265, + "loss": 0.4895, + "step": 126370 + }, + { + "epoch": 6.27694447203735, + "grad_norm": 0.162109375, + "learning_rate": 0.000297848415615377, + "loss": 0.5334, + "step": 126380 + }, + { + "epoch": 6.277441144332969, + "grad_norm": 0.130859375, + "learning_rate": 0.0002978086818317274, + "loss": 0.4788, + "step": 126390 + }, + { + "epoch": 6.277937816628588, + "grad_norm": 0.1201171875, + "learning_rate": 0.00029776894804807793, + "loss": 0.4646, + "step": 126400 + }, + { + "epoch": 6.278434488924208, + "grad_norm": 0.1484375, + "learning_rate": 0.00029772921426442835, + "loss": 0.5157, + "step": 126410 + }, + { + "epoch": 6.278931161219827, + "grad_norm": 0.12109375, + "learning_rate": 0.00029768948048077877, + "loss": 0.4983, + "step": 126420 + }, + { + "epoch": 6.279427833515447, + "grad_norm": 0.1318359375, + "learning_rate": 0.00029764974669712924, + "loss": 0.4527, + "step": 126430 + }, + { + "epoch": 6.279924505811066, + "grad_norm": 0.177734375, + "learning_rate": 0.0002976100129134797, + "loss": 0.5077, + "step": 126440 + }, + { + "epoch": 6.280421178106685, + "grad_norm": 0.146484375, + "learning_rate": 0.00029757027912983013, + "loss": 0.5107, + "step": 126450 + }, + { + "epoch": 6.280917850402305, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002975305453461806, + "loss": 0.4965, + "step": 126460 + }, + { + "epoch": 6.281414522697924, + "grad_norm": 0.12255859375, + "learning_rate": 0.00029749081156253107, + "loss": 0.4856, + "step": 126470 + }, + { + "epoch": 6.281911194993543, + "grad_norm": 0.171875, + "learning_rate": 0.00029745107777888154, + "loss": 0.5022, + "step": 126480 + }, + { + "epoch": 6.282407867289162, + "grad_norm": 0.1865234375, + "learning_rate": 0.00029741134399523196, + "loss": 0.4864, + "step": 126490 + }, + { + "epoch": 6.282904539584782, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002973716102115824, + "loss": 0.5043, + "step": 126500 + }, + { + "epoch": 6.283401211880402, + "grad_norm": 0.126953125, + "learning_rate": 0.0002973318764279329, + "loss": 0.4887, + "step": 126510 + }, + { + "epoch": 6.283897884176021, + "grad_norm": 0.142578125, + "learning_rate": 0.0002972921426442833, + "loss": 0.4909, + "step": 126520 + }, + { + "epoch": 6.28439455647164, + "grad_norm": 0.125, + "learning_rate": 0.00029725240886063374, + "loss": 0.4986, + "step": 126530 + }, + { + "epoch": 6.284891228767259, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002972126750769842, + "loss": 0.4875, + "step": 126540 + }, + { + "epoch": 6.285387901062879, + "grad_norm": 0.12890625, + "learning_rate": 0.0002971729412933347, + "loss": 0.4904, + "step": 126550 + }, + { + "epoch": 6.285884573358498, + "grad_norm": 0.1318359375, + "learning_rate": 0.00029713320750968515, + "loss": 0.5244, + "step": 126560 + }, + { + "epoch": 6.286381245654117, + "grad_norm": 0.1201171875, + "learning_rate": 0.00029709347372603557, + "loss": 0.4744, + "step": 126570 + }, + { + "epoch": 6.286877917949737, + "grad_norm": 0.12890625, + "learning_rate": 0.000297053739942386, + "loss": 0.4744, + "step": 126580 + }, + { + "epoch": 6.287374590245356, + "grad_norm": 0.1923828125, + "learning_rate": 0.0002970140061587365, + "loss": 0.4994, + "step": 126590 + }, + { + "epoch": 6.287871262540976, + "grad_norm": 0.12451171875, + "learning_rate": 0.00029697427237508693, + "loss": 0.4812, + "step": 126600 + }, + { + "epoch": 6.288367934836595, + "grad_norm": 0.125, + "learning_rate": 0.0002969345385914374, + "loss": 0.488, + "step": 126610 + }, + { + "epoch": 6.288864607132214, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002968948048077878, + "loss": 0.5159, + "step": 126620 + }, + { + "epoch": 6.289361279427833, + "grad_norm": 0.12158203125, + "learning_rate": 0.0002968550710241383, + "loss": 0.4736, + "step": 126630 + }, + { + "epoch": 6.2898579517234525, + "grad_norm": 0.1669921875, + "learning_rate": 0.00029681533724048876, + "loss": 0.4954, + "step": 126640 + }, + { + "epoch": 6.290354624019072, + "grad_norm": 0.1640625, + "learning_rate": 0.0002967756034568392, + "loss": 0.5119, + "step": 126650 + }, + { + "epoch": 6.290851296314692, + "grad_norm": 0.154296875, + "learning_rate": 0.00029673586967318965, + "loss": 0.5072, + "step": 126660 + }, + { + "epoch": 6.291347968610311, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002966961358895401, + "loss": 0.4805, + "step": 126670 + }, + { + "epoch": 6.29184464090593, + "grad_norm": 0.12451171875, + "learning_rate": 0.00029665640210589054, + "loss": 0.4895, + "step": 126680 + }, + { + "epoch": 6.2923413132015495, + "grad_norm": 0.146484375, + "learning_rate": 0.000296616668322241, + "loss": 0.5036, + "step": 126690 + }, + { + "epoch": 6.292837985497169, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002965769345385915, + "loss": 0.4635, + "step": 126700 + }, + { + "epoch": 6.293334657792788, + "grad_norm": 0.138671875, + "learning_rate": 0.0002965372007549419, + "loss": 0.5062, + "step": 126710 + }, + { + "epoch": 6.293831330088407, + "grad_norm": 0.11767578125, + "learning_rate": 0.00029649746697129237, + "loss": 0.4842, + "step": 126720 + }, + { + "epoch": 6.294328002384027, + "grad_norm": 0.14453125, + "learning_rate": 0.0002964577331876428, + "loss": 0.5248, + "step": 126730 + }, + { + "epoch": 6.294824674679647, + "grad_norm": 0.12890625, + "learning_rate": 0.00029641799940399326, + "loss": 0.4795, + "step": 126740 + }, + { + "epoch": 6.295321346975266, + "grad_norm": 0.1201171875, + "learning_rate": 0.00029637826562034373, + "loss": 0.5147, + "step": 126750 + }, + { + "epoch": 6.295818019270885, + "grad_norm": 0.1298828125, + "learning_rate": 0.00029633853183669414, + "loss": 0.4838, + "step": 126760 + }, + { + "epoch": 6.296314691566504, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002962987980530446, + "loss": 0.5255, + "step": 126770 + }, + { + "epoch": 6.2968113638621235, + "grad_norm": 0.169921875, + "learning_rate": 0.0002962590642693951, + "loss": 0.5057, + "step": 126780 + }, + { + "epoch": 6.297308036157743, + "grad_norm": 0.14453125, + "learning_rate": 0.0002962193304857455, + "loss": 0.496, + "step": 126790 + }, + { + "epoch": 6.297804708453363, + "grad_norm": 0.130859375, + "learning_rate": 0.000296179596702096, + "loss": 0.4741, + "step": 126800 + }, + { + "epoch": 6.298301380748982, + "grad_norm": 0.15625, + "learning_rate": 0.00029613986291844645, + "loss": 0.4893, + "step": 126810 + }, + { + "epoch": 6.298798053044601, + "grad_norm": 0.1259765625, + "learning_rate": 0.00029610012913479686, + "loss": 0.483, + "step": 126820 + }, + { + "epoch": 6.2992947253402205, + "grad_norm": 0.130859375, + "learning_rate": 0.00029606039535114734, + "loss": 0.5172, + "step": 126830 + }, + { + "epoch": 6.29979139763584, + "grad_norm": 0.126953125, + "learning_rate": 0.00029602066156749775, + "loss": 0.5366, + "step": 126840 + }, + { + "epoch": 6.300288069931459, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002959809277838482, + "loss": 0.5062, + "step": 126850 + }, + { + "epoch": 6.300784742227078, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002959411940001987, + "loss": 0.4965, + "step": 126860 + }, + { + "epoch": 6.301281414522698, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002959014602165491, + "loss": 0.4638, + "step": 126870 + }, + { + "epoch": 6.3017780868183175, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002958617264328996, + "loss": 0.4834, + "step": 126880 + }, + { + "epoch": 6.302274759113937, + "grad_norm": 0.126953125, + "learning_rate": 0.00029582199264925006, + "loss": 0.4711, + "step": 126890 + }, + { + "epoch": 6.302771431409556, + "grad_norm": 0.1484375, + "learning_rate": 0.00029578225886560047, + "loss": 0.5093, + "step": 126900 + }, + { + "epoch": 6.303268103705175, + "grad_norm": 0.11767578125, + "learning_rate": 0.00029574252508195094, + "loss": 0.496, + "step": 126910 + }, + { + "epoch": 6.303764776000794, + "grad_norm": 0.1796875, + "learning_rate": 0.00029570279129830136, + "loss": 0.5124, + "step": 126920 + }, + { + "epoch": 6.304261448296414, + "grad_norm": 0.1328125, + "learning_rate": 0.0002956630575146519, + "loss": 0.4961, + "step": 126930 + }, + { + "epoch": 6.304758120592034, + "grad_norm": 0.1142578125, + "learning_rate": 0.0002956233237310023, + "loss": 0.4907, + "step": 126940 + }, + { + "epoch": 6.305254792887653, + "grad_norm": 0.125, + "learning_rate": 0.0002955835899473527, + "loss": 0.4881, + "step": 126950 + }, + { + "epoch": 6.305751465183272, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002955438561637032, + "loss": 0.4682, + "step": 126960 + }, + { + "epoch": 6.3062481374788915, + "grad_norm": 0.140625, + "learning_rate": 0.00029550412238005366, + "loss": 0.5269, + "step": 126970 + }, + { + "epoch": 6.306744809774511, + "grad_norm": 0.13671875, + "learning_rate": 0.0002954643885964041, + "loss": 0.513, + "step": 126980 + }, + { + "epoch": 6.30724148207013, + "grad_norm": 0.12890625, + "learning_rate": 0.00029542465481275455, + "loss": 0.4934, + "step": 126990 + }, + { + "epoch": 6.307738154365749, + "grad_norm": 0.11767578125, + "learning_rate": 0.000295384921029105, + "loss": 0.4866, + "step": 127000 + }, + { + "epoch": 6.308234826661369, + "grad_norm": 0.142578125, + "learning_rate": 0.0002953451872454555, + "loss": 0.4747, + "step": 127010 + }, + { + "epoch": 6.3087314989569885, + "grad_norm": 0.12158203125, + "learning_rate": 0.0002953054534618059, + "loss": 0.5089, + "step": 127020 + }, + { + "epoch": 6.309228171252608, + "grad_norm": 0.12255859375, + "learning_rate": 0.00029526571967815633, + "loss": 0.5182, + "step": 127030 + }, + { + "epoch": 6.309724843548227, + "grad_norm": 0.1220703125, + "learning_rate": 0.00029522598589450685, + "loss": 0.4992, + "step": 127040 + }, + { + "epoch": 6.310221515843846, + "grad_norm": 0.134765625, + "learning_rate": 0.00029518625211085727, + "loss": 0.4945, + "step": 127050 + }, + { + "epoch": 6.310718188139465, + "grad_norm": 0.138671875, + "learning_rate": 0.00029514651832720774, + "loss": 0.4966, + "step": 127060 + }, + { + "epoch": 6.311214860435085, + "grad_norm": 0.1328125, + "learning_rate": 0.00029510678454355816, + "loss": 0.5039, + "step": 127070 + }, + { + "epoch": 6.311711532730705, + "grad_norm": 0.1796875, + "learning_rate": 0.00029506705075990863, + "loss": 0.501, + "step": 127080 + }, + { + "epoch": 6.312208205026324, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002950273169762591, + "loss": 0.5039, + "step": 127090 + }, + { + "epoch": 6.312704877321943, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002949875831926095, + "loss": 0.4992, + "step": 127100 + }, + { + "epoch": 6.313201549617562, + "grad_norm": 0.154296875, + "learning_rate": 0.00029494784940895994, + "loss": 0.5168, + "step": 127110 + }, + { + "epoch": 6.313698221913182, + "grad_norm": 0.14453125, + "learning_rate": 0.00029490811562531046, + "loss": 0.5004, + "step": 127120 + }, + { + "epoch": 6.314194894208801, + "grad_norm": 0.12890625, + "learning_rate": 0.0002948683818416609, + "loss": 0.4948, + "step": 127130 + }, + { + "epoch": 6.31469156650442, + "grad_norm": 0.146484375, + "learning_rate": 0.00029482864805801135, + "loss": 0.4946, + "step": 127140 + }, + { + "epoch": 6.315188238800039, + "grad_norm": 0.1318359375, + "learning_rate": 0.00029478891427436177, + "loss": 0.5133, + "step": 127150 + }, + { + "epoch": 6.315684911095659, + "grad_norm": 0.12451171875, + "learning_rate": 0.00029474918049071224, + "loss": 0.4776, + "step": 127160 + }, + { + "epoch": 6.316181583391279, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002947094467070627, + "loss": 0.5288, + "step": 127170 + }, + { + "epoch": 6.316678255686898, + "grad_norm": 0.15625, + "learning_rate": 0.00029466971292341313, + "loss": 0.4827, + "step": 127180 + }, + { + "epoch": 6.317174927982517, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002946299791397636, + "loss": 0.504, + "step": 127190 + }, + { + "epoch": 6.317671600278136, + "grad_norm": 0.1279296875, + "learning_rate": 0.00029459024535611407, + "loss": 0.5046, + "step": 127200 + }, + { + "epoch": 6.318168272573756, + "grad_norm": 0.15234375, + "learning_rate": 0.0002945505115724645, + "loss": 0.4915, + "step": 127210 + }, + { + "epoch": 6.318664944869375, + "grad_norm": 0.126953125, + "learning_rate": 0.00029451077778881496, + "loss": 0.5133, + "step": 127220 + }, + { + "epoch": 6.319161617164995, + "grad_norm": 0.173828125, + "learning_rate": 0.00029447104400516543, + "loss": 0.5189, + "step": 127230 + }, + { + "epoch": 6.319658289460614, + "grad_norm": 0.1396484375, + "learning_rate": 0.00029443131022151585, + "loss": 0.5, + "step": 127240 + }, + { + "epoch": 6.320154961756233, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002943915764378663, + "loss": 0.4652, + "step": 127250 + }, + { + "epoch": 6.320651634051853, + "grad_norm": 0.142578125, + "learning_rate": 0.00029435184265421674, + "loss": 0.4974, + "step": 127260 + }, + { + "epoch": 6.321148306347472, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002943121088705672, + "loss": 0.4879, + "step": 127270 + }, + { + "epoch": 6.321644978643091, + "grad_norm": 0.142578125, + "learning_rate": 0.0002942723750869177, + "loss": 0.5183, + "step": 127280 + }, + { + "epoch": 6.32214165093871, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002942326413032681, + "loss": 0.4927, + "step": 127290 + }, + { + "epoch": 6.32263832323433, + "grad_norm": 0.126953125, + "learning_rate": 0.00029419290751961857, + "loss": 0.5134, + "step": 127300 + }, + { + "epoch": 6.32313499552995, + "grad_norm": 0.1474609375, + "learning_rate": 0.00029415317373596904, + "loss": 0.4883, + "step": 127310 + }, + { + "epoch": 6.323631667825569, + "grad_norm": 0.130859375, + "learning_rate": 0.00029411343995231946, + "loss": 0.4833, + "step": 127320 + }, + { + "epoch": 6.324128340121188, + "grad_norm": 0.14453125, + "learning_rate": 0.00029407370616866993, + "loss": 0.504, + "step": 127330 + }, + { + "epoch": 6.324625012416807, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002940339723850204, + "loss": 0.5042, + "step": 127340 + }, + { + "epoch": 6.3251216847124265, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002939942386013708, + "loss": 0.5016, + "step": 127350 + }, + { + "epoch": 6.325618357008046, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002939545048177213, + "loss": 0.4883, + "step": 127360 + }, + { + "epoch": 6.326115029303665, + "grad_norm": 0.146484375, + "learning_rate": 0.0002939147710340717, + "loss": 0.481, + "step": 127370 + }, + { + "epoch": 6.326611701599285, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002938750372504222, + "loss": 0.5074, + "step": 127380 + }, + { + "epoch": 6.327108373894904, + "grad_norm": 0.1162109375, + "learning_rate": 0.00029383530346677265, + "loss": 0.5119, + "step": 127390 + }, + { + "epoch": 6.3276050461905236, + "grad_norm": 0.138671875, + "learning_rate": 0.00029379556968312306, + "loss": 0.4492, + "step": 127400 + }, + { + "epoch": 6.328101718486143, + "grad_norm": 0.1767578125, + "learning_rate": 0.00029375583589947354, + "loss": 0.4809, + "step": 127410 + }, + { + "epoch": 6.328598390781762, + "grad_norm": 0.140625, + "learning_rate": 0.000293716102115824, + "loss": 0.4887, + "step": 127420 + }, + { + "epoch": 6.329095063077381, + "grad_norm": 0.146484375, + "learning_rate": 0.0002936763683321745, + "loss": 0.5163, + "step": 127430 + }, + { + "epoch": 6.3295917353730005, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002936366345485249, + "loss": 0.5008, + "step": 127440 + }, + { + "epoch": 6.330088407668621, + "grad_norm": 0.12890625, + "learning_rate": 0.0002935969007648753, + "loss": 0.5162, + "step": 127450 + }, + { + "epoch": 6.33058507996424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00029355716698122584, + "loss": 0.4824, + "step": 127460 + }, + { + "epoch": 6.331081752259859, + "grad_norm": 0.1298828125, + "learning_rate": 0.00029351743319757626, + "loss": 0.4889, + "step": 127470 + }, + { + "epoch": 6.331578424555478, + "grad_norm": 0.150390625, + "learning_rate": 0.00029347769941392667, + "loss": 0.4926, + "step": 127480 + }, + { + "epoch": 6.3320750968510975, + "grad_norm": 0.150390625, + "learning_rate": 0.00029343796563027714, + "loss": 0.4883, + "step": 127490 + }, + { + "epoch": 6.332571769146717, + "grad_norm": 0.1328125, + "learning_rate": 0.0002933982318466276, + "loss": 0.4851, + "step": 127500 + }, + { + "epoch": 6.333068441442336, + "grad_norm": 0.22265625, + "learning_rate": 0.0002933584980629781, + "loss": 0.5089, + "step": 127510 + }, + { + "epoch": 6.333565113737956, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002933187642793285, + "loss": 0.4786, + "step": 127520 + }, + { + "epoch": 6.334061786033575, + "grad_norm": 0.1591796875, + "learning_rate": 0.000293279030495679, + "loss": 0.5079, + "step": 127530 + }, + { + "epoch": 6.3345584583291945, + "grad_norm": 0.12255859375, + "learning_rate": 0.00029323929671202945, + "loss": 0.4996, + "step": 127540 + }, + { + "epoch": 6.335055130624814, + "grad_norm": 0.1298828125, + "learning_rate": 0.00029319956292837986, + "loss": 0.4921, + "step": 127550 + }, + { + "epoch": 6.335551802920433, + "grad_norm": 0.134765625, + "learning_rate": 0.0002931598291447303, + "loss": 0.5025, + "step": 127560 + }, + { + "epoch": 6.336048475216052, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002931200953610808, + "loss": 0.4897, + "step": 127570 + }, + { + "epoch": 6.336545147511671, + "grad_norm": 0.134765625, + "learning_rate": 0.0002930803615774312, + "loss": 0.481, + "step": 127580 + }, + { + "epoch": 6.3370418198072915, + "grad_norm": 0.140625, + "learning_rate": 0.0002930406277937817, + "loss": 0.5139, + "step": 127590 + }, + { + "epoch": 6.337538492102911, + "grad_norm": 0.125, + "learning_rate": 0.0002930008940101321, + "loss": 0.4962, + "step": 127600 + }, + { + "epoch": 6.33803516439853, + "grad_norm": 0.14453125, + "learning_rate": 0.0002929611602264826, + "loss": 0.5171, + "step": 127610 + }, + { + "epoch": 6.338531836694149, + "grad_norm": 0.154296875, + "learning_rate": 0.00029292142644283305, + "loss": 0.4813, + "step": 127620 + }, + { + "epoch": 6.339028508989768, + "grad_norm": 0.1318359375, + "learning_rate": 0.00029288169265918347, + "loss": 0.4879, + "step": 127630 + }, + { + "epoch": 6.339525181285388, + "grad_norm": 0.126953125, + "learning_rate": 0.00029284195887553394, + "loss": 0.4723, + "step": 127640 + }, + { + "epoch": 6.340021853581007, + "grad_norm": 0.146484375, + "learning_rate": 0.0002928022250918844, + "loss": 0.4918, + "step": 127650 + }, + { + "epoch": 6.340518525876627, + "grad_norm": 0.1328125, + "learning_rate": 0.00029276249130823483, + "loss": 0.4824, + "step": 127660 + }, + { + "epoch": 6.341015198172246, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002927227575245853, + "loss": 0.4864, + "step": 127670 + }, + { + "epoch": 6.3415118704678655, + "grad_norm": 0.140625, + "learning_rate": 0.0002926830237409357, + "loss": 0.5111, + "step": 127680 + }, + { + "epoch": 6.342008542763485, + "grad_norm": 0.15234375, + "learning_rate": 0.0002926432899572862, + "loss": 0.4874, + "step": 127690 + }, + { + "epoch": 6.342505215059104, + "grad_norm": 0.130859375, + "learning_rate": 0.00029260355617363666, + "loss": 0.4807, + "step": 127700 + }, + { + "epoch": 6.343001887354723, + "grad_norm": 0.140625, + "learning_rate": 0.0002925638223899871, + "loss": 0.504, + "step": 127710 + }, + { + "epoch": 6.343498559650342, + "grad_norm": 0.1474609375, + "learning_rate": 0.00029252408860633755, + "loss": 0.511, + "step": 127720 + }, + { + "epoch": 6.3439952319459625, + "grad_norm": 0.12158203125, + "learning_rate": 0.000292484354822688, + "loss": 0.4594, + "step": 127730 + }, + { + "epoch": 6.344491904241582, + "grad_norm": 0.1240234375, + "learning_rate": 0.00029244462103903844, + "loss": 0.501, + "step": 127740 + }, + { + "epoch": 6.344988576537201, + "grad_norm": 0.20703125, + "learning_rate": 0.0002924048872553889, + "loss": 0.504, + "step": 127750 + }, + { + "epoch": 6.34548524883282, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002923651534717394, + "loss": 0.4805, + "step": 127760 + }, + { + "epoch": 6.345981921128439, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002923254196880898, + "loss": 0.5137, + "step": 127770 + }, + { + "epoch": 6.346478593424059, + "grad_norm": 0.1220703125, + "learning_rate": 0.00029228568590444027, + "loss": 0.4669, + "step": 127780 + }, + { + "epoch": 6.346975265719678, + "grad_norm": 0.162109375, + "learning_rate": 0.0002922459521207907, + "loss": 0.4747, + "step": 127790 + }, + { + "epoch": 6.347471938015298, + "grad_norm": 0.1708984375, + "learning_rate": 0.00029220621833714116, + "loss": 0.5053, + "step": 127800 + }, + { + "epoch": 6.347968610310917, + "grad_norm": 0.16015625, + "learning_rate": 0.00029216648455349163, + "loss": 0.4982, + "step": 127810 + }, + { + "epoch": 6.348465282606536, + "grad_norm": 0.13671875, + "learning_rate": 0.00029212675076984205, + "loss": 0.505, + "step": 127820 + }, + { + "epoch": 6.348961954902156, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002920870169861925, + "loss": 0.5074, + "step": 127830 + }, + { + "epoch": 6.349458627197775, + "grad_norm": 0.115234375, + "learning_rate": 0.000292047283202543, + "loss": 0.4657, + "step": 127840 + }, + { + "epoch": 6.349955299493394, + "grad_norm": 0.15234375, + "learning_rate": 0.0002920075494188934, + "loss": 0.5303, + "step": 127850 + }, + { + "epoch": 6.350451971789013, + "grad_norm": 0.126953125, + "learning_rate": 0.0002919678156352439, + "loss": 0.4858, + "step": 127860 + }, + { + "epoch": 6.3509486440846326, + "grad_norm": 0.1171875, + "learning_rate": 0.00029192808185159435, + "loss": 0.4904, + "step": 127870 + }, + { + "epoch": 6.351445316380253, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002918883480679448, + "loss": 0.4802, + "step": 127880 + }, + { + "epoch": 6.351941988675872, + "grad_norm": 0.1259765625, + "learning_rate": 0.00029184861428429524, + "loss": 0.4675, + "step": 127890 + }, + { + "epoch": 6.352438660971491, + "grad_norm": 0.1513671875, + "learning_rate": 0.00029180888050064566, + "loss": 0.509, + "step": 127900 + }, + { + "epoch": 6.35293533326711, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002917691467169962, + "loss": 0.5031, + "step": 127910 + }, + { + "epoch": 6.35343200556273, + "grad_norm": 0.134765625, + "learning_rate": 0.0002917294129333466, + "loss": 0.4812, + "step": 127920 + }, + { + "epoch": 6.353928677858349, + "grad_norm": 0.123046875, + "learning_rate": 0.000291689679149697, + "loss": 0.4554, + "step": 127930 + }, + { + "epoch": 6.354425350153968, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002916499453660475, + "loss": 0.5126, + "step": 127940 + }, + { + "epoch": 6.354922022449588, + "grad_norm": 0.1220703125, + "learning_rate": 0.00029161021158239796, + "loss": 0.4959, + "step": 127950 + }, + { + "epoch": 6.355418694745207, + "grad_norm": 0.185546875, + "learning_rate": 0.00029157047779874843, + "loss": 0.5187, + "step": 127960 + }, + { + "epoch": 6.355915367040827, + "grad_norm": 0.1279296875, + "learning_rate": 0.00029153074401509885, + "loss": 0.505, + "step": 127970 + }, + { + "epoch": 6.356412039336446, + "grad_norm": 0.126953125, + "learning_rate": 0.00029149101023144926, + "loss": 0.4917, + "step": 127980 + }, + { + "epoch": 6.356908711632065, + "grad_norm": 0.134765625, + "learning_rate": 0.0002914512764477998, + "loss": 0.467, + "step": 127990 + }, + { + "epoch": 6.357405383927684, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002914115426641502, + "loss": 0.4777, + "step": 128000 + }, + { + "epoch": 6.3579020562233035, + "grad_norm": 0.12158203125, + "learning_rate": 0.0002913718088805006, + "loss": 0.4678, + "step": 128010 + }, + { + "epoch": 6.358398728518924, + "grad_norm": 0.1484375, + "learning_rate": 0.0002913320750968511, + "loss": 0.522, + "step": 128020 + }, + { + "epoch": 6.358895400814543, + "grad_norm": 0.126953125, + "learning_rate": 0.00029129234131320157, + "loss": 0.4893, + "step": 128030 + }, + { + "epoch": 6.359392073110162, + "grad_norm": 0.12353515625, + "learning_rate": 0.00029125260752955204, + "loss": 0.5169, + "step": 128040 + }, + { + "epoch": 6.359888745405781, + "grad_norm": 0.1630859375, + "learning_rate": 0.00029121287374590246, + "loss": 0.4913, + "step": 128050 + }, + { + "epoch": 6.3603854177014005, + "grad_norm": 0.123046875, + "learning_rate": 0.00029117313996225293, + "loss": 0.4992, + "step": 128060 + }, + { + "epoch": 6.36088208999702, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002911334061786034, + "loss": 0.5027, + "step": 128070 + }, + { + "epoch": 6.361378762292639, + "grad_norm": 0.12158203125, + "learning_rate": 0.0002910936723949538, + "loss": 0.4832, + "step": 128080 + }, + { + "epoch": 6.361875434588258, + "grad_norm": 0.1669921875, + "learning_rate": 0.00029105393861130423, + "loss": 0.4816, + "step": 128090 + }, + { + "epoch": 6.362372106883878, + "grad_norm": 0.15234375, + "learning_rate": 0.00029101420482765476, + "loss": 0.4841, + "step": 128100 + }, + { + "epoch": 6.362868779179498, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002909744710440052, + "loss": 0.4955, + "step": 128110 + }, + { + "epoch": 6.363365451475117, + "grad_norm": 0.12890625, + "learning_rate": 0.00029093473726035565, + "loss": 0.5074, + "step": 128120 + }, + { + "epoch": 6.363862123770736, + "grad_norm": 0.12353515625, + "learning_rate": 0.00029089500347670606, + "loss": 0.4901, + "step": 128130 + }, + { + "epoch": 6.364358796066355, + "grad_norm": 0.146484375, + "learning_rate": 0.00029085526969305654, + "loss": 0.509, + "step": 128140 + }, + { + "epoch": 6.3648554683619745, + "grad_norm": 0.1279296875, + "learning_rate": 0.000290815535909407, + "loss": 0.4937, + "step": 128150 + }, + { + "epoch": 6.365352140657594, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002907758021257574, + "loss": 0.5029, + "step": 128160 + }, + { + "epoch": 6.365848812953214, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002907360683421079, + "loss": 0.4996, + "step": 128170 + }, + { + "epoch": 6.366345485248833, + "grad_norm": 0.1318359375, + "learning_rate": 0.00029069633455845837, + "loss": 0.4869, + "step": 128180 + }, + { + "epoch": 6.366842157544452, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002906566007748088, + "loss": 0.5017, + "step": 128190 + }, + { + "epoch": 6.3673388298400715, + "grad_norm": 0.11572265625, + "learning_rate": 0.00029061686699115926, + "loss": 0.4763, + "step": 128200 + }, + { + "epoch": 6.367835502135691, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002905771332075097, + "loss": 0.5134, + "step": 128210 + }, + { + "epoch": 6.36833217443131, + "grad_norm": 0.1904296875, + "learning_rate": 0.00029053739942386014, + "loss": 0.4816, + "step": 128220 + }, + { + "epoch": 6.368828846726929, + "grad_norm": 0.140625, + "learning_rate": 0.0002904976656402106, + "loss": 0.4838, + "step": 128230 + }, + { + "epoch": 6.369325519022549, + "grad_norm": 0.125, + "learning_rate": 0.00029045793185656103, + "loss": 0.4561, + "step": 128240 + }, + { + "epoch": 6.3698221913181685, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002904181980729115, + "loss": 0.5005, + "step": 128250 + }, + { + "epoch": 6.370318863613788, + "grad_norm": 0.12255859375, + "learning_rate": 0.000290378464289262, + "loss": 0.5194, + "step": 128260 + }, + { + "epoch": 6.370815535909407, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002903387305056124, + "loss": 0.4942, + "step": 128270 + }, + { + "epoch": 6.371312208205026, + "grad_norm": 0.130859375, + "learning_rate": 0.00029029899672196286, + "loss": 0.4992, + "step": 128280 + }, + { + "epoch": 6.371808880500645, + "grad_norm": 0.140625, + "learning_rate": 0.00029025926293831333, + "loss": 0.5089, + "step": 128290 + }, + { + "epoch": 6.372305552796265, + "grad_norm": 0.146484375, + "learning_rate": 0.00029021952915466375, + "loss": 0.511, + "step": 128300 + }, + { + "epoch": 6.372802225091885, + "grad_norm": 0.158203125, + "learning_rate": 0.0002901797953710142, + "loss": 0.5151, + "step": 128310 + }, + { + "epoch": 6.373298897387504, + "grad_norm": 0.205078125, + "learning_rate": 0.00029014006158736464, + "loss": 0.495, + "step": 128320 + }, + { + "epoch": 6.373795569683123, + "grad_norm": 0.1435546875, + "learning_rate": 0.00029010032780371517, + "loss": 0.4658, + "step": 128330 + }, + { + "epoch": 6.3742922419787424, + "grad_norm": 0.140625, + "learning_rate": 0.0002900605940200656, + "loss": 0.4733, + "step": 128340 + }, + { + "epoch": 6.374788914274362, + "grad_norm": 0.1201171875, + "learning_rate": 0.000290020860236416, + "loss": 0.5039, + "step": 128350 + }, + { + "epoch": 6.375285586569981, + "grad_norm": 0.1357421875, + "learning_rate": 0.00028998112645276647, + "loss": 0.4856, + "step": 128360 + }, + { + "epoch": 6.3757822588656, + "grad_norm": 0.134765625, + "learning_rate": 0.00028994139266911694, + "loss": 0.4915, + "step": 128370 + }, + { + "epoch": 6.37627893116122, + "grad_norm": 0.12060546875, + "learning_rate": 0.00028990165888546736, + "loss": 0.5009, + "step": 128380 + }, + { + "epoch": 6.3767756034568395, + "grad_norm": 0.1455078125, + "learning_rate": 0.00028986192510181783, + "loss": 0.5039, + "step": 128390 + }, + { + "epoch": 6.377272275752459, + "grad_norm": 0.17578125, + "learning_rate": 0.0002898221913181683, + "loss": 0.5007, + "step": 128400 + }, + { + "epoch": 6.377768948048078, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002897824575345188, + "loss": 0.4989, + "step": 128410 + }, + { + "epoch": 6.378265620343697, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002897427237508692, + "loss": 0.4918, + "step": 128420 + }, + { + "epoch": 6.378762292639316, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002897029899672196, + "loss": 0.5297, + "step": 128430 + }, + { + "epoch": 6.379258964934936, + "grad_norm": 0.150390625, + "learning_rate": 0.00028966325618357013, + "loss": 0.5047, + "step": 128440 + }, + { + "epoch": 6.379755637230556, + "grad_norm": 0.1181640625, + "learning_rate": 0.00028962352239992055, + "loss": 0.4742, + "step": 128450 + }, + { + "epoch": 6.380252309526175, + "grad_norm": 0.130859375, + "learning_rate": 0.00028958378861627097, + "loss": 0.486, + "step": 128460 + }, + { + "epoch": 6.380748981821794, + "grad_norm": 0.13671875, + "learning_rate": 0.00028954405483262144, + "loss": 0.5336, + "step": 128470 + }, + { + "epoch": 6.381245654117413, + "grad_norm": 0.134765625, + "learning_rate": 0.0002895043210489719, + "loss": 0.5085, + "step": 128480 + }, + { + "epoch": 6.381742326413033, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002894645872653224, + "loss": 0.4967, + "step": 128490 + }, + { + "epoch": 6.382238998708652, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002894248534816728, + "loss": 0.4979, + "step": 128500 + }, + { + "epoch": 6.382735671004271, + "grad_norm": 0.126953125, + "learning_rate": 0.00028938511969802327, + "loss": 0.4953, + "step": 128510 + }, + { + "epoch": 6.383232343299891, + "grad_norm": 0.1591796875, + "learning_rate": 0.00028934538591437374, + "loss": 0.4948, + "step": 128520 + }, + { + "epoch": 6.38372901559551, + "grad_norm": 0.126953125, + "learning_rate": 0.00028930565213072416, + "loss": 0.517, + "step": 128530 + }, + { + "epoch": 6.38422568789113, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002892659183470746, + "loss": 0.506, + "step": 128540 + }, + { + "epoch": 6.384722360186749, + "grad_norm": 0.1376953125, + "learning_rate": 0.00028922618456342505, + "loss": 0.4918, + "step": 128550 + }, + { + "epoch": 6.385219032482368, + "grad_norm": 0.13671875, + "learning_rate": 0.0002891864507797755, + "loss": 0.5063, + "step": 128560 + }, + { + "epoch": 6.385715704777987, + "grad_norm": 0.1318359375, + "learning_rate": 0.000289146716996126, + "loss": 0.5084, + "step": 128570 + }, + { + "epoch": 6.386212377073607, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002891069832124764, + "loss": 0.4974, + "step": 128580 + }, + { + "epoch": 6.386709049369226, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002890672494288269, + "loss": 0.4743, + "step": 128590 + }, + { + "epoch": 6.387205721664846, + "grad_norm": 0.123046875, + "learning_rate": 0.00028902751564517735, + "loss": 0.4642, + "step": 128600 + }, + { + "epoch": 6.387702393960465, + "grad_norm": 0.12890625, + "learning_rate": 0.00028898778186152777, + "loss": 0.4721, + "step": 128610 + }, + { + "epoch": 6.388199066256084, + "grad_norm": 0.134765625, + "learning_rate": 0.0002889480480778782, + "loss": 0.4628, + "step": 128620 + }, + { + "epoch": 6.388695738551704, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002889083142942287, + "loss": 0.4595, + "step": 128630 + }, + { + "epoch": 6.389192410847323, + "grad_norm": 0.1435546875, + "learning_rate": 0.00028886858051057913, + "loss": 0.5141, + "step": 128640 + }, + { + "epoch": 6.389689083142942, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002888288467269296, + "loss": 0.5169, + "step": 128650 + }, + { + "epoch": 6.390185755438561, + "grad_norm": 0.1494140625, + "learning_rate": 0.00028878911294328, + "loss": 0.489, + "step": 128660 + }, + { + "epoch": 6.390682427734181, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002887493791596305, + "loss": 0.4895, + "step": 128670 + }, + { + "epoch": 6.391179100029801, + "grad_norm": 0.15625, + "learning_rate": 0.00028870964537598096, + "loss": 0.4744, + "step": 128680 + }, + { + "epoch": 6.39167577232542, + "grad_norm": 0.11474609375, + "learning_rate": 0.0002886699115923314, + "loss": 0.464, + "step": 128690 + }, + { + "epoch": 6.392172444621039, + "grad_norm": 0.1572265625, + "learning_rate": 0.00028863017780868185, + "loss": 0.496, + "step": 128700 + }, + { + "epoch": 6.392669116916658, + "grad_norm": 0.130859375, + "learning_rate": 0.0002885904440250323, + "loss": 0.4764, + "step": 128710 + }, + { + "epoch": 6.3931657892122775, + "grad_norm": 0.1552734375, + "learning_rate": 0.00028855071024138274, + "loss": 0.4833, + "step": 128720 + }, + { + "epoch": 6.393662461507897, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002885109764577332, + "loss": 0.4572, + "step": 128730 + }, + { + "epoch": 6.394159133803516, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002884712426740837, + "loss": 0.4848, + "step": 128740 + }, + { + "epoch": 6.394655806099136, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002884315088904341, + "loss": 0.4934, + "step": 128750 + }, + { + "epoch": 6.395152478394755, + "grad_norm": 0.1591796875, + "learning_rate": 0.00028839177510678457, + "loss": 0.5331, + "step": 128760 + }, + { + "epoch": 6.3956491506903745, + "grad_norm": 0.12158203125, + "learning_rate": 0.000288352041323135, + "loss": 0.4802, + "step": 128770 + }, + { + "epoch": 6.396145822985994, + "grad_norm": 0.1416015625, + "learning_rate": 0.00028831230753948546, + "loss": 0.5062, + "step": 128780 + }, + { + "epoch": 6.396642495281613, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002882725737558359, + "loss": 0.5086, + "step": 128790 + }, + { + "epoch": 6.397139167577232, + "grad_norm": 0.1318359375, + "learning_rate": 0.00028823283997218634, + "loss": 0.5039, + "step": 128800 + }, + { + "epoch": 6.3976358398728514, + "grad_norm": 0.126953125, + "learning_rate": 0.0002881931061885368, + "loss": 0.5243, + "step": 128810 + }, + { + "epoch": 6.398132512168472, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002881533724048873, + "loss": 0.4958, + "step": 128820 + }, + { + "epoch": 6.398629184464091, + "grad_norm": 0.125, + "learning_rate": 0.0002881136386212377, + "loss": 0.4818, + "step": 128830 + }, + { + "epoch": 6.39912585675971, + "grad_norm": 0.1328125, + "learning_rate": 0.0002880739048375882, + "loss": 0.4959, + "step": 128840 + }, + { + "epoch": 6.399622529055329, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002880341710539386, + "loss": 0.5146, + "step": 128850 + }, + { + "epoch": 6.4001192013509485, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002879944372702891, + "loss": 0.5259, + "step": 128860 + }, + { + "epoch": 6.400615873646568, + "grad_norm": 0.12890625, + "learning_rate": 0.00028795470348663953, + "loss": 0.4883, + "step": 128870 + }, + { + "epoch": 6.401112545942187, + "grad_norm": 0.12109375, + "learning_rate": 0.00028791496970298995, + "loss": 0.4779, + "step": 128880 + }, + { + "epoch": 6.401609218237807, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002878752359193404, + "loss": 0.5104, + "step": 128890 + }, + { + "epoch": 6.402105890533426, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002878355021356909, + "loss": 0.5174, + "step": 128900 + }, + { + "epoch": 6.4026025628290455, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002877957683520413, + "loss": 0.4908, + "step": 128910 + }, + { + "epoch": 6.403099235124665, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002877560345683918, + "loss": 0.4861, + "step": 128920 + }, + { + "epoch": 6.403595907420284, + "grad_norm": 0.1552734375, + "learning_rate": 0.00028771630078474225, + "loss": 0.4865, + "step": 128930 + }, + { + "epoch": 6.404092579715903, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002876765670010927, + "loss": 0.5096, + "step": 128940 + }, + { + "epoch": 6.404589252011522, + "grad_norm": 0.130859375, + "learning_rate": 0.00028763683321744314, + "loss": 0.5094, + "step": 128950 + }, + { + "epoch": 6.4050859243071425, + "grad_norm": 0.1513671875, + "learning_rate": 0.00028759709943379356, + "loss": 0.4718, + "step": 128960 + }, + { + "epoch": 6.405582596602762, + "grad_norm": 0.13671875, + "learning_rate": 0.0002875573656501441, + "loss": 0.512, + "step": 128970 + }, + { + "epoch": 6.406079268898381, + "grad_norm": 0.166015625, + "learning_rate": 0.0002875176318664945, + "loss": 0.497, + "step": 128980 + }, + { + "epoch": 6.406575941194, + "grad_norm": 0.12060546875, + "learning_rate": 0.0002874778980828449, + "loss": 0.4937, + "step": 128990 + }, + { + "epoch": 6.407072613489619, + "grad_norm": 0.12890625, + "learning_rate": 0.0002874381642991954, + "loss": 0.5073, + "step": 129000 + }, + { + "epoch": 6.407569285785239, + "grad_norm": 0.142578125, + "learning_rate": 0.00028739843051554586, + "loss": 0.5271, + "step": 129010 + }, + { + "epoch": 6.408065958080858, + "grad_norm": 0.1357421875, + "learning_rate": 0.00028735869673189633, + "loss": 0.4926, + "step": 129020 + }, + { + "epoch": 6.408562630376478, + "grad_norm": 0.123046875, + "learning_rate": 0.00028731896294824675, + "loss": 0.4877, + "step": 129030 + }, + { + "epoch": 6.409059302672097, + "grad_norm": 0.16015625, + "learning_rate": 0.0002872792291645972, + "loss": 0.4803, + "step": 129040 + }, + { + "epoch": 6.4095559749677165, + "grad_norm": 0.119140625, + "learning_rate": 0.0002872394953809477, + "loss": 0.476, + "step": 129050 + }, + { + "epoch": 6.410052647263336, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002871997615972981, + "loss": 0.5281, + "step": 129060 + }, + { + "epoch": 6.410549319558955, + "grad_norm": 0.1494140625, + "learning_rate": 0.00028716002781364853, + "loss": 0.4813, + "step": 129070 + }, + { + "epoch": 6.411045991854574, + "grad_norm": 0.146484375, + "learning_rate": 0.000287120294029999, + "loss": 0.4843, + "step": 129080 + }, + { + "epoch": 6.411542664150193, + "grad_norm": 0.1416015625, + "learning_rate": 0.00028708056024634947, + "loss": 0.4956, + "step": 129090 + }, + { + "epoch": 6.4120393364458135, + "grad_norm": 0.1279296875, + "learning_rate": 0.00028704082646269994, + "loss": 0.4909, + "step": 129100 + }, + { + "epoch": 6.412536008741433, + "grad_norm": 0.12255859375, + "learning_rate": 0.00028700109267905036, + "loss": 0.4595, + "step": 129110 + }, + { + "epoch": 6.413032681037052, + "grad_norm": 0.119140625, + "learning_rate": 0.00028696135889540083, + "loss": 0.4983, + "step": 129120 + }, + { + "epoch": 6.413529353332671, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002869216251117513, + "loss": 0.476, + "step": 129130 + }, + { + "epoch": 6.41402602562829, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002868818913281017, + "loss": 0.4897, + "step": 129140 + }, + { + "epoch": 6.41452269792391, + "grad_norm": 0.138671875, + "learning_rate": 0.0002868421575444522, + "loss": 0.4898, + "step": 129150 + }, + { + "epoch": 6.415019370219529, + "grad_norm": 0.130859375, + "learning_rate": 0.00028680242376080266, + "loss": 0.4681, + "step": 129160 + }, + { + "epoch": 6.415516042515149, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002867626899771531, + "loss": 0.4966, + "step": 129170 + }, + { + "epoch": 6.416012714810768, + "grad_norm": 0.1435546875, + "learning_rate": 0.00028672295619350355, + "loss": 0.5034, + "step": 129180 + }, + { + "epoch": 6.416509387106387, + "grad_norm": 0.11865234375, + "learning_rate": 0.00028668322240985397, + "loss": 0.4797, + "step": 129190 + }, + { + "epoch": 6.417006059402007, + "grad_norm": 0.15234375, + "learning_rate": 0.00028664348862620444, + "loss": 0.5159, + "step": 129200 + }, + { + "epoch": 6.417502731697626, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002866037548425549, + "loss": 0.4948, + "step": 129210 + }, + { + "epoch": 6.417999403993245, + "grad_norm": 0.15625, + "learning_rate": 0.00028656402105890533, + "loss": 0.513, + "step": 129220 + }, + { + "epoch": 6.418496076288864, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002865242872752558, + "loss": 0.4909, + "step": 129230 + }, + { + "epoch": 6.4189927485844835, + "grad_norm": 0.14453125, + "learning_rate": 0.00028648455349160627, + "loss": 0.4891, + "step": 129240 + }, + { + "epoch": 6.419489420880104, + "grad_norm": 0.14453125, + "learning_rate": 0.0002864448197079567, + "loss": 0.4745, + "step": 129250 + }, + { + "epoch": 6.419986093175723, + "grad_norm": 0.16015625, + "learning_rate": 0.00028640508592430716, + "loss": 0.4804, + "step": 129260 + }, + { + "epoch": 6.420482765471342, + "grad_norm": 0.130859375, + "learning_rate": 0.00028636535214065763, + "loss": 0.4977, + "step": 129270 + }, + { + "epoch": 6.420979437766961, + "grad_norm": 0.134765625, + "learning_rate": 0.00028632561835700805, + "loss": 0.5057, + "step": 129280 + }, + { + "epoch": 6.421476110062581, + "grad_norm": 0.13671875, + "learning_rate": 0.0002862858845733585, + "loss": 0.5113, + "step": 129290 + }, + { + "epoch": 6.4219727823582, + "grad_norm": 0.12158203125, + "learning_rate": 0.00028624615078970894, + "loss": 0.4901, + "step": 129300 + }, + { + "epoch": 6.422469454653819, + "grad_norm": 0.146484375, + "learning_rate": 0.00028620641700605946, + "loss": 0.5311, + "step": 129310 + }, + { + "epoch": 6.422966126949439, + "grad_norm": 0.15625, + "learning_rate": 0.0002861666832224099, + "loss": 0.4552, + "step": 129320 + }, + { + "epoch": 6.423462799245058, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002861269494387603, + "loss": 0.4717, + "step": 129330 + }, + { + "epoch": 6.423959471540678, + "grad_norm": 0.123046875, + "learning_rate": 0.00028608721565511077, + "loss": 0.4843, + "step": 129340 + }, + { + "epoch": 6.424456143836297, + "grad_norm": 0.1484375, + "learning_rate": 0.00028604748187146124, + "loss": 0.5129, + "step": 129350 + }, + { + "epoch": 6.424952816131916, + "grad_norm": 0.126953125, + "learning_rate": 0.00028600774808781166, + "loss": 0.4949, + "step": 129360 + }, + { + "epoch": 6.425449488427535, + "grad_norm": 0.142578125, + "learning_rate": 0.0002859680143041621, + "loss": 0.4898, + "step": 129370 + }, + { + "epoch": 6.4259461607231545, + "grad_norm": 0.1357421875, + "learning_rate": 0.00028592828052051254, + "loss": 0.4962, + "step": 129380 + }, + { + "epoch": 6.426442833018775, + "grad_norm": 0.1279296875, + "learning_rate": 0.00028588854673686307, + "loss": 0.4867, + "step": 129390 + }, + { + "epoch": 6.426939505314394, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002858488129532135, + "loss": 0.5046, + "step": 129400 + }, + { + "epoch": 6.427436177610013, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002858090791695639, + "loss": 0.4614, + "step": 129410 + }, + { + "epoch": 6.427932849905632, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002857693453859144, + "loss": 0.4749, + "step": 129420 + }, + { + "epoch": 6.4284295222012515, + "grad_norm": 0.12890625, + "learning_rate": 0.00028572961160226485, + "loss": 0.5002, + "step": 129430 + }, + { + "epoch": 6.428926194496871, + "grad_norm": 0.14453125, + "learning_rate": 0.00028568987781861526, + "loss": 0.4991, + "step": 129440 + }, + { + "epoch": 6.42942286679249, + "grad_norm": 0.1298828125, + "learning_rate": 0.00028565014403496574, + "loss": 0.4621, + "step": 129450 + }, + { + "epoch": 6.429919539088109, + "grad_norm": 0.130859375, + "learning_rate": 0.0002856104102513162, + "loss": 0.4931, + "step": 129460 + }, + { + "epoch": 6.430416211383729, + "grad_norm": 0.14453125, + "learning_rate": 0.0002855706764676667, + "loss": 0.4974, + "step": 129470 + }, + { + "epoch": 6.4309128836793485, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002855309426840171, + "loss": 0.4677, + "step": 129480 + }, + { + "epoch": 6.431409555974968, + "grad_norm": 0.142578125, + "learning_rate": 0.0002854912089003675, + "loss": 0.4817, + "step": 129490 + }, + { + "epoch": 6.431906228270587, + "grad_norm": 0.1435546875, + "learning_rate": 0.00028545147511671804, + "loss": 0.4852, + "step": 129500 + }, + { + "epoch": 6.432402900566206, + "grad_norm": 0.15234375, + "learning_rate": 0.00028541174133306845, + "loss": 0.4982, + "step": 129510 + }, + { + "epoch": 6.4328995728618255, + "grad_norm": 0.14453125, + "learning_rate": 0.00028537200754941887, + "loss": 0.5092, + "step": 129520 + }, + { + "epoch": 6.433396245157445, + "grad_norm": 0.150390625, + "learning_rate": 0.00028533227376576934, + "loss": 0.476, + "step": 129530 + }, + { + "epoch": 6.433892917453065, + "grad_norm": 0.12890625, + "learning_rate": 0.0002852925399821198, + "loss": 0.4969, + "step": 129540 + }, + { + "epoch": 6.434389589748684, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002852528061984703, + "loss": 0.4964, + "step": 129550 + }, + { + "epoch": 6.434886262044303, + "grad_norm": 0.150390625, + "learning_rate": 0.0002852130724148207, + "loss": 0.484, + "step": 129560 + }, + { + "epoch": 6.4353829343399225, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002851733386311712, + "loss": 0.4956, + "step": 129570 + }, + { + "epoch": 6.435879606635542, + "grad_norm": 0.1796875, + "learning_rate": 0.00028513360484752165, + "loss": 0.5129, + "step": 129580 + }, + { + "epoch": 6.436376278931161, + "grad_norm": 0.1484375, + "learning_rate": 0.00028509387106387206, + "loss": 0.5242, + "step": 129590 + }, + { + "epoch": 6.43687295122678, + "grad_norm": 0.126953125, + "learning_rate": 0.00028505413728022253, + "loss": 0.4968, + "step": 129600 + }, + { + "epoch": 6.4373696235224, + "grad_norm": 0.1279296875, + "learning_rate": 0.000285014403496573, + "loss": 0.4754, + "step": 129610 + }, + { + "epoch": 6.4378662958180195, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002849746697129234, + "loss": 0.5006, + "step": 129620 + }, + { + "epoch": 6.438362968113639, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002849349359292739, + "loss": 0.4809, + "step": 129630 + }, + { + "epoch": 6.438859640409258, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002848952021456243, + "loss": 0.5065, + "step": 129640 + }, + { + "epoch": 6.439356312704877, + "grad_norm": 0.13671875, + "learning_rate": 0.0002848554683619748, + "loss": 0.5027, + "step": 129650 + }, + { + "epoch": 6.439852985000496, + "grad_norm": 0.1728515625, + "learning_rate": 0.00028481573457832525, + "loss": 0.4714, + "step": 129660 + }, + { + "epoch": 6.440349657296116, + "grad_norm": 0.134765625, + "learning_rate": 0.00028477600079467567, + "loss": 0.5001, + "step": 129670 + }, + { + "epoch": 6.440846329591736, + "grad_norm": 0.1435546875, + "learning_rate": 0.00028473626701102614, + "loss": 0.5064, + "step": 129680 + }, + { + "epoch": 6.441343001887355, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002846965332273766, + "loss": 0.4961, + "step": 129690 + }, + { + "epoch": 6.441839674182974, + "grad_norm": 0.1279296875, + "learning_rate": 0.00028465679944372703, + "loss": 0.5148, + "step": 129700 + }, + { + "epoch": 6.442336346478593, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002846170656600775, + "loss": 0.5455, + "step": 129710 + }, + { + "epoch": 6.442833018774213, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002845773318764279, + "loss": 0.4687, + "step": 129720 + }, + { + "epoch": 6.443329691069832, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002845375980927784, + "loss": 0.4709, + "step": 129730 + }, + { + "epoch": 6.443826363365451, + "grad_norm": 0.12451171875, + "learning_rate": 0.00028449786430912886, + "loss": 0.4912, + "step": 129740 + }, + { + "epoch": 6.444323035661071, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002844581305254793, + "loss": 0.5387, + "step": 129750 + }, + { + "epoch": 6.4448197079566905, + "grad_norm": 0.123046875, + "learning_rate": 0.00028441839674182975, + "loss": 0.4964, + "step": 129760 + }, + { + "epoch": 6.44531638025231, + "grad_norm": 0.15234375, + "learning_rate": 0.0002843786629581802, + "loss": 0.4962, + "step": 129770 + }, + { + "epoch": 6.445813052547929, + "grad_norm": 0.123046875, + "learning_rate": 0.00028433892917453064, + "loss": 0.504, + "step": 129780 + }, + { + "epoch": 6.446309724843548, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002842991953908811, + "loss": 0.5037, + "step": 129790 + }, + { + "epoch": 6.446806397139167, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002842594616072316, + "loss": 0.4845, + "step": 129800 + }, + { + "epoch": 6.447303069434787, + "grad_norm": 0.1376953125, + "learning_rate": 0.000284219727823582, + "loss": 0.5116, + "step": 129810 + }, + { + "epoch": 6.447799741730407, + "grad_norm": 0.1171875, + "learning_rate": 0.00028417999403993247, + "loss": 0.4593, + "step": 129820 + }, + { + "epoch": 6.448296414026026, + "grad_norm": 0.13671875, + "learning_rate": 0.0002841402602562829, + "loss": 0.4668, + "step": 129830 + }, + { + "epoch": 6.448793086321645, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002841005264726334, + "loss": 0.4476, + "step": 129840 + }, + { + "epoch": 6.449289758617264, + "grad_norm": 0.146484375, + "learning_rate": 0.00028406079268898383, + "loss": 0.481, + "step": 129850 + }, + { + "epoch": 6.449786430912884, + "grad_norm": 0.138671875, + "learning_rate": 0.00028402105890533425, + "loss": 0.5138, + "step": 129860 + }, + { + "epoch": 6.450283103208503, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002839813251216847, + "loss": 0.4492, + "step": 129870 + }, + { + "epoch": 6.450779775504122, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002839415913380352, + "loss": 0.4905, + "step": 129880 + }, + { + "epoch": 6.451276447799742, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002839018575543856, + "loss": 0.5089, + "step": 129890 + }, + { + "epoch": 6.451773120095361, + "grad_norm": 0.140625, + "learning_rate": 0.0002838621237707361, + "loss": 0.4971, + "step": 129900 + }, + { + "epoch": 6.452269792390981, + "grad_norm": 0.12255859375, + "learning_rate": 0.00028382238998708655, + "loss": 0.5208, + "step": 129910 + }, + { + "epoch": 6.4527664646866, + "grad_norm": 0.171875, + "learning_rate": 0.000283782656203437, + "loss": 0.4988, + "step": 129920 + }, + { + "epoch": 6.453263136982219, + "grad_norm": 0.134765625, + "learning_rate": 0.00028374292241978744, + "loss": 0.4563, + "step": 129930 + }, + { + "epoch": 6.453759809277838, + "grad_norm": 0.140625, + "learning_rate": 0.00028370318863613786, + "loss": 0.5077, + "step": 129940 + }, + { + "epoch": 6.4542564815734575, + "grad_norm": 0.15625, + "learning_rate": 0.0002836634548524883, + "loss": 0.4954, + "step": 129950 + }, + { + "epoch": 6.454753153869077, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002836237210688388, + "loss": 0.4795, + "step": 129960 + }, + { + "epoch": 6.455249826164697, + "grad_norm": 0.134765625, + "learning_rate": 0.00028358398728518927, + "loss": 0.4732, + "step": 129970 + }, + { + "epoch": 6.455746498460316, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002835442535015397, + "loss": 0.4989, + "step": 129980 + }, + { + "epoch": 6.456243170755935, + "grad_norm": 0.1337890625, + "learning_rate": 0.00028350451971789016, + "loss": 0.4869, + "step": 129990 + }, + { + "epoch": 6.456739843051555, + "grad_norm": 0.171875, + "learning_rate": 0.00028346478593424063, + "loss": 0.488, + "step": 130000 + }, + { + "epoch": 6.457236515347174, + "grad_norm": 0.11767578125, + "learning_rate": 0.00028342505215059105, + "loss": 0.4937, + "step": 130010 + }, + { + "epoch": 6.457733187642793, + "grad_norm": 0.1416015625, + "learning_rate": 0.00028338531836694146, + "loss": 0.5094, + "step": 130020 + }, + { + "epoch": 6.458229859938412, + "grad_norm": 0.1376953125, + "learning_rate": 0.000283345584583292, + "loss": 0.4815, + "step": 130030 + }, + { + "epoch": 6.458726532234032, + "grad_norm": 0.130859375, + "learning_rate": 0.0002833058507996424, + "loss": 0.4699, + "step": 130040 + }, + { + "epoch": 6.459223204529652, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002832661170159929, + "loss": 0.4936, + "step": 130050 + }, + { + "epoch": 6.459719876825271, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002832263832323433, + "loss": 0.5095, + "step": 130060 + }, + { + "epoch": 6.46021654912089, + "grad_norm": 0.13671875, + "learning_rate": 0.00028318664944869377, + "loss": 0.4635, + "step": 130070 + }, + { + "epoch": 6.460713221416509, + "grad_norm": 0.1357421875, + "learning_rate": 0.00028314691566504424, + "loss": 0.5229, + "step": 130080 + }, + { + "epoch": 6.4612098937121285, + "grad_norm": 0.1181640625, + "learning_rate": 0.00028310718188139466, + "loss": 0.4524, + "step": 130090 + }, + { + "epoch": 6.461706566007748, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002830674480977451, + "loss": 0.4919, + "step": 130100 + }, + { + "epoch": 6.462203238303368, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002830277143140956, + "loss": 0.4804, + "step": 130110 + }, + { + "epoch": 6.462699910598987, + "grad_norm": 0.14453125, + "learning_rate": 0.000282987980530446, + "loss": 0.4807, + "step": 130120 + }, + { + "epoch": 6.463196582894606, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002829482467467965, + "loss": 0.4682, + "step": 130130 + }, + { + "epoch": 6.4636932551902255, + "grad_norm": 0.1416015625, + "learning_rate": 0.00028290851296314696, + "loss": 0.4976, + "step": 130140 + }, + { + "epoch": 6.464189927485845, + "grad_norm": 0.12890625, + "learning_rate": 0.0002828687791794974, + "loss": 0.5323, + "step": 130150 + }, + { + "epoch": 6.464686599781464, + "grad_norm": 0.1357421875, + "learning_rate": 0.00028282904539584785, + "loss": 0.483, + "step": 130160 + }, + { + "epoch": 6.465183272077083, + "grad_norm": 0.138671875, + "learning_rate": 0.00028278931161219826, + "loss": 0.5056, + "step": 130170 + }, + { + "epoch": 6.465679944372702, + "grad_norm": 0.1357421875, + "learning_rate": 0.00028274957782854873, + "loss": 0.5006, + "step": 130180 + }, + { + "epoch": 6.4661766166683226, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002827098440448992, + "loss": 0.4996, + "step": 130190 + }, + { + "epoch": 6.466673288963942, + "grad_norm": 0.2060546875, + "learning_rate": 0.0002826701102612496, + "loss": 0.5152, + "step": 130200 + }, + { + "epoch": 6.467169961259561, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002826303764776001, + "loss": 0.5057, + "step": 130210 + }, + { + "epoch": 6.46766663355518, + "grad_norm": 0.1552734375, + "learning_rate": 0.00028259064269395057, + "loss": 0.5236, + "step": 130220 + }, + { + "epoch": 6.4681633058507995, + "grad_norm": 0.12060546875, + "learning_rate": 0.000282550908910301, + "loss": 0.4987, + "step": 130230 + }, + { + "epoch": 6.468659978146419, + "grad_norm": 0.1318359375, + "learning_rate": 0.00028251117512665145, + "loss": 0.5053, + "step": 130240 + }, + { + "epoch": 6.469156650442038, + "grad_norm": 0.13671875, + "learning_rate": 0.00028247144134300187, + "loss": 0.4914, + "step": 130250 + }, + { + "epoch": 6.469653322737658, + "grad_norm": 0.1591796875, + "learning_rate": 0.00028243170755935234, + "loss": 0.4776, + "step": 130260 + }, + { + "epoch": 6.470149995033277, + "grad_norm": 0.134765625, + "learning_rate": 0.0002823919737757028, + "loss": 0.4943, + "step": 130270 + }, + { + "epoch": 6.4706466673288965, + "grad_norm": 0.154296875, + "learning_rate": 0.00028235223999205323, + "loss": 0.4928, + "step": 130280 + }, + { + "epoch": 6.471143339624516, + "grad_norm": 0.140625, + "learning_rate": 0.0002823125062084037, + "loss": 0.4775, + "step": 130290 + }, + { + "epoch": 6.471640011920135, + "grad_norm": 0.162109375, + "learning_rate": 0.0002822727724247542, + "loss": 0.4926, + "step": 130300 + }, + { + "epoch": 6.472136684215754, + "grad_norm": 0.119140625, + "learning_rate": 0.0002822330386411046, + "loss": 0.5075, + "step": 130310 + }, + { + "epoch": 6.472633356511373, + "grad_norm": 0.19140625, + "learning_rate": 0.00028219330485745506, + "loss": 0.5062, + "step": 130320 + }, + { + "epoch": 6.4731300288069935, + "grad_norm": 0.14453125, + "learning_rate": 0.00028215357107380553, + "loss": 0.4768, + "step": 130330 + }, + { + "epoch": 6.473626701102613, + "grad_norm": 0.1259765625, + "learning_rate": 0.00028211383729015595, + "loss": 0.4855, + "step": 130340 + }, + { + "epoch": 6.474123373398232, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002820741035065064, + "loss": 0.4859, + "step": 130350 + }, + { + "epoch": 6.474620045693851, + "grad_norm": 0.1220703125, + "learning_rate": 0.00028203436972285684, + "loss": 0.4693, + "step": 130360 + }, + { + "epoch": 6.47511671798947, + "grad_norm": 0.1259765625, + "learning_rate": 0.00028199463593920737, + "loss": 0.4863, + "step": 130370 + }, + { + "epoch": 6.47561339028509, + "grad_norm": 0.13671875, + "learning_rate": 0.0002819549021555578, + "loss": 0.5194, + "step": 130380 + }, + { + "epoch": 6.476110062580709, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002819151683719082, + "loss": 0.4944, + "step": 130390 + }, + { + "epoch": 6.476606734876329, + "grad_norm": 0.138671875, + "learning_rate": 0.00028187543458825867, + "loss": 0.4976, + "step": 130400 + }, + { + "epoch": 6.477103407171948, + "grad_norm": 0.130859375, + "learning_rate": 0.00028183570080460914, + "loss": 0.4932, + "step": 130410 + }, + { + "epoch": 6.477600079467567, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002817959670209596, + "loss": 0.4907, + "step": 130420 + }, + { + "epoch": 6.478096751763187, + "grad_norm": 0.154296875, + "learning_rate": 0.00028175623323731003, + "loss": 0.4963, + "step": 130430 + }, + { + "epoch": 6.478593424058806, + "grad_norm": 0.150390625, + "learning_rate": 0.0002817164994536605, + "loss": 0.4813, + "step": 130440 + }, + { + "epoch": 6.479090096354425, + "grad_norm": 0.12890625, + "learning_rate": 0.000281676765670011, + "loss": 0.4851, + "step": 130450 + }, + { + "epoch": 6.479586768650044, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002816370318863614, + "loss": 0.4949, + "step": 130460 + }, + { + "epoch": 6.4800834409456645, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002815972981027118, + "loss": 0.5014, + "step": 130470 + }, + { + "epoch": 6.480580113241284, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002815575643190623, + "loss": 0.4666, + "step": 130480 + }, + { + "epoch": 6.481076785536903, + "grad_norm": 0.15234375, + "learning_rate": 0.00028151783053541275, + "loss": 0.478, + "step": 130490 + }, + { + "epoch": 6.481573457832522, + "grad_norm": 0.11083984375, + "learning_rate": 0.0002814780967517632, + "loss": 0.4738, + "step": 130500 + }, + { + "epoch": 6.482070130128141, + "grad_norm": 0.1435546875, + "learning_rate": 0.00028143836296811364, + "loss": 0.4727, + "step": 130510 + }, + { + "epoch": 6.482566802423761, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002813986291844641, + "loss": 0.4706, + "step": 130520 + }, + { + "epoch": 6.48306347471938, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002813588954008146, + "loss": 0.5028, + "step": 130530 + }, + { + "epoch": 6.483560147015, + "grad_norm": 0.1337890625, + "learning_rate": 0.000281319161617165, + "loss": 0.4949, + "step": 130540 + }, + { + "epoch": 6.484056819310619, + "grad_norm": 0.11962890625, + "learning_rate": 0.0002812794278335154, + "loss": 0.4685, + "step": 130550 + }, + { + "epoch": 6.484553491606238, + "grad_norm": 0.1806640625, + "learning_rate": 0.00028123969404986594, + "loss": 0.4842, + "step": 130560 + }, + { + "epoch": 6.485050163901858, + "grad_norm": 0.150390625, + "learning_rate": 0.00028119996026621636, + "loss": 0.4985, + "step": 130570 + }, + { + "epoch": 6.485546836197477, + "grad_norm": 0.1396484375, + "learning_rate": 0.00028116022648256683, + "loss": 0.5261, + "step": 130580 + }, + { + "epoch": 6.486043508493096, + "grad_norm": 0.142578125, + "learning_rate": 0.00028112049269891725, + "loss": 0.5217, + "step": 130590 + }, + { + "epoch": 6.486540180788715, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002810807589152677, + "loss": 0.4858, + "step": 130600 + }, + { + "epoch": 6.487036853084335, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002810410251316182, + "loss": 0.4955, + "step": 130610 + }, + { + "epoch": 6.487533525379955, + "grad_norm": 0.138671875, + "learning_rate": 0.0002810012913479686, + "loss": 0.5115, + "step": 130620 + }, + { + "epoch": 6.488030197675574, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002809615575643191, + "loss": 0.5005, + "step": 130630 + }, + { + "epoch": 6.488526869971193, + "grad_norm": 0.1416015625, + "learning_rate": 0.00028092182378066955, + "loss": 0.5045, + "step": 130640 + }, + { + "epoch": 6.489023542266812, + "grad_norm": 0.162109375, + "learning_rate": 0.00028088208999701997, + "loss": 0.5191, + "step": 130650 + }, + { + "epoch": 6.4895202145624316, + "grad_norm": 0.1298828125, + "learning_rate": 0.00028084235621337044, + "loss": 0.5194, + "step": 130660 + }, + { + "epoch": 6.490016886858051, + "grad_norm": 0.140625, + "learning_rate": 0.0002808026224297209, + "loss": 0.5083, + "step": 130670 + }, + { + "epoch": 6.49051355915367, + "grad_norm": 0.140625, + "learning_rate": 0.0002807628886460713, + "loss": 0.4884, + "step": 130680 + }, + { + "epoch": 6.49101023144929, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002807231548624218, + "loss": 0.4933, + "step": 130690 + }, + { + "epoch": 6.491506903744909, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002806834210787722, + "loss": 0.5009, + "step": 130700 + }, + { + "epoch": 6.492003576040529, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002806436872951227, + "loss": 0.4687, + "step": 130710 + }, + { + "epoch": 6.492500248336148, + "grad_norm": 0.1298828125, + "learning_rate": 0.00028060395351147316, + "loss": 0.485, + "step": 130720 + }, + { + "epoch": 6.492996920631767, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002805642197278236, + "loss": 0.4935, + "step": 130730 + }, + { + "epoch": 6.493493592927386, + "grad_norm": 0.1376953125, + "learning_rate": 0.00028052448594417405, + "loss": 0.502, + "step": 130740 + }, + { + "epoch": 6.4939902652230055, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002804847521605245, + "loss": 0.4907, + "step": 130750 + }, + { + "epoch": 6.494486937518626, + "grad_norm": 0.134765625, + "learning_rate": 0.00028044501837687493, + "loss": 0.5057, + "step": 130760 + }, + { + "epoch": 6.494983609814245, + "grad_norm": 0.15625, + "learning_rate": 0.0002804052845932254, + "loss": 0.5142, + "step": 130770 + }, + { + "epoch": 6.495480282109864, + "grad_norm": 0.185546875, + "learning_rate": 0.0002803655508095758, + "loss": 0.4872, + "step": 130780 + }, + { + "epoch": 6.495976954405483, + "grad_norm": 0.1279296875, + "learning_rate": 0.00028032581702592635, + "loss": 0.4561, + "step": 130790 + }, + { + "epoch": 6.4964736267011025, + "grad_norm": 0.12353515625, + "learning_rate": 0.00028028608324227677, + "loss": 0.5, + "step": 130800 + }, + { + "epoch": 6.496970298996722, + "grad_norm": 0.138671875, + "learning_rate": 0.0002802463494586272, + "loss": 0.4935, + "step": 130810 + }, + { + "epoch": 6.497466971292341, + "grad_norm": 0.119140625, + "learning_rate": 0.00028020661567497765, + "loss": 0.4962, + "step": 130820 + }, + { + "epoch": 6.49796364358796, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002801668818913281, + "loss": 0.4927, + "step": 130830 + }, + { + "epoch": 6.49846031588358, + "grad_norm": 0.15234375, + "learning_rate": 0.00028012714810767854, + "loss": 0.4761, + "step": 130840 + }, + { + "epoch": 6.4989569881791995, + "grad_norm": 0.1279296875, + "learning_rate": 0.000280087414324029, + "loss": 0.4948, + "step": 130850 + }, + { + "epoch": 6.499453660474819, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002800476805403795, + "loss": 0.4841, + "step": 130860 + }, + { + "epoch": 6.499950332770438, + "grad_norm": 0.1728515625, + "learning_rate": 0.00028000794675672996, + "loss": 0.5016, + "step": 130870 + }, + { + "epoch": 6.500447005066057, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002799682129730804, + "loss": 0.5021, + "step": 130880 + }, + { + "epoch": 6.500943677361676, + "grad_norm": 0.158203125, + "learning_rate": 0.0002799284791894308, + "loss": 0.4649, + "step": 130890 + }, + { + "epoch": 6.501440349657296, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002798887454057813, + "loss": 0.527, + "step": 130900 + }, + { + "epoch": 6.501937021952916, + "grad_norm": 0.126953125, + "learning_rate": 0.00027984901162213173, + "loss": 0.4981, + "step": 130910 + }, + { + "epoch": 6.502433694248535, + "grad_norm": 0.1328125, + "learning_rate": 0.00027980927783848215, + "loss": 0.4976, + "step": 130920 + }, + { + "epoch": 6.502930366544154, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002797695440548326, + "loss": 0.4933, + "step": 130930 + }, + { + "epoch": 6.5034270388397735, + "grad_norm": 0.1484375, + "learning_rate": 0.0002797298102711831, + "loss": 0.5107, + "step": 130940 + }, + { + "epoch": 6.503923711135393, + "grad_norm": 0.1416015625, + "learning_rate": 0.00027969007648753357, + "loss": 0.5004, + "step": 130950 + }, + { + "epoch": 6.504420383431012, + "grad_norm": 0.12255859375, + "learning_rate": 0.000279650342703884, + "loss": 0.4815, + "step": 130960 + }, + { + "epoch": 6.504917055726631, + "grad_norm": 0.1279296875, + "learning_rate": 0.00027961060892023445, + "loss": 0.4926, + "step": 130970 + }, + { + "epoch": 6.505413728022251, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002795708751365849, + "loss": 0.4914, + "step": 130980 + }, + { + "epoch": 6.5059104003178705, + "grad_norm": 0.11865234375, + "learning_rate": 0.00027953114135293534, + "loss": 0.4831, + "step": 130990 + }, + { + "epoch": 6.50640707261349, + "grad_norm": 0.14453125, + "learning_rate": 0.00027949140756928576, + "loss": 0.4801, + "step": 131000 + }, + { + "epoch": 6.506903744909109, + "grad_norm": 0.189453125, + "learning_rate": 0.0002794516737856363, + "loss": 0.5064, + "step": 131010 + }, + { + "epoch": 6.507400417204728, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002794119400019867, + "loss": 0.4795, + "step": 131020 + }, + { + "epoch": 6.507897089500347, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002793722062183372, + "loss": 0.4814, + "step": 131030 + }, + { + "epoch": 6.508393761795967, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002793324724346876, + "loss": 0.4836, + "step": 131040 + }, + { + "epoch": 6.508890434091587, + "grad_norm": 0.1357421875, + "learning_rate": 0.00027929273865103806, + "loss": 0.4854, + "step": 131050 + }, + { + "epoch": 6.509387106387206, + "grad_norm": 0.140625, + "learning_rate": 0.00027925300486738853, + "loss": 0.4736, + "step": 131060 + }, + { + "epoch": 6.509883778682825, + "grad_norm": 0.1337890625, + "learning_rate": 0.00027921327108373895, + "loss": 0.4926, + "step": 131070 + }, + { + "epoch": 6.510380450978444, + "grad_norm": 0.1484375, + "learning_rate": 0.00027917353730008937, + "loss": 0.487, + "step": 131080 + }, + { + "epoch": 6.510877123274064, + "grad_norm": 0.12890625, + "learning_rate": 0.0002791338035164399, + "loss": 0.4976, + "step": 131090 + }, + { + "epoch": 6.511373795569683, + "grad_norm": 0.130859375, + "learning_rate": 0.0002790940697327903, + "loss": 0.4842, + "step": 131100 + }, + { + "epoch": 6.511870467865302, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002790543359491408, + "loss": 0.4664, + "step": 131110 + }, + { + "epoch": 6.512367140160922, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002790146021654912, + "loss": 0.4964, + "step": 131120 + }, + { + "epoch": 6.5128638124565414, + "grad_norm": 0.1455078125, + "learning_rate": 0.00027897486838184167, + "loss": 0.5052, + "step": 131130 + }, + { + "epoch": 6.513360484752161, + "grad_norm": 0.1396484375, + "learning_rate": 0.00027893513459819214, + "loss": 0.4974, + "step": 131140 + }, + { + "epoch": 6.51385715704778, + "grad_norm": 0.13671875, + "learning_rate": 0.00027889540081454256, + "loss": 0.4822, + "step": 131150 + }, + { + "epoch": 6.514353829343399, + "grad_norm": 0.1435546875, + "learning_rate": 0.00027885566703089303, + "loss": 0.5037, + "step": 131160 + }, + { + "epoch": 6.514850501639018, + "grad_norm": 0.130859375, + "learning_rate": 0.0002788159332472435, + "loss": 0.4931, + "step": 131170 + }, + { + "epoch": 6.515347173934638, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002787761994635939, + "loss": 0.501, + "step": 131180 + }, + { + "epoch": 6.515843846230258, + "grad_norm": 0.13671875, + "learning_rate": 0.0002787364656799444, + "loss": 0.4913, + "step": 131190 + }, + { + "epoch": 6.516340518525877, + "grad_norm": 0.134765625, + "learning_rate": 0.00027869673189629486, + "loss": 0.5102, + "step": 131200 + }, + { + "epoch": 6.516837190821496, + "grad_norm": 0.130859375, + "learning_rate": 0.0002786569981126453, + "loss": 0.5017, + "step": 131210 + }, + { + "epoch": 6.517333863117115, + "grad_norm": 0.1337890625, + "learning_rate": 0.00027861726432899575, + "loss": 0.5112, + "step": 131220 + }, + { + "epoch": 6.517830535412735, + "grad_norm": 0.1328125, + "learning_rate": 0.00027857753054534617, + "loss": 0.4644, + "step": 131230 + }, + { + "epoch": 6.518327207708354, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002785377967616967, + "loss": 0.48, + "step": 131240 + }, + { + "epoch": 6.518823880003973, + "grad_norm": 0.130859375, + "learning_rate": 0.0002784980629780471, + "loss": 0.4718, + "step": 131250 + }, + { + "epoch": 6.519320552299593, + "grad_norm": 0.130859375, + "learning_rate": 0.0002784583291943975, + "loss": 0.4539, + "step": 131260 + }, + { + "epoch": 6.519817224595212, + "grad_norm": 0.1357421875, + "learning_rate": 0.000278418595410748, + "loss": 0.5051, + "step": 131270 + }, + { + "epoch": 6.520313896890832, + "grad_norm": 0.13671875, + "learning_rate": 0.00027837886162709847, + "loss": 0.5004, + "step": 131280 + }, + { + "epoch": 6.520810569186451, + "grad_norm": 0.14453125, + "learning_rate": 0.0002783391278434489, + "loss": 0.4924, + "step": 131290 + }, + { + "epoch": 6.52130724148207, + "grad_norm": 0.1279296875, + "learning_rate": 0.00027829939405979936, + "loss": 0.4745, + "step": 131300 + }, + { + "epoch": 6.521803913777689, + "grad_norm": 0.12890625, + "learning_rate": 0.00027825966027614983, + "loss": 0.489, + "step": 131310 + }, + { + "epoch": 6.5223005860733085, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002782199264925003, + "loss": 0.5139, + "step": 131320 + }, + { + "epoch": 6.522797258368929, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002781801927088507, + "loss": 0.5139, + "step": 131330 + }, + { + "epoch": 6.523293930664548, + "grad_norm": 0.12353515625, + "learning_rate": 0.00027814045892520114, + "loss": 0.4829, + "step": 131340 + }, + { + "epoch": 6.523790602960167, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002781007251415516, + "loss": 0.4999, + "step": 131350 + }, + { + "epoch": 6.524287275255786, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002780609913579021, + "loss": 0.4658, + "step": 131360 + }, + { + "epoch": 6.524783947551406, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002780212575742525, + "loss": 0.4999, + "step": 131370 + }, + { + "epoch": 6.525280619847025, + "grad_norm": 0.146484375, + "learning_rate": 0.00027798152379060297, + "loss": 0.4937, + "step": 131380 + }, + { + "epoch": 6.525777292142644, + "grad_norm": 0.15234375, + "learning_rate": 0.00027794179000695344, + "loss": 0.4633, + "step": 131390 + }, + { + "epoch": 6.526273964438264, + "grad_norm": 0.154296875, + "learning_rate": 0.0002779020562233039, + "loss": 0.4726, + "step": 131400 + }, + { + "epoch": 6.526770636733883, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002778623224396543, + "loss": 0.5023, + "step": 131410 + }, + { + "epoch": 6.527267309029503, + "grad_norm": 0.140625, + "learning_rate": 0.00027782258865600474, + "loss": 0.4984, + "step": 131420 + }, + { + "epoch": 6.527763981325122, + "grad_norm": 0.13671875, + "learning_rate": 0.00027778285487235527, + "loss": 0.5, + "step": 131430 + }, + { + "epoch": 6.528260653620741, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002777431210887057, + "loss": 0.4708, + "step": 131440 + }, + { + "epoch": 6.52875732591636, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002777033873050561, + "loss": 0.4609, + "step": 131450 + }, + { + "epoch": 6.5292539982119795, + "grad_norm": 0.1484375, + "learning_rate": 0.0002776636535214066, + "loss": 0.4748, + "step": 131460 + }, + { + "epoch": 6.529750670507599, + "grad_norm": 0.1318359375, + "learning_rate": 0.00027762391973775705, + "loss": 0.5029, + "step": 131470 + }, + { + "epoch": 6.530247342803218, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002775841859541075, + "loss": 0.4988, + "step": 131480 + }, + { + "epoch": 6.530744015098838, + "grad_norm": 0.1337890625, + "learning_rate": 0.00027754445217045793, + "loss": 0.5001, + "step": 131490 + }, + { + "epoch": 6.531240687394457, + "grad_norm": 0.134765625, + "learning_rate": 0.0002775047183868084, + "loss": 0.4659, + "step": 131500 + }, + { + "epoch": 6.5317373596900765, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002774649846031589, + "loss": 0.4866, + "step": 131510 + }, + { + "epoch": 6.532234031985696, + "grad_norm": 0.134765625, + "learning_rate": 0.0002774252508195093, + "loss": 0.4882, + "step": 131520 + }, + { + "epoch": 6.532730704281315, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002773855170358597, + "loss": 0.4618, + "step": 131530 + }, + { + "epoch": 6.533227376576934, + "grad_norm": 0.140625, + "learning_rate": 0.00027734578325221024, + "loss": 0.4812, + "step": 131540 + }, + { + "epoch": 6.533724048872553, + "grad_norm": 0.1552734375, + "learning_rate": 0.00027730604946856065, + "loss": 0.4814, + "step": 131550 + }, + { + "epoch": 6.5342207211681735, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002772663156849111, + "loss": 0.4608, + "step": 131560 + }, + { + "epoch": 6.534717393463793, + "grad_norm": 0.1240234375, + "learning_rate": 0.00027722658190126154, + "loss": 0.4921, + "step": 131570 + }, + { + "epoch": 6.535214065759412, + "grad_norm": 0.1259765625, + "learning_rate": 0.000277186848117612, + "loss": 0.482, + "step": 131580 + }, + { + "epoch": 6.535710738055031, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002771471143339625, + "loss": 0.4734, + "step": 131590 + }, + { + "epoch": 6.5362074103506504, + "grad_norm": 0.126953125, + "learning_rate": 0.0002771073805503129, + "loss": 0.4999, + "step": 131600 + }, + { + "epoch": 6.53670408264627, + "grad_norm": 0.1640625, + "learning_rate": 0.0002770676467666634, + "loss": 0.4949, + "step": 131610 + }, + { + "epoch": 6.537200754941889, + "grad_norm": 0.12158203125, + "learning_rate": 0.00027702791298301385, + "loss": 0.5104, + "step": 131620 + }, + { + "epoch": 6.537697427237509, + "grad_norm": 0.134765625, + "learning_rate": 0.00027698817919936426, + "loss": 0.5106, + "step": 131630 + }, + { + "epoch": 6.538194099533128, + "grad_norm": 0.1318359375, + "learning_rate": 0.00027694844541571473, + "loss": 0.4902, + "step": 131640 + }, + { + "epoch": 6.5386907718287475, + "grad_norm": 0.15234375, + "learning_rate": 0.00027690871163206515, + "loss": 0.4874, + "step": 131650 + }, + { + "epoch": 6.539187444124367, + "grad_norm": 0.134765625, + "learning_rate": 0.0002768689778484156, + "loss": 0.4876, + "step": 131660 + }, + { + "epoch": 6.539684116419986, + "grad_norm": 0.142578125, + "learning_rate": 0.0002768292440647661, + "loss": 0.5096, + "step": 131670 + }, + { + "epoch": 6.540180788715605, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002767895102811165, + "loss": 0.4685, + "step": 131680 + }, + { + "epoch": 6.540677461011224, + "grad_norm": 0.12451171875, + "learning_rate": 0.000276749776497467, + "loss": 0.5019, + "step": 131690 + }, + { + "epoch": 6.5411741333068445, + "grad_norm": 0.1357421875, + "learning_rate": 0.00027671004271381745, + "loss": 0.5112, + "step": 131700 + }, + { + "epoch": 6.541670805602464, + "grad_norm": 0.1357421875, + "learning_rate": 0.00027667030893016787, + "loss": 0.4696, + "step": 131710 + }, + { + "epoch": 6.542167477898083, + "grad_norm": 0.1435546875, + "learning_rate": 0.00027663057514651834, + "loss": 0.5079, + "step": 131720 + }, + { + "epoch": 6.542664150193702, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002765908413628688, + "loss": 0.4758, + "step": 131730 + }, + { + "epoch": 6.543160822489321, + "grad_norm": 0.12255859375, + "learning_rate": 0.00027655110757921923, + "loss": 0.5078, + "step": 131740 + }, + { + "epoch": 6.543657494784941, + "grad_norm": 0.12890625, + "learning_rate": 0.0002765113737955697, + "loss": 0.4915, + "step": 131750 + }, + { + "epoch": 6.54415416708056, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002764716400119201, + "loss": 0.4816, + "step": 131760 + }, + { + "epoch": 6.54465083937618, + "grad_norm": 0.1298828125, + "learning_rate": 0.00027643190622827064, + "loss": 0.4976, + "step": 131770 + }, + { + "epoch": 6.545147511671799, + "grad_norm": 0.1484375, + "learning_rate": 0.00027639217244462106, + "loss": 0.5056, + "step": 131780 + }, + { + "epoch": 6.545644183967418, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002763524386609715, + "loss": 0.4844, + "step": 131790 + }, + { + "epoch": 6.546140856263038, + "grad_norm": 0.138671875, + "learning_rate": 0.00027631270487732195, + "loss": 0.5009, + "step": 131800 + }, + { + "epoch": 6.546637528558657, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002762729710936724, + "loss": 0.5147, + "step": 131810 + }, + { + "epoch": 6.547134200854276, + "grad_norm": 0.1328125, + "learning_rate": 0.00027623323731002284, + "loss": 0.472, + "step": 131820 + }, + { + "epoch": 6.547630873149895, + "grad_norm": 0.15625, + "learning_rate": 0.0002761935035263733, + "loss": 0.512, + "step": 131830 + }, + { + "epoch": 6.5481275454455155, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002761537697427238, + "loss": 0.5148, + "step": 131840 + }, + { + "epoch": 6.548624217741135, + "grad_norm": 0.1318359375, + "learning_rate": 0.00027611403595907425, + "loss": 0.4977, + "step": 131850 + }, + { + "epoch": 6.549120890036754, + "grad_norm": 0.142578125, + "learning_rate": 0.00027607430217542467, + "loss": 0.4998, + "step": 131860 + }, + { + "epoch": 6.549617562332373, + "grad_norm": 0.12890625, + "learning_rate": 0.0002760345683917751, + "loss": 0.5116, + "step": 131870 + }, + { + "epoch": 6.550114234627992, + "grad_norm": 0.13671875, + "learning_rate": 0.00027599483460812556, + "loss": 0.5357, + "step": 131880 + }, + { + "epoch": 6.550610906923612, + "grad_norm": 0.1416015625, + "learning_rate": 0.00027595510082447603, + "loss": 0.4937, + "step": 131890 + }, + { + "epoch": 6.551107579219231, + "grad_norm": 0.1455078125, + "learning_rate": 0.00027591536704082645, + "loss": 0.5095, + "step": 131900 + }, + { + "epoch": 6.551604251514851, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002758756332571769, + "loss": 0.49, + "step": 131910 + }, + { + "epoch": 6.55210092381047, + "grad_norm": 0.12890625, + "learning_rate": 0.0002758358994735274, + "loss": 0.5151, + "step": 131920 + }, + { + "epoch": 6.552597596106089, + "grad_norm": 0.146484375, + "learning_rate": 0.00027579616568987786, + "loss": 0.4589, + "step": 131930 + }, + { + "epoch": 6.553094268401709, + "grad_norm": 0.111328125, + "learning_rate": 0.0002757564319062283, + "loss": 0.4864, + "step": 131940 + }, + { + "epoch": 6.553590940697328, + "grad_norm": 0.134765625, + "learning_rate": 0.0002757166981225787, + "loss": 0.4945, + "step": 131950 + }, + { + "epoch": 6.554087612992947, + "grad_norm": 0.11474609375, + "learning_rate": 0.0002756769643389292, + "loss": 0.4959, + "step": 131960 + }, + { + "epoch": 6.554584285288566, + "grad_norm": 0.140625, + "learning_rate": 0.00027563723055527964, + "loss": 0.4824, + "step": 131970 + }, + { + "epoch": 6.555080957584186, + "grad_norm": 0.1298828125, + "learning_rate": 0.00027559749677163005, + "loss": 0.5406, + "step": 131980 + }, + { + "epoch": 6.555577629879806, + "grad_norm": 0.140625, + "learning_rate": 0.0002755577629879805, + "loss": 0.4884, + "step": 131990 + }, + { + "epoch": 6.556074302175425, + "grad_norm": 0.1484375, + "learning_rate": 0.000275518029204331, + "loss": 0.4944, + "step": 132000 + }, + { + "epoch": 6.556570974471044, + "grad_norm": 0.1533203125, + "learning_rate": 0.00027547829542068147, + "loss": 0.49, + "step": 132010 + }, + { + "epoch": 6.557067646766663, + "grad_norm": 0.138671875, + "learning_rate": 0.0002754385616370319, + "loss": 0.5078, + "step": 132020 + }, + { + "epoch": 6.5575643190622825, + "grad_norm": 0.18359375, + "learning_rate": 0.00027539882785338236, + "loss": 0.4808, + "step": 132030 + }, + { + "epoch": 6.558060991357902, + "grad_norm": 0.140625, + "learning_rate": 0.00027535909406973283, + "loss": 0.5131, + "step": 132040 + }, + { + "epoch": 6.558557663653522, + "grad_norm": 0.1337890625, + "learning_rate": 0.00027531936028608325, + "loss": 0.5028, + "step": 132050 + }, + { + "epoch": 6.559054335949141, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002752796265024337, + "loss": 0.4891, + "step": 132060 + }, + { + "epoch": 6.55955100824476, + "grad_norm": 0.134765625, + "learning_rate": 0.0002752398927187842, + "loss": 0.4724, + "step": 132070 + }, + { + "epoch": 6.56004768054038, + "grad_norm": 0.13671875, + "learning_rate": 0.0002752001589351346, + "loss": 0.5097, + "step": 132080 + }, + { + "epoch": 6.560544352835999, + "grad_norm": 0.1328125, + "learning_rate": 0.0002751604251514851, + "loss": 0.4934, + "step": 132090 + }, + { + "epoch": 6.561041025131618, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002751206913678355, + "loss": 0.4868, + "step": 132100 + }, + { + "epoch": 6.561537697427237, + "grad_norm": 0.134765625, + "learning_rate": 0.00027508095758418597, + "loss": 0.4855, + "step": 132110 + }, + { + "epoch": 6.5620343697228565, + "grad_norm": 0.12890625, + "learning_rate": 0.00027504122380053644, + "loss": 0.4691, + "step": 132120 + }, + { + "epoch": 6.562531042018477, + "grad_norm": 0.12890625, + "learning_rate": 0.00027500149001688685, + "loss": 0.4996, + "step": 132130 + }, + { + "epoch": 6.563027714314096, + "grad_norm": 0.150390625, + "learning_rate": 0.0002749617562332373, + "loss": 0.5151, + "step": 132140 + }, + { + "epoch": 6.563524386609715, + "grad_norm": 0.13671875, + "learning_rate": 0.0002749220224495878, + "loss": 0.4846, + "step": 132150 + }, + { + "epoch": 6.564021058905334, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002748822886659382, + "loss": 0.493, + "step": 132160 + }, + { + "epoch": 6.5645177312009535, + "grad_norm": 0.154296875, + "learning_rate": 0.0002748425548822887, + "loss": 0.4981, + "step": 132170 + }, + { + "epoch": 6.565014403496573, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002748028210986391, + "loss": 0.4957, + "step": 132180 + }, + { + "epoch": 6.565511075792192, + "grad_norm": 0.1171875, + "learning_rate": 0.0002747630873149896, + "loss": 0.476, + "step": 132190 + }, + { + "epoch": 6.566007748087811, + "grad_norm": 0.1318359375, + "learning_rate": 0.00027472335353134005, + "loss": 0.4689, + "step": 132200 + }, + { + "epoch": 6.566504420383431, + "grad_norm": 0.1357421875, + "learning_rate": 0.00027468361974769046, + "loss": 0.5053, + "step": 132210 + }, + { + "epoch": 6.5670010926790505, + "grad_norm": 0.1259765625, + "learning_rate": 0.00027464388596404093, + "loss": 0.4794, + "step": 132220 + }, + { + "epoch": 6.56749776497467, + "grad_norm": 0.125, + "learning_rate": 0.0002746041521803914, + "loss": 0.4824, + "step": 132230 + }, + { + "epoch": 6.567994437270289, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002745644183967418, + "loss": 0.5269, + "step": 132240 + }, + { + "epoch": 6.568491109565908, + "grad_norm": 0.13671875, + "learning_rate": 0.0002745246846130923, + "loss": 0.4838, + "step": 132250 + }, + { + "epoch": 6.568987781861527, + "grad_norm": 0.142578125, + "learning_rate": 0.00027448495082944277, + "loss": 0.497, + "step": 132260 + }, + { + "epoch": 6.569484454157147, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002744452170457932, + "loss": 0.5044, + "step": 132270 + }, + { + "epoch": 6.569981126452767, + "grad_norm": 0.1630859375, + "learning_rate": 0.00027440548326214365, + "loss": 0.4951, + "step": 132280 + }, + { + "epoch": 6.570477798748386, + "grad_norm": 0.169921875, + "learning_rate": 0.00027436574947849407, + "loss": 0.5129, + "step": 132290 + }, + { + "epoch": 6.570974471044005, + "grad_norm": 0.138671875, + "learning_rate": 0.0002743260156948446, + "loss": 0.4742, + "step": 132300 + }, + { + "epoch": 6.5714711433396245, + "grad_norm": 0.150390625, + "learning_rate": 0.000274286281911195, + "loss": 0.4861, + "step": 132310 + }, + { + "epoch": 6.571967815635244, + "grad_norm": 0.1279296875, + "learning_rate": 0.00027424654812754543, + "loss": 0.4985, + "step": 132320 + }, + { + "epoch": 6.572464487930863, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002742068143438959, + "loss": 0.4778, + "step": 132330 + }, + { + "epoch": 6.572961160226482, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002741670805602464, + "loss": 0.5032, + "step": 132340 + }, + { + "epoch": 6.573457832522102, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002741273467765968, + "loss": 0.4898, + "step": 132350 + }, + { + "epoch": 6.5739545048177215, + "grad_norm": 0.140625, + "learning_rate": 0.00027408761299294726, + "loss": 0.5188, + "step": 132360 + }, + { + "epoch": 6.574451177113341, + "grad_norm": 0.125, + "learning_rate": 0.00027404787920929773, + "loss": 0.4875, + "step": 132370 + }, + { + "epoch": 6.57494784940896, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002740081454256482, + "loss": 0.5, + "step": 132380 + }, + { + "epoch": 6.575444521704579, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002739684116419986, + "loss": 0.4891, + "step": 132390 + }, + { + "epoch": 6.575941194000198, + "grad_norm": 0.1328125, + "learning_rate": 0.00027392867785834904, + "loss": 0.4886, + "step": 132400 + }, + { + "epoch": 6.576437866295818, + "grad_norm": 0.14453125, + "learning_rate": 0.00027388894407469956, + "loss": 0.4917, + "step": 132410 + }, + { + "epoch": 6.576934538591438, + "grad_norm": 0.125, + "learning_rate": 0.00027384921029105, + "loss": 0.4932, + "step": 132420 + }, + { + "epoch": 6.577431210887057, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002738094765074004, + "loss": 0.4747, + "step": 132430 + }, + { + "epoch": 6.577927883182676, + "grad_norm": 0.1259765625, + "learning_rate": 0.00027376974272375087, + "loss": 0.5104, + "step": 132440 + }, + { + "epoch": 6.578424555478295, + "grad_norm": 0.130859375, + "learning_rate": 0.00027373000894010134, + "loss": 0.4977, + "step": 132450 + }, + { + "epoch": 6.578921227773915, + "grad_norm": 0.13671875, + "learning_rate": 0.0002736902751564518, + "loss": 0.4797, + "step": 132460 + }, + { + "epoch": 6.579417900069534, + "grad_norm": 0.123046875, + "learning_rate": 0.00027365054137280223, + "loss": 0.4796, + "step": 132470 + }, + { + "epoch": 6.579914572365153, + "grad_norm": 0.1318359375, + "learning_rate": 0.00027361080758915265, + "loss": 0.4642, + "step": 132480 + }, + { + "epoch": 6.580411244660773, + "grad_norm": 0.1396484375, + "learning_rate": 0.00027357107380550317, + "loss": 0.4925, + "step": 132490 + }, + { + "epoch": 6.580907916956392, + "grad_norm": 0.12890625, + "learning_rate": 0.0002735313400218536, + "loss": 0.486, + "step": 132500 + }, + { + "epoch": 6.581404589252012, + "grad_norm": 0.1318359375, + "learning_rate": 0.00027349160623820406, + "loss": 0.4796, + "step": 132510 + }, + { + "epoch": 6.581901261547631, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002734518724545545, + "loss": 0.512, + "step": 132520 + }, + { + "epoch": 6.58239793384325, + "grad_norm": 0.12353515625, + "learning_rate": 0.00027341213867090495, + "loss": 0.4844, + "step": 132530 + }, + { + "epoch": 6.582894606138869, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002733724048872554, + "loss": 0.4681, + "step": 132540 + }, + { + "epoch": 6.583391278434489, + "grad_norm": 0.1279296875, + "learning_rate": 0.00027333267110360584, + "loss": 0.4842, + "step": 132550 + }, + { + "epoch": 6.583887950730109, + "grad_norm": 0.154296875, + "learning_rate": 0.0002732929373199563, + "loss": 0.4881, + "step": 132560 + }, + { + "epoch": 6.584384623025728, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002732532035363068, + "loss": 0.5233, + "step": 132570 + }, + { + "epoch": 6.584881295321347, + "grad_norm": 0.126953125, + "learning_rate": 0.0002732134697526572, + "loss": 0.4756, + "step": 132580 + }, + { + "epoch": 6.585377967616966, + "grad_norm": 0.12890625, + "learning_rate": 0.00027317373596900767, + "loss": 0.4881, + "step": 132590 + }, + { + "epoch": 6.585874639912586, + "grad_norm": 0.12109375, + "learning_rate": 0.00027313400218535814, + "loss": 0.4969, + "step": 132600 + }, + { + "epoch": 6.586371312208205, + "grad_norm": 0.1298828125, + "learning_rate": 0.00027309426840170856, + "loss": 0.504, + "step": 132610 + }, + { + "epoch": 6.586867984503824, + "grad_norm": 0.130859375, + "learning_rate": 0.00027305453461805903, + "loss": 0.4739, + "step": 132620 + }, + { + "epoch": 6.587364656799444, + "grad_norm": 0.1318359375, + "learning_rate": 0.00027301480083440945, + "loss": 0.5395, + "step": 132630 + }, + { + "epoch": 6.587861329095063, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002729750670507599, + "loss": 0.4975, + "step": 132640 + }, + { + "epoch": 6.588358001390683, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002729353332671104, + "loss": 0.4969, + "step": 132650 + }, + { + "epoch": 6.588854673686302, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002728955994834608, + "loss": 0.4937, + "step": 132660 + }, + { + "epoch": 6.589351345981921, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002728558656998113, + "loss": 0.4718, + "step": 132670 + }, + { + "epoch": 6.58984801827754, + "grad_norm": 0.142578125, + "learning_rate": 0.00027281613191616175, + "loss": 0.4972, + "step": 132680 + }, + { + "epoch": 6.5903446905731595, + "grad_norm": 0.1611328125, + "learning_rate": 0.00027277639813251217, + "loss": 0.4969, + "step": 132690 + }, + { + "epoch": 6.59084136286878, + "grad_norm": 0.138671875, + "learning_rate": 0.00027273666434886264, + "loss": 0.4912, + "step": 132700 + }, + { + "epoch": 6.591338035164399, + "grad_norm": 0.130859375, + "learning_rate": 0.0002726969305652131, + "loss": 0.5153, + "step": 132710 + }, + { + "epoch": 6.591834707460018, + "grad_norm": 0.130859375, + "learning_rate": 0.0002726571967815635, + "loss": 0.4904, + "step": 132720 + }, + { + "epoch": 6.592331379755637, + "grad_norm": 0.1455078125, + "learning_rate": 0.000272617462997914, + "loss": 0.49, + "step": 132730 + }, + { + "epoch": 6.5928280520512565, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002725777292142644, + "loss": 0.4925, + "step": 132740 + }, + { + "epoch": 6.593324724346876, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002725379954306149, + "loss": 0.5258, + "step": 132750 + }, + { + "epoch": 6.593821396642495, + "grad_norm": 0.1728515625, + "learning_rate": 0.00027249826164696536, + "loss": 0.4999, + "step": 132760 + }, + { + "epoch": 6.594318068938115, + "grad_norm": 0.1826171875, + "learning_rate": 0.0002724585278633158, + "loss": 0.4972, + "step": 132770 + }, + { + "epoch": 6.594814741233734, + "grad_norm": 0.1484375, + "learning_rate": 0.00027241879407966625, + "loss": 0.4713, + "step": 132780 + }, + { + "epoch": 6.595311413529354, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002723790602960167, + "loss": 0.49, + "step": 132790 + }, + { + "epoch": 6.595808085824973, + "grad_norm": 0.1494140625, + "learning_rate": 0.00027233932651236713, + "loss": 0.504, + "step": 132800 + }, + { + "epoch": 6.596304758120592, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002722995927287176, + "loss": 0.4771, + "step": 132810 + }, + { + "epoch": 6.596801430416211, + "grad_norm": 0.130859375, + "learning_rate": 0.000272259858945068, + "loss": 0.4936, + "step": 132820 + }, + { + "epoch": 6.5972981027118305, + "grad_norm": 0.130859375, + "learning_rate": 0.00027222012516141855, + "loss": 0.4815, + "step": 132830 + }, + { + "epoch": 6.59779477500745, + "grad_norm": 0.146484375, + "learning_rate": 0.00027218039137776897, + "loss": 0.5094, + "step": 132840 + }, + { + "epoch": 6.59829144730307, + "grad_norm": 0.130859375, + "learning_rate": 0.0002721406575941194, + "loss": 0.5089, + "step": 132850 + }, + { + "epoch": 6.598788119598689, + "grad_norm": 0.13671875, + "learning_rate": 0.00027210092381046985, + "loss": 0.4632, + "step": 132860 + }, + { + "epoch": 6.599284791894308, + "grad_norm": 0.130859375, + "learning_rate": 0.0002720611900268203, + "loss": 0.483, + "step": 132870 + }, + { + "epoch": 6.5997814641899275, + "grad_norm": 0.1767578125, + "learning_rate": 0.00027202145624317074, + "loss": 0.4891, + "step": 132880 + }, + { + "epoch": 6.600278136485547, + "grad_norm": 0.138671875, + "learning_rate": 0.0002719817224595212, + "loss": 0.4684, + "step": 132890 + }, + { + "epoch": 6.600774808781166, + "grad_norm": 0.126953125, + "learning_rate": 0.0002719419886758717, + "loss": 0.4608, + "step": 132900 + }, + { + "epoch": 6.601271481076785, + "grad_norm": 0.2119140625, + "learning_rate": 0.00027190225489222216, + "loss": 0.485, + "step": 132910 + }, + { + "epoch": 6.601768153372404, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002718625211085726, + "loss": 0.4507, + "step": 132920 + }, + { + "epoch": 6.6022648256680245, + "grad_norm": 0.1376953125, + "learning_rate": 0.000271822787324923, + "loss": 0.4968, + "step": 132930 + }, + { + "epoch": 6.602761497963644, + "grad_norm": 0.1640625, + "learning_rate": 0.0002717830535412735, + "loss": 0.4773, + "step": 132940 + }, + { + "epoch": 6.603258170259263, + "grad_norm": 0.130859375, + "learning_rate": 0.00027174331975762393, + "loss": 0.4796, + "step": 132950 + }, + { + "epoch": 6.603754842554882, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002717035859739744, + "loss": 0.488, + "step": 132960 + }, + { + "epoch": 6.604251514850501, + "grad_norm": 0.125, + "learning_rate": 0.0002716638521903248, + "loss": 0.4728, + "step": 132970 + }, + { + "epoch": 6.604748187146121, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002716241184066753, + "loss": 0.5108, + "step": 132980 + }, + { + "epoch": 6.60524485944174, + "grad_norm": 0.1376953125, + "learning_rate": 0.00027158438462302576, + "loss": 0.4891, + "step": 132990 + }, + { + "epoch": 6.60574153173736, + "grad_norm": 0.134765625, + "learning_rate": 0.0002715446508393762, + "loss": 0.512, + "step": 133000 + }, + { + "epoch": 6.606238204032979, + "grad_norm": 0.138671875, + "learning_rate": 0.00027150491705572665, + "loss": 0.4805, + "step": 133010 + }, + { + "epoch": 6.6067348763285985, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002714651832720771, + "loss": 0.4996, + "step": 133020 + }, + { + "epoch": 6.607231548624218, + "grad_norm": 0.1376953125, + "learning_rate": 0.00027142544948842754, + "loss": 0.5027, + "step": 133030 + }, + { + "epoch": 6.607728220919837, + "grad_norm": 0.1474609375, + "learning_rate": 0.000271385715704778, + "loss": 0.53, + "step": 133040 + }, + { + "epoch": 6.608224893215456, + "grad_norm": 0.1494140625, + "learning_rate": 0.00027134598192112843, + "loss": 0.507, + "step": 133050 + }, + { + "epoch": 6.608721565511075, + "grad_norm": 0.1328125, + "learning_rate": 0.0002713062481374789, + "loss": 0.4711, + "step": 133060 + }, + { + "epoch": 6.6092182378066955, + "grad_norm": 0.1337890625, + "learning_rate": 0.00027126651435382937, + "loss": 0.4811, + "step": 133070 + }, + { + "epoch": 6.609714910102315, + "grad_norm": 0.138671875, + "learning_rate": 0.0002712267805701798, + "loss": 0.4837, + "step": 133080 + }, + { + "epoch": 6.610211582397934, + "grad_norm": 0.1416015625, + "learning_rate": 0.00027118704678653026, + "loss": 0.5085, + "step": 133090 + }, + { + "epoch": 6.610708254693553, + "grad_norm": 0.1328125, + "learning_rate": 0.00027114731300288073, + "loss": 0.5287, + "step": 133100 + }, + { + "epoch": 6.611204926989172, + "grad_norm": 0.1435546875, + "learning_rate": 0.00027110757921923115, + "loss": 0.5062, + "step": 133110 + }, + { + "epoch": 6.611701599284792, + "grad_norm": 0.130859375, + "learning_rate": 0.0002710678454355816, + "loss": 0.4982, + "step": 133120 + }, + { + "epoch": 6.612198271580411, + "grad_norm": 0.13671875, + "learning_rate": 0.0002710281116519321, + "loss": 0.4991, + "step": 133130 + }, + { + "epoch": 6.612694943876031, + "grad_norm": 0.13671875, + "learning_rate": 0.0002709883778682825, + "loss": 0.5036, + "step": 133140 + }, + { + "epoch": 6.61319161617165, + "grad_norm": 0.12890625, + "learning_rate": 0.000270948644084633, + "loss": 0.4817, + "step": 133150 + }, + { + "epoch": 6.613688288467269, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002709089103009834, + "loss": 0.4849, + "step": 133160 + }, + { + "epoch": 6.614184960762889, + "grad_norm": 0.1396484375, + "learning_rate": 0.00027086917651733387, + "loss": 0.5049, + "step": 133170 + }, + { + "epoch": 6.614681633058508, + "grad_norm": 0.140625, + "learning_rate": 0.00027082944273368434, + "loss": 0.5086, + "step": 133180 + }, + { + "epoch": 6.615178305354127, + "grad_norm": 0.130859375, + "learning_rate": 0.00027078970895003476, + "loss": 0.4887, + "step": 133190 + }, + { + "epoch": 6.615674977649746, + "grad_norm": 0.12255859375, + "learning_rate": 0.00027074997516638523, + "loss": 0.4756, + "step": 133200 + }, + { + "epoch": 6.616171649945366, + "grad_norm": 0.1748046875, + "learning_rate": 0.0002707102413827357, + "loss": 0.4732, + "step": 133210 + }, + { + "epoch": 6.616668322240986, + "grad_norm": 0.146484375, + "learning_rate": 0.0002706705075990861, + "loss": 0.4917, + "step": 133220 + }, + { + "epoch": 6.617164994536605, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002706307738154366, + "loss": 0.4943, + "step": 133230 + }, + { + "epoch": 6.617661666832224, + "grad_norm": 0.134765625, + "learning_rate": 0.00027059104003178706, + "loss": 0.4608, + "step": 133240 + }, + { + "epoch": 6.618158339127843, + "grad_norm": 0.171875, + "learning_rate": 0.0002705513062481375, + "loss": 0.4674, + "step": 133250 + }, + { + "epoch": 6.618655011423463, + "grad_norm": 0.1376953125, + "learning_rate": 0.00027051157246448795, + "loss": 0.4938, + "step": 133260 + }, + { + "epoch": 6.619151683719082, + "grad_norm": 0.1416015625, + "learning_rate": 0.00027047183868083837, + "loss": 0.4672, + "step": 133270 + }, + { + "epoch": 6.619648356014702, + "grad_norm": 0.138671875, + "learning_rate": 0.0002704321048971889, + "loss": 0.5048, + "step": 133280 + }, + { + "epoch": 6.620145028310321, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002703923711135393, + "loss": 0.4894, + "step": 133290 + }, + { + "epoch": 6.62064170060594, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002703526373298897, + "loss": 0.4835, + "step": 133300 + }, + { + "epoch": 6.62113837290156, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002703129035462402, + "loss": 0.5134, + "step": 133310 + }, + { + "epoch": 6.621635045197179, + "grad_norm": 0.1259765625, + "learning_rate": 0.00027027316976259067, + "loss": 0.5127, + "step": 133320 + }, + { + "epoch": 6.622131717492798, + "grad_norm": 0.14453125, + "learning_rate": 0.00027023343597894114, + "loss": 0.4695, + "step": 133330 + }, + { + "epoch": 6.622628389788417, + "grad_norm": 0.1396484375, + "learning_rate": 0.00027019370219529156, + "loss": 0.5054, + "step": 133340 + }, + { + "epoch": 6.623125062084037, + "grad_norm": 0.142578125, + "learning_rate": 0.000270153968411642, + "loss": 0.5493, + "step": 133350 + }, + { + "epoch": 6.623621734379657, + "grad_norm": 0.13671875, + "learning_rate": 0.0002701142346279925, + "loss": 0.4761, + "step": 133360 + }, + { + "epoch": 6.624118406675276, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002700745008443429, + "loss": 0.5189, + "step": 133370 + }, + { + "epoch": 6.624615078970895, + "grad_norm": 0.17578125, + "learning_rate": 0.00027003476706069333, + "loss": 0.4894, + "step": 133380 + }, + { + "epoch": 6.625111751266514, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002699950332770438, + "loss": 0.5142, + "step": 133390 + }, + { + "epoch": 6.6256084235621335, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002699552994933943, + "loss": 0.4884, + "step": 133400 + }, + { + "epoch": 6.626105095857753, + "grad_norm": 0.1796875, + "learning_rate": 0.00026991556570974475, + "loss": 0.4813, + "step": 133410 + }, + { + "epoch": 6.626601768153373, + "grad_norm": 0.18359375, + "learning_rate": 0.00026987583192609517, + "loss": 0.5112, + "step": 133420 + }, + { + "epoch": 6.627098440448992, + "grad_norm": 0.140625, + "learning_rate": 0.00026983609814244564, + "loss": 0.5361, + "step": 133430 + }, + { + "epoch": 6.627595112744611, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002697963643587961, + "loss": 0.4426, + "step": 133440 + }, + { + "epoch": 6.6280917850402306, + "grad_norm": 0.150390625, + "learning_rate": 0.0002697566305751465, + "loss": 0.4809, + "step": 133450 + }, + { + "epoch": 6.62858845733585, + "grad_norm": 0.13671875, + "learning_rate": 0.00026971689679149694, + "loss": 0.5134, + "step": 133460 + }, + { + "epoch": 6.629085129631469, + "grad_norm": 0.140625, + "learning_rate": 0.00026967716300784747, + "loss": 0.5242, + "step": 133470 + }, + { + "epoch": 6.629581801927088, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002696374292241979, + "loss": 0.4854, + "step": 133480 + }, + { + "epoch": 6.630078474222708, + "grad_norm": 0.130859375, + "learning_rate": 0.00026959769544054836, + "loss": 0.4724, + "step": 133490 + }, + { + "epoch": 6.630575146518328, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002695579616568988, + "loss": 0.5376, + "step": 133500 + }, + { + "epoch": 6.631071818813947, + "grad_norm": 0.134765625, + "learning_rate": 0.00026951822787324925, + "loss": 0.4984, + "step": 133510 + }, + { + "epoch": 6.631568491109566, + "grad_norm": 0.134765625, + "learning_rate": 0.0002694784940895997, + "loss": 0.5126, + "step": 133520 + }, + { + "epoch": 6.632065163405185, + "grad_norm": 0.140625, + "learning_rate": 0.00026943876030595013, + "loss": 0.511, + "step": 133530 + }, + { + "epoch": 6.6325618357008045, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002693990265223006, + "loss": 0.4826, + "step": 133540 + }, + { + "epoch": 6.633058507996424, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002693592927386511, + "loss": 0.4895, + "step": 133550 + }, + { + "epoch": 6.633555180292043, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002693195589550015, + "loss": 0.5184, + "step": 133560 + }, + { + "epoch": 6.634051852587662, + "grad_norm": 0.1259765625, + "learning_rate": 0.00026927982517135196, + "loss": 0.4698, + "step": 133570 + }, + { + "epoch": 6.634548524883282, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002692400913877024, + "loss": 0.5026, + "step": 133580 + }, + { + "epoch": 6.6350451971789015, + "grad_norm": 0.1474609375, + "learning_rate": 0.00026920035760405285, + "loss": 0.461, + "step": 133590 + }, + { + "epoch": 6.635541869474521, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002691606238204033, + "loss": 0.4852, + "step": 133600 + }, + { + "epoch": 6.63603854177014, + "grad_norm": 0.146484375, + "learning_rate": 0.00026912089003675374, + "loss": 0.5103, + "step": 133610 + }, + { + "epoch": 6.636535214065759, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002690811562531042, + "loss": 0.4986, + "step": 133620 + }, + { + "epoch": 6.637031886361378, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002690414224694547, + "loss": 0.483, + "step": 133630 + }, + { + "epoch": 6.637528558656998, + "grad_norm": 0.13671875, + "learning_rate": 0.0002690016886858051, + "loss": 0.5236, + "step": 133640 + }, + { + "epoch": 6.638025230952618, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002689619549021556, + "loss": 0.4868, + "step": 133650 + }, + { + "epoch": 6.638521903248237, + "grad_norm": 0.1318359375, + "learning_rate": 0.00026892222111850604, + "loss": 0.4662, + "step": 133660 + }, + { + "epoch": 6.639018575543856, + "grad_norm": 0.15234375, + "learning_rate": 0.00026888248733485646, + "loss": 0.4987, + "step": 133670 + }, + { + "epoch": 6.639515247839475, + "grad_norm": 0.125, + "learning_rate": 0.00026884275355120693, + "loss": 0.4798, + "step": 133680 + }, + { + "epoch": 6.640011920135095, + "grad_norm": 0.1484375, + "learning_rate": 0.00026880301976755735, + "loss": 0.5026, + "step": 133690 + }, + { + "epoch": 6.640508592430714, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002687632859839078, + "loss": 0.5088, + "step": 133700 + }, + { + "epoch": 6.641005264726333, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002687235522002583, + "loss": 0.4896, + "step": 133710 + }, + { + "epoch": 6.641501937021953, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002686838184166087, + "loss": 0.5207, + "step": 133720 + }, + { + "epoch": 6.6419986093175725, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002686440846329592, + "loss": 0.5255, + "step": 133730 + }, + { + "epoch": 6.642495281613192, + "grad_norm": 0.126953125, + "learning_rate": 0.00026860435084930965, + "loss": 0.4937, + "step": 133740 + }, + { + "epoch": 6.642991953908811, + "grad_norm": 0.15625, + "learning_rate": 0.00026856461706566007, + "loss": 0.4631, + "step": 133750 + }, + { + "epoch": 6.64348862620443, + "grad_norm": 0.1298828125, + "learning_rate": 0.00026852488328201054, + "loss": 0.5256, + "step": 133760 + }, + { + "epoch": 6.643985298500049, + "grad_norm": 0.1416015625, + "learning_rate": 0.000268485149498361, + "loss": 0.505, + "step": 133770 + }, + { + "epoch": 6.644481970795669, + "grad_norm": 0.1484375, + "learning_rate": 0.0002684454157147115, + "loss": 0.4995, + "step": 133780 + }, + { + "epoch": 6.644978643091289, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002684056819310619, + "loss": 0.5255, + "step": 133790 + }, + { + "epoch": 6.645475315386908, + "grad_norm": 0.1162109375, + "learning_rate": 0.0002683659481474123, + "loss": 0.4723, + "step": 133800 + }, + { + "epoch": 6.645971987682527, + "grad_norm": 0.1357421875, + "learning_rate": 0.00026832621436376284, + "loss": 0.5012, + "step": 133810 + }, + { + "epoch": 6.646468659978146, + "grad_norm": 0.169921875, + "learning_rate": 0.00026828648058011326, + "loss": 0.4947, + "step": 133820 + }, + { + "epoch": 6.646965332273766, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002682467467964637, + "loss": 0.5005, + "step": 133830 + }, + { + "epoch": 6.647462004569385, + "grad_norm": 0.1318359375, + "learning_rate": 0.00026820701301281415, + "loss": 0.5042, + "step": 133840 + }, + { + "epoch": 6.647958676865004, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002681672792291646, + "loss": 0.4901, + "step": 133850 + }, + { + "epoch": 6.648455349160624, + "grad_norm": 0.138671875, + "learning_rate": 0.0002681275454455151, + "loss": 0.5007, + "step": 133860 + }, + { + "epoch": 6.648952021456243, + "grad_norm": 0.140625, + "learning_rate": 0.0002680878116618655, + "loss": 0.4806, + "step": 133870 + }, + { + "epoch": 6.649448693751863, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002680480778782159, + "loss": 0.4836, + "step": 133880 + }, + { + "epoch": 6.649945366047482, + "grad_norm": 0.134765625, + "learning_rate": 0.00026800834409456645, + "loss": 0.5106, + "step": 133890 + }, + { + "epoch": 6.650442038343101, + "grad_norm": 0.1201171875, + "learning_rate": 0.00026796861031091687, + "loss": 0.4862, + "step": 133900 + }, + { + "epoch": 6.65093871063872, + "grad_norm": 0.146484375, + "learning_rate": 0.0002679288765272673, + "loss": 0.4535, + "step": 133910 + }, + { + "epoch": 6.65143538293434, + "grad_norm": 0.15625, + "learning_rate": 0.00026788914274361776, + "loss": 0.5245, + "step": 133920 + }, + { + "epoch": 6.65193205522996, + "grad_norm": 0.142578125, + "learning_rate": 0.00026784940895996823, + "loss": 0.5196, + "step": 133930 + }, + { + "epoch": 6.652428727525579, + "grad_norm": 0.13671875, + "learning_rate": 0.0002678096751763187, + "loss": 0.5149, + "step": 133940 + }, + { + "epoch": 6.652925399821198, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002677699413926691, + "loss": 0.4959, + "step": 133950 + }, + { + "epoch": 6.653422072116817, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002677302076090196, + "loss": 0.4859, + "step": 133960 + }, + { + "epoch": 6.653918744412437, + "grad_norm": 0.1259765625, + "learning_rate": 0.00026769047382537006, + "loss": 0.518, + "step": 133970 + }, + { + "epoch": 6.654415416708056, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002676507400417205, + "loss": 0.477, + "step": 133980 + }, + { + "epoch": 6.654912089003675, + "grad_norm": 0.197265625, + "learning_rate": 0.0002676110062580709, + "loss": 0.483, + "step": 133990 + }, + { + "epoch": 6.655408761299295, + "grad_norm": 0.125, + "learning_rate": 0.0002675712724744214, + "loss": 0.4983, + "step": 134000 + }, + { + "epoch": 6.655905433594914, + "grad_norm": 0.126953125, + "learning_rate": 0.00026753153869077184, + "loss": 0.4666, + "step": 134010 + }, + { + "epoch": 6.656402105890534, + "grad_norm": 0.142578125, + "learning_rate": 0.0002674918049071223, + "loss": 0.5227, + "step": 134020 + }, + { + "epoch": 6.656898778186153, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002674520711234727, + "loss": 0.4894, + "step": 134030 + }, + { + "epoch": 6.657395450481772, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002674123373398232, + "loss": 0.4588, + "step": 134040 + }, + { + "epoch": 6.657892122777391, + "grad_norm": 0.140625, + "learning_rate": 0.00026737260355617367, + "loss": 0.4926, + "step": 134050 + }, + { + "epoch": 6.6583887950730105, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002673328697725241, + "loss": 0.4998, + "step": 134060 + }, + { + "epoch": 6.658885467368631, + "grad_norm": 0.12158203125, + "learning_rate": 0.00026729313598887456, + "loss": 0.5003, + "step": 134070 + }, + { + "epoch": 6.65938213966425, + "grad_norm": 0.1484375, + "learning_rate": 0.00026725340220522503, + "loss": 0.5129, + "step": 134080 + }, + { + "epoch": 6.659878811959869, + "grad_norm": 0.1328125, + "learning_rate": 0.00026721366842157545, + "loss": 0.4808, + "step": 134090 + }, + { + "epoch": 6.660375484255488, + "grad_norm": 0.140625, + "learning_rate": 0.0002671739346379259, + "loss": 0.4807, + "step": 134100 + }, + { + "epoch": 6.6608721565511075, + "grad_norm": 0.126953125, + "learning_rate": 0.0002671342008542764, + "loss": 0.5135, + "step": 134110 + }, + { + "epoch": 6.661368828846727, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002670944670706268, + "loss": 0.4837, + "step": 134120 + }, + { + "epoch": 6.661865501142346, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002670547332869773, + "loss": 0.478, + "step": 134130 + }, + { + "epoch": 6.662362173437966, + "grad_norm": 0.138671875, + "learning_rate": 0.0002670149995033277, + "loss": 0.4888, + "step": 134140 + }, + { + "epoch": 6.662858845733585, + "grad_norm": 0.171875, + "learning_rate": 0.00026697526571967816, + "loss": 0.4739, + "step": 134150 + }, + { + "epoch": 6.663355518029205, + "grad_norm": 0.1533203125, + "learning_rate": 0.00026693553193602864, + "loss": 0.5055, + "step": 134160 + }, + { + "epoch": 6.663852190324824, + "grad_norm": 0.1376953125, + "learning_rate": 0.00026689579815237905, + "loss": 0.4849, + "step": 134170 + }, + { + "epoch": 6.664348862620443, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002668560643687295, + "loss": 0.4776, + "step": 134180 + }, + { + "epoch": 6.664845534916062, + "grad_norm": 0.154296875, + "learning_rate": 0.00026681633058508, + "loss": 0.5023, + "step": 134190 + }, + { + "epoch": 6.6653422072116815, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002667765968014304, + "loss": 0.4725, + "step": 134200 + }, + { + "epoch": 6.665838879507301, + "grad_norm": 0.130859375, + "learning_rate": 0.0002667368630177809, + "loss": 0.4813, + "step": 134210 + }, + { + "epoch": 6.666335551802921, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002666971292341313, + "loss": 0.4912, + "step": 134220 + }, + { + "epoch": 6.66683222409854, + "grad_norm": 0.2197265625, + "learning_rate": 0.00026665739545048183, + "loss": 0.5012, + "step": 134230 + }, + { + "epoch": 6.667328896394159, + "grad_norm": 0.1455078125, + "learning_rate": 0.00026661766166683224, + "loss": 0.4752, + "step": 134240 + }, + { + "epoch": 6.6678255686897785, + "grad_norm": 0.12890625, + "learning_rate": 0.00026657792788318266, + "loss": 0.4683, + "step": 134250 + }, + { + "epoch": 6.668322240985398, + "grad_norm": 0.13671875, + "learning_rate": 0.00026653819409953313, + "loss": 0.507, + "step": 134260 + }, + { + "epoch": 6.668818913281017, + "grad_norm": 0.158203125, + "learning_rate": 0.0002664984603158836, + "loss": 0.5005, + "step": 134270 + }, + { + "epoch": 6.669315585576636, + "grad_norm": 0.1484375, + "learning_rate": 0.000266458726532234, + "loss": 0.4964, + "step": 134280 + }, + { + "epoch": 6.669812257872255, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002664189927485845, + "loss": 0.5075, + "step": 134290 + }, + { + "epoch": 6.6703089301678755, + "grad_norm": 0.12890625, + "learning_rate": 0.00026637925896493496, + "loss": 0.4932, + "step": 134300 + }, + { + "epoch": 6.670805602463495, + "grad_norm": 0.1259765625, + "learning_rate": 0.00026633952518128544, + "loss": 0.5096, + "step": 134310 + }, + { + "epoch": 6.671302274759114, + "grad_norm": 0.125, + "learning_rate": 0.00026629979139763585, + "loss": 0.4881, + "step": 134320 + }, + { + "epoch": 6.671798947054733, + "grad_norm": 0.171875, + "learning_rate": 0.00026626005761398627, + "loss": 0.4874, + "step": 134330 + }, + { + "epoch": 6.672295619350352, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002662203238303368, + "loss": 0.486, + "step": 134340 + }, + { + "epoch": 6.672792291645972, + "grad_norm": 0.185546875, + "learning_rate": 0.0002661805900466872, + "loss": 0.5361, + "step": 134350 + }, + { + "epoch": 6.673288963941591, + "grad_norm": 0.158203125, + "learning_rate": 0.00026614085626303763, + "loss": 0.4694, + "step": 134360 + }, + { + "epoch": 6.673785636237211, + "grad_norm": 0.12890625, + "learning_rate": 0.0002661011224793881, + "loss": 0.4767, + "step": 134370 + }, + { + "epoch": 6.67428230853283, + "grad_norm": 0.138671875, + "learning_rate": 0.00026606138869573857, + "loss": 0.4853, + "step": 134380 + }, + { + "epoch": 6.6747789808284494, + "grad_norm": 0.142578125, + "learning_rate": 0.00026602165491208904, + "loss": 0.4636, + "step": 134390 + }, + { + "epoch": 6.675275653124069, + "grad_norm": 0.1201171875, + "learning_rate": 0.00026598192112843946, + "loss": 0.4859, + "step": 134400 + }, + { + "epoch": 6.675772325419688, + "grad_norm": 0.13671875, + "learning_rate": 0.00026594218734478993, + "loss": 0.5127, + "step": 134410 + }, + { + "epoch": 6.676268997715307, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002659024535611404, + "loss": 0.4887, + "step": 134420 + }, + { + "epoch": 6.676765670010926, + "grad_norm": 0.150390625, + "learning_rate": 0.0002658627197774908, + "loss": 0.5102, + "step": 134430 + }, + { + "epoch": 6.6772623423065465, + "grad_norm": 0.13671875, + "learning_rate": 0.00026582298599384124, + "loss": 0.4927, + "step": 134440 + }, + { + "epoch": 6.677759014602166, + "grad_norm": 0.12109375, + "learning_rate": 0.0002657832522101917, + "loss": 0.5023, + "step": 134450 + }, + { + "epoch": 6.678255686897785, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002657435184265422, + "loss": 0.4924, + "step": 134460 + }, + { + "epoch": 6.678752359193404, + "grad_norm": 0.1357421875, + "learning_rate": 0.00026570378464289265, + "loss": 0.4979, + "step": 134470 + }, + { + "epoch": 6.679249031489023, + "grad_norm": 0.13671875, + "learning_rate": 0.00026566405085924307, + "loss": 0.4738, + "step": 134480 + }, + { + "epoch": 6.679745703784643, + "grad_norm": 0.1337890625, + "learning_rate": 0.00026562431707559354, + "loss": 0.4945, + "step": 134490 + }, + { + "epoch": 6.680242376080262, + "grad_norm": 0.1357421875, + "learning_rate": 0.000265584583291944, + "loss": 0.4789, + "step": 134500 + }, + { + "epoch": 6.680739048375882, + "grad_norm": 0.140625, + "learning_rate": 0.00026554484950829443, + "loss": 0.4905, + "step": 134510 + }, + { + "epoch": 6.681235720671501, + "grad_norm": 0.1435546875, + "learning_rate": 0.00026550511572464485, + "loss": 0.4961, + "step": 134520 + }, + { + "epoch": 6.68173239296712, + "grad_norm": 0.1337890625, + "learning_rate": 0.00026546538194099537, + "loss": 0.5094, + "step": 134530 + }, + { + "epoch": 6.68222906526274, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002654256481573458, + "loss": 0.5259, + "step": 134540 + }, + { + "epoch": 6.682725737558359, + "grad_norm": 0.1357421875, + "learning_rate": 0.00026538591437369626, + "loss": 0.5188, + "step": 134550 + }, + { + "epoch": 6.683222409853978, + "grad_norm": 0.130859375, + "learning_rate": 0.0002653461805900467, + "loss": 0.4919, + "step": 134560 + }, + { + "epoch": 6.683719082149597, + "grad_norm": 0.134765625, + "learning_rate": 0.00026530644680639715, + "loss": 0.4937, + "step": 134570 + }, + { + "epoch": 6.684215754445217, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002652667130227476, + "loss": 0.4981, + "step": 134580 + }, + { + "epoch": 6.684712426740837, + "grad_norm": 0.1259765625, + "learning_rate": 0.00026522697923909804, + "loss": 0.478, + "step": 134590 + }, + { + "epoch": 6.685209099036456, + "grad_norm": 0.171875, + "learning_rate": 0.0002651872454554485, + "loss": 0.482, + "step": 134600 + }, + { + "epoch": 6.685705771332075, + "grad_norm": 0.1240234375, + "learning_rate": 0.000265147511671799, + "loss": 0.4786, + "step": 134610 + }, + { + "epoch": 6.686202443627694, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002651077778881494, + "loss": 0.5206, + "step": 134620 + }, + { + "epoch": 6.686699115923314, + "grad_norm": 0.134765625, + "learning_rate": 0.00026506804410449987, + "loss": 0.4924, + "step": 134630 + }, + { + "epoch": 6.687195788218933, + "grad_norm": 0.142578125, + "learning_rate": 0.00026502831032085034, + "loss": 0.487, + "step": 134640 + }, + { + "epoch": 6.687692460514553, + "grad_norm": 0.1435546875, + "learning_rate": 0.00026498857653720076, + "loss": 0.4893, + "step": 134650 + }, + { + "epoch": 6.688189132810172, + "grad_norm": 0.1826171875, + "learning_rate": 0.00026494884275355123, + "loss": 0.5016, + "step": 134660 + }, + { + "epoch": 6.688685805105791, + "grad_norm": 0.154296875, + "learning_rate": 0.00026490910896990165, + "loss": 0.48, + "step": 134670 + }, + { + "epoch": 6.689182477401411, + "grad_norm": 0.138671875, + "learning_rate": 0.00026486937518625217, + "loss": 0.4858, + "step": 134680 + }, + { + "epoch": 6.68967914969703, + "grad_norm": 0.12158203125, + "learning_rate": 0.0002648296414026026, + "loss": 0.502, + "step": 134690 + }, + { + "epoch": 6.690175821992649, + "grad_norm": 0.150390625, + "learning_rate": 0.000264789907618953, + "loss": 0.4786, + "step": 134700 + }, + { + "epoch": 6.690672494288268, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002647501738353035, + "loss": 0.5083, + "step": 134710 + }, + { + "epoch": 6.691169166583888, + "grad_norm": 0.1298828125, + "learning_rate": 0.00026471044005165395, + "loss": 0.5071, + "step": 134720 + }, + { + "epoch": 6.691665838879508, + "grad_norm": 0.158203125, + "learning_rate": 0.00026467070626800437, + "loss": 0.4903, + "step": 134730 + }, + { + "epoch": 6.692162511175127, + "grad_norm": 0.1630859375, + "learning_rate": 0.00026463097248435484, + "loss": 0.4844, + "step": 134740 + }, + { + "epoch": 6.692659183470746, + "grad_norm": 0.12109375, + "learning_rate": 0.00026459123870070525, + "loss": 0.5136, + "step": 134750 + }, + { + "epoch": 6.693155855766365, + "grad_norm": 0.142578125, + "learning_rate": 0.0002645515049170558, + "loss": 0.5069, + "step": 134760 + }, + { + "epoch": 6.6936525280619845, + "grad_norm": 0.130859375, + "learning_rate": 0.0002645117711334062, + "loss": 0.4855, + "step": 134770 + }, + { + "epoch": 6.694149200357604, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002644720373497566, + "loss": 0.4965, + "step": 134780 + }, + { + "epoch": 6.694645872653224, + "grad_norm": 0.142578125, + "learning_rate": 0.0002644323035661071, + "loss": 0.5055, + "step": 134790 + }, + { + "epoch": 6.695142544948843, + "grad_norm": 0.12255859375, + "learning_rate": 0.00026439256978245756, + "loss": 0.4824, + "step": 134800 + }, + { + "epoch": 6.695639217244462, + "grad_norm": 0.1416015625, + "learning_rate": 0.000264352835998808, + "loss": 0.4634, + "step": 134810 + }, + { + "epoch": 6.6961358895400815, + "grad_norm": 0.1337890625, + "learning_rate": 0.00026431310221515844, + "loss": 0.4952, + "step": 134820 + }, + { + "epoch": 6.696632561835701, + "grad_norm": 0.142578125, + "learning_rate": 0.0002642733684315089, + "loss": 0.4851, + "step": 134830 + }, + { + "epoch": 6.69712923413132, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002642336346478594, + "loss": 0.4957, + "step": 134840 + }, + { + "epoch": 6.697625906426939, + "grad_norm": 0.123046875, + "learning_rate": 0.0002641939008642098, + "loss": 0.5094, + "step": 134850 + }, + { + "epoch": 6.698122578722559, + "grad_norm": 0.1328125, + "learning_rate": 0.0002641541670805602, + "loss": 0.4804, + "step": 134860 + }, + { + "epoch": 6.698619251018179, + "grad_norm": 0.150390625, + "learning_rate": 0.00026411443329691075, + "loss": 0.4908, + "step": 134870 + }, + { + "epoch": 6.699115923313798, + "grad_norm": 0.1337890625, + "learning_rate": 0.00026407469951326116, + "loss": 0.4931, + "step": 134880 + }, + { + "epoch": 6.699612595609417, + "grad_norm": 0.126953125, + "learning_rate": 0.0002640349657296116, + "loss": 0.4713, + "step": 134890 + }, + { + "epoch": 6.700109267905036, + "grad_norm": 0.1689453125, + "learning_rate": 0.00026399523194596205, + "loss": 0.4666, + "step": 134900 + }, + { + "epoch": 6.7006059402006555, + "grad_norm": 0.1845703125, + "learning_rate": 0.0002639554981623125, + "loss": 0.4869, + "step": 134910 + }, + { + "epoch": 6.701102612496275, + "grad_norm": 0.142578125, + "learning_rate": 0.000263915764378663, + "loss": 0.5096, + "step": 134920 + }, + { + "epoch": 6.701599284791894, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002638760305950134, + "loss": 0.4614, + "step": 134930 + }, + { + "epoch": 6.702095957087514, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002638362968113639, + "loss": 0.4738, + "step": 134940 + }, + { + "epoch": 6.702592629383133, + "grad_norm": 0.1640625, + "learning_rate": 0.00026379656302771436, + "loss": 0.4953, + "step": 134950 + }, + { + "epoch": 6.7030893016787525, + "grad_norm": 0.1357421875, + "learning_rate": 0.00026375682924406477, + "loss": 0.5163, + "step": 134960 + }, + { + "epoch": 6.703585973974372, + "grad_norm": 0.15234375, + "learning_rate": 0.0002637170954604152, + "loss": 0.4771, + "step": 134970 + }, + { + "epoch": 6.704082646269991, + "grad_norm": 0.12890625, + "learning_rate": 0.0002636773616767657, + "loss": 0.5037, + "step": 134980 + }, + { + "epoch": 6.70457931856561, + "grad_norm": 0.1494140625, + "learning_rate": 0.00026363762789311613, + "loss": 0.4834, + "step": 134990 + }, + { + "epoch": 6.705075990861229, + "grad_norm": 0.1484375, + "learning_rate": 0.0002635978941094666, + "loss": 0.5574, + "step": 135000 + }, + { + "epoch": 6.705572663156849, + "grad_norm": 0.1591796875, + "learning_rate": 0.000263558160325817, + "loss": 0.5143, + "step": 135010 + }, + { + "epoch": 6.706069335452469, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002635184265421675, + "loss": 0.5004, + "step": 135020 + }, + { + "epoch": 6.706566007748088, + "grad_norm": 0.1533203125, + "learning_rate": 0.00026347869275851796, + "loss": 0.5572, + "step": 135030 + }, + { + "epoch": 6.707062680043707, + "grad_norm": 0.13671875, + "learning_rate": 0.0002634389589748684, + "loss": 0.4847, + "step": 135040 + }, + { + "epoch": 6.707559352339326, + "grad_norm": 0.177734375, + "learning_rate": 0.00026339922519121885, + "loss": 0.4937, + "step": 135050 + }, + { + "epoch": 6.708056024634946, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002633594914075693, + "loss": 0.5069, + "step": 135060 + }, + { + "epoch": 6.708552696930565, + "grad_norm": 0.1767578125, + "learning_rate": 0.00026331975762391974, + "loss": 0.4741, + "step": 135070 + }, + { + "epoch": 6.709049369226184, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002632800238402702, + "loss": 0.4897, + "step": 135080 + }, + { + "epoch": 6.709546041521804, + "grad_norm": 0.158203125, + "learning_rate": 0.00026324029005662063, + "loss": 0.4932, + "step": 135090 + }, + { + "epoch": 6.7100427138174235, + "grad_norm": 0.1328125, + "learning_rate": 0.0002632005562729711, + "loss": 0.4921, + "step": 135100 + }, + { + "epoch": 6.710539386113043, + "grad_norm": 0.134765625, + "learning_rate": 0.00026316082248932157, + "loss": 0.4973, + "step": 135110 + }, + { + "epoch": 6.711036058408662, + "grad_norm": 0.14453125, + "learning_rate": 0.000263121088705672, + "loss": 0.5009, + "step": 135120 + }, + { + "epoch": 6.711532730704281, + "grad_norm": 0.1328125, + "learning_rate": 0.00026308135492202246, + "loss": 0.501, + "step": 135130 + }, + { + "epoch": 6.7120294029999, + "grad_norm": 0.1376953125, + "learning_rate": 0.00026304162113837293, + "loss": 0.498, + "step": 135140 + }, + { + "epoch": 6.71252607529552, + "grad_norm": 0.1494140625, + "learning_rate": 0.00026300188735472335, + "loss": 0.5147, + "step": 135150 + }, + { + "epoch": 6.71302274759114, + "grad_norm": 0.126953125, + "learning_rate": 0.0002629621535710738, + "loss": 0.4895, + "step": 135160 + }, + { + "epoch": 6.713519419886759, + "grad_norm": 0.14453125, + "learning_rate": 0.0002629224197874243, + "loss": 0.4905, + "step": 135170 + }, + { + "epoch": 6.714016092182378, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002628826860037747, + "loss": 0.462, + "step": 135180 + }, + { + "epoch": 6.714512764477997, + "grad_norm": 0.1181640625, + "learning_rate": 0.0002628429522201252, + "loss": 0.5165, + "step": 135190 + }, + { + "epoch": 6.715009436773617, + "grad_norm": 0.126953125, + "learning_rate": 0.0002628032184364756, + "loss": 0.5055, + "step": 135200 + }, + { + "epoch": 6.715506109069236, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002627634846528261, + "loss": 0.4938, + "step": 135210 + }, + { + "epoch": 6.716002781364855, + "grad_norm": 0.138671875, + "learning_rate": 0.00026272375086917654, + "loss": 0.4777, + "step": 135220 + }, + { + "epoch": 6.716499453660475, + "grad_norm": 0.1357421875, + "learning_rate": 0.00026268401708552696, + "loss": 0.4791, + "step": 135230 + }, + { + "epoch": 6.716996125956094, + "grad_norm": 0.146484375, + "learning_rate": 0.00026264428330187743, + "loss": 0.4691, + "step": 135240 + }, + { + "epoch": 6.717492798251714, + "grad_norm": 0.15234375, + "learning_rate": 0.0002626045495182279, + "loss": 0.4933, + "step": 135250 + }, + { + "epoch": 6.717989470547333, + "grad_norm": 0.126953125, + "learning_rate": 0.0002625648157345783, + "loss": 0.5045, + "step": 135260 + }, + { + "epoch": 6.718486142842952, + "grad_norm": 0.138671875, + "learning_rate": 0.0002625250819509288, + "loss": 0.4828, + "step": 135270 + }, + { + "epoch": 6.718982815138571, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002624853481672792, + "loss": 0.4916, + "step": 135280 + }, + { + "epoch": 6.7194794874341905, + "grad_norm": 0.1689453125, + "learning_rate": 0.00026244561438362973, + "loss": 0.5101, + "step": 135290 + }, + { + "epoch": 6.719976159729811, + "grad_norm": 0.1328125, + "learning_rate": 0.00026240588059998015, + "loss": 0.5071, + "step": 135300 + }, + { + "epoch": 6.72047283202543, + "grad_norm": 0.150390625, + "learning_rate": 0.00026236614681633057, + "loss": 0.5014, + "step": 135310 + }, + { + "epoch": 6.720969504321049, + "grad_norm": 0.1787109375, + "learning_rate": 0.00026232641303268104, + "loss": 0.4723, + "step": 135320 + }, + { + "epoch": 6.721466176616668, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002622866792490315, + "loss": 0.4911, + "step": 135330 + }, + { + "epoch": 6.721962848912288, + "grad_norm": 0.12109375, + "learning_rate": 0.0002622469454653819, + "loss": 0.5158, + "step": 135340 + }, + { + "epoch": 6.722459521207907, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002622072116817324, + "loss": 0.4811, + "step": 135350 + }, + { + "epoch": 6.722956193503526, + "grad_norm": 0.1337890625, + "learning_rate": 0.00026216747789808287, + "loss": 0.5049, + "step": 135360 + }, + { + "epoch": 6.723452865799146, + "grad_norm": 0.1416015625, + "learning_rate": 0.00026212774411443334, + "loss": 0.5028, + "step": 135370 + }, + { + "epoch": 6.723949538094765, + "grad_norm": 0.14453125, + "learning_rate": 0.00026208801033078376, + "loss": 0.5054, + "step": 135380 + }, + { + "epoch": 6.724446210390385, + "grad_norm": 0.13671875, + "learning_rate": 0.0002620482765471342, + "loss": 0.484, + "step": 135390 + }, + { + "epoch": 6.724942882686004, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002620085427634847, + "loss": 0.4932, + "step": 135400 + }, + { + "epoch": 6.725439554981623, + "grad_norm": 0.12890625, + "learning_rate": 0.0002619688089798351, + "loss": 0.525, + "step": 135410 + }, + { + "epoch": 6.725936227277242, + "grad_norm": 0.1279296875, + "learning_rate": 0.00026192907519618553, + "loss": 0.4812, + "step": 135420 + }, + { + "epoch": 6.7264328995728615, + "grad_norm": 0.1513671875, + "learning_rate": 0.000261889341412536, + "loss": 0.4896, + "step": 135430 + }, + { + "epoch": 6.726929571868482, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002618496076288865, + "loss": 0.488, + "step": 135440 + }, + { + "epoch": 6.727426244164101, + "grad_norm": 0.158203125, + "learning_rate": 0.00026180987384523695, + "loss": 0.5218, + "step": 135450 + }, + { + "epoch": 6.72792291645972, + "grad_norm": 0.1494140625, + "learning_rate": 0.00026177014006158736, + "loss": 0.4835, + "step": 135460 + }, + { + "epoch": 6.728419588755339, + "grad_norm": 0.1357421875, + "learning_rate": 0.00026173040627793784, + "loss": 0.5187, + "step": 135470 + }, + { + "epoch": 6.7289162610509585, + "grad_norm": 0.12890625, + "learning_rate": 0.0002616906724942883, + "loss": 0.4934, + "step": 135480 + }, + { + "epoch": 6.729412933346578, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002616509387106387, + "loss": 0.4806, + "step": 135490 + }, + { + "epoch": 6.729909605642197, + "grad_norm": 0.12890625, + "learning_rate": 0.0002616112049269892, + "loss": 0.496, + "step": 135500 + }, + { + "epoch": 6.730406277937817, + "grad_norm": 0.15234375, + "learning_rate": 0.00026157147114333967, + "loss": 0.5232, + "step": 135510 + }, + { + "epoch": 6.730902950233436, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002615317373596901, + "loss": 0.4889, + "step": 135520 + }, + { + "epoch": 6.7313996225290555, + "grad_norm": 0.15234375, + "learning_rate": 0.00026149200357604056, + "loss": 0.5107, + "step": 135530 + }, + { + "epoch": 6.731896294824675, + "grad_norm": 0.1767578125, + "learning_rate": 0.00026145226979239097, + "loss": 0.5284, + "step": 135540 + }, + { + "epoch": 6.732392967120294, + "grad_norm": 0.1455078125, + "learning_rate": 0.00026141253600874144, + "loss": 0.4982, + "step": 135550 + }, + { + "epoch": 6.732889639415913, + "grad_norm": 0.134765625, + "learning_rate": 0.0002613728022250919, + "loss": 0.5103, + "step": 135560 + }, + { + "epoch": 6.7333863117115325, + "grad_norm": 0.1279296875, + "learning_rate": 0.00026133306844144233, + "loss": 0.4855, + "step": 135570 + }, + { + "epoch": 6.733882984007153, + "grad_norm": 0.15625, + "learning_rate": 0.0002612933346577928, + "loss": 0.5072, + "step": 135580 + }, + { + "epoch": 6.734379656302772, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002612536008741433, + "loss": 0.4879, + "step": 135590 + }, + { + "epoch": 6.734876328598391, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002612138670904937, + "loss": 0.5167, + "step": 135600 + }, + { + "epoch": 6.73537300089401, + "grad_norm": 0.134765625, + "learning_rate": 0.00026117413330684416, + "loss": 0.4997, + "step": 135610 + }, + { + "epoch": 6.7358696731896295, + "grad_norm": 0.134765625, + "learning_rate": 0.0002611343995231946, + "loss": 0.5016, + "step": 135620 + }, + { + "epoch": 6.736366345485249, + "grad_norm": 0.1416015625, + "learning_rate": 0.00026109466573954505, + "loss": 0.5042, + "step": 135630 + }, + { + "epoch": 6.736863017780868, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002610549319558955, + "loss": 0.5122, + "step": 135640 + }, + { + "epoch": 6.737359690076487, + "grad_norm": 0.1318359375, + "learning_rate": 0.00026101519817224594, + "loss": 0.5026, + "step": 135650 + }, + { + "epoch": 6.737856362372106, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002609754643885964, + "loss": 0.4977, + "step": 135660 + }, + { + "epoch": 6.7383530346677265, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002609357306049469, + "loss": 0.452, + "step": 135670 + }, + { + "epoch": 6.738849706963346, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002608959968212973, + "loss": 0.4967, + "step": 135680 + }, + { + "epoch": 6.739346379258965, + "grad_norm": 0.1357421875, + "learning_rate": 0.00026085626303764777, + "loss": 0.5096, + "step": 135690 + }, + { + "epoch": 6.739843051554584, + "grad_norm": 0.1337890625, + "learning_rate": 0.00026081652925399824, + "loss": 0.5105, + "step": 135700 + }, + { + "epoch": 6.740339723850203, + "grad_norm": 0.1435546875, + "learning_rate": 0.00026077679547034866, + "loss": 0.4877, + "step": 135710 + }, + { + "epoch": 6.740836396145823, + "grad_norm": 0.150390625, + "learning_rate": 0.00026073706168669913, + "loss": 0.4725, + "step": 135720 + }, + { + "epoch": 6.741333068441442, + "grad_norm": 0.1416015625, + "learning_rate": 0.00026069732790304955, + "loss": 0.4899, + "step": 135730 + }, + { + "epoch": 6.741829740737062, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002606575941194001, + "loss": 0.5124, + "step": 135740 + }, + { + "epoch": 6.742326413032681, + "grad_norm": 0.1484375, + "learning_rate": 0.0002606178603357505, + "loss": 0.5151, + "step": 135750 + }, + { + "epoch": 6.7428230853283, + "grad_norm": 0.1171875, + "learning_rate": 0.0002605781265521009, + "loss": 0.4804, + "step": 135760 + }, + { + "epoch": 6.74331975762392, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002605383927684514, + "loss": 0.4874, + "step": 135770 + }, + { + "epoch": 6.743816429919539, + "grad_norm": 0.1533203125, + "learning_rate": 0.00026049865898480185, + "loss": 0.5055, + "step": 135780 + }, + { + "epoch": 6.744313102215158, + "grad_norm": 0.1611328125, + "learning_rate": 0.00026045892520115227, + "loss": 0.4592, + "step": 135790 + }, + { + "epoch": 6.744809774510777, + "grad_norm": 0.1435546875, + "learning_rate": 0.00026041919141750274, + "loss": 0.4691, + "step": 135800 + }, + { + "epoch": 6.7453064468063975, + "grad_norm": 0.142578125, + "learning_rate": 0.0002603794576338532, + "loss": 0.5086, + "step": 135810 + }, + { + "epoch": 6.745803119102017, + "grad_norm": 0.1220703125, + "learning_rate": 0.0002603397238502037, + "loss": 0.5025, + "step": 135820 + }, + { + "epoch": 6.746299791397636, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002602999900665541, + "loss": 0.5012, + "step": 135830 + }, + { + "epoch": 6.746796463693255, + "grad_norm": 0.158203125, + "learning_rate": 0.0002602602562829045, + "loss": 0.4888, + "step": 135840 + }, + { + "epoch": 6.747293135988874, + "grad_norm": 0.142578125, + "learning_rate": 0.000260220522499255, + "loss": 0.4955, + "step": 135850 + }, + { + "epoch": 6.747789808284494, + "grad_norm": 0.1328125, + "learning_rate": 0.00026018078871560546, + "loss": 0.5009, + "step": 135860 + }, + { + "epoch": 6.748286480580113, + "grad_norm": 0.1484375, + "learning_rate": 0.00026014105493195593, + "loss": 0.5116, + "step": 135870 + }, + { + "epoch": 6.748783152875733, + "grad_norm": 0.1484375, + "learning_rate": 0.00026010132114830635, + "loss": 0.5054, + "step": 135880 + }, + { + "epoch": 6.749279825171352, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002600615873646568, + "loss": 0.4901, + "step": 135890 + }, + { + "epoch": 6.749776497466971, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002600218535810073, + "loss": 0.5006, + "step": 135900 + }, + { + "epoch": 6.750273169762591, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002599821197973577, + "loss": 0.4814, + "step": 135910 + }, + { + "epoch": 6.75076984205821, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002599423860137081, + "loss": 0.4964, + "step": 135920 + }, + { + "epoch": 6.751266514353829, + "grad_norm": 0.1455078125, + "learning_rate": 0.00025990265223005865, + "loss": 0.5052, + "step": 135930 + }, + { + "epoch": 6.751763186649448, + "grad_norm": 0.126953125, + "learning_rate": 0.00025986291844640907, + "loss": 0.4758, + "step": 135940 + }, + { + "epoch": 6.752259858945068, + "grad_norm": 0.1611328125, + "learning_rate": 0.00025982318466275954, + "loss": 0.471, + "step": 135950 + }, + { + "epoch": 6.752756531240688, + "grad_norm": 0.138671875, + "learning_rate": 0.00025978345087910996, + "loss": 0.4947, + "step": 135960 + }, + { + "epoch": 6.753253203536307, + "grad_norm": 0.1533203125, + "learning_rate": 0.00025974371709546043, + "loss": 0.5145, + "step": 135970 + }, + { + "epoch": 6.753749875831926, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002597039833118109, + "loss": 0.5098, + "step": 135980 + }, + { + "epoch": 6.754246548127545, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002596642495281613, + "loss": 0.4997, + "step": 135990 + }, + { + "epoch": 6.7547432204231646, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002596245157445118, + "loss": 0.4737, + "step": 136000 + }, + { + "epoch": 6.755239892718784, + "grad_norm": 0.13671875, + "learning_rate": 0.00025958478196086226, + "loss": 0.5139, + "step": 136010 + }, + { + "epoch": 6.755736565014404, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002595450481772127, + "loss": 0.5038, + "step": 136020 + }, + { + "epoch": 6.756233237310023, + "grad_norm": 0.1572265625, + "learning_rate": 0.00025950531439356315, + "loss": 0.5041, + "step": 136030 + }, + { + "epoch": 6.756729909605642, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002594655806099136, + "loss": 0.5137, + "step": 136040 + }, + { + "epoch": 6.757226581901262, + "grad_norm": 0.1640625, + "learning_rate": 0.00025942584682626404, + "loss": 0.4855, + "step": 136050 + }, + { + "epoch": 6.757723254196881, + "grad_norm": 0.126953125, + "learning_rate": 0.0002593861130426145, + "loss": 0.4702, + "step": 136060 + }, + { + "epoch": 6.7582199264925, + "grad_norm": 0.16015625, + "learning_rate": 0.0002593463792589649, + "loss": 0.5077, + "step": 136070 + }, + { + "epoch": 6.758716598788119, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002593066454753154, + "loss": 0.4921, + "step": 136080 + }, + { + "epoch": 6.759213271083739, + "grad_norm": 0.1513671875, + "learning_rate": 0.00025926691169166587, + "loss": 0.494, + "step": 136090 + }, + { + "epoch": 6.759709943379359, + "grad_norm": 0.123046875, + "learning_rate": 0.0002592271779080163, + "loss": 0.4766, + "step": 136100 + }, + { + "epoch": 6.760206615674978, + "grad_norm": 0.150390625, + "learning_rate": 0.00025918744412436676, + "loss": 0.4798, + "step": 136110 + }, + { + "epoch": 6.760703287970597, + "grad_norm": 0.14453125, + "learning_rate": 0.00025914771034071723, + "loss": 0.4939, + "step": 136120 + }, + { + "epoch": 6.761199960266216, + "grad_norm": 0.1337890625, + "learning_rate": 0.00025910797655706764, + "loss": 0.5104, + "step": 136130 + }, + { + "epoch": 6.7616966325618355, + "grad_norm": 0.142578125, + "learning_rate": 0.0002590682427734181, + "loss": 0.5014, + "step": 136140 + }, + { + "epoch": 6.762193304857455, + "grad_norm": 0.146484375, + "learning_rate": 0.00025902850898976853, + "loss": 0.4916, + "step": 136150 + }, + { + "epoch": 6.762689977153075, + "grad_norm": 0.146484375, + "learning_rate": 0.000258988775206119, + "loss": 0.5142, + "step": 136160 + }, + { + "epoch": 6.763186649448694, + "grad_norm": 0.16796875, + "learning_rate": 0.0002589490414224695, + "loss": 0.4729, + "step": 136170 + }, + { + "epoch": 6.763683321744313, + "grad_norm": 0.126953125, + "learning_rate": 0.0002589093076388199, + "loss": 0.4935, + "step": 136180 + }, + { + "epoch": 6.7641799940399325, + "grad_norm": 0.1298828125, + "learning_rate": 0.00025886957385517036, + "loss": 0.4747, + "step": 136190 + }, + { + "epoch": 6.764676666335552, + "grad_norm": 0.1328125, + "learning_rate": 0.00025882984007152084, + "loss": 0.4835, + "step": 136200 + }, + { + "epoch": 6.765173338631171, + "grad_norm": 0.2373046875, + "learning_rate": 0.00025879010628787125, + "loss": 0.453, + "step": 136210 + }, + { + "epoch": 6.76567001092679, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002587503725042217, + "loss": 0.5033, + "step": 136220 + }, + { + "epoch": 6.76616668322241, + "grad_norm": 0.1328125, + "learning_rate": 0.0002587106387205722, + "loss": 0.5052, + "step": 136230 + }, + { + "epoch": 6.7666633555180296, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002586709049369226, + "loss": 0.4999, + "step": 136240 + }, + { + "epoch": 6.767160027813649, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002586311711532731, + "loss": 0.4879, + "step": 136250 + }, + { + "epoch": 6.767656700109268, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002585914373696235, + "loss": 0.5078, + "step": 136260 + }, + { + "epoch": 6.768153372404887, + "grad_norm": 0.1845703125, + "learning_rate": 0.000258551703585974, + "loss": 0.5187, + "step": 136270 + }, + { + "epoch": 6.7686500447005065, + "grad_norm": 0.1474609375, + "learning_rate": 0.00025851196980232444, + "loss": 0.4798, + "step": 136280 + }, + { + "epoch": 6.769146716996126, + "grad_norm": 0.1259765625, + "learning_rate": 0.00025847223601867486, + "loss": 0.4725, + "step": 136290 + }, + { + "epoch": 6.769643389291746, + "grad_norm": 0.158203125, + "learning_rate": 0.00025843250223502533, + "loss": 0.4921, + "step": 136300 + }, + { + "epoch": 6.770140061587365, + "grad_norm": 0.146484375, + "learning_rate": 0.0002583927684513758, + "loss": 0.5116, + "step": 136310 + }, + { + "epoch": 6.770636733882984, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002583530346677263, + "loss": 0.472, + "step": 136320 + }, + { + "epoch": 6.7711334061786035, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002583133008840767, + "loss": 0.4923, + "step": 136330 + }, + { + "epoch": 6.771630078474223, + "grad_norm": 0.1337890625, + "learning_rate": 0.00025827356710042716, + "loss": 0.4859, + "step": 136340 + }, + { + "epoch": 6.772126750769842, + "grad_norm": 0.146484375, + "learning_rate": 0.00025823383331677763, + "loss": 0.4872, + "step": 136350 + }, + { + "epoch": 6.772623423065461, + "grad_norm": 0.1455078125, + "learning_rate": 0.00025819409953312805, + "loss": 0.5, + "step": 136360 + }, + { + "epoch": 6.77312009536108, + "grad_norm": 0.1494140625, + "learning_rate": 0.00025815436574947847, + "loss": 0.4895, + "step": 136370 + }, + { + "epoch": 6.7736167676567, + "grad_norm": 0.134765625, + "learning_rate": 0.000258114631965829, + "loss": 0.494, + "step": 136380 + }, + { + "epoch": 6.77411343995232, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002580748981821794, + "loss": 0.47, + "step": 136390 + }, + { + "epoch": 6.774610112247939, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002580351643985299, + "loss": 0.4972, + "step": 136400 + }, + { + "epoch": 6.775106784543558, + "grad_norm": 0.171875, + "learning_rate": 0.0002579954306148803, + "loss": 0.495, + "step": 136410 + }, + { + "epoch": 6.775603456839177, + "grad_norm": 0.171875, + "learning_rate": 0.00025795569683123077, + "loss": 0.5207, + "step": 136420 + }, + { + "epoch": 6.776100129134797, + "grad_norm": 0.14453125, + "learning_rate": 0.00025791596304758124, + "loss": 0.5088, + "step": 136430 + }, + { + "epoch": 6.776596801430416, + "grad_norm": 0.1689453125, + "learning_rate": 0.00025787622926393166, + "loss": 0.5184, + "step": 136440 + }, + { + "epoch": 6.777093473726035, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002578364954802821, + "loss": 0.4857, + "step": 136450 + }, + { + "epoch": 6.777590146021655, + "grad_norm": 0.146484375, + "learning_rate": 0.0002577967616966326, + "loss": 0.4996, + "step": 136460 + }, + { + "epoch": 6.778086818317274, + "grad_norm": 0.1396484375, + "learning_rate": 0.000257757027912983, + "loss": 0.4963, + "step": 136470 + }, + { + "epoch": 6.778583490612894, + "grad_norm": 0.125, + "learning_rate": 0.0002577172941293335, + "loss": 0.4915, + "step": 136480 + }, + { + "epoch": 6.779080162908513, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002576775603456839, + "loss": 0.5114, + "step": 136490 + }, + { + "epoch": 6.779576835204132, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002576378265620344, + "loss": 0.4829, + "step": 136500 + }, + { + "epoch": 6.780073507499751, + "grad_norm": 0.16015625, + "learning_rate": 0.00025759809277838485, + "loss": 0.4805, + "step": 136510 + }, + { + "epoch": 6.780570179795371, + "grad_norm": 0.1328125, + "learning_rate": 0.00025755835899473527, + "loss": 0.4742, + "step": 136520 + }, + { + "epoch": 6.781066852090991, + "grad_norm": 0.1318359375, + "learning_rate": 0.00025751862521108574, + "loss": 0.4905, + "step": 136530 + }, + { + "epoch": 6.78156352438661, + "grad_norm": 0.1328125, + "learning_rate": 0.0002574788914274362, + "loss": 0.5023, + "step": 136540 + }, + { + "epoch": 6.782060196682229, + "grad_norm": 0.1494140625, + "learning_rate": 0.00025743915764378663, + "loss": 0.4985, + "step": 136550 + }, + { + "epoch": 6.782556868977848, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002573994238601371, + "loss": 0.5227, + "step": 136560 + }, + { + "epoch": 6.783053541273468, + "grad_norm": 0.1318359375, + "learning_rate": 0.00025735969007648757, + "loss": 0.494, + "step": 136570 + }, + { + "epoch": 6.783550213569087, + "grad_norm": 0.130859375, + "learning_rate": 0.000257319956292838, + "loss": 0.5016, + "step": 136580 + }, + { + "epoch": 6.784046885864706, + "grad_norm": 0.1650390625, + "learning_rate": 0.00025728022250918846, + "loss": 0.4944, + "step": 136590 + }, + { + "epoch": 6.784543558160326, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002572404887255389, + "loss": 0.4921, + "step": 136600 + }, + { + "epoch": 6.785040230455945, + "grad_norm": 0.140625, + "learning_rate": 0.00025720075494188935, + "loss": 0.4741, + "step": 136610 + }, + { + "epoch": 6.785536902751565, + "grad_norm": 0.134765625, + "learning_rate": 0.0002571610211582398, + "loss": 0.4933, + "step": 136620 + }, + { + "epoch": 6.786033575047184, + "grad_norm": 0.146484375, + "learning_rate": 0.00025712128737459024, + "loss": 0.4757, + "step": 136630 + }, + { + "epoch": 6.786530247342803, + "grad_norm": 0.130859375, + "learning_rate": 0.0002570815535909407, + "loss": 0.4882, + "step": 136640 + }, + { + "epoch": 6.787026919638422, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002570418198072912, + "loss": 0.4844, + "step": 136650 + }, + { + "epoch": 6.7875235919340415, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002570020860236416, + "loss": 0.5032, + "step": 136660 + }, + { + "epoch": 6.788020264229662, + "grad_norm": 0.1396484375, + "learning_rate": 0.00025696235223999207, + "loss": 0.4978, + "step": 136670 + }, + { + "epoch": 6.788516936525281, + "grad_norm": 0.142578125, + "learning_rate": 0.00025692261845634254, + "loss": 0.5073, + "step": 136680 + }, + { + "epoch": 6.7890136088209, + "grad_norm": 0.146484375, + "learning_rate": 0.000256882884672693, + "loss": 0.4952, + "step": 136690 + }, + { + "epoch": 6.789510281116519, + "grad_norm": 0.1484375, + "learning_rate": 0.00025684315088904343, + "loss": 0.5072, + "step": 136700 + }, + { + "epoch": 6.790006953412139, + "grad_norm": 0.13671875, + "learning_rate": 0.00025680341710539384, + "loss": 0.4873, + "step": 136710 + }, + { + "epoch": 6.790503625707758, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002567636833217443, + "loss": 0.5001, + "step": 136720 + }, + { + "epoch": 6.791000298003377, + "grad_norm": 0.125, + "learning_rate": 0.0002567239495380948, + "loss": 0.4856, + "step": 136730 + }, + { + "epoch": 6.791496970298997, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002566842157544452, + "loss": 0.4997, + "step": 136740 + }, + { + "epoch": 6.791993642594616, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002566444819707957, + "loss": 0.4973, + "step": 136750 + }, + { + "epoch": 6.792490314890236, + "grad_norm": 0.171875, + "learning_rate": 0.00025660474818714615, + "loss": 0.5042, + "step": 136760 + }, + { + "epoch": 6.792986987185855, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002565650144034966, + "loss": 0.5157, + "step": 136770 + }, + { + "epoch": 6.793483659481474, + "grad_norm": 0.1298828125, + "learning_rate": 0.00025652528061984704, + "loss": 0.5132, + "step": 136780 + }, + { + "epoch": 6.793980331777093, + "grad_norm": 0.1240234375, + "learning_rate": 0.00025648554683619745, + "loss": 0.4975, + "step": 136790 + }, + { + "epoch": 6.7944770040727125, + "grad_norm": 0.15625, + "learning_rate": 0.000256445813052548, + "loss": 0.5163, + "step": 136800 + }, + { + "epoch": 6.794973676368333, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002564060792688984, + "loss": 0.4743, + "step": 136810 + }, + { + "epoch": 6.795470348663952, + "grad_norm": 0.140625, + "learning_rate": 0.0002563663454852488, + "loss": 0.4745, + "step": 136820 + }, + { + "epoch": 6.795967020959571, + "grad_norm": 0.13671875, + "learning_rate": 0.0002563266117015993, + "loss": 0.5037, + "step": 136830 + }, + { + "epoch": 6.79646369325519, + "grad_norm": 0.1552734375, + "learning_rate": 0.00025628687791794976, + "loss": 0.453, + "step": 136840 + }, + { + "epoch": 6.7969603655508095, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002562471441343002, + "loss": 0.4808, + "step": 136850 + }, + { + "epoch": 6.797457037846429, + "grad_norm": 0.126953125, + "learning_rate": 0.00025620741035065064, + "loss": 0.4868, + "step": 136860 + }, + { + "epoch": 6.797953710142048, + "grad_norm": 0.15234375, + "learning_rate": 0.0002561676765670011, + "loss": 0.4915, + "step": 136870 + }, + { + "epoch": 6.798450382437668, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002561279427833516, + "loss": 0.4938, + "step": 136880 + }, + { + "epoch": 6.798947054733287, + "grad_norm": 0.1279296875, + "learning_rate": 0.000256088208999702, + "loss": 0.4555, + "step": 136890 + }, + { + "epoch": 6.7994437270289065, + "grad_norm": 0.15625, + "learning_rate": 0.0002560484752160524, + "loss": 0.5093, + "step": 136900 + }, + { + "epoch": 6.799940399324526, + "grad_norm": 0.134765625, + "learning_rate": 0.00025600874143240295, + "loss": 0.4832, + "step": 136910 + }, + { + "epoch": 6.800437071620145, + "grad_norm": 0.1455078125, + "learning_rate": 0.00025596900764875336, + "loss": 0.4586, + "step": 136920 + }, + { + "epoch": 6.800933743915764, + "grad_norm": 0.12890625, + "learning_rate": 0.00025592927386510384, + "loss": 0.4659, + "step": 136930 + }, + { + "epoch": 6.8014304162113834, + "grad_norm": 0.1298828125, + "learning_rate": 0.00025588954008145425, + "loss": 0.4829, + "step": 136940 + }, + { + "epoch": 6.801927088507004, + "grad_norm": 0.15625, + "learning_rate": 0.0002558498062978047, + "loss": 0.4846, + "step": 136950 + }, + { + "epoch": 6.802423760802623, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002558100725141552, + "loss": 0.5186, + "step": 136960 + }, + { + "epoch": 6.802920433098242, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002557703387305056, + "loss": 0.4911, + "step": 136970 + }, + { + "epoch": 6.803417105393861, + "grad_norm": 0.138671875, + "learning_rate": 0.00025573060494685603, + "loss": 0.4802, + "step": 136980 + }, + { + "epoch": 6.8039137776894805, + "grad_norm": 0.1376953125, + "learning_rate": 0.00025569087116320655, + "loss": 0.5097, + "step": 136990 + }, + { + "epoch": 6.8044104499851, + "grad_norm": 0.1357421875, + "learning_rate": 0.00025565113737955697, + "loss": 0.4759, + "step": 137000 + }, + { + "epoch": 6.804907122280719, + "grad_norm": 0.140625, + "learning_rate": 0.00025561140359590744, + "loss": 0.4875, + "step": 137010 + }, + { + "epoch": 6.805403794576338, + "grad_norm": 0.1904296875, + "learning_rate": 0.00025557166981225786, + "loss": 0.5124, + "step": 137020 + }, + { + "epoch": 6.805900466871958, + "grad_norm": 0.1318359375, + "learning_rate": 0.00025553193602860833, + "loss": 0.4782, + "step": 137030 + }, + { + "epoch": 6.8063971391675775, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002554922022449588, + "loss": 0.476, + "step": 137040 + }, + { + "epoch": 6.806893811463197, + "grad_norm": 0.1484375, + "learning_rate": 0.0002554524684613092, + "loss": 0.4942, + "step": 137050 + }, + { + "epoch": 6.807390483758816, + "grad_norm": 0.130859375, + "learning_rate": 0.0002554127346776597, + "loss": 0.5152, + "step": 137060 + }, + { + "epoch": 6.807887156054435, + "grad_norm": 0.1748046875, + "learning_rate": 0.00025537300089401016, + "loss": 0.5123, + "step": 137070 + }, + { + "epoch": 6.808383828350054, + "grad_norm": 0.17578125, + "learning_rate": 0.0002553332671103606, + "loss": 0.4966, + "step": 137080 + }, + { + "epoch": 6.808880500645674, + "grad_norm": 0.1416015625, + "learning_rate": 0.00025529353332671105, + "loss": 0.4836, + "step": 137090 + }, + { + "epoch": 6.809377172941293, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002552537995430615, + "loss": 0.4629, + "step": 137100 + }, + { + "epoch": 6.809873845236913, + "grad_norm": 0.126953125, + "learning_rate": 0.00025521406575941194, + "loss": 0.4719, + "step": 137110 + }, + { + "epoch": 6.810370517532532, + "grad_norm": 0.134765625, + "learning_rate": 0.0002551743319757624, + "loss": 0.4963, + "step": 137120 + }, + { + "epoch": 6.810867189828151, + "grad_norm": 0.1513671875, + "learning_rate": 0.00025513459819211283, + "loss": 0.4692, + "step": 137130 + }, + { + "epoch": 6.811363862123771, + "grad_norm": 0.15625, + "learning_rate": 0.00025509486440846335, + "loss": 0.5022, + "step": 137140 + }, + { + "epoch": 6.81186053441939, + "grad_norm": 0.1357421875, + "learning_rate": 0.00025505513062481377, + "loss": 0.463, + "step": 137150 + }, + { + "epoch": 6.812357206715009, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002550153968411642, + "loss": 0.5002, + "step": 137160 + }, + { + "epoch": 6.812853879010628, + "grad_norm": 0.1640625, + "learning_rate": 0.00025497566305751466, + "loss": 0.4814, + "step": 137170 + }, + { + "epoch": 6.8133505513062484, + "grad_norm": 0.1416015625, + "learning_rate": 0.00025493592927386513, + "loss": 0.5067, + "step": 137180 + }, + { + "epoch": 6.813847223601868, + "grad_norm": 0.1640625, + "learning_rate": 0.00025489619549021555, + "loss": 0.4942, + "step": 137190 + }, + { + "epoch": 6.814343895897487, + "grad_norm": 0.154296875, + "learning_rate": 0.000254856461706566, + "loss": 0.5138, + "step": 137200 + }, + { + "epoch": 6.814840568193106, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002548167279229165, + "loss": 0.4945, + "step": 137210 + }, + { + "epoch": 6.815337240488725, + "grad_norm": 0.1416015625, + "learning_rate": 0.00025477699413926696, + "loss": 0.4939, + "step": 137220 + }, + { + "epoch": 6.815833912784345, + "grad_norm": 0.134765625, + "learning_rate": 0.0002547372603556174, + "loss": 0.5035, + "step": 137230 + }, + { + "epoch": 6.816330585079964, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002546975265719678, + "loss": 0.4963, + "step": 137240 + }, + { + "epoch": 6.816827257375584, + "grad_norm": 0.130859375, + "learning_rate": 0.00025465779278831827, + "loss": 0.4881, + "step": 137250 + }, + { + "epoch": 6.817323929671203, + "grad_norm": 0.12158203125, + "learning_rate": 0.00025461805900466874, + "loss": 0.4813, + "step": 137260 + }, + { + "epoch": 6.817820601966822, + "grad_norm": 0.142578125, + "learning_rate": 0.00025457832522101916, + "loss": 0.5031, + "step": 137270 + }, + { + "epoch": 6.818317274262442, + "grad_norm": 0.130859375, + "learning_rate": 0.00025453859143736963, + "loss": 0.4995, + "step": 137280 + }, + { + "epoch": 6.818813946558061, + "grad_norm": 0.12353515625, + "learning_rate": 0.0002544988576537201, + "loss": 0.4715, + "step": 137290 + }, + { + "epoch": 6.81931061885368, + "grad_norm": 0.1298828125, + "learning_rate": 0.00025445912387007057, + "loss": 0.4713, + "step": 137300 + }, + { + "epoch": 6.819807291149299, + "grad_norm": 0.1337890625, + "learning_rate": 0.000254419390086421, + "loss": 0.4607, + "step": 137310 + }, + { + "epoch": 6.820303963444919, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002543796563027714, + "loss": 0.4896, + "step": 137320 + }, + { + "epoch": 6.820800635740539, + "grad_norm": 0.126953125, + "learning_rate": 0.00025433992251912193, + "loss": 0.4689, + "step": 137330 + }, + { + "epoch": 6.821297308036158, + "grad_norm": 0.1376953125, + "learning_rate": 0.00025430018873547235, + "loss": 0.51, + "step": 137340 + }, + { + "epoch": 6.821793980331777, + "grad_norm": 0.177734375, + "learning_rate": 0.00025426045495182276, + "loss": 0.4849, + "step": 137350 + }, + { + "epoch": 6.822290652627396, + "grad_norm": 0.13671875, + "learning_rate": 0.00025422072116817324, + "loss": 0.5056, + "step": 137360 + }, + { + "epoch": 6.8227873249230155, + "grad_norm": 0.125, + "learning_rate": 0.0002541809873845237, + "loss": 0.4617, + "step": 137370 + }, + { + "epoch": 6.823283997218635, + "grad_norm": 0.138671875, + "learning_rate": 0.0002541412536008742, + "loss": 0.5141, + "step": 137380 + }, + { + "epoch": 6.823780669514255, + "grad_norm": 0.13671875, + "learning_rate": 0.0002541015198172246, + "loss": 0.487, + "step": 137390 + }, + { + "epoch": 6.824277341809874, + "grad_norm": 0.130859375, + "learning_rate": 0.00025406178603357507, + "loss": 0.5098, + "step": 137400 + }, + { + "epoch": 6.824774014105493, + "grad_norm": 0.1328125, + "learning_rate": 0.00025402205224992554, + "loss": 0.5122, + "step": 137410 + }, + { + "epoch": 6.825270686401113, + "grad_norm": 0.13671875, + "learning_rate": 0.00025398231846627596, + "loss": 0.5249, + "step": 137420 + }, + { + "epoch": 6.825767358696732, + "grad_norm": 0.1318359375, + "learning_rate": 0.00025394258468262637, + "loss": 0.473, + "step": 137430 + }, + { + "epoch": 6.826264030992351, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002539028508989769, + "loss": 0.515, + "step": 137440 + }, + { + "epoch": 6.82676070328797, + "grad_norm": 0.1484375, + "learning_rate": 0.0002538631171153273, + "loss": 0.4566, + "step": 137450 + }, + { + "epoch": 6.82725737558359, + "grad_norm": 0.134765625, + "learning_rate": 0.0002538233833316778, + "loss": 0.47, + "step": 137460 + }, + { + "epoch": 6.82775404787921, + "grad_norm": 0.146484375, + "learning_rate": 0.0002537836495480282, + "loss": 0.5116, + "step": 137470 + }, + { + "epoch": 6.828250720174829, + "grad_norm": 0.138671875, + "learning_rate": 0.0002537439157643787, + "loss": 0.4773, + "step": 137480 + }, + { + "epoch": 6.828747392470448, + "grad_norm": 0.1396484375, + "learning_rate": 0.00025370418198072915, + "loss": 0.5096, + "step": 137490 + }, + { + "epoch": 6.829244064766067, + "grad_norm": 0.1455078125, + "learning_rate": 0.00025366444819707956, + "loss": 0.4903, + "step": 137500 + }, + { + "epoch": 6.8297407370616865, + "grad_norm": 0.158203125, + "learning_rate": 0.00025362471441343004, + "loss": 0.5225, + "step": 137510 + }, + { + "epoch": 6.830237409357306, + "grad_norm": 0.138671875, + "learning_rate": 0.0002535849806297805, + "loss": 0.5026, + "step": 137520 + }, + { + "epoch": 6.830734081652926, + "grad_norm": 0.14453125, + "learning_rate": 0.0002535452468461309, + "loss": 0.5412, + "step": 137530 + }, + { + "epoch": 6.831230753948545, + "grad_norm": 0.138671875, + "learning_rate": 0.0002535055130624814, + "loss": 0.5015, + "step": 137540 + }, + { + "epoch": 6.831727426244164, + "grad_norm": 0.134765625, + "learning_rate": 0.0002534657792788318, + "loss": 0.5025, + "step": 137550 + }, + { + "epoch": 6.8322240985397835, + "grad_norm": 0.142578125, + "learning_rate": 0.0002534260454951823, + "loss": 0.4904, + "step": 137560 + }, + { + "epoch": 6.832720770835403, + "grad_norm": 0.142578125, + "learning_rate": 0.00025338631171153275, + "loss": 0.4763, + "step": 137570 + }, + { + "epoch": 6.833217443131022, + "grad_norm": 0.12890625, + "learning_rate": 0.00025334657792788317, + "loss": 0.4824, + "step": 137580 + }, + { + "epoch": 6.833714115426641, + "grad_norm": 0.1474609375, + "learning_rate": 0.00025330684414423364, + "loss": 0.5054, + "step": 137590 + }, + { + "epoch": 6.834210787722261, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002532671103605841, + "loss": 0.4908, + "step": 137600 + }, + { + "epoch": 6.8347074600178805, + "grad_norm": 0.1455078125, + "learning_rate": 0.00025322737657693453, + "loss": 0.4919, + "step": 137610 + }, + { + "epoch": 6.8352041323135, + "grad_norm": 0.1640625, + "learning_rate": 0.000253187642793285, + "loss": 0.5096, + "step": 137620 + }, + { + "epoch": 6.835700804609119, + "grad_norm": 0.154296875, + "learning_rate": 0.0002531479090096355, + "loss": 0.4745, + "step": 137630 + }, + { + "epoch": 6.836197476904738, + "grad_norm": 0.138671875, + "learning_rate": 0.0002531081752259859, + "loss": 0.4789, + "step": 137640 + }, + { + "epoch": 6.8366941492003575, + "grad_norm": 0.1826171875, + "learning_rate": 0.00025306844144233636, + "loss": 0.5087, + "step": 137650 + }, + { + "epoch": 6.837190821495977, + "grad_norm": 0.171875, + "learning_rate": 0.0002530287076586868, + "loss": 0.5004, + "step": 137660 + }, + { + "epoch": 6.837687493791597, + "grad_norm": 0.126953125, + "learning_rate": 0.0002529889738750373, + "loss": 0.4793, + "step": 137670 + }, + { + "epoch": 6.838184166087216, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002529492400913877, + "loss": 0.5153, + "step": 137680 + }, + { + "epoch": 6.838680838382835, + "grad_norm": 0.1181640625, + "learning_rate": 0.00025290950630773814, + "loss": 0.4701, + "step": 137690 + }, + { + "epoch": 6.8391775106784545, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002528697725240886, + "loss": 0.4855, + "step": 137700 + }, + { + "epoch": 6.839674182974074, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002528300387404391, + "loss": 0.5083, + "step": 137710 + }, + { + "epoch": 6.840170855269693, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002527903049567895, + "loss": 0.4645, + "step": 137720 + }, + { + "epoch": 6.840667527565312, + "grad_norm": 0.150390625, + "learning_rate": 0.00025275057117313997, + "loss": 0.497, + "step": 137730 + }, + { + "epoch": 6.841164199860931, + "grad_norm": 0.244140625, + "learning_rate": 0.00025271083738949044, + "loss": 0.4876, + "step": 137740 + }, + { + "epoch": 6.841660872156551, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002526711036058409, + "loss": 0.488, + "step": 137750 + }, + { + "epoch": 6.842157544452171, + "grad_norm": 0.1650390625, + "learning_rate": 0.00025263136982219133, + "loss": 0.5133, + "step": 137760 + }, + { + "epoch": 6.84265421674779, + "grad_norm": 0.1865234375, + "learning_rate": 0.00025259163603854175, + "loss": 0.4872, + "step": 137770 + }, + { + "epoch": 6.843150889043409, + "grad_norm": 0.12890625, + "learning_rate": 0.0002525519022548923, + "loss": 0.5093, + "step": 137780 + }, + { + "epoch": 6.843647561339028, + "grad_norm": 0.138671875, + "learning_rate": 0.0002525121684712427, + "loss": 0.5031, + "step": 137790 + }, + { + "epoch": 6.844144233634648, + "grad_norm": 0.12890625, + "learning_rate": 0.0002524724346875931, + "loss": 0.4943, + "step": 137800 + }, + { + "epoch": 6.844640905930267, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002524327009039436, + "loss": 0.4916, + "step": 137810 + }, + { + "epoch": 6.845137578225886, + "grad_norm": 0.14453125, + "learning_rate": 0.00025239296712029405, + "loss": 0.5025, + "step": 137820 + }, + { + "epoch": 6.845634250521506, + "grad_norm": 0.13671875, + "learning_rate": 0.0002523532333366445, + "loss": 0.5072, + "step": 137830 + }, + { + "epoch": 6.846130922817125, + "grad_norm": 0.1259765625, + "learning_rate": 0.00025231349955299494, + "loss": 0.5001, + "step": 137840 + }, + { + "epoch": 6.846627595112745, + "grad_norm": 0.146484375, + "learning_rate": 0.00025227376576934536, + "loss": 0.4927, + "step": 137850 + }, + { + "epoch": 6.847124267408364, + "grad_norm": 0.158203125, + "learning_rate": 0.0002522340319856959, + "loss": 0.5367, + "step": 137860 + }, + { + "epoch": 6.847620939703983, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002521942982020463, + "loss": 0.4874, + "step": 137870 + }, + { + "epoch": 6.848117611999602, + "grad_norm": 0.130859375, + "learning_rate": 0.0002521545644183967, + "loss": 0.4807, + "step": 137880 + }, + { + "epoch": 6.848614284295222, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002521148306347472, + "loss": 0.4903, + "step": 137890 + }, + { + "epoch": 6.849110956590842, + "grad_norm": 0.1396484375, + "learning_rate": 0.00025207509685109766, + "loss": 0.5168, + "step": 137900 + }, + { + "epoch": 6.849607628886461, + "grad_norm": 0.166015625, + "learning_rate": 0.00025203536306744813, + "loss": 0.481, + "step": 137910 + }, + { + "epoch": 6.85010430118208, + "grad_norm": 0.1259765625, + "learning_rate": 0.00025199562928379855, + "loss": 0.4804, + "step": 137920 + }, + { + "epoch": 6.850600973477699, + "grad_norm": 0.146484375, + "learning_rate": 0.000251955895500149, + "loss": 0.5242, + "step": 137930 + }, + { + "epoch": 6.851097645773319, + "grad_norm": 0.154296875, + "learning_rate": 0.0002519161617164995, + "loss": 0.5137, + "step": 137940 + }, + { + "epoch": 6.851594318068938, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002518764279328499, + "loss": 0.516, + "step": 137950 + }, + { + "epoch": 6.852090990364557, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002518366941492004, + "loss": 0.5078, + "step": 137960 + }, + { + "epoch": 6.852587662660177, + "grad_norm": 0.1318359375, + "learning_rate": 0.00025179696036555085, + "loss": 0.4554, + "step": 137970 + }, + { + "epoch": 6.853084334955796, + "grad_norm": 0.1435546875, + "learning_rate": 0.00025175722658190127, + "loss": 0.5063, + "step": 137980 + }, + { + "epoch": 6.853581007251416, + "grad_norm": 0.1435546875, + "learning_rate": 0.00025171749279825174, + "loss": 0.4849, + "step": 137990 + }, + { + "epoch": 6.854077679547035, + "grad_norm": 0.1396484375, + "learning_rate": 0.00025167775901460216, + "loss": 0.4813, + "step": 138000 + }, + { + "epoch": 6.854574351842654, + "grad_norm": 0.138671875, + "learning_rate": 0.00025163802523095263, + "loss": 0.4909, + "step": 138010 + }, + { + "epoch": 6.855071024138273, + "grad_norm": 0.13671875, + "learning_rate": 0.0002515982914473031, + "loss": 0.4865, + "step": 138020 + }, + { + "epoch": 6.8555676964338925, + "grad_norm": 0.201171875, + "learning_rate": 0.0002515585576636535, + "loss": 0.4834, + "step": 138030 + }, + { + "epoch": 6.856064368729513, + "grad_norm": 0.1435546875, + "learning_rate": 0.000251518823880004, + "loss": 0.4737, + "step": 138040 + }, + { + "epoch": 6.856561041025132, + "grad_norm": 0.1572265625, + "learning_rate": 0.00025147909009635446, + "loss": 0.4934, + "step": 138050 + }, + { + "epoch": 6.857057713320751, + "grad_norm": 0.14453125, + "learning_rate": 0.0002514393563127049, + "loss": 0.4939, + "step": 138060 + }, + { + "epoch": 6.85755438561637, + "grad_norm": 0.1328125, + "learning_rate": 0.00025139962252905535, + "loss": 0.5089, + "step": 138070 + }, + { + "epoch": 6.8580510579119895, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002513598887454058, + "loss": 0.525, + "step": 138080 + }, + { + "epoch": 6.858547730207609, + "grad_norm": 0.1328125, + "learning_rate": 0.00025132015496175624, + "loss": 0.5115, + "step": 138090 + }, + { + "epoch": 6.859044402503228, + "grad_norm": 0.130859375, + "learning_rate": 0.0002512804211781067, + "loss": 0.481, + "step": 138100 + }, + { + "epoch": 6.859541074798848, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002512406873944571, + "loss": 0.502, + "step": 138110 + }, + { + "epoch": 6.860037747094467, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002512009536108076, + "loss": 0.5023, + "step": 138120 + }, + { + "epoch": 6.860534419390087, + "grad_norm": 0.15625, + "learning_rate": 0.00025116121982715807, + "loss": 0.5104, + "step": 138130 + }, + { + "epoch": 6.861031091685706, + "grad_norm": 0.126953125, + "learning_rate": 0.0002511214860435085, + "loss": 0.4992, + "step": 138140 + }, + { + "epoch": 6.861527763981325, + "grad_norm": 0.1748046875, + "learning_rate": 0.00025108175225985896, + "loss": 0.4933, + "step": 138150 + }, + { + "epoch": 6.862024436276944, + "grad_norm": 0.162109375, + "learning_rate": 0.0002510420184762094, + "loss": 0.4964, + "step": 138160 + }, + { + "epoch": 6.8625211085725635, + "grad_norm": 0.1435546875, + "learning_rate": 0.00025100228469255984, + "loss": 0.4791, + "step": 138170 + }, + { + "epoch": 6.863017780868184, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002509625509089103, + "loss": 0.5395, + "step": 138180 + }, + { + "epoch": 6.863514453163803, + "grad_norm": 0.158203125, + "learning_rate": 0.00025092281712526073, + "loss": 0.5023, + "step": 138190 + }, + { + "epoch": 6.864011125459422, + "grad_norm": 0.138671875, + "learning_rate": 0.00025088308334161126, + "loss": 0.5093, + "step": 138200 + }, + { + "epoch": 6.864507797755041, + "grad_norm": 0.1875, + "learning_rate": 0.0002508433495579617, + "loss": 0.5053, + "step": 138210 + }, + { + "epoch": 6.8650044700506605, + "grad_norm": 0.13671875, + "learning_rate": 0.0002508036157743121, + "loss": 0.4806, + "step": 138220 + }, + { + "epoch": 6.86550114234628, + "grad_norm": 0.1240234375, + "learning_rate": 0.00025076388199066256, + "loss": 0.4829, + "step": 138230 + }, + { + "epoch": 6.865997814641899, + "grad_norm": 0.1494140625, + "learning_rate": 0.00025072414820701303, + "loss": 0.5352, + "step": 138240 + }, + { + "epoch": 6.866494486937519, + "grad_norm": 0.140625, + "learning_rate": 0.00025068441442336345, + "loss": 0.4913, + "step": 138250 + }, + { + "epoch": 6.866991159233138, + "grad_norm": 0.20703125, + "learning_rate": 0.0002506446806397139, + "loss": 0.4969, + "step": 138260 + }, + { + "epoch": 6.8674878315287575, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002506049468560644, + "loss": 0.4607, + "step": 138270 + }, + { + "epoch": 6.867984503824377, + "grad_norm": 0.1484375, + "learning_rate": 0.00025056521307241487, + "loss": 0.5346, + "step": 138280 + }, + { + "epoch": 6.868481176119996, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002505254792887653, + "loss": 0.4662, + "step": 138290 + }, + { + "epoch": 6.868977848415615, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002504857455051157, + "loss": 0.5014, + "step": 138300 + }, + { + "epoch": 6.869474520711234, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002504460117214662, + "loss": 0.4955, + "step": 138310 + }, + { + "epoch": 6.8699711930068545, + "grad_norm": 0.138671875, + "learning_rate": 0.00025040627793781664, + "loss": 0.5091, + "step": 138320 + }, + { + "epoch": 6.870467865302474, + "grad_norm": 0.12890625, + "learning_rate": 0.00025036654415416706, + "loss": 0.4925, + "step": 138330 + }, + { + "epoch": 6.870964537598093, + "grad_norm": 0.1708984375, + "learning_rate": 0.00025032681037051753, + "loss": 0.4968, + "step": 138340 + }, + { + "epoch": 6.871461209893712, + "grad_norm": 0.13671875, + "learning_rate": 0.000250287076586868, + "loss": 0.5242, + "step": 138350 + }, + { + "epoch": 6.8719578821893315, + "grad_norm": 0.1875, + "learning_rate": 0.0002502473428032185, + "loss": 0.5039, + "step": 138360 + }, + { + "epoch": 6.872454554484951, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002502076090195689, + "loss": 0.5184, + "step": 138370 + }, + { + "epoch": 6.87295122678057, + "grad_norm": 0.142578125, + "learning_rate": 0.00025016787523591936, + "loss": 0.512, + "step": 138380 + }, + { + "epoch": 6.87344789907619, + "grad_norm": 0.1240234375, + "learning_rate": 0.00025012814145226983, + "loss": 0.476, + "step": 138390 + }, + { + "epoch": 6.873944571371809, + "grad_norm": 0.177734375, + "learning_rate": 0.00025008840766862025, + "loss": 0.4987, + "step": 138400 + }, + { + "epoch": 6.8744412436674285, + "grad_norm": 0.13671875, + "learning_rate": 0.0002500486738849707, + "loss": 0.4966, + "step": 138410 + }, + { + "epoch": 6.874937915963048, + "grad_norm": 0.201171875, + "learning_rate": 0.00025000894010132114, + "loss": 0.4859, + "step": 138420 + }, + { + "epoch": 6.875434588258667, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002499692063176716, + "loss": 0.5372, + "step": 138430 + }, + { + "epoch": 6.875931260554286, + "grad_norm": 0.18359375, + "learning_rate": 0.0002499294725340221, + "loss": 0.4881, + "step": 138440 + }, + { + "epoch": 6.876427932849905, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002498897387503725, + "loss": 0.4923, + "step": 138450 + }, + { + "epoch": 6.876924605145525, + "grad_norm": 0.1455078125, + "learning_rate": 0.00024985000496672297, + "loss": 0.5507, + "step": 138460 + }, + { + "epoch": 6.877421277441144, + "grad_norm": 0.130859375, + "learning_rate": 0.00024981027118307344, + "loss": 0.4744, + "step": 138470 + }, + { + "epoch": 6.877917949736764, + "grad_norm": 0.1416015625, + "learning_rate": 0.00024977053739942386, + "loss": 0.5066, + "step": 138480 + }, + { + "epoch": 6.878414622032383, + "grad_norm": 0.1328125, + "learning_rate": 0.00024973080361577433, + "loss": 0.4819, + "step": 138490 + }, + { + "epoch": 6.878911294328002, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002496910698321248, + "loss": 0.4991, + "step": 138500 + }, + { + "epoch": 6.879407966623622, + "grad_norm": 0.130859375, + "learning_rate": 0.0002496513360484752, + "loss": 0.4835, + "step": 138510 + }, + { + "epoch": 6.879904638919241, + "grad_norm": 0.138671875, + "learning_rate": 0.0002496116022648257, + "loss": 0.4923, + "step": 138520 + }, + { + "epoch": 6.88040131121486, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002495718684811761, + "loss": 0.502, + "step": 138530 + }, + { + "epoch": 6.880897983510479, + "grad_norm": 0.130859375, + "learning_rate": 0.0002495321346975266, + "loss": 0.5097, + "step": 138540 + }, + { + "epoch": 6.881394655806099, + "grad_norm": 0.1357421875, + "learning_rate": 0.00024949240091387705, + "loss": 0.5251, + "step": 138550 + }, + { + "epoch": 6.881891328101719, + "grad_norm": 0.162109375, + "learning_rate": 0.00024945266713022747, + "loss": 0.497, + "step": 138560 + }, + { + "epoch": 6.882388000397338, + "grad_norm": 0.1455078125, + "learning_rate": 0.00024941293334657794, + "loss": 0.5, + "step": 138570 + }, + { + "epoch": 6.882884672692957, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002493731995629284, + "loss": 0.5033, + "step": 138580 + }, + { + "epoch": 6.883381344988576, + "grad_norm": 0.1533203125, + "learning_rate": 0.00024933346577927883, + "loss": 0.4907, + "step": 138590 + }, + { + "epoch": 6.883878017284196, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002492937319956293, + "loss": 0.5065, + "step": 138600 + }, + { + "epoch": 6.884374689579815, + "grad_norm": 0.140625, + "learning_rate": 0.00024925399821197977, + "loss": 0.4797, + "step": 138610 + }, + { + "epoch": 6.884871361875435, + "grad_norm": 0.134765625, + "learning_rate": 0.0002492142644283302, + "loss": 0.4651, + "step": 138620 + }, + { + "epoch": 6.885368034171054, + "grad_norm": 0.1494140625, + "learning_rate": 0.00024917453064468066, + "loss": 0.5183, + "step": 138630 + }, + { + "epoch": 6.885864706466673, + "grad_norm": 0.146484375, + "learning_rate": 0.0002491347968610311, + "loss": 0.511, + "step": 138640 + }, + { + "epoch": 6.886361378762293, + "grad_norm": 0.158203125, + "learning_rate": 0.00024909506307738155, + "loss": 0.4801, + "step": 138650 + }, + { + "epoch": 6.886858051057912, + "grad_norm": 0.1337890625, + "learning_rate": 0.000249055329293732, + "loss": 0.4738, + "step": 138660 + }, + { + "epoch": 6.887354723353531, + "grad_norm": 0.1259765625, + "learning_rate": 0.00024901559551008244, + "loss": 0.4818, + "step": 138670 + }, + { + "epoch": 6.88785139564915, + "grad_norm": 0.125, + "learning_rate": 0.0002489758617264329, + "loss": 0.4866, + "step": 138680 + }, + { + "epoch": 6.88834806794477, + "grad_norm": 0.11962890625, + "learning_rate": 0.0002489361279427834, + "loss": 0.4494, + "step": 138690 + }, + { + "epoch": 6.88884474024039, + "grad_norm": 0.126953125, + "learning_rate": 0.0002488963941591338, + "loss": 0.5095, + "step": 138700 + }, + { + "epoch": 6.889341412536009, + "grad_norm": 0.130859375, + "learning_rate": 0.00024885666037548427, + "loss": 0.506, + "step": 138710 + }, + { + "epoch": 6.889838084831628, + "grad_norm": 0.1796875, + "learning_rate": 0.0002488169265918347, + "loss": 0.4734, + "step": 138720 + }, + { + "epoch": 6.890334757127247, + "grad_norm": 0.140625, + "learning_rate": 0.0002487771928081852, + "loss": 0.4911, + "step": 138730 + }, + { + "epoch": 6.8908314294228665, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002487374590245356, + "loss": 0.5237, + "step": 138740 + }, + { + "epoch": 6.891328101718486, + "grad_norm": 0.12451171875, + "learning_rate": 0.00024869772524088604, + "loss": 0.5002, + "step": 138750 + }, + { + "epoch": 6.891824774014106, + "grad_norm": 0.146484375, + "learning_rate": 0.0002486579914572365, + "loss": 0.4975, + "step": 138760 + }, + { + "epoch": 6.892321446309725, + "grad_norm": 0.1328125, + "learning_rate": 0.000248618257673587, + "loss": 0.5047, + "step": 138770 + }, + { + "epoch": 6.892818118605344, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002485785238899374, + "loss": 0.5074, + "step": 138780 + }, + { + "epoch": 6.8933147909009636, + "grad_norm": 0.1328125, + "learning_rate": 0.0002485387901062879, + "loss": 0.478, + "step": 138790 + }, + { + "epoch": 6.893811463196583, + "grad_norm": 0.138671875, + "learning_rate": 0.00024849905632263835, + "loss": 0.4823, + "step": 138800 + }, + { + "epoch": 6.894308135492202, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002484593225389888, + "loss": 0.4991, + "step": 138810 + }, + { + "epoch": 6.894804807787821, + "grad_norm": 0.13671875, + "learning_rate": 0.00024841958875533923, + "loss": 0.5135, + "step": 138820 + }, + { + "epoch": 6.895301480083441, + "grad_norm": 0.154296875, + "learning_rate": 0.00024837985497168965, + "loss": 0.5066, + "step": 138830 + }, + { + "epoch": 6.895798152379061, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002483401211880402, + "loss": 0.4946, + "step": 138840 + }, + { + "epoch": 6.89629482467468, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002483003874043906, + "loss": 0.4861, + "step": 138850 + }, + { + "epoch": 6.896791496970299, + "grad_norm": 0.1484375, + "learning_rate": 0.00024826065362074107, + "loss": 0.5071, + "step": 138860 + }, + { + "epoch": 6.897288169265918, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002482209198370915, + "loss": 0.4995, + "step": 138870 + }, + { + "epoch": 6.8977848415615375, + "grad_norm": 0.1484375, + "learning_rate": 0.00024818118605344195, + "loss": 0.4846, + "step": 138880 + }, + { + "epoch": 6.898281513857157, + "grad_norm": 0.15625, + "learning_rate": 0.0002481414522697924, + "loss": 0.4874, + "step": 138890 + }, + { + "epoch": 6.898778186152777, + "grad_norm": 0.1455078125, + "learning_rate": 0.00024810171848614284, + "loss": 0.5222, + "step": 138900 + }, + { + "epoch": 6.899274858448396, + "grad_norm": 0.1328125, + "learning_rate": 0.0002480619847024933, + "loss": 0.4916, + "step": 138910 + }, + { + "epoch": 6.899771530744015, + "grad_norm": 0.193359375, + "learning_rate": 0.0002480222509188438, + "loss": 0.4765, + "step": 138920 + }, + { + "epoch": 6.9002682030396345, + "grad_norm": 0.150390625, + "learning_rate": 0.0002479825171351942, + "loss": 0.5114, + "step": 138930 + }, + { + "epoch": 6.900764875335254, + "grad_norm": 0.126953125, + "learning_rate": 0.0002479427833515447, + "loss": 0.4979, + "step": 138940 + }, + { + "epoch": 6.901261547630873, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002479030495678951, + "loss": 0.4637, + "step": 138950 + }, + { + "epoch": 6.901758219926492, + "grad_norm": 0.1826171875, + "learning_rate": 0.00024786331578424556, + "loss": 0.5241, + "step": 138960 + }, + { + "epoch": 6.902254892222112, + "grad_norm": 0.1640625, + "learning_rate": 0.00024782358200059603, + "loss": 0.4974, + "step": 138970 + }, + { + "epoch": 6.9027515645177315, + "grad_norm": 0.205078125, + "learning_rate": 0.00024778384821694645, + "loss": 0.5086, + "step": 138980 + }, + { + "epoch": 6.903248236813351, + "grad_norm": 0.1796875, + "learning_rate": 0.0002477441144332969, + "loss": 0.4754, + "step": 138990 + }, + { + "epoch": 6.90374490910897, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002477043806496474, + "loss": 0.5033, + "step": 139000 + }, + { + "epoch": 6.904241581404589, + "grad_norm": 0.14453125, + "learning_rate": 0.0002476646468659978, + "loss": 0.5201, + "step": 139010 + }, + { + "epoch": 6.904738253700208, + "grad_norm": 0.134765625, + "learning_rate": 0.0002476249130823483, + "loss": 0.5056, + "step": 139020 + }, + { + "epoch": 6.905234925995828, + "grad_norm": 0.1298828125, + "learning_rate": 0.00024758517929869875, + "loss": 0.4932, + "step": 139030 + }, + { + "epoch": 6.905731598291448, + "grad_norm": 0.1337890625, + "learning_rate": 0.00024754544551504917, + "loss": 0.4779, + "step": 139040 + }, + { + "epoch": 6.906228270587067, + "grad_norm": 0.14453125, + "learning_rate": 0.00024750571173139964, + "loss": 0.5336, + "step": 139050 + }, + { + "epoch": 6.906724942882686, + "grad_norm": 0.1650390625, + "learning_rate": 0.00024746597794775006, + "loss": 0.5058, + "step": 139060 + }, + { + "epoch": 6.9072216151783055, + "grad_norm": 0.146484375, + "learning_rate": 0.00024742624416410053, + "loss": 0.5215, + "step": 139070 + }, + { + "epoch": 6.907718287473925, + "grad_norm": 0.1298828125, + "learning_rate": 0.000247386510380451, + "loss": 0.4882, + "step": 139080 + }, + { + "epoch": 6.908214959769544, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002473467765968014, + "loss": 0.4795, + "step": 139090 + }, + { + "epoch": 6.908711632065163, + "grad_norm": 0.13671875, + "learning_rate": 0.0002473070428131519, + "loss": 0.486, + "step": 139100 + }, + { + "epoch": 6.909208304360782, + "grad_norm": 0.13671875, + "learning_rate": 0.00024726730902950236, + "loss": 0.4911, + "step": 139110 + }, + { + "epoch": 6.9097049766564025, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002472275752458528, + "loss": 0.4978, + "step": 139120 + }, + { + "epoch": 6.910201648952022, + "grad_norm": 0.126953125, + "learning_rate": 0.00024718784146220325, + "loss": 0.4819, + "step": 139130 + }, + { + "epoch": 6.910698321247641, + "grad_norm": 0.197265625, + "learning_rate": 0.0002471481076785537, + "loss": 0.4984, + "step": 139140 + }, + { + "epoch": 6.91119499354326, + "grad_norm": 0.16796875, + "learning_rate": 0.00024710837389490414, + "loss": 0.5102, + "step": 139150 + }, + { + "epoch": 6.911691665838879, + "grad_norm": 0.138671875, + "learning_rate": 0.0002470686401112546, + "loss": 0.4817, + "step": 139160 + }, + { + "epoch": 6.912188338134499, + "grad_norm": 0.140625, + "learning_rate": 0.00024702890632760503, + "loss": 0.5105, + "step": 139170 + }, + { + "epoch": 6.912685010430118, + "grad_norm": 0.1259765625, + "learning_rate": 0.00024698917254395555, + "loss": 0.4879, + "step": 139180 + }, + { + "epoch": 6.913181682725737, + "grad_norm": 0.15234375, + "learning_rate": 0.00024694943876030597, + "loss": 0.507, + "step": 139190 + }, + { + "epoch": 6.913678355021357, + "grad_norm": 0.126953125, + "learning_rate": 0.0002469097049766564, + "loss": 0.5073, + "step": 139200 + }, + { + "epoch": 6.914175027316976, + "grad_norm": 0.13671875, + "learning_rate": 0.00024686997119300686, + "loss": 0.4896, + "step": 139210 + }, + { + "epoch": 6.914671699612596, + "grad_norm": 0.138671875, + "learning_rate": 0.00024683023740935733, + "loss": 0.4983, + "step": 139220 + }, + { + "epoch": 6.915168371908215, + "grad_norm": 0.12890625, + "learning_rate": 0.0002467905036257078, + "loss": 0.5106, + "step": 139230 + }, + { + "epoch": 6.915665044203834, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002467507698420582, + "loss": 0.4939, + "step": 139240 + }, + { + "epoch": 6.916161716499453, + "grad_norm": 0.1474609375, + "learning_rate": 0.00024671103605840864, + "loss": 0.541, + "step": 139250 + }, + { + "epoch": 6.9166583887950726, + "grad_norm": 0.134765625, + "learning_rate": 0.00024667130227475916, + "loss": 0.5167, + "step": 139260 + }, + { + "epoch": 6.917155061090693, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002466315684911096, + "loss": 0.5126, + "step": 139270 + }, + { + "epoch": 6.917651733386312, + "grad_norm": 0.150390625, + "learning_rate": 0.00024659183470746, + "loss": 0.5017, + "step": 139280 + }, + { + "epoch": 6.918148405681931, + "grad_norm": 0.16015625, + "learning_rate": 0.00024655210092381047, + "loss": 0.5083, + "step": 139290 + }, + { + "epoch": 6.91864507797755, + "grad_norm": 0.1474609375, + "learning_rate": 0.00024651236714016094, + "loss": 0.5055, + "step": 139300 + }, + { + "epoch": 6.91914175027317, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002464726333565114, + "loss": 0.5173, + "step": 139310 + }, + { + "epoch": 6.919638422568789, + "grad_norm": 0.12890625, + "learning_rate": 0.0002464328995728618, + "loss": 0.4768, + "step": 139320 + }, + { + "epoch": 6.920135094864408, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002463931657892123, + "loss": 0.4742, + "step": 139330 + }, + { + "epoch": 6.920631767160028, + "grad_norm": 0.140625, + "learning_rate": 0.00024635343200556277, + "loss": 0.4894, + "step": 139340 + }, + { + "epoch": 6.921128439455647, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002463136982219132, + "loss": 0.4996, + "step": 139350 + }, + { + "epoch": 6.921625111751267, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002462739644382636, + "loss": 0.5211, + "step": 139360 + }, + { + "epoch": 6.922121784046886, + "grad_norm": 0.1259765625, + "learning_rate": 0.00024623423065461413, + "loss": 0.4981, + "step": 139370 + }, + { + "epoch": 6.922618456342505, + "grad_norm": 0.1240234375, + "learning_rate": 0.00024619449687096455, + "loss": 0.5044, + "step": 139380 + }, + { + "epoch": 6.923115128638124, + "grad_norm": 0.1591796875, + "learning_rate": 0.000246154763087315, + "loss": 0.502, + "step": 139390 + }, + { + "epoch": 6.9236118009337435, + "grad_norm": 0.1376953125, + "learning_rate": 0.00024611502930366544, + "loss": 0.4946, + "step": 139400 + }, + { + "epoch": 6.924108473229364, + "grad_norm": 0.162109375, + "learning_rate": 0.0002460752955200159, + "loss": 0.5054, + "step": 139410 + }, + { + "epoch": 6.924605145524983, + "grad_norm": 0.1796875, + "learning_rate": 0.0002460355617363664, + "loss": 0.4877, + "step": 139420 + }, + { + "epoch": 6.925101817820602, + "grad_norm": 0.140625, + "learning_rate": 0.0002459958279527168, + "loss": 0.4919, + "step": 139430 + }, + { + "epoch": 6.925598490116221, + "grad_norm": 0.130859375, + "learning_rate": 0.00024595609416906727, + "loss": 0.4642, + "step": 139440 + }, + { + "epoch": 6.9260951624118405, + "grad_norm": 0.1669921875, + "learning_rate": 0.00024591636038541774, + "loss": 0.5025, + "step": 139450 + }, + { + "epoch": 6.92659183470746, + "grad_norm": 0.130859375, + "learning_rate": 0.00024587662660176815, + "loss": 0.4847, + "step": 139460 + }, + { + "epoch": 6.927088507003079, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002458368928181186, + "loss": 0.5113, + "step": 139470 + }, + { + "epoch": 6.927585179298699, + "grad_norm": 0.1484375, + "learning_rate": 0.0002457971590344691, + "loss": 0.4855, + "step": 139480 + }, + { + "epoch": 6.928081851594318, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002457574252508195, + "loss": 0.4971, + "step": 139490 + }, + { + "epoch": 6.928578523889938, + "grad_norm": 0.1435546875, + "learning_rate": 0.00024571769146717, + "loss": 0.5003, + "step": 139500 + }, + { + "epoch": 6.929075196185557, + "grad_norm": 0.126953125, + "learning_rate": 0.0002456779576835204, + "loss": 0.4825, + "step": 139510 + }, + { + "epoch": 6.929571868481176, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002456382238998709, + "loss": 0.501, + "step": 139520 + }, + { + "epoch": 6.930068540776795, + "grad_norm": 0.1787109375, + "learning_rate": 0.00024559849011622135, + "loss": 0.4937, + "step": 139530 + }, + { + "epoch": 6.9305652130724145, + "grad_norm": 0.1416015625, + "learning_rate": 0.00024555875633257176, + "loss": 0.4989, + "step": 139540 + }, + { + "epoch": 6.931061885368035, + "grad_norm": 0.1875, + "learning_rate": 0.00024551902254892223, + "loss": 0.4682, + "step": 139550 + }, + { + "epoch": 6.931558557663654, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002454792887652727, + "loss": 0.5281, + "step": 139560 + }, + { + "epoch": 6.932055229959273, + "grad_norm": 0.169921875, + "learning_rate": 0.0002454395549816231, + "loss": 0.5126, + "step": 139570 + }, + { + "epoch": 6.932551902254892, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002453998211979736, + "loss": 0.4697, + "step": 139580 + }, + { + "epoch": 6.9330485745505115, + "grad_norm": 0.1318359375, + "learning_rate": 0.000245360087414324, + "loss": 0.4833, + "step": 139590 + }, + { + "epoch": 6.933545246846131, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002453203536306745, + "loss": 0.4721, + "step": 139600 + }, + { + "epoch": 6.93404191914175, + "grad_norm": 0.14453125, + "learning_rate": 0.00024528061984702495, + "loss": 0.492, + "step": 139610 + }, + { + "epoch": 6.93453859143737, + "grad_norm": 0.13671875, + "learning_rate": 0.00024524088606337537, + "loss": 0.5299, + "step": 139620 + }, + { + "epoch": 6.935035263732989, + "grad_norm": 0.1396484375, + "learning_rate": 0.00024520115227972584, + "loss": 0.4887, + "step": 139630 + }, + { + "epoch": 6.9355319360286085, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002451614184960763, + "loss": 0.4836, + "step": 139640 + }, + { + "epoch": 6.936028608324228, + "grad_norm": 0.146484375, + "learning_rate": 0.00024512168471242673, + "loss": 0.4728, + "step": 139650 + }, + { + "epoch": 6.936525280619847, + "grad_norm": 0.1640625, + "learning_rate": 0.0002450819509287772, + "loss": 0.4863, + "step": 139660 + }, + { + "epoch": 6.937021952915466, + "grad_norm": 0.142578125, + "learning_rate": 0.0002450422171451277, + "loss": 0.4935, + "step": 139670 + }, + { + "epoch": 6.937518625211085, + "grad_norm": 0.1240234375, + "learning_rate": 0.00024500248336147815, + "loss": 0.486, + "step": 139680 + }, + { + "epoch": 6.9380152975067055, + "grad_norm": 0.142578125, + "learning_rate": 0.00024496274957782856, + "loss": 0.4928, + "step": 139690 + }, + { + "epoch": 6.938511969802325, + "grad_norm": 0.1337890625, + "learning_rate": 0.000244923015794179, + "loss": 0.4964, + "step": 139700 + }, + { + "epoch": 6.939008642097944, + "grad_norm": 0.134765625, + "learning_rate": 0.0002448832820105295, + "loss": 0.4789, + "step": 139710 + }, + { + "epoch": 6.939505314393563, + "grad_norm": 0.13671875, + "learning_rate": 0.0002448435482268799, + "loss": 0.4749, + "step": 139720 + }, + { + "epoch": 6.9400019866891824, + "grad_norm": 0.140625, + "learning_rate": 0.00024480381444323034, + "loss": 0.4678, + "step": 139730 + }, + { + "epoch": 6.940498658984802, + "grad_norm": 0.130859375, + "learning_rate": 0.0002447640806595808, + "loss": 0.4679, + "step": 139740 + }, + { + "epoch": 6.940995331280421, + "grad_norm": 0.14453125, + "learning_rate": 0.0002447243468759313, + "loss": 0.4952, + "step": 139750 + }, + { + "epoch": 6.941492003576041, + "grad_norm": 0.146484375, + "learning_rate": 0.00024468461309228175, + "loss": 0.4943, + "step": 139760 + }, + { + "epoch": 6.94198867587166, + "grad_norm": 0.12451171875, + "learning_rate": 0.00024464487930863217, + "loss": 0.4898, + "step": 139770 + }, + { + "epoch": 6.9424853481672795, + "grad_norm": 0.1435546875, + "learning_rate": 0.00024460514552498264, + "loss": 0.5022, + "step": 139780 + }, + { + "epoch": 6.942982020462899, + "grad_norm": 0.125, + "learning_rate": 0.0002445654117413331, + "loss": 0.5112, + "step": 139790 + }, + { + "epoch": 6.943478692758518, + "grad_norm": 0.1455078125, + "learning_rate": 0.00024452567795768353, + "loss": 0.4766, + "step": 139800 + }, + { + "epoch": 6.943975365054137, + "grad_norm": 0.169921875, + "learning_rate": 0.00024448594417403395, + "loss": 0.4633, + "step": 139810 + }, + { + "epoch": 6.944472037349756, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002444462103903844, + "loss": 0.4725, + "step": 139820 + }, + { + "epoch": 6.944968709645376, + "grad_norm": 0.1328125, + "learning_rate": 0.0002444064766067349, + "loss": 0.4767, + "step": 139830 + }, + { + "epoch": 6.945465381940996, + "grad_norm": 0.130859375, + "learning_rate": 0.00024436674282308536, + "loss": 0.5269, + "step": 139840 + }, + { + "epoch": 6.945962054236615, + "grad_norm": 0.142578125, + "learning_rate": 0.0002443270090394358, + "loss": 0.4814, + "step": 139850 + }, + { + "epoch": 6.946458726532234, + "grad_norm": 0.1376953125, + "learning_rate": 0.00024428727525578625, + "loss": 0.4934, + "step": 139860 + }, + { + "epoch": 6.946955398827853, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002442475414721367, + "loss": 0.4957, + "step": 139870 + }, + { + "epoch": 6.947452071123473, + "grad_norm": 0.158203125, + "learning_rate": 0.00024420780768848714, + "loss": 0.5135, + "step": 139880 + }, + { + "epoch": 6.947948743419092, + "grad_norm": 0.138671875, + "learning_rate": 0.00024416807390483756, + "loss": 0.4758, + "step": 139890 + }, + { + "epoch": 6.948445415714711, + "grad_norm": 0.1484375, + "learning_rate": 0.00024412834012118805, + "loss": 0.5051, + "step": 139900 + }, + { + "epoch": 6.94894208801033, + "grad_norm": 0.1328125, + "learning_rate": 0.0002440886063375385, + "loss": 0.4878, + "step": 139910 + }, + { + "epoch": 6.94943876030595, + "grad_norm": 0.14453125, + "learning_rate": 0.00024404887255388897, + "loss": 0.4885, + "step": 139920 + }, + { + "epoch": 6.94993543260157, + "grad_norm": 0.1298828125, + "learning_rate": 0.00024400913877023941, + "loss": 0.4816, + "step": 139930 + }, + { + "epoch": 6.950432104897189, + "grad_norm": 0.1396484375, + "learning_rate": 0.00024396940498658983, + "loss": 0.505, + "step": 139940 + }, + { + "epoch": 6.950928777192808, + "grad_norm": 0.134765625, + "learning_rate": 0.00024392967120294033, + "loss": 0.5088, + "step": 139950 + }, + { + "epoch": 6.951425449488427, + "grad_norm": 0.1435546875, + "learning_rate": 0.00024388993741929075, + "loss": 0.4806, + "step": 139960 + }, + { + "epoch": 6.951922121784047, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002438502036356412, + "loss": 0.4938, + "step": 139970 + }, + { + "epoch": 6.952418794079666, + "grad_norm": 0.1318359375, + "learning_rate": 0.00024381046985199166, + "loss": 0.4716, + "step": 139980 + }, + { + "epoch": 6.952915466375286, + "grad_norm": 0.138671875, + "learning_rate": 0.0002437707360683421, + "loss": 0.4989, + "step": 139990 + }, + { + "epoch": 6.953412138670905, + "grad_norm": 0.13671875, + "learning_rate": 0.00024373100228469258, + "loss": 0.4904, + "step": 140000 + }, + { + "epoch": 6.953908810966524, + "grad_norm": 0.14453125, + "learning_rate": 0.00024369126850104302, + "loss": 0.5107, + "step": 140010 + }, + { + "epoch": 6.954405483262144, + "grad_norm": 0.130859375, + "learning_rate": 0.00024365153471739347, + "loss": 0.5227, + "step": 140020 + }, + { + "epoch": 6.954902155557763, + "grad_norm": 0.140625, + "learning_rate": 0.00024361180093374394, + "loss": 0.4979, + "step": 140030 + }, + { + "epoch": 6.955398827853382, + "grad_norm": 0.1318359375, + "learning_rate": 0.00024357206715009438, + "loss": 0.4795, + "step": 140040 + }, + { + "epoch": 6.955895500149001, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002435323333664448, + "loss": 0.485, + "step": 140050 + }, + { + "epoch": 6.956392172444621, + "grad_norm": 0.12451171875, + "learning_rate": 0.0002434925995827953, + "loss": 0.484, + "step": 140060 + }, + { + "epoch": 6.956888844740241, + "grad_norm": 0.1552734375, + "learning_rate": 0.00024345286579914571, + "loss": 0.5158, + "step": 140070 + }, + { + "epoch": 6.95738551703586, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002434131320154962, + "loss": 0.4753, + "step": 140080 + }, + { + "epoch": 6.957882189331479, + "grad_norm": 0.1513671875, + "learning_rate": 0.00024337339823184663, + "loss": 0.4996, + "step": 140090 + }, + { + "epoch": 6.958378861627098, + "grad_norm": 0.173828125, + "learning_rate": 0.00024333366444819707, + "loss": 0.4916, + "step": 140100 + }, + { + "epoch": 6.9588755339227175, + "grad_norm": 0.1376953125, + "learning_rate": 0.00024329393066454755, + "loss": 0.4938, + "step": 140110 + }, + { + "epoch": 6.959372206218337, + "grad_norm": 0.138671875, + "learning_rate": 0.000243254196880898, + "loss": 0.5164, + "step": 140120 + }, + { + "epoch": 6.959868878513957, + "grad_norm": 0.1298828125, + "learning_rate": 0.00024321446309724846, + "loss": 0.4918, + "step": 140130 + }, + { + "epoch": 6.960365550809576, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002431747293135989, + "loss": 0.5013, + "step": 140140 + }, + { + "epoch": 6.960862223105195, + "grad_norm": 0.1474609375, + "learning_rate": 0.00024313499552994935, + "loss": 0.4931, + "step": 140150 + }, + { + "epoch": 6.9613588954008145, + "grad_norm": 0.1298828125, + "learning_rate": 0.00024309526174629982, + "loss": 0.4834, + "step": 140160 + }, + { + "epoch": 6.961855567696434, + "grad_norm": 0.1396484375, + "learning_rate": 0.00024305552796265027, + "loss": 0.4915, + "step": 140170 + }, + { + "epoch": 6.962352239992053, + "grad_norm": 0.1748046875, + "learning_rate": 0.00024301579417900068, + "loss": 0.4947, + "step": 140180 + }, + { + "epoch": 6.962848912287672, + "grad_norm": 0.1591796875, + "learning_rate": 0.00024297606039535118, + "loss": 0.5211, + "step": 140190 + }, + { + "epoch": 6.963345584583292, + "grad_norm": 0.150390625, + "learning_rate": 0.0002429363266117016, + "loss": 0.4868, + "step": 140200 + }, + { + "epoch": 6.963842256878912, + "grad_norm": 0.1376953125, + "learning_rate": 0.00024289659282805207, + "loss": 0.4687, + "step": 140210 + }, + { + "epoch": 6.964338929174531, + "grad_norm": 0.130859375, + "learning_rate": 0.00024285685904440251, + "loss": 0.4978, + "step": 140220 + }, + { + "epoch": 6.96483560147015, + "grad_norm": 0.138671875, + "learning_rate": 0.00024281712526075296, + "loss": 0.494, + "step": 140230 + }, + { + "epoch": 6.965332273765769, + "grad_norm": 0.158203125, + "learning_rate": 0.00024277739147710343, + "loss": 0.4935, + "step": 140240 + }, + { + "epoch": 6.9658289460613885, + "grad_norm": 0.177734375, + "learning_rate": 0.00024273765769345387, + "loss": 0.4853, + "step": 140250 + }, + { + "epoch": 6.966325618357008, + "grad_norm": 0.13671875, + "learning_rate": 0.0002426979239098043, + "loss": 0.4798, + "step": 140260 + }, + { + "epoch": 6.966822290652628, + "grad_norm": 0.134765625, + "learning_rate": 0.0002426581901261548, + "loss": 0.4909, + "step": 140270 + }, + { + "epoch": 6.967318962948247, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002426184563425052, + "loss": 0.5131, + "step": 140280 + }, + { + "epoch": 6.967815635243866, + "grad_norm": 0.17578125, + "learning_rate": 0.0002425787225588557, + "loss": 0.4682, + "step": 140290 + }, + { + "epoch": 6.9683123075394855, + "grad_norm": 0.1259765625, + "learning_rate": 0.00024253898877520612, + "loss": 0.4771, + "step": 140300 + }, + { + "epoch": 6.968808979835105, + "grad_norm": 0.1328125, + "learning_rate": 0.00024249925499155657, + "loss": 0.4993, + "step": 140310 + }, + { + "epoch": 6.969305652130724, + "grad_norm": 0.1455078125, + "learning_rate": 0.00024245952120790704, + "loss": 0.4717, + "step": 140320 + }, + { + "epoch": 6.969802324426343, + "grad_norm": 0.134765625, + "learning_rate": 0.00024241978742425748, + "loss": 0.4884, + "step": 140330 + }, + { + "epoch": 6.970298996721963, + "grad_norm": 0.1357421875, + "learning_rate": 0.00024238005364060793, + "loss": 0.4904, + "step": 140340 + }, + { + "epoch": 6.9707956690175825, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002423403198569584, + "loss": 0.4857, + "step": 140350 + }, + { + "epoch": 6.971292341313202, + "grad_norm": 0.158203125, + "learning_rate": 0.00024230058607330884, + "loss": 0.4607, + "step": 140360 + }, + { + "epoch": 6.971789013608821, + "grad_norm": 0.162109375, + "learning_rate": 0.0002422608522896593, + "loss": 0.4999, + "step": 140370 + }, + { + "epoch": 6.97228568590444, + "grad_norm": 0.140625, + "learning_rate": 0.00024222111850600976, + "loss": 0.4969, + "step": 140380 + }, + { + "epoch": 6.972782358200059, + "grad_norm": 0.1416015625, + "learning_rate": 0.00024218138472236017, + "loss": 0.5017, + "step": 140390 + }, + { + "epoch": 6.973279030495679, + "grad_norm": 0.15625, + "learning_rate": 0.00024214165093871067, + "loss": 0.54, + "step": 140400 + }, + { + "epoch": 6.973775702791299, + "grad_norm": 0.14453125, + "learning_rate": 0.0002421019171550611, + "loss": 0.4736, + "step": 140410 + }, + { + "epoch": 6.974272375086918, + "grad_norm": 0.134765625, + "learning_rate": 0.00024206218337141153, + "loss": 0.4898, + "step": 140420 + }, + { + "epoch": 6.974769047382537, + "grad_norm": 0.1435546875, + "learning_rate": 0.000242022449587762, + "loss": 0.5352, + "step": 140430 + }, + { + "epoch": 6.9752657196781565, + "grad_norm": 0.146484375, + "learning_rate": 0.00024198271580411245, + "loss": 0.4869, + "step": 140440 + }, + { + "epoch": 6.975762391973776, + "grad_norm": 0.15234375, + "learning_rate": 0.00024194298202046292, + "loss": 0.4762, + "step": 140450 + }, + { + "epoch": 6.976259064269395, + "grad_norm": 0.138671875, + "learning_rate": 0.00024190324823681337, + "loss": 0.4969, + "step": 140460 + }, + { + "epoch": 6.976755736565014, + "grad_norm": 0.138671875, + "learning_rate": 0.0002418635144531638, + "loss": 0.5136, + "step": 140470 + }, + { + "epoch": 6.977252408860634, + "grad_norm": 0.13671875, + "learning_rate": 0.00024182378066951428, + "loss": 0.514, + "step": 140480 + }, + { + "epoch": 6.9777490811562535, + "grad_norm": 0.138671875, + "learning_rate": 0.00024178404688586473, + "loss": 0.4833, + "step": 140490 + }, + { + "epoch": 6.978245753451873, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002417443131022152, + "loss": 0.4871, + "step": 140500 + }, + { + "epoch": 6.978742425747492, + "grad_norm": 0.1513671875, + "learning_rate": 0.00024170457931856561, + "loss": 0.5071, + "step": 140510 + }, + { + "epoch": 6.979239098043111, + "grad_norm": 0.15234375, + "learning_rate": 0.00024166484553491606, + "loss": 0.5003, + "step": 140520 + }, + { + "epoch": 6.97973577033873, + "grad_norm": 0.1318359375, + "learning_rate": 0.00024162511175126653, + "loss": 0.4776, + "step": 140530 + }, + { + "epoch": 6.98023244263435, + "grad_norm": 0.1337890625, + "learning_rate": 0.00024158537796761697, + "loss": 0.5007, + "step": 140540 + }, + { + "epoch": 6.980729114929969, + "grad_norm": 0.13671875, + "learning_rate": 0.00024154564418396742, + "loss": 0.4972, + "step": 140550 + }, + { + "epoch": 6.981225787225588, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002415059104003179, + "loss": 0.4996, + "step": 140560 + }, + { + "epoch": 6.981722459521208, + "grad_norm": 0.2138671875, + "learning_rate": 0.00024146617661666833, + "loss": 0.4959, + "step": 140570 + }, + { + "epoch": 6.982219131816827, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002414264428330188, + "loss": 0.5053, + "step": 140580 + }, + { + "epoch": 6.982715804112447, + "grad_norm": 0.142578125, + "learning_rate": 0.00024138670904936925, + "loss": 0.5081, + "step": 140590 + }, + { + "epoch": 6.983212476408066, + "grad_norm": 0.1376953125, + "learning_rate": 0.00024134697526571967, + "loss": 0.4818, + "step": 140600 + }, + { + "epoch": 6.983709148703685, + "grad_norm": 0.1552734375, + "learning_rate": 0.00024130724148207017, + "loss": 0.5042, + "step": 140610 + }, + { + "epoch": 6.984205820999304, + "grad_norm": 0.1328125, + "learning_rate": 0.00024126750769842058, + "loss": 0.5098, + "step": 140620 + }, + { + "epoch": 6.9847024932949235, + "grad_norm": 0.1318359375, + "learning_rate": 0.00024122777391477103, + "loss": 0.5089, + "step": 140630 + }, + { + "epoch": 6.985199165590544, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002411880401311215, + "loss": 0.4998, + "step": 140640 + }, + { + "epoch": 6.985695837886163, + "grad_norm": 0.1669921875, + "learning_rate": 0.00024114830634747194, + "loss": 0.5101, + "step": 140650 + }, + { + "epoch": 6.986192510181782, + "grad_norm": 0.162109375, + "learning_rate": 0.00024110857256382241, + "loss": 0.4872, + "step": 140660 + }, + { + "epoch": 6.986689182477401, + "grad_norm": 0.158203125, + "learning_rate": 0.00024106883878017286, + "loss": 0.4679, + "step": 140670 + }, + { + "epoch": 6.987185854773021, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002410291049965233, + "loss": 0.4831, + "step": 140680 + }, + { + "epoch": 6.98768252706864, + "grad_norm": 0.1533203125, + "learning_rate": 0.00024098937121287377, + "loss": 0.474, + "step": 140690 + }, + { + "epoch": 6.988179199364259, + "grad_norm": 0.126953125, + "learning_rate": 0.00024094963742922422, + "loss": 0.492, + "step": 140700 + }, + { + "epoch": 6.988675871659879, + "grad_norm": 0.1416015625, + "learning_rate": 0.00024090990364557463, + "loss": 0.5069, + "step": 140710 + }, + { + "epoch": 6.989172543955498, + "grad_norm": 0.1376953125, + "learning_rate": 0.00024087016986192513, + "loss": 0.5026, + "step": 140720 + }, + { + "epoch": 6.989669216251118, + "grad_norm": 0.1298828125, + "learning_rate": 0.00024083043607827555, + "loss": 0.484, + "step": 140730 + }, + { + "epoch": 6.990165888546737, + "grad_norm": 0.1669921875, + "learning_rate": 0.00024079070229462605, + "loss": 0.498, + "step": 140740 + }, + { + "epoch": 6.990662560842356, + "grad_norm": 0.189453125, + "learning_rate": 0.00024075096851097647, + "loss": 0.4986, + "step": 140750 + }, + { + "epoch": 6.991159233137975, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002407112347273269, + "loss": 0.5142, + "step": 140760 + }, + { + "epoch": 6.9916559054335945, + "grad_norm": 0.154296875, + "learning_rate": 0.00024067150094367738, + "loss": 0.4693, + "step": 140770 + }, + { + "epoch": 6.992152577729215, + "grad_norm": 0.1396484375, + "learning_rate": 0.00024063176716002783, + "loss": 0.522, + "step": 140780 + }, + { + "epoch": 6.992649250024834, + "grad_norm": 0.1728515625, + "learning_rate": 0.00024059203337637824, + "loss": 0.4882, + "step": 140790 + }, + { + "epoch": 6.993145922320453, + "grad_norm": 0.1474609375, + "learning_rate": 0.00024055229959272874, + "loss": 0.502, + "step": 140800 + }, + { + "epoch": 6.993642594616072, + "grad_norm": 0.146484375, + "learning_rate": 0.00024051256580907916, + "loss": 0.4914, + "step": 140810 + }, + { + "epoch": 6.9941392669116915, + "grad_norm": 0.15625, + "learning_rate": 0.00024047283202542966, + "loss": 0.4719, + "step": 140820 + }, + { + "epoch": 6.994635939207311, + "grad_norm": 0.1396484375, + "learning_rate": 0.00024043309824178007, + "loss": 0.472, + "step": 140830 + }, + { + "epoch": 6.99513261150293, + "grad_norm": 0.146484375, + "learning_rate": 0.00024039336445813052, + "loss": 0.4798, + "step": 140840 + }, + { + "epoch": 6.99562928379855, + "grad_norm": 0.13671875, + "learning_rate": 0.000240353630674481, + "loss": 0.5351, + "step": 140850 + }, + { + "epoch": 6.996125956094169, + "grad_norm": 0.1630859375, + "learning_rate": 0.00024031389689083143, + "loss": 0.5033, + "step": 140860 + }, + { + "epoch": 6.9966226283897885, + "grad_norm": 0.1494140625, + "learning_rate": 0.00024027416310718188, + "loss": 0.4998, + "step": 140870 + }, + { + "epoch": 6.997119300685408, + "grad_norm": 0.1328125, + "learning_rate": 0.00024023442932353235, + "loss": 0.4827, + "step": 140880 + }, + { + "epoch": 6.997615972981027, + "grad_norm": 0.15625, + "learning_rate": 0.0002401946955398828, + "loss": 0.5031, + "step": 140890 + }, + { + "epoch": 6.998112645276646, + "grad_norm": 0.13671875, + "learning_rate": 0.00024015496175623327, + "loss": 0.5173, + "step": 140900 + }, + { + "epoch": 6.9986093175722655, + "grad_norm": 0.171875, + "learning_rate": 0.0002401152279725837, + "loss": 0.482, + "step": 140910 + }, + { + "epoch": 6.999105989867886, + "grad_norm": 0.138671875, + "learning_rate": 0.00024007549418893413, + "loss": 0.4702, + "step": 140920 + }, + { + "epoch": 6.999602662163505, + "grad_norm": 0.150390625, + "learning_rate": 0.00024003576040528463, + "loss": 0.4797, + "step": 140930 + }, + { + "epoch": 7.000099334459124, + "grad_norm": 0.1630859375, + "learning_rate": 0.00023999602662163504, + "loss": 0.5283, + "step": 140940 + }, + { + "epoch": 7.000596006754743, + "grad_norm": 0.15234375, + "learning_rate": 0.00023995629283798554, + "loss": 0.4986, + "step": 140950 + }, + { + "epoch": 7.0010926790503625, + "grad_norm": 0.1328125, + "learning_rate": 0.00023991655905433596, + "loss": 0.4611, + "step": 140960 + }, + { + "epoch": 7.001589351345982, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002398768252706864, + "loss": 0.4936, + "step": 140970 + }, + { + "epoch": 7.002086023641601, + "grad_norm": 0.12890625, + "learning_rate": 0.00023983709148703687, + "loss": 0.4864, + "step": 140980 + }, + { + "epoch": 7.002582695937221, + "grad_norm": 0.162109375, + "learning_rate": 0.00023979735770338732, + "loss": 0.4895, + "step": 140990 + }, + { + "epoch": 7.00307936823284, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023975762391973776, + "loss": 0.4758, + "step": 141000 + }, + { + "epoch": 7.0035760405284595, + "grad_norm": 0.13671875, + "learning_rate": 0.00023971789013608823, + "loss": 0.4665, + "step": 141010 + }, + { + "epoch": 7.004072712824079, + "grad_norm": 0.13671875, + "learning_rate": 0.00023967815635243868, + "loss": 0.4966, + "step": 141020 + }, + { + "epoch": 7.004569385119698, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023963842256878915, + "loss": 0.4663, + "step": 141030 + }, + { + "epoch": 7.005066057415317, + "grad_norm": 0.1728515625, + "learning_rate": 0.0002395986887851396, + "loss": 0.5, + "step": 141040 + }, + { + "epoch": 7.005562729710936, + "grad_norm": 0.1474609375, + "learning_rate": 0.00023955895500149, + "loss": 0.5082, + "step": 141050 + }, + { + "epoch": 7.006059402006556, + "grad_norm": 0.1220703125, + "learning_rate": 0.00023951922121784048, + "loss": 0.4666, + "step": 141060 + }, + { + "epoch": 7.006556074302176, + "grad_norm": 0.142578125, + "learning_rate": 0.00023947948743419093, + "loss": 0.4773, + "step": 141070 + }, + { + "epoch": 7.007052746597795, + "grad_norm": 0.162109375, + "learning_rate": 0.00023943975365054137, + "loss": 0.4744, + "step": 141080 + }, + { + "epoch": 7.007549418893414, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023940001986689184, + "loss": 0.4914, + "step": 141090 + }, + { + "epoch": 7.008046091189033, + "grad_norm": 0.138671875, + "learning_rate": 0.00023936028608324229, + "loss": 0.4801, + "step": 141100 + }, + { + "epoch": 7.008542763484653, + "grad_norm": 0.154296875, + "learning_rate": 0.00023932055229959276, + "loss": 0.4721, + "step": 141110 + }, + { + "epoch": 7.009039435780272, + "grad_norm": 0.14453125, + "learning_rate": 0.0002392808185159432, + "loss": 0.4794, + "step": 141120 + }, + { + "epoch": 7.009536108075891, + "grad_norm": 0.130859375, + "learning_rate": 0.00023924108473229362, + "loss": 0.5034, + "step": 141130 + }, + { + "epoch": 7.010032780371511, + "grad_norm": 0.2138671875, + "learning_rate": 0.00023920135094864412, + "loss": 0.4798, + "step": 141140 + }, + { + "epoch": 7.0105294526671305, + "grad_norm": 0.1474609375, + "learning_rate": 0.00023916161716499453, + "loss": 0.4586, + "step": 141150 + }, + { + "epoch": 7.01102612496275, + "grad_norm": 0.138671875, + "learning_rate": 0.00023912188338134498, + "loss": 0.4931, + "step": 141160 + }, + { + "epoch": 7.011522797258369, + "grad_norm": 0.1337890625, + "learning_rate": 0.00023908214959769545, + "loss": 0.4674, + "step": 141170 + }, + { + "epoch": 7.012019469553988, + "grad_norm": 0.140625, + "learning_rate": 0.0002390424158140459, + "loss": 0.496, + "step": 141180 + }, + { + "epoch": 7.012516141849607, + "grad_norm": 0.166015625, + "learning_rate": 0.00023900268203039637, + "loss": 0.4728, + "step": 141190 + }, + { + "epoch": 7.013012814145227, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002389629482467468, + "loss": 0.4576, + "step": 141200 + }, + { + "epoch": 7.013509486440847, + "grad_norm": 0.130859375, + "learning_rate": 0.00023892321446309725, + "loss": 0.4785, + "step": 141210 + }, + { + "epoch": 7.014006158736466, + "grad_norm": 0.134765625, + "learning_rate": 0.00023888348067944773, + "loss": 0.4936, + "step": 141220 + }, + { + "epoch": 7.014502831032085, + "grad_norm": 0.140625, + "learning_rate": 0.00023884374689579817, + "loss": 0.4896, + "step": 141230 + }, + { + "epoch": 7.014999503327704, + "grad_norm": 0.12890625, + "learning_rate": 0.0002388040131121486, + "loss": 0.4937, + "step": 141240 + }, + { + "epoch": 7.015496175623324, + "grad_norm": 0.1484375, + "learning_rate": 0.00023876427932849909, + "loss": 0.4706, + "step": 141250 + }, + { + "epoch": 7.015992847918943, + "grad_norm": 0.15625, + "learning_rate": 0.0002387245455448495, + "loss": 0.5109, + "step": 141260 + }, + { + "epoch": 7.016489520214562, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002386848117612, + "loss": 0.4743, + "step": 141270 + }, + { + "epoch": 7.016986192510182, + "grad_norm": 0.140625, + "learning_rate": 0.00023864507797755042, + "loss": 0.4832, + "step": 141280 + }, + { + "epoch": 7.017482864805801, + "grad_norm": 0.142578125, + "learning_rate": 0.00023860534419390086, + "loss": 0.5202, + "step": 141290 + }, + { + "epoch": 7.017979537101421, + "grad_norm": 0.1630859375, + "learning_rate": 0.00023856561041025133, + "loss": 0.519, + "step": 141300 + }, + { + "epoch": 7.01847620939704, + "grad_norm": 0.1484375, + "learning_rate": 0.00023852587662660178, + "loss": 0.4728, + "step": 141310 + }, + { + "epoch": 7.018972881692659, + "grad_norm": 0.1806640625, + "learning_rate": 0.00023848614284295225, + "loss": 0.4885, + "step": 141320 + }, + { + "epoch": 7.019469553988278, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002384464090593027, + "loss": 0.4653, + "step": 141330 + }, + { + "epoch": 7.0199662262838975, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002384066752756531, + "loss": 0.4746, + "step": 141340 + }, + { + "epoch": 7.020462898579518, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002383669414920036, + "loss": 0.4718, + "step": 141350 + }, + { + "epoch": 7.020959570875137, + "grad_norm": 0.1474609375, + "learning_rate": 0.00023832720770835403, + "loss": 0.4733, + "step": 141360 + }, + { + "epoch": 7.021456243170756, + "grad_norm": 0.13671875, + "learning_rate": 0.00023828747392470447, + "loss": 0.4683, + "step": 141370 + }, + { + "epoch": 7.021952915466375, + "grad_norm": 0.1455078125, + "learning_rate": 0.00023824774014105494, + "loss": 0.4735, + "step": 141380 + }, + { + "epoch": 7.022449587761995, + "grad_norm": 0.15234375, + "learning_rate": 0.00023820800635740539, + "loss": 0.4853, + "step": 141390 + }, + { + "epoch": 7.022946260057614, + "grad_norm": 0.1298828125, + "learning_rate": 0.00023816827257375586, + "loss": 0.4634, + "step": 141400 + }, + { + "epoch": 7.023442932353233, + "grad_norm": 0.14453125, + "learning_rate": 0.0002381285387901063, + "loss": 0.4686, + "step": 141410 + }, + { + "epoch": 7.023939604648852, + "grad_norm": 0.201171875, + "learning_rate": 0.00023808880500645675, + "loss": 0.5208, + "step": 141420 + }, + { + "epoch": 7.024436276944472, + "grad_norm": 0.1337890625, + "learning_rate": 0.00023804907122280722, + "loss": 0.4788, + "step": 141430 + }, + { + "epoch": 7.024932949240092, + "grad_norm": 0.15625, + "learning_rate": 0.00023800933743915766, + "loss": 0.5135, + "step": 141440 + }, + { + "epoch": 7.025429621535711, + "grad_norm": 0.154296875, + "learning_rate": 0.00023796960365550808, + "loss": 0.4973, + "step": 141450 + }, + { + "epoch": 7.02592629383133, + "grad_norm": 0.130859375, + "learning_rate": 0.00023792986987185858, + "loss": 0.4715, + "step": 141460 + }, + { + "epoch": 7.026422966126949, + "grad_norm": 0.1533203125, + "learning_rate": 0.000237890136088209, + "loss": 0.5018, + "step": 141470 + }, + { + "epoch": 7.0269196384225685, + "grad_norm": 0.201171875, + "learning_rate": 0.0002378504023045595, + "loss": 0.5192, + "step": 141480 + }, + { + "epoch": 7.027416310718188, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002378106685209099, + "loss": 0.5151, + "step": 141490 + }, + { + "epoch": 7.027912983013808, + "grad_norm": 0.1376953125, + "learning_rate": 0.00023777093473726035, + "loss": 0.4586, + "step": 141500 + }, + { + "epoch": 7.028409655309427, + "grad_norm": 0.1533203125, + "learning_rate": 0.00023773120095361083, + "loss": 0.4731, + "step": 141510 + }, + { + "epoch": 7.028906327605046, + "grad_norm": 0.1552734375, + "learning_rate": 0.00023769146716996127, + "loss": 0.4952, + "step": 141520 + }, + { + "epoch": 7.0294029999006655, + "grad_norm": 0.1884765625, + "learning_rate": 0.00023765173338631171, + "loss": 0.4799, + "step": 141530 + }, + { + "epoch": 7.029899672196285, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023761199960266219, + "loss": 0.5019, + "step": 141540 + }, + { + "epoch": 7.030396344491904, + "grad_norm": 0.1494140625, + "learning_rate": 0.00023757226581901263, + "loss": 0.4547, + "step": 141550 + }, + { + "epoch": 7.030893016787523, + "grad_norm": 0.181640625, + "learning_rate": 0.0002375325320353631, + "loss": 0.502, + "step": 141560 + }, + { + "epoch": 7.031389689083143, + "grad_norm": 0.13671875, + "learning_rate": 0.00023749279825171355, + "loss": 0.5338, + "step": 141570 + }, + { + "epoch": 7.0318863613787626, + "grad_norm": 0.1875, + "learning_rate": 0.00023745306446806396, + "loss": 0.4831, + "step": 141580 + }, + { + "epoch": 7.032383033674382, + "grad_norm": 0.1435546875, + "learning_rate": 0.00023741333068441446, + "loss": 0.4868, + "step": 141590 + }, + { + "epoch": 7.032879705970001, + "grad_norm": 0.12890625, + "learning_rate": 0.00023737359690076488, + "loss": 0.482, + "step": 141600 + }, + { + "epoch": 7.03337637826562, + "grad_norm": 0.1435546875, + "learning_rate": 0.00023733386311711532, + "loss": 0.4651, + "step": 141610 + }, + { + "epoch": 7.0338730505612395, + "grad_norm": 0.1240234375, + "learning_rate": 0.0002372941293334658, + "loss": 0.47, + "step": 141620 + }, + { + "epoch": 7.034369722856859, + "grad_norm": 0.1337890625, + "learning_rate": 0.00023725439554981624, + "loss": 0.4866, + "step": 141630 + }, + { + "epoch": 7.034866395152479, + "grad_norm": 0.138671875, + "learning_rate": 0.0002372146617661667, + "loss": 0.4972, + "step": 141640 + }, + { + "epoch": 7.035363067448098, + "grad_norm": 0.1484375, + "learning_rate": 0.00023717492798251715, + "loss": 0.4595, + "step": 141650 + }, + { + "epoch": 7.035859739743717, + "grad_norm": 0.134765625, + "learning_rate": 0.00023713519419886757, + "loss": 0.459, + "step": 141660 + }, + { + "epoch": 7.0363564120393365, + "grad_norm": 0.1435546875, + "learning_rate": 0.00023709546041521807, + "loss": 0.491, + "step": 141670 + }, + { + "epoch": 7.036853084334956, + "grad_norm": 0.140625, + "learning_rate": 0.00023705572663156849, + "loss": 0.4738, + "step": 141680 + }, + { + "epoch": 7.037349756630575, + "grad_norm": 0.16015625, + "learning_rate": 0.00023701599284791893, + "loss": 0.4718, + "step": 141690 + }, + { + "epoch": 7.037846428926194, + "grad_norm": 0.16015625, + "learning_rate": 0.0002369762590642694, + "loss": 0.4733, + "step": 141700 + }, + { + "epoch": 7.038343101221814, + "grad_norm": 0.1494140625, + "learning_rate": 0.00023693652528061985, + "loss": 0.4742, + "step": 141710 + }, + { + "epoch": 7.0388397735174335, + "grad_norm": 0.1376953125, + "learning_rate": 0.00023689679149697032, + "loss": 0.4897, + "step": 141720 + }, + { + "epoch": 7.039336445813053, + "grad_norm": 0.15625, + "learning_rate": 0.00023685705771332076, + "loss": 0.5069, + "step": 141730 + }, + { + "epoch": 7.039833118108672, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002368173239296712, + "loss": 0.4781, + "step": 141740 + }, + { + "epoch": 7.040329790404291, + "grad_norm": 0.1298828125, + "learning_rate": 0.00023677759014602168, + "loss": 0.4808, + "step": 141750 + }, + { + "epoch": 7.04082646269991, + "grad_norm": 0.1318359375, + "learning_rate": 0.00023673785636237212, + "loss": 0.5024, + "step": 141760 + }, + { + "epoch": 7.04132313499553, + "grad_norm": 0.138671875, + "learning_rate": 0.0002366981225787226, + "loss": 0.4859, + "step": 141770 + }, + { + "epoch": 7.041819807291149, + "grad_norm": 0.1337890625, + "learning_rate": 0.00023665838879507304, + "loss": 0.4987, + "step": 141780 + }, + { + "epoch": 7.042316479586769, + "grad_norm": 0.1220703125, + "learning_rate": 0.00023661865501142345, + "loss": 0.458, + "step": 141790 + }, + { + "epoch": 7.042813151882388, + "grad_norm": 0.2109375, + "learning_rate": 0.00023657892122777395, + "loss": 0.5042, + "step": 141800 + }, + { + "epoch": 7.043309824178007, + "grad_norm": 0.1416015625, + "learning_rate": 0.00023653918744412437, + "loss": 0.4712, + "step": 141810 + }, + { + "epoch": 7.043806496473627, + "grad_norm": 0.1484375, + "learning_rate": 0.00023649945366047481, + "loss": 0.4868, + "step": 141820 + }, + { + "epoch": 7.044303168769246, + "grad_norm": 0.154296875, + "learning_rate": 0.00023645971987682529, + "loss": 0.4499, + "step": 141830 + }, + { + "epoch": 7.044799841064865, + "grad_norm": 0.1826171875, + "learning_rate": 0.00023641998609317573, + "loss": 0.4485, + "step": 141840 + }, + { + "epoch": 7.045296513360484, + "grad_norm": 0.140625, + "learning_rate": 0.0002363802523095262, + "loss": 0.4767, + "step": 141850 + }, + { + "epoch": 7.0457931856561045, + "grad_norm": 0.12890625, + "learning_rate": 0.00023634051852587665, + "loss": 0.485, + "step": 141860 + }, + { + "epoch": 7.046289857951724, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002363007847422271, + "loss": 0.4986, + "step": 141870 + }, + { + "epoch": 7.046786530247343, + "grad_norm": 0.13671875, + "learning_rate": 0.00023626105095857756, + "loss": 0.4705, + "step": 141880 + }, + { + "epoch": 7.047283202542962, + "grad_norm": 0.146484375, + "learning_rate": 0.000236221317174928, + "loss": 0.4768, + "step": 141890 + }, + { + "epoch": 7.047779874838581, + "grad_norm": 0.13671875, + "learning_rate": 0.00023618158339127842, + "loss": 0.4889, + "step": 141900 + }, + { + "epoch": 7.048276547134201, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002361418496076289, + "loss": 0.4702, + "step": 141910 + }, + { + "epoch": 7.04877321942982, + "grad_norm": 0.13671875, + "learning_rate": 0.00023610211582397934, + "loss": 0.4579, + "step": 141920 + }, + { + "epoch": 7.04926989172544, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002360623820403298, + "loss": 0.4773, + "step": 141930 + }, + { + "epoch": 7.049766564021059, + "grad_norm": 0.1474609375, + "learning_rate": 0.00023602264825668025, + "loss": 0.4548, + "step": 141940 + }, + { + "epoch": 7.050263236316678, + "grad_norm": 0.12890625, + "learning_rate": 0.0002359829144730307, + "loss": 0.4606, + "step": 141950 + }, + { + "epoch": 7.050759908612298, + "grad_norm": 0.130859375, + "learning_rate": 0.00023594318068938117, + "loss": 0.4856, + "step": 141960 + }, + { + "epoch": 7.051256580907917, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002359034469057316, + "loss": 0.5125, + "step": 141970 + }, + { + "epoch": 7.051753253203536, + "grad_norm": 0.1484375, + "learning_rate": 0.00023586371312208203, + "loss": 0.4888, + "step": 141980 + }, + { + "epoch": 7.052249925499155, + "grad_norm": 0.134765625, + "learning_rate": 0.00023582397933843253, + "loss": 0.4924, + "step": 141990 + }, + { + "epoch": 7.052746597794775, + "grad_norm": 0.14453125, + "learning_rate": 0.00023578424555478295, + "loss": 0.4722, + "step": 142000 + }, + { + "epoch": 7.053243270090395, + "grad_norm": 0.1416015625, + "learning_rate": 0.00023574451177113344, + "loss": 0.4774, + "step": 142010 + }, + { + "epoch": 7.053739942386014, + "grad_norm": 0.130859375, + "learning_rate": 0.00023570477798748386, + "loss": 0.4848, + "step": 142020 + }, + { + "epoch": 7.054236614681633, + "grad_norm": 0.1796875, + "learning_rate": 0.0002356650442038343, + "loss": 0.4656, + "step": 142030 + }, + { + "epoch": 7.054733286977252, + "grad_norm": 0.15234375, + "learning_rate": 0.00023562531042018478, + "loss": 0.5006, + "step": 142040 + }, + { + "epoch": 7.0552299592728716, + "grad_norm": 0.2041015625, + "learning_rate": 0.00023558557663653522, + "loss": 0.4561, + "step": 142050 + }, + { + "epoch": 7.055726631568491, + "grad_norm": 0.1337890625, + "learning_rate": 0.00023554584285288567, + "loss": 0.5041, + "step": 142060 + }, + { + "epoch": 7.056223303864111, + "grad_norm": 0.1376953125, + "learning_rate": 0.00023550610906923614, + "loss": 0.4918, + "step": 142070 + }, + { + "epoch": 7.05671997615973, + "grad_norm": 0.1474609375, + "learning_rate": 0.00023546637528558658, + "loss": 0.4901, + "step": 142080 + }, + { + "epoch": 7.057216648455349, + "grad_norm": 0.15625, + "learning_rate": 0.00023542664150193705, + "loss": 0.4613, + "step": 142090 + }, + { + "epoch": 7.057713320750969, + "grad_norm": 0.1328125, + "learning_rate": 0.0002353869077182875, + "loss": 0.4986, + "step": 142100 + }, + { + "epoch": 7.058209993046588, + "grad_norm": 0.150390625, + "learning_rate": 0.00023534717393463791, + "loss": 0.4867, + "step": 142110 + }, + { + "epoch": 7.058706665342207, + "grad_norm": 0.177734375, + "learning_rate": 0.0002353074401509884, + "loss": 0.4918, + "step": 142120 + }, + { + "epoch": 7.059203337637826, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023526770636733883, + "loss": 0.4797, + "step": 142130 + }, + { + "epoch": 7.0597000099334455, + "grad_norm": 0.12890625, + "learning_rate": 0.00023522797258368927, + "loss": 0.4604, + "step": 142140 + }, + { + "epoch": 7.060196682229066, + "grad_norm": 0.1337890625, + "learning_rate": 0.00023518823880003975, + "loss": 0.4562, + "step": 142150 + }, + { + "epoch": 7.060693354524685, + "grad_norm": 0.177734375, + "learning_rate": 0.0002351485050163902, + "loss": 0.4999, + "step": 142160 + }, + { + "epoch": 7.061190026820304, + "grad_norm": 0.14453125, + "learning_rate": 0.00023510877123274066, + "loss": 0.5229, + "step": 142170 + }, + { + "epoch": 7.061686699115923, + "grad_norm": 0.150390625, + "learning_rate": 0.0002350690374490911, + "loss": 0.4936, + "step": 142180 + }, + { + "epoch": 7.0621833714115425, + "grad_norm": 0.1513671875, + "learning_rate": 0.00023502930366544152, + "loss": 0.4659, + "step": 142190 + }, + { + "epoch": 7.062680043707162, + "grad_norm": 0.150390625, + "learning_rate": 0.00023498956988179202, + "loss": 0.5114, + "step": 142200 + }, + { + "epoch": 7.063176716002781, + "grad_norm": 0.1611328125, + "learning_rate": 0.00023494983609814244, + "loss": 0.4818, + "step": 142210 + }, + { + "epoch": 7.063673388298401, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023491010231449294, + "loss": 0.4942, + "step": 142220 + }, + { + "epoch": 7.06417006059402, + "grad_norm": 0.1640625, + "learning_rate": 0.00023487036853084335, + "loss": 0.4806, + "step": 142230 + }, + { + "epoch": 7.0646667328896395, + "grad_norm": 0.1328125, + "learning_rate": 0.0002348306347471938, + "loss": 0.5028, + "step": 142240 + }, + { + "epoch": 7.065163405185259, + "grad_norm": 0.146484375, + "learning_rate": 0.00023479090096354427, + "loss": 0.4712, + "step": 142250 + }, + { + "epoch": 7.065660077480878, + "grad_norm": 0.134765625, + "learning_rate": 0.0002347511671798947, + "loss": 0.4996, + "step": 142260 + }, + { + "epoch": 7.066156749776497, + "grad_norm": 0.1337890625, + "learning_rate": 0.00023471143339624516, + "loss": 0.4829, + "step": 142270 + }, + { + "epoch": 7.066653422072116, + "grad_norm": 0.1494140625, + "learning_rate": 0.00023467169961259563, + "loss": 0.5225, + "step": 142280 + }, + { + "epoch": 7.067150094367737, + "grad_norm": 0.13671875, + "learning_rate": 0.00023463196582894607, + "loss": 0.4617, + "step": 142290 + }, + { + "epoch": 7.067646766663356, + "grad_norm": 0.13671875, + "learning_rate": 0.00023459223204529654, + "loss": 0.4767, + "step": 142300 + }, + { + "epoch": 7.068143438958975, + "grad_norm": 0.1572265625, + "learning_rate": 0.000234552498261647, + "loss": 0.5274, + "step": 142310 + }, + { + "epoch": 7.068640111254594, + "grad_norm": 0.15625, + "learning_rate": 0.0002345127644779974, + "loss": 0.4848, + "step": 142320 + }, + { + "epoch": 7.0691367835502135, + "grad_norm": 0.140625, + "learning_rate": 0.0002344730306943479, + "loss": 0.4745, + "step": 142330 + }, + { + "epoch": 7.069633455845833, + "grad_norm": 0.146484375, + "learning_rate": 0.00023443329691069832, + "loss": 0.4779, + "step": 142340 + }, + { + "epoch": 7.070130128141452, + "grad_norm": 0.15625, + "learning_rate": 0.00023439356312704877, + "loss": 0.4938, + "step": 142350 + }, + { + "epoch": 7.070626800437072, + "grad_norm": 0.1630859375, + "learning_rate": 0.00023435382934339924, + "loss": 0.5046, + "step": 142360 + }, + { + "epoch": 7.071123472732691, + "grad_norm": 0.140625, + "learning_rate": 0.00023431409555974968, + "loss": 0.4688, + "step": 142370 + }, + { + "epoch": 7.0716201450283105, + "grad_norm": 0.154296875, + "learning_rate": 0.00023427436177610015, + "loss": 0.4591, + "step": 142380 + }, + { + "epoch": 7.07211681732393, + "grad_norm": 0.12158203125, + "learning_rate": 0.0002342346279924506, + "loss": 0.4907, + "step": 142390 + }, + { + "epoch": 7.072613489619549, + "grad_norm": 0.1318359375, + "learning_rate": 0.00023419489420880104, + "loss": 0.4604, + "step": 142400 + }, + { + "epoch": 7.073110161915168, + "grad_norm": 0.130859375, + "learning_rate": 0.0002341551604251515, + "loss": 0.503, + "step": 142410 + }, + { + "epoch": 7.073606834210787, + "grad_norm": 0.12451171875, + "learning_rate": 0.00023411542664150196, + "loss": 0.4991, + "step": 142420 + }, + { + "epoch": 7.0741035065064075, + "grad_norm": 0.1396484375, + "learning_rate": 0.00023407569285785237, + "loss": 0.472, + "step": 142430 + }, + { + "epoch": 7.074600178802027, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023403595907420287, + "loss": 0.4888, + "step": 142440 + }, + { + "epoch": 7.075096851097646, + "grad_norm": 0.1328125, + "learning_rate": 0.0002339962252905533, + "loss": 0.4842, + "step": 142450 + }, + { + "epoch": 7.075593523393265, + "grad_norm": 0.146484375, + "learning_rate": 0.00023395649150690376, + "loss": 0.4966, + "step": 142460 + }, + { + "epoch": 7.076090195688884, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002339167577232542, + "loss": 0.4971, + "step": 142470 + }, + { + "epoch": 7.076586867984504, + "grad_norm": 0.169921875, + "learning_rate": 0.00023387702393960465, + "loss": 0.4729, + "step": 142480 + }, + { + "epoch": 7.077083540280123, + "grad_norm": 0.18359375, + "learning_rate": 0.00023383729015595512, + "loss": 0.5048, + "step": 142490 + }, + { + "epoch": 7.077580212575742, + "grad_norm": 0.140625, + "learning_rate": 0.00023379755637230557, + "loss": 0.4961, + "step": 142500 + }, + { + "epoch": 7.078076884871362, + "grad_norm": 0.14453125, + "learning_rate": 0.00023375782258865598, + "loss": 0.497, + "step": 142510 + }, + { + "epoch": 7.0785735571669814, + "grad_norm": 0.158203125, + "learning_rate": 0.00023371808880500648, + "loss": 0.4709, + "step": 142520 + }, + { + "epoch": 7.079070229462601, + "grad_norm": 0.140625, + "learning_rate": 0.0002336783550213569, + "loss": 0.514, + "step": 142530 + }, + { + "epoch": 7.07956690175822, + "grad_norm": 0.1328125, + "learning_rate": 0.0002336386212377074, + "loss": 0.4985, + "step": 142540 + }, + { + "epoch": 7.080063574053839, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002335988874540578, + "loss": 0.4723, + "step": 142550 + }, + { + "epoch": 7.080560246349458, + "grad_norm": 0.1533203125, + "learning_rate": 0.00023355915367040826, + "loss": 0.5067, + "step": 142560 + }, + { + "epoch": 7.081056918645078, + "grad_norm": 0.1376953125, + "learning_rate": 0.00023351941988675873, + "loss": 0.4608, + "step": 142570 + }, + { + "epoch": 7.081553590940698, + "grad_norm": 0.1552734375, + "learning_rate": 0.00023347968610310917, + "loss": 0.4775, + "step": 142580 + }, + { + "epoch": 7.082050263236317, + "grad_norm": 0.1611328125, + "learning_rate": 0.00023343995231945964, + "loss": 0.4837, + "step": 142590 + }, + { + "epoch": 7.082546935531936, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002334002185358101, + "loss": 0.4759, + "step": 142600 + }, + { + "epoch": 7.083043607827555, + "grad_norm": 0.1513671875, + "learning_rate": 0.00023336048475216053, + "loss": 0.4956, + "step": 142610 + }, + { + "epoch": 7.083540280123175, + "grad_norm": 0.1328125, + "learning_rate": 0.000233320750968511, + "loss": 0.4865, + "step": 142620 + }, + { + "epoch": 7.084036952418794, + "grad_norm": 0.130859375, + "learning_rate": 0.00023328101718486145, + "loss": 0.4843, + "step": 142630 + }, + { + "epoch": 7.084533624714413, + "grad_norm": 0.138671875, + "learning_rate": 0.00023324128340121187, + "loss": 0.4738, + "step": 142640 + }, + { + "epoch": 7.085030297010033, + "grad_norm": 0.1513671875, + "learning_rate": 0.00023320154961756236, + "loss": 0.4977, + "step": 142650 + }, + { + "epoch": 7.085526969305652, + "grad_norm": 0.125, + "learning_rate": 0.00023316181583391278, + "loss": 0.4792, + "step": 142660 + }, + { + "epoch": 7.086023641601272, + "grad_norm": 0.1513671875, + "learning_rate": 0.00023312208205026328, + "loss": 0.4879, + "step": 142670 + }, + { + "epoch": 7.086520313896891, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002330823482666137, + "loss": 0.5005, + "step": 142680 + }, + { + "epoch": 7.08701698619251, + "grad_norm": 0.1279296875, + "learning_rate": 0.00023304261448296414, + "loss": 0.4676, + "step": 142690 + }, + { + "epoch": 7.087513658488129, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002330028806993146, + "loss": 0.4853, + "step": 142700 + }, + { + "epoch": 7.0880103307837485, + "grad_norm": 0.1318359375, + "learning_rate": 0.00023296314691566506, + "loss": 0.5005, + "step": 142710 + }, + { + "epoch": 7.088507003079369, + "grad_norm": 0.158203125, + "learning_rate": 0.0002329234131320155, + "loss": 0.4629, + "step": 142720 + }, + { + "epoch": 7.089003675374988, + "grad_norm": 0.134765625, + "learning_rate": 0.00023288367934836597, + "loss": 0.4831, + "step": 142730 + }, + { + "epoch": 7.089500347670607, + "grad_norm": 0.1533203125, + "learning_rate": 0.00023284394556471642, + "loss": 0.5082, + "step": 142740 + }, + { + "epoch": 7.089997019966226, + "grad_norm": 0.126953125, + "learning_rate": 0.0002328042117810669, + "loss": 0.4694, + "step": 142750 + }, + { + "epoch": 7.090493692261846, + "grad_norm": 0.158203125, + "learning_rate": 0.0002327644779974173, + "loss": 0.4828, + "step": 142760 + }, + { + "epoch": 7.090990364557465, + "grad_norm": 0.15234375, + "learning_rate": 0.00023272474421376775, + "loss": 0.5281, + "step": 142770 + }, + { + "epoch": 7.091487036853084, + "grad_norm": 0.1376953125, + "learning_rate": 0.00023268501043011822, + "loss": 0.4916, + "step": 142780 + }, + { + "epoch": 7.091983709148703, + "grad_norm": 0.1748046875, + "learning_rate": 0.00023264527664646867, + "loss": 0.49, + "step": 142790 + }, + { + "epoch": 7.092480381444323, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002326055428628191, + "loss": 0.4367, + "step": 142800 + }, + { + "epoch": 7.092977053739943, + "grad_norm": 0.1474609375, + "learning_rate": 0.00023256580907916958, + "loss": 0.4794, + "step": 142810 + }, + { + "epoch": 7.093473726035562, + "grad_norm": 0.1533203125, + "learning_rate": 0.00023252607529552003, + "loss": 0.468, + "step": 142820 + }, + { + "epoch": 7.093970398331181, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002324863415118705, + "loss": 0.506, + "step": 142830 + }, + { + "epoch": 7.0944670706268, + "grad_norm": 0.1298828125, + "learning_rate": 0.00023244660772822094, + "loss": 0.4884, + "step": 142840 + }, + { + "epoch": 7.0949637429224195, + "grad_norm": 0.1484375, + "learning_rate": 0.00023240687394457136, + "loss": 0.5043, + "step": 142850 + }, + { + "epoch": 7.095460415218039, + "grad_norm": 0.1640625, + "learning_rate": 0.00023236714016092186, + "loss": 0.4999, + "step": 142860 + }, + { + "epoch": 7.095957087513659, + "grad_norm": 0.1533203125, + "learning_rate": 0.00023232740637727227, + "loss": 0.4508, + "step": 142870 + }, + { + "epoch": 7.096453759809278, + "grad_norm": 0.154296875, + "learning_rate": 0.00023228767259362272, + "loss": 0.4809, + "step": 142880 + }, + { + "epoch": 7.096950432104897, + "grad_norm": 0.14453125, + "learning_rate": 0.0002322479388099732, + "loss": 0.4774, + "step": 142890 + }, + { + "epoch": 7.0974471044005165, + "grad_norm": 0.154296875, + "learning_rate": 0.00023220820502632363, + "loss": 0.4958, + "step": 142900 + }, + { + "epoch": 7.097943776696136, + "grad_norm": 0.158203125, + "learning_rate": 0.0002321684712426741, + "loss": 0.5162, + "step": 142910 + }, + { + "epoch": 7.098440448991755, + "grad_norm": 0.1767578125, + "learning_rate": 0.00023212873745902455, + "loss": 0.4788, + "step": 142920 + }, + { + "epoch": 7.098937121287374, + "grad_norm": 0.1376953125, + "learning_rate": 0.000232089003675375, + "loss": 0.4658, + "step": 142930 + }, + { + "epoch": 7.099433793582994, + "grad_norm": 0.1533203125, + "learning_rate": 0.00023204926989172546, + "loss": 0.5278, + "step": 142940 + }, + { + "epoch": 7.0999304658786135, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002320095361080759, + "loss": 0.5104, + "step": 142950 + }, + { + "epoch": 7.100427138174233, + "grad_norm": 0.1943359375, + "learning_rate": 0.00023196980232442633, + "loss": 0.4983, + "step": 142960 + }, + { + "epoch": 7.100923810469852, + "grad_norm": 0.140625, + "learning_rate": 0.00023193006854077682, + "loss": 0.4831, + "step": 142970 + }, + { + "epoch": 7.101420482765471, + "grad_norm": 0.2216796875, + "learning_rate": 0.00023189033475712724, + "loss": 0.4977, + "step": 142980 + }, + { + "epoch": 7.1019171550610904, + "grad_norm": 0.1484375, + "learning_rate": 0.00023185060097347774, + "loss": 0.481, + "step": 142990 + }, + { + "epoch": 7.10241382735671, + "grad_norm": 0.134765625, + "learning_rate": 0.00023181086718982816, + "loss": 0.5166, + "step": 143000 + }, + { + "epoch": 7.10291049965233, + "grad_norm": 0.16015625, + "learning_rate": 0.0002317711334061786, + "loss": 0.5247, + "step": 143010 + }, + { + "epoch": 7.103407171947949, + "grad_norm": 0.12890625, + "learning_rate": 0.00023173139962252907, + "loss": 0.4746, + "step": 143020 + }, + { + "epoch": 7.103903844243568, + "grad_norm": 0.13671875, + "learning_rate": 0.00023169166583887952, + "loss": 0.5252, + "step": 143030 + }, + { + "epoch": 7.1044005165391875, + "grad_norm": 0.1416015625, + "learning_rate": 0.00023165193205523, + "loss": 0.4897, + "step": 143040 + }, + { + "epoch": 7.104897188834807, + "grad_norm": 0.189453125, + "learning_rate": 0.00023161219827158043, + "loss": 0.4883, + "step": 143050 + }, + { + "epoch": 7.105393861130426, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023157246448793085, + "loss": 0.5, + "step": 143060 + }, + { + "epoch": 7.105890533426045, + "grad_norm": 0.1826171875, + "learning_rate": 0.00023153273070428135, + "loss": 0.4721, + "step": 143070 + }, + { + "epoch": 7.106387205721665, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023149299692063177, + "loss": 0.4643, + "step": 143080 + }, + { + "epoch": 7.1068838780172845, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002314532631369822, + "loss": 0.461, + "step": 143090 + }, + { + "epoch": 7.107380550312904, + "grad_norm": 0.1298828125, + "learning_rate": 0.00023141352935333268, + "loss": 0.5495, + "step": 143100 + }, + { + "epoch": 7.107877222608523, + "grad_norm": 0.1396484375, + "learning_rate": 0.00023137379556968313, + "loss": 0.4675, + "step": 143110 + }, + { + "epoch": 7.108373894904142, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002313340617860336, + "loss": 0.4878, + "step": 143120 + }, + { + "epoch": 7.108870567199761, + "grad_norm": 0.1484375, + "learning_rate": 0.00023129432800238404, + "loss": 0.4986, + "step": 143130 + }, + { + "epoch": 7.109367239495381, + "grad_norm": 0.1455078125, + "learning_rate": 0.00023125459421873449, + "loss": 0.4905, + "step": 143140 + }, + { + "epoch": 7.109863911791001, + "grad_norm": 0.126953125, + "learning_rate": 0.00023121486043508496, + "loss": 0.4642, + "step": 143150 + }, + { + "epoch": 7.11036058408662, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002311751266514354, + "loss": 0.4544, + "step": 143160 + }, + { + "epoch": 7.110857256382239, + "grad_norm": 0.1455078125, + "learning_rate": 0.00023113539286778582, + "loss": 0.4911, + "step": 143170 + }, + { + "epoch": 7.111353928677858, + "grad_norm": 0.1337890625, + "learning_rate": 0.00023109565908413632, + "loss": 0.4653, + "step": 143180 + }, + { + "epoch": 7.111850600973478, + "grad_norm": 0.1337890625, + "learning_rate": 0.00023105592530048673, + "loss": 0.4804, + "step": 143190 + }, + { + "epoch": 7.112347273269097, + "grad_norm": 0.171875, + "learning_rate": 0.00023101619151683723, + "loss": 0.4988, + "step": 143200 + }, + { + "epoch": 7.112843945564716, + "grad_norm": 0.205078125, + "learning_rate": 0.00023097645773318765, + "loss": 0.4689, + "step": 143210 + }, + { + "epoch": 7.113340617860335, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002309367239495381, + "loss": 0.506, + "step": 143220 + }, + { + "epoch": 7.1138372901559555, + "grad_norm": 0.150390625, + "learning_rate": 0.00023089699016588856, + "loss": 0.5052, + "step": 143230 + }, + { + "epoch": 7.114333962451575, + "grad_norm": 0.1435546875, + "learning_rate": 0.000230857256382239, + "loss": 0.4921, + "step": 143240 + }, + { + "epoch": 7.114830634747194, + "grad_norm": 0.1591796875, + "learning_rate": 0.00023081752259858945, + "loss": 0.5034, + "step": 143250 + }, + { + "epoch": 7.115327307042813, + "grad_norm": 0.142578125, + "learning_rate": 0.00023077778881493992, + "loss": 0.4747, + "step": 143260 + }, + { + "epoch": 7.115823979338432, + "grad_norm": 0.142578125, + "learning_rate": 0.00023073805503129037, + "loss": 0.4833, + "step": 143270 + }, + { + "epoch": 7.116320651634052, + "grad_norm": 0.1328125, + "learning_rate": 0.00023069832124764084, + "loss": 0.4776, + "step": 143280 + }, + { + "epoch": 7.116817323929671, + "grad_norm": 0.126953125, + "learning_rate": 0.00023065858746399128, + "loss": 0.452, + "step": 143290 + }, + { + "epoch": 7.117313996225291, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002306188536803417, + "loss": 0.499, + "step": 143300 + }, + { + "epoch": 7.11781066852091, + "grad_norm": 0.15234375, + "learning_rate": 0.00023057911989669217, + "loss": 0.5051, + "step": 143310 + }, + { + "epoch": 7.118307340816529, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023053938611304262, + "loss": 0.487, + "step": 143320 + }, + { + "epoch": 7.118804013112149, + "grad_norm": 0.1259765625, + "learning_rate": 0.00023049965232939306, + "loss": 0.505, + "step": 143330 + }, + { + "epoch": 7.119300685407768, + "grad_norm": 0.2021484375, + "learning_rate": 0.00023045991854574353, + "loss": 0.5393, + "step": 143340 + }, + { + "epoch": 7.119797357703387, + "grad_norm": 0.12353515625, + "learning_rate": 0.00023042018476209398, + "loss": 0.4801, + "step": 143350 + }, + { + "epoch": 7.120294029999006, + "grad_norm": 0.1708984375, + "learning_rate": 0.00023038045097844445, + "loss": 0.5147, + "step": 143360 + }, + { + "epoch": 7.120790702294626, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002303407171947949, + "loss": 0.4547, + "step": 143370 + }, + { + "epoch": 7.121287374590246, + "grad_norm": 0.1279296875, + "learning_rate": 0.0002303009834111453, + "loss": 0.4809, + "step": 143380 + }, + { + "epoch": 7.121784046885865, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002302612496274958, + "loss": 0.5019, + "step": 143390 + }, + { + "epoch": 7.122280719181484, + "grad_norm": 0.138671875, + "learning_rate": 0.00023022151584384623, + "loss": 0.5201, + "step": 143400 + }, + { + "epoch": 7.122777391477103, + "grad_norm": 0.1630859375, + "learning_rate": 0.00023018178206019667, + "loss": 0.4755, + "step": 143410 + }, + { + "epoch": 7.1232740637727225, + "grad_norm": 0.1357421875, + "learning_rate": 0.00023014204827654714, + "loss": 0.4496, + "step": 143420 + }, + { + "epoch": 7.123770736068342, + "grad_norm": 0.158203125, + "learning_rate": 0.00023010231449289759, + "loss": 0.5056, + "step": 143430 + }, + { + "epoch": 7.124267408363962, + "grad_norm": 0.1396484375, + "learning_rate": 0.00023006258070924806, + "loss": 0.4708, + "step": 143440 + }, + { + "epoch": 7.124764080659581, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002300228469255985, + "loss": 0.5275, + "step": 143450 + }, + { + "epoch": 7.1252607529552, + "grad_norm": 0.1396484375, + "learning_rate": 0.00022998311314194895, + "loss": 0.4782, + "step": 143460 + }, + { + "epoch": 7.12575742525082, + "grad_norm": 0.1669921875, + "learning_rate": 0.00022994337935829942, + "loss": 0.4861, + "step": 143470 + }, + { + "epoch": 7.126254097546439, + "grad_norm": 0.138671875, + "learning_rate": 0.00022990364557464986, + "loss": 0.5009, + "step": 143480 + }, + { + "epoch": 7.126750769842058, + "grad_norm": 0.1962890625, + "learning_rate": 0.00022986391179100033, + "loss": 0.4984, + "step": 143490 + }, + { + "epoch": 7.127247442137677, + "grad_norm": 0.171875, + "learning_rate": 0.00022982417800735078, + "loss": 0.4795, + "step": 143500 + }, + { + "epoch": 7.1277441144332965, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002297844442237012, + "loss": 0.498, + "step": 143510 + }, + { + "epoch": 7.128240786728917, + "grad_norm": 0.1640625, + "learning_rate": 0.0002297447104400517, + "loss": 0.4772, + "step": 143520 + }, + { + "epoch": 7.128737459024536, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002297049766564021, + "loss": 0.4932, + "step": 143530 + }, + { + "epoch": 7.129234131320155, + "grad_norm": 0.138671875, + "learning_rate": 0.00022966524287275255, + "loss": 0.5033, + "step": 143540 + }, + { + "epoch": 7.129730803615774, + "grad_norm": 0.134765625, + "learning_rate": 0.00022962550908910302, + "loss": 0.4864, + "step": 143550 + }, + { + "epoch": 7.1302274759113935, + "grad_norm": 0.169921875, + "learning_rate": 0.00022958577530545347, + "loss": 0.4603, + "step": 143560 + }, + { + "epoch": 7.130724148207013, + "grad_norm": 0.15625, + "learning_rate": 0.00022954604152180394, + "loss": 0.4982, + "step": 143570 + }, + { + "epoch": 7.131220820502632, + "grad_norm": 0.134765625, + "learning_rate": 0.00022950630773815438, + "loss": 0.4506, + "step": 143580 + }, + { + "epoch": 7.131717492798252, + "grad_norm": 0.1455078125, + "learning_rate": 0.00022946657395450483, + "loss": 0.5013, + "step": 143590 + }, + { + "epoch": 7.132214165093871, + "grad_norm": 0.146484375, + "learning_rate": 0.0002294268401708553, + "loss": 0.4878, + "step": 143600 + }, + { + "epoch": 7.1327108373894905, + "grad_norm": 0.1416015625, + "learning_rate": 0.00022938710638720572, + "loss": 0.5103, + "step": 143610 + }, + { + "epoch": 7.13320750968511, + "grad_norm": 0.18359375, + "learning_rate": 0.00022934737260355616, + "loss": 0.4955, + "step": 143620 + }, + { + "epoch": 7.133704181980729, + "grad_norm": 0.171875, + "learning_rate": 0.00022930763881990663, + "loss": 0.4691, + "step": 143630 + }, + { + "epoch": 7.134200854276348, + "grad_norm": 0.154296875, + "learning_rate": 0.00022926790503625708, + "loss": 0.4869, + "step": 143640 + }, + { + "epoch": 7.134697526571967, + "grad_norm": 0.1396484375, + "learning_rate": 0.00022922817125260755, + "loss": 0.5061, + "step": 143650 + }, + { + "epoch": 7.1351941988675875, + "grad_norm": 0.15625, + "learning_rate": 0.000229188437468958, + "loss": 0.4892, + "step": 143660 + }, + { + "epoch": 7.135690871163207, + "grad_norm": 0.142578125, + "learning_rate": 0.00022914870368530844, + "loss": 0.4848, + "step": 143670 + }, + { + "epoch": 7.136187543458826, + "grad_norm": 0.140625, + "learning_rate": 0.0002291089699016589, + "loss": 0.4725, + "step": 143680 + }, + { + "epoch": 7.136684215754445, + "grad_norm": 0.1435546875, + "learning_rate": 0.00022906923611800935, + "loss": 0.4675, + "step": 143690 + }, + { + "epoch": 7.1371808880500645, + "grad_norm": 0.138671875, + "learning_rate": 0.00022902950233435977, + "loss": 0.4967, + "step": 143700 + }, + { + "epoch": 7.137677560345684, + "grad_norm": 0.16796875, + "learning_rate": 0.00022898976855071027, + "loss": 0.486, + "step": 143710 + }, + { + "epoch": 7.138174232641303, + "grad_norm": 0.14453125, + "learning_rate": 0.00022895003476706069, + "loss": 0.5012, + "step": 143720 + }, + { + "epoch": 7.138670904936923, + "grad_norm": 0.1435546875, + "learning_rate": 0.00022891030098341118, + "loss": 0.492, + "step": 143730 + }, + { + "epoch": 7.139167577232542, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002288705671997616, + "loss": 0.462, + "step": 143740 + }, + { + "epoch": 7.1396642495281615, + "grad_norm": 0.2060546875, + "learning_rate": 0.00022883083341611205, + "loss": 0.4899, + "step": 143750 + }, + { + "epoch": 7.140160921823781, + "grad_norm": 0.1318359375, + "learning_rate": 0.00022879109963246252, + "loss": 0.4951, + "step": 143760 + }, + { + "epoch": 7.1406575941194, + "grad_norm": 0.1396484375, + "learning_rate": 0.00022875136584881296, + "loss": 0.4571, + "step": 143770 + }, + { + "epoch": 7.141154266415019, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002287116320651634, + "loss": 0.4952, + "step": 143780 + }, + { + "epoch": 7.141650938710638, + "grad_norm": 0.126953125, + "learning_rate": 0.00022867189828151388, + "loss": 0.4873, + "step": 143790 + }, + { + "epoch": 7.1421476110062585, + "grad_norm": 0.1474609375, + "learning_rate": 0.00022863216449786432, + "loss": 0.4599, + "step": 143800 + }, + { + "epoch": 7.142644283301878, + "grad_norm": 0.16796875, + "learning_rate": 0.0002285924307142148, + "loss": 0.4905, + "step": 143810 + }, + { + "epoch": 7.143140955597497, + "grad_norm": 0.1689453125, + "learning_rate": 0.00022855269693056524, + "loss": 0.4881, + "step": 143820 + }, + { + "epoch": 7.143637627893116, + "grad_norm": 0.1591796875, + "learning_rate": 0.00022851296314691565, + "loss": 0.4771, + "step": 143830 + }, + { + "epoch": 7.144134300188735, + "grad_norm": 0.150390625, + "learning_rate": 0.00022847322936326615, + "loss": 0.4965, + "step": 143840 + }, + { + "epoch": 7.144630972484355, + "grad_norm": 0.1357421875, + "learning_rate": 0.00022843349557961657, + "loss": 0.5024, + "step": 143850 + }, + { + "epoch": 7.145127644779974, + "grad_norm": 0.1474609375, + "learning_rate": 0.00022839376179596704, + "loss": 0.4866, + "step": 143860 + }, + { + "epoch": 7.145624317075594, + "grad_norm": 0.140625, + "learning_rate": 0.00022835402801231748, + "loss": 0.5024, + "step": 143870 + }, + { + "epoch": 7.146120989371213, + "grad_norm": 0.1455078125, + "learning_rate": 0.00022831429422866793, + "loss": 0.4544, + "step": 143880 + }, + { + "epoch": 7.146617661666832, + "grad_norm": 0.1484375, + "learning_rate": 0.0002282745604450184, + "loss": 0.4847, + "step": 143890 + }, + { + "epoch": 7.147114333962452, + "grad_norm": 0.1220703125, + "learning_rate": 0.00022823482666136884, + "loss": 0.4629, + "step": 143900 + }, + { + "epoch": 7.147611006258071, + "grad_norm": 0.1572265625, + "learning_rate": 0.00022819509287771926, + "loss": 0.5042, + "step": 143910 + }, + { + "epoch": 7.14810767855369, + "grad_norm": 0.140625, + "learning_rate": 0.00022815535909406976, + "loss": 0.4641, + "step": 143920 + }, + { + "epoch": 7.148604350849309, + "grad_norm": 0.15234375, + "learning_rate": 0.00022811562531042018, + "loss": 0.4813, + "step": 143930 + }, + { + "epoch": 7.149101023144929, + "grad_norm": 0.150390625, + "learning_rate": 0.00022807589152677068, + "loss": 0.4937, + "step": 143940 + }, + { + "epoch": 7.149597695440549, + "grad_norm": 0.15234375, + "learning_rate": 0.0002280361577431211, + "loss": 0.4746, + "step": 143950 + }, + { + "epoch": 7.150094367736168, + "grad_norm": 0.150390625, + "learning_rate": 0.00022799642395947154, + "loss": 0.4798, + "step": 143960 + }, + { + "epoch": 7.150591040031787, + "grad_norm": 0.134765625, + "learning_rate": 0.000227956690175822, + "loss": 0.5005, + "step": 143970 + }, + { + "epoch": 7.151087712327406, + "grad_norm": 0.12890625, + "learning_rate": 0.00022791695639217245, + "loss": 0.4803, + "step": 143980 + }, + { + "epoch": 7.151584384623026, + "grad_norm": 0.130859375, + "learning_rate": 0.0002278772226085229, + "loss": 0.4651, + "step": 143990 + }, + { + "epoch": 7.152081056918645, + "grad_norm": 0.140625, + "learning_rate": 0.00022783748882487337, + "loss": 0.4723, + "step": 144000 + }, + { + "epoch": 7.152577729214264, + "grad_norm": 0.17578125, + "learning_rate": 0.0002277977550412238, + "loss": 0.4977, + "step": 144010 + }, + { + "epoch": 7.153074401509884, + "grad_norm": 0.1494140625, + "learning_rate": 0.00022775802125757428, + "loss": 0.5051, + "step": 144020 + }, + { + "epoch": 7.153571073805503, + "grad_norm": 0.12890625, + "learning_rate": 0.00022771828747392473, + "loss": 0.5129, + "step": 144030 + }, + { + "epoch": 7.154067746101123, + "grad_norm": 0.1376953125, + "learning_rate": 0.00022767855369027515, + "loss": 0.4887, + "step": 144040 + }, + { + "epoch": 7.154564418396742, + "grad_norm": 0.17578125, + "learning_rate": 0.00022763881990662564, + "loss": 0.5253, + "step": 144050 + }, + { + "epoch": 7.155061090692361, + "grad_norm": 0.134765625, + "learning_rate": 0.00022759908612297606, + "loss": 0.4954, + "step": 144060 + }, + { + "epoch": 7.15555776298798, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002275593523393265, + "loss": 0.4925, + "step": 144070 + }, + { + "epoch": 7.1560544352835995, + "grad_norm": 0.1298828125, + "learning_rate": 0.00022751961855567698, + "loss": 0.4706, + "step": 144080 + }, + { + "epoch": 7.15655110757922, + "grad_norm": 0.1669921875, + "learning_rate": 0.00022747988477202742, + "loss": 0.4558, + "step": 144090 + }, + { + "epoch": 7.157047779874839, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002274401509883779, + "loss": 0.4933, + "step": 144100 + }, + { + "epoch": 7.157544452170458, + "grad_norm": 0.1572265625, + "learning_rate": 0.00022740041720472834, + "loss": 0.484, + "step": 144110 + }, + { + "epoch": 7.158041124466077, + "grad_norm": 0.1357421875, + "learning_rate": 0.00022736068342107878, + "loss": 0.5118, + "step": 144120 + }, + { + "epoch": 7.1585377967616965, + "grad_norm": 0.1328125, + "learning_rate": 0.00022732094963742925, + "loss": 0.4969, + "step": 144130 + }, + { + "epoch": 7.159034469057316, + "grad_norm": 0.16015625, + "learning_rate": 0.0002272812158537797, + "loss": 0.4972, + "step": 144140 + }, + { + "epoch": 7.159531141352935, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002272414820701301, + "loss": 0.5192, + "step": 144150 + }, + { + "epoch": 7.160027813648554, + "grad_norm": 0.146484375, + "learning_rate": 0.00022720174828648058, + "loss": 0.4866, + "step": 144160 + }, + { + "epoch": 7.160524485944174, + "grad_norm": 0.14453125, + "learning_rate": 0.00022716201450283103, + "loss": 0.4867, + "step": 144170 + }, + { + "epoch": 7.161021158239794, + "grad_norm": 0.1328125, + "learning_rate": 0.0002271222807191815, + "loss": 0.5067, + "step": 144180 + }, + { + "epoch": 7.161517830535413, + "grad_norm": 0.1474609375, + "learning_rate": 0.00022708254693553194, + "loss": 0.4791, + "step": 144190 + }, + { + "epoch": 7.162014502831032, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002270428131518824, + "loss": 0.4805, + "step": 144200 + }, + { + "epoch": 7.162511175126651, + "grad_norm": 0.1630859375, + "learning_rate": 0.00022700307936823286, + "loss": 0.4814, + "step": 144210 + }, + { + "epoch": 7.1630078474222705, + "grad_norm": 0.146484375, + "learning_rate": 0.0002269633455845833, + "loss": 0.4961, + "step": 144220 + }, + { + "epoch": 7.16350451971789, + "grad_norm": 0.150390625, + "learning_rate": 0.00022692361180093372, + "loss": 0.4874, + "step": 144230 + }, + { + "epoch": 7.16400119201351, + "grad_norm": 0.1591796875, + "learning_rate": 0.00022688387801728422, + "loss": 0.5099, + "step": 144240 + }, + { + "epoch": 7.164497864309129, + "grad_norm": 0.154296875, + "learning_rate": 0.00022684414423363464, + "loss": 0.5031, + "step": 144250 + }, + { + "epoch": 7.164994536604748, + "grad_norm": 0.14453125, + "learning_rate": 0.00022680441044998514, + "loss": 0.5086, + "step": 144260 + }, + { + "epoch": 7.1654912089003675, + "grad_norm": 0.138671875, + "learning_rate": 0.00022676467666633555, + "loss": 0.4491, + "step": 144270 + }, + { + "epoch": 7.165987881195987, + "grad_norm": 0.1923828125, + "learning_rate": 0.000226724942882686, + "loss": 0.4925, + "step": 144280 + }, + { + "epoch": 7.166484553491606, + "grad_norm": 0.15625, + "learning_rate": 0.00022668520909903647, + "loss": 0.4784, + "step": 144290 + }, + { + "epoch": 7.166981225787225, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002266454753153869, + "loss": 0.5344, + "step": 144300 + }, + { + "epoch": 7.167477898082845, + "grad_norm": 0.1435546875, + "learning_rate": 0.00022660574153173738, + "loss": 0.4703, + "step": 144310 + }, + { + "epoch": 7.1679745703784645, + "grad_norm": 0.13671875, + "learning_rate": 0.00022656600774808783, + "loss": 0.502, + "step": 144320 + }, + { + "epoch": 7.168471242674084, + "grad_norm": 0.169921875, + "learning_rate": 0.00022652627396443827, + "loss": 0.4975, + "step": 144330 + }, + { + "epoch": 7.168967914969703, + "grad_norm": 0.2041015625, + "learning_rate": 0.00022648654018078874, + "loss": 0.4867, + "step": 144340 + }, + { + "epoch": 7.169464587265322, + "grad_norm": 0.142578125, + "learning_rate": 0.0002264468063971392, + "loss": 0.4843, + "step": 144350 + }, + { + "epoch": 7.169961259560941, + "grad_norm": 0.181640625, + "learning_rate": 0.0002264070726134896, + "loss": 0.4771, + "step": 144360 + }, + { + "epoch": 7.170457931856561, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002263673388298401, + "loss": 0.4978, + "step": 144370 + }, + { + "epoch": 7.170954604152181, + "grad_norm": 0.146484375, + "learning_rate": 0.00022632760504619052, + "loss": 0.4906, + "step": 144380 + }, + { + "epoch": 7.1714512764478, + "grad_norm": 0.1337890625, + "learning_rate": 0.00022628787126254102, + "loss": 0.4696, + "step": 144390 + }, + { + "epoch": 7.171947948743419, + "grad_norm": 0.140625, + "learning_rate": 0.00022624813747889144, + "loss": 0.4589, + "step": 144400 + }, + { + "epoch": 7.1724446210390385, + "grad_norm": 0.1484375, + "learning_rate": 0.00022620840369524188, + "loss": 0.4837, + "step": 144410 + }, + { + "epoch": 7.172941293334658, + "grad_norm": 0.15234375, + "learning_rate": 0.00022616866991159235, + "loss": 0.4701, + "step": 144420 + }, + { + "epoch": 7.173437965630277, + "grad_norm": 0.142578125, + "learning_rate": 0.0002261289361279428, + "loss": 0.4935, + "step": 144430 + }, + { + "epoch": 7.173934637925896, + "grad_norm": 0.158203125, + "learning_rate": 0.00022608920234429324, + "loss": 0.5182, + "step": 144440 + }, + { + "epoch": 7.174431310221516, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002260494685606437, + "loss": 0.4686, + "step": 144450 + }, + { + "epoch": 7.1749279825171355, + "grad_norm": 0.1630859375, + "learning_rate": 0.00022600973477699413, + "loss": 0.4775, + "step": 144460 + }, + { + "epoch": 7.175424654812755, + "grad_norm": 0.14453125, + "learning_rate": 0.00022597000099334463, + "loss": 0.5098, + "step": 144470 + }, + { + "epoch": 7.175921327108374, + "grad_norm": 0.146484375, + "learning_rate": 0.00022593026720969504, + "loss": 0.4768, + "step": 144480 + }, + { + "epoch": 7.176417999403993, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002258905334260455, + "loss": 0.4488, + "step": 144490 + }, + { + "epoch": 7.176914671699612, + "grad_norm": 0.1669921875, + "learning_rate": 0.00022585079964239596, + "loss": 0.4852, + "step": 144500 + }, + { + "epoch": 7.177411343995232, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002258110658587464, + "loss": 0.4838, + "step": 144510 + }, + { + "epoch": 7.177908016290852, + "grad_norm": 0.1357421875, + "learning_rate": 0.00022577133207509685, + "loss": 0.4947, + "step": 144520 + }, + { + "epoch": 7.178404688586471, + "grad_norm": 0.130859375, + "learning_rate": 0.00022573159829144732, + "loss": 0.4788, + "step": 144530 + }, + { + "epoch": 7.17890136088209, + "grad_norm": 0.1796875, + "learning_rate": 0.00022569186450779776, + "loss": 0.4921, + "step": 144540 + }, + { + "epoch": 7.179398033177709, + "grad_norm": 0.1494140625, + "learning_rate": 0.00022565213072414824, + "loss": 0.473, + "step": 144550 + }, + { + "epoch": 7.179894705473329, + "grad_norm": 0.1396484375, + "learning_rate": 0.00022561239694049868, + "loss": 0.4832, + "step": 144560 + }, + { + "epoch": 7.180391377768948, + "grad_norm": 0.162109375, + "learning_rate": 0.0002255726631568491, + "loss": 0.4857, + "step": 144570 + }, + { + "epoch": 7.180888050064567, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002255329293731996, + "loss": 0.4534, + "step": 144580 + }, + { + "epoch": 7.181384722360186, + "grad_norm": 0.1435546875, + "learning_rate": 0.00022549319558955, + "loss": 0.5032, + "step": 144590 + }, + { + "epoch": 7.181881394655806, + "grad_norm": 0.130859375, + "learning_rate": 0.00022545346180590046, + "loss": 0.4603, + "step": 144600 + }, + { + "epoch": 7.182378066951426, + "grad_norm": 0.1416015625, + "learning_rate": 0.00022541372802225093, + "loss": 0.4819, + "step": 144610 + }, + { + "epoch": 7.182874739247045, + "grad_norm": 0.171875, + "learning_rate": 0.00022537399423860137, + "loss": 0.5002, + "step": 144620 + }, + { + "epoch": 7.183371411542664, + "grad_norm": 0.1396484375, + "learning_rate": 0.00022533426045495184, + "loss": 0.4795, + "step": 144630 + }, + { + "epoch": 7.183868083838283, + "grad_norm": 0.171875, + "learning_rate": 0.0002252945266713023, + "loss": 0.4916, + "step": 144640 + }, + { + "epoch": 7.184364756133903, + "grad_norm": 0.1435546875, + "learning_rate": 0.00022525479288765273, + "loss": 0.4951, + "step": 144650 + }, + { + "epoch": 7.184861428429522, + "grad_norm": 0.138671875, + "learning_rate": 0.0002252150591040032, + "loss": 0.5165, + "step": 144660 + }, + { + "epoch": 7.185358100725142, + "grad_norm": 0.140625, + "learning_rate": 0.00022517532532035365, + "loss": 0.4848, + "step": 144670 + }, + { + "epoch": 7.185854773020761, + "grad_norm": 0.1416015625, + "learning_rate": 0.00022513559153670407, + "loss": 0.4596, + "step": 144680 + }, + { + "epoch": 7.18635144531638, + "grad_norm": 0.140625, + "learning_rate": 0.00022509585775305456, + "loss": 0.4846, + "step": 144690 + }, + { + "epoch": 7.186848117612, + "grad_norm": 0.154296875, + "learning_rate": 0.00022505612396940498, + "loss": 0.4743, + "step": 144700 + }, + { + "epoch": 7.187344789907619, + "grad_norm": 0.140625, + "learning_rate": 0.00022501639018575545, + "loss": 0.5068, + "step": 144710 + }, + { + "epoch": 7.187841462203238, + "grad_norm": 0.142578125, + "learning_rate": 0.0002249766564021059, + "loss": 0.4759, + "step": 144720 + }, + { + "epoch": 7.188338134498857, + "grad_norm": 0.1328125, + "learning_rate": 0.00022493692261845634, + "loss": 0.4783, + "step": 144730 + }, + { + "epoch": 7.188834806794477, + "grad_norm": 0.12255859375, + "learning_rate": 0.0002248971888348068, + "loss": 0.4935, + "step": 144740 + }, + { + "epoch": 7.189331479090097, + "grad_norm": 0.150390625, + "learning_rate": 0.00022485745505115726, + "loss": 0.4897, + "step": 144750 + }, + { + "epoch": 7.189828151385716, + "grad_norm": 0.1337890625, + "learning_rate": 0.00022481772126750773, + "loss": 0.5137, + "step": 144760 + }, + { + "epoch": 7.190324823681335, + "grad_norm": 0.1552734375, + "learning_rate": 0.00022477798748385817, + "loss": 0.4782, + "step": 144770 + }, + { + "epoch": 7.190821495976954, + "grad_norm": 0.169921875, + "learning_rate": 0.0002247382537002086, + "loss": 0.4985, + "step": 144780 + }, + { + "epoch": 7.1913181682725735, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002246985199165591, + "loss": 0.4739, + "step": 144790 + }, + { + "epoch": 7.191814840568193, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002246587861329095, + "loss": 0.4992, + "step": 144800 + }, + { + "epoch": 7.192311512863813, + "grad_norm": 0.14453125, + "learning_rate": 0.00022461905234925995, + "loss": 0.4707, + "step": 144810 + }, + { + "epoch": 7.192808185159432, + "grad_norm": 0.1376953125, + "learning_rate": 0.00022457931856561042, + "loss": 0.4752, + "step": 144820 + }, + { + "epoch": 7.193304857455051, + "grad_norm": 0.142578125, + "learning_rate": 0.00022453958478196086, + "loss": 0.4861, + "step": 144830 + }, + { + "epoch": 7.1938015297506706, + "grad_norm": 0.1640625, + "learning_rate": 0.00022449985099831134, + "loss": 0.4754, + "step": 144840 + }, + { + "epoch": 7.19429820204629, + "grad_norm": 0.1884765625, + "learning_rate": 0.00022446011721466178, + "loss": 0.483, + "step": 144850 + }, + { + "epoch": 7.194794874341909, + "grad_norm": 0.150390625, + "learning_rate": 0.00022442038343101222, + "loss": 0.4937, + "step": 144860 + }, + { + "epoch": 7.195291546637528, + "grad_norm": 0.17578125, + "learning_rate": 0.0002243806496473627, + "loss": 0.4744, + "step": 144870 + }, + { + "epoch": 7.1957882189331475, + "grad_norm": 0.142578125, + "learning_rate": 0.00022434091586371314, + "loss": 0.502, + "step": 144880 + }, + { + "epoch": 7.196284891228768, + "grad_norm": 0.14453125, + "learning_rate": 0.00022430118208006356, + "loss": 0.4647, + "step": 144890 + }, + { + "epoch": 7.196781563524387, + "grad_norm": 0.1474609375, + "learning_rate": 0.00022426144829641406, + "loss": 0.5108, + "step": 144900 + }, + { + "epoch": 7.197278235820006, + "grad_norm": 0.146484375, + "learning_rate": 0.00022422171451276447, + "loss": 0.5119, + "step": 144910 + }, + { + "epoch": 7.197774908115625, + "grad_norm": 0.166015625, + "learning_rate": 0.00022418198072911497, + "loss": 0.4668, + "step": 144920 + }, + { + "epoch": 7.1982715804112445, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002241422469454654, + "loss": 0.513, + "step": 144930 + }, + { + "epoch": 7.198768252706864, + "grad_norm": 0.154296875, + "learning_rate": 0.00022410251316181583, + "loss": 0.4853, + "step": 144940 + }, + { + "epoch": 7.199264925002483, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002240627793781663, + "loss": 0.4552, + "step": 144950 + }, + { + "epoch": 7.199761597298103, + "grad_norm": 0.1572265625, + "learning_rate": 0.00022402304559451675, + "loss": 0.4684, + "step": 144960 + }, + { + "epoch": 7.200258269593722, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002239833118108672, + "loss": 0.4928, + "step": 144970 + }, + { + "epoch": 7.2007549418893415, + "grad_norm": 0.16015625, + "learning_rate": 0.00022394357802721766, + "loss": 0.5074, + "step": 144980 + }, + { + "epoch": 7.201251614184961, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002239038442435681, + "loss": 0.4706, + "step": 144990 + }, + { + "epoch": 7.20174828648058, + "grad_norm": 0.15625, + "learning_rate": 0.00022386411045991858, + "loss": 0.5032, + "step": 145000 + }, + { + "epoch": 7.202244958776199, + "grad_norm": 0.146484375, + "learning_rate": 0.000223824376676269, + "loss": 0.4878, + "step": 145010 + }, + { + "epoch": 7.202741631071818, + "grad_norm": 0.138671875, + "learning_rate": 0.00022378464289261944, + "loss": 0.4989, + "step": 145020 + }, + { + "epoch": 7.2032383033674385, + "grad_norm": 0.1357421875, + "learning_rate": 0.0002237449091089699, + "loss": 0.4722, + "step": 145030 + }, + { + "epoch": 7.203734975663058, + "grad_norm": 0.1474609375, + "learning_rate": 0.00022370517532532036, + "loss": 0.4766, + "step": 145040 + }, + { + "epoch": 7.204231647958677, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002236654415416708, + "loss": 0.5251, + "step": 145050 + }, + { + "epoch": 7.204728320254296, + "grad_norm": 0.140625, + "learning_rate": 0.00022362570775802127, + "loss": 0.4952, + "step": 145060 + }, + { + "epoch": 7.205224992549915, + "grad_norm": 0.189453125, + "learning_rate": 0.00022358597397437172, + "loss": 0.4873, + "step": 145070 + }, + { + "epoch": 7.205721664845535, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002235462401907222, + "loss": 0.4997, + "step": 145080 + }, + { + "epoch": 7.206218337141154, + "grad_norm": 0.1796875, + "learning_rate": 0.00022350650640707263, + "loss": 0.4896, + "step": 145090 + }, + { + "epoch": 7.206715009436774, + "grad_norm": 0.1455078125, + "learning_rate": 0.00022346677262342305, + "loss": 0.4994, + "step": 145100 + }, + { + "epoch": 7.207211681732393, + "grad_norm": 0.1328125, + "learning_rate": 0.00022342703883977355, + "loss": 0.5009, + "step": 145110 + }, + { + "epoch": 7.2077083540280125, + "grad_norm": 0.1474609375, + "learning_rate": 0.00022338730505612396, + "loss": 0.4934, + "step": 145120 + }, + { + "epoch": 7.208205026323632, + "grad_norm": 0.1298828125, + "learning_rate": 0.00022334757127247446, + "loss": 0.4681, + "step": 145130 + }, + { + "epoch": 7.208701698619251, + "grad_norm": 0.2001953125, + "learning_rate": 0.00022330783748882488, + "loss": 0.4923, + "step": 145140 + }, + { + "epoch": 7.20919837091487, + "grad_norm": 0.189453125, + "learning_rate": 0.00022326810370517532, + "loss": 0.4874, + "step": 145150 + }, + { + "epoch": 7.209695043210489, + "grad_norm": 0.142578125, + "learning_rate": 0.0002232283699215258, + "loss": 0.501, + "step": 145160 + }, + { + "epoch": 7.2101917155061095, + "grad_norm": 0.1455078125, + "learning_rate": 0.00022318863613787624, + "loss": 0.4875, + "step": 145170 + }, + { + "epoch": 7.210688387801729, + "grad_norm": 0.134765625, + "learning_rate": 0.00022314890235422668, + "loss": 0.4814, + "step": 145180 + }, + { + "epoch": 7.211185060097348, + "grad_norm": 0.1591796875, + "learning_rate": 0.00022310916857057716, + "loss": 0.4958, + "step": 145190 + }, + { + "epoch": 7.211681732392967, + "grad_norm": 0.142578125, + "learning_rate": 0.0002230694347869276, + "loss": 0.4566, + "step": 145200 + }, + { + "epoch": 7.212178404688586, + "grad_norm": 0.154296875, + "learning_rate": 0.00022302970100327807, + "loss": 0.5105, + "step": 145210 + }, + { + "epoch": 7.212675076984206, + "grad_norm": 0.1435546875, + "learning_rate": 0.00022298996721962852, + "loss": 0.4881, + "step": 145220 + }, + { + "epoch": 7.213171749279825, + "grad_norm": 0.1494140625, + "learning_rate": 0.00022295023343597893, + "loss": 0.4852, + "step": 145230 + }, + { + "epoch": 7.213668421575445, + "grad_norm": 0.1513671875, + "learning_rate": 0.00022291049965232943, + "loss": 0.4856, + "step": 145240 + }, + { + "epoch": 7.214165093871064, + "grad_norm": 0.1357421875, + "learning_rate": 0.00022287076586867985, + "loss": 0.5042, + "step": 145250 + }, + { + "epoch": 7.214661766166683, + "grad_norm": 0.16015625, + "learning_rate": 0.0002228310320850303, + "loss": 0.4929, + "step": 145260 + }, + { + "epoch": 7.215158438462303, + "grad_norm": 0.1513671875, + "learning_rate": 0.00022279129830138076, + "loss": 0.4727, + "step": 145270 + }, + { + "epoch": 7.215655110757922, + "grad_norm": 0.1875, + "learning_rate": 0.0002227515645177312, + "loss": 0.5022, + "step": 145280 + }, + { + "epoch": 7.216151783053541, + "grad_norm": 0.1513671875, + "learning_rate": 0.00022271183073408168, + "loss": 0.4897, + "step": 145290 + }, + { + "epoch": 7.21664845534916, + "grad_norm": 0.1455078125, + "learning_rate": 0.00022267209695043212, + "loss": 0.5057, + "step": 145300 + }, + { + "epoch": 7.21714512764478, + "grad_norm": 0.150390625, + "learning_rate": 0.00022263236316678254, + "loss": 0.464, + "step": 145310 + }, + { + "epoch": 7.2176417999404, + "grad_norm": 0.1357421875, + "learning_rate": 0.00022259262938313304, + "loss": 0.4611, + "step": 145320 + }, + { + "epoch": 7.218138472236019, + "grad_norm": 0.1376953125, + "learning_rate": 0.00022255289559948346, + "loss": 0.4799, + "step": 145330 + }, + { + "epoch": 7.218635144531638, + "grad_norm": 0.142578125, + "learning_rate": 0.0002225131618158339, + "loss": 0.4788, + "step": 145340 + }, + { + "epoch": 7.219131816827257, + "grad_norm": 0.146484375, + "learning_rate": 0.00022247342803218437, + "loss": 0.4758, + "step": 145350 + }, + { + "epoch": 7.219628489122877, + "grad_norm": 0.1650390625, + "learning_rate": 0.00022243369424853482, + "loss": 0.4607, + "step": 145360 + }, + { + "epoch": 7.220125161418496, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002223939604648853, + "loss": 0.512, + "step": 145370 + }, + { + "epoch": 7.220621833714115, + "grad_norm": 0.1806640625, + "learning_rate": 0.00022235422668123573, + "loss": 0.4601, + "step": 145380 + }, + { + "epoch": 7.221118506009735, + "grad_norm": 0.138671875, + "learning_rate": 0.00022231449289758618, + "loss": 0.4845, + "step": 145390 + }, + { + "epoch": 7.221615178305354, + "grad_norm": 0.150390625, + "learning_rate": 0.00022227475911393665, + "loss": 0.4872, + "step": 145400 + }, + { + "epoch": 7.222111850600974, + "grad_norm": 0.1640625, + "learning_rate": 0.0002222350253302871, + "loss": 0.4676, + "step": 145410 + }, + { + "epoch": 7.222608522896593, + "grad_norm": 0.1953125, + "learning_rate": 0.0002221952915466375, + "loss": 0.4826, + "step": 145420 + }, + { + "epoch": 7.223105195192212, + "grad_norm": 0.1572265625, + "learning_rate": 0.000222155557762988, + "loss": 0.5007, + "step": 145430 + }, + { + "epoch": 7.223601867487831, + "grad_norm": 0.1376953125, + "learning_rate": 0.00022211582397933842, + "loss": 0.4959, + "step": 145440 + }, + { + "epoch": 7.2240985397834505, + "grad_norm": 0.158203125, + "learning_rate": 0.00022207609019568892, + "loss": 0.5069, + "step": 145450 + }, + { + "epoch": 7.224595212079071, + "grad_norm": 0.166015625, + "learning_rate": 0.00022203635641203934, + "loss": 0.5187, + "step": 145460 + }, + { + "epoch": 7.22509188437469, + "grad_norm": 0.1376953125, + "learning_rate": 0.00022199662262838978, + "loss": 0.4887, + "step": 145470 + }, + { + "epoch": 7.225588556670309, + "grad_norm": 0.1513671875, + "learning_rate": 0.00022195688884474026, + "loss": 0.4891, + "step": 145480 + }, + { + "epoch": 7.226085228965928, + "grad_norm": 0.15234375, + "learning_rate": 0.0002219171550610907, + "loss": 0.5209, + "step": 145490 + }, + { + "epoch": 7.2265819012615475, + "grad_norm": 0.154296875, + "learning_rate": 0.00022187742127744114, + "loss": 0.4798, + "step": 145500 + }, + { + "epoch": 7.227078573557167, + "grad_norm": 0.1279296875, + "learning_rate": 0.00022183768749379162, + "loss": 0.529, + "step": 145510 + }, + { + "epoch": 7.227575245852786, + "grad_norm": 0.140625, + "learning_rate": 0.00022179795371014206, + "loss": 0.4911, + "step": 145520 + }, + { + "epoch": 7.228071918148406, + "grad_norm": 0.13671875, + "learning_rate": 0.00022175821992649253, + "loss": 0.499, + "step": 145530 + }, + { + "epoch": 7.228568590444025, + "grad_norm": 0.1708984375, + "learning_rate": 0.00022171848614284298, + "loss": 0.49, + "step": 145540 + }, + { + "epoch": 7.229065262739645, + "grad_norm": 0.134765625, + "learning_rate": 0.0002216787523591934, + "loss": 0.5261, + "step": 145550 + }, + { + "epoch": 7.229561935035264, + "grad_norm": 0.14453125, + "learning_rate": 0.00022163901857554386, + "loss": 0.4873, + "step": 145560 + }, + { + "epoch": 7.230058607330883, + "grad_norm": 0.220703125, + "learning_rate": 0.0002215992847918943, + "loss": 0.494, + "step": 145570 + }, + { + "epoch": 7.230555279626502, + "grad_norm": 0.1728515625, + "learning_rate": 0.00022155955100824478, + "loss": 0.4886, + "step": 145580 + }, + { + "epoch": 7.2310519519221215, + "grad_norm": 0.1416015625, + "learning_rate": 0.00022151981722459522, + "loss": 0.4963, + "step": 145590 + }, + { + "epoch": 7.231548624217741, + "grad_norm": 0.1328125, + "learning_rate": 0.00022148008344094567, + "loss": 0.4728, + "step": 145600 + }, + { + "epoch": 7.232045296513361, + "grad_norm": 0.1767578125, + "learning_rate": 0.00022144034965729614, + "loss": 0.4815, + "step": 145610 + }, + { + "epoch": 7.23254196880898, + "grad_norm": 0.173828125, + "learning_rate": 0.00022140061587364658, + "loss": 0.5064, + "step": 145620 + }, + { + "epoch": 7.233038641104599, + "grad_norm": 0.1435546875, + "learning_rate": 0.000221360882089997, + "loss": 0.4569, + "step": 145630 + }, + { + "epoch": 7.2335353134002185, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002213211483063475, + "loss": 0.4598, + "step": 145640 + }, + { + "epoch": 7.234031985695838, + "grad_norm": 0.1533203125, + "learning_rate": 0.00022128141452269792, + "loss": 0.4865, + "step": 145650 + }, + { + "epoch": 7.234528657991457, + "grad_norm": 0.138671875, + "learning_rate": 0.00022124168073904841, + "loss": 0.5023, + "step": 145660 + }, + { + "epoch": 7.235025330287076, + "grad_norm": 0.1552734375, + "learning_rate": 0.00022120194695539883, + "loss": 0.4864, + "step": 145670 + }, + { + "epoch": 7.235522002582696, + "grad_norm": 0.1591796875, + "learning_rate": 0.00022116221317174928, + "loss": 0.5131, + "step": 145680 + }, + { + "epoch": 7.2360186748783155, + "grad_norm": 0.1533203125, + "learning_rate": 0.00022112247938809975, + "loss": 0.4828, + "step": 145690 + }, + { + "epoch": 7.236515347173935, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002210827456044502, + "loss": 0.4809, + "step": 145700 + }, + { + "epoch": 7.237012019469554, + "grad_norm": 0.1396484375, + "learning_rate": 0.00022104301182080064, + "loss": 0.4749, + "step": 145710 + }, + { + "epoch": 7.237508691765173, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002210032780371511, + "loss": 0.5027, + "step": 145720 + }, + { + "epoch": 7.238005364060792, + "grad_norm": 0.154296875, + "learning_rate": 0.00022096354425350155, + "loss": 0.521, + "step": 145730 + }, + { + "epoch": 7.238502036356412, + "grad_norm": 0.1650390625, + "learning_rate": 0.00022092381046985202, + "loss": 0.4808, + "step": 145740 + }, + { + "epoch": 7.238998708652032, + "grad_norm": 0.15625, + "learning_rate": 0.00022088407668620247, + "loss": 0.5053, + "step": 145750 + }, + { + "epoch": 7.239495380947651, + "grad_norm": 0.1337890625, + "learning_rate": 0.00022084434290255288, + "loss": 0.4967, + "step": 145760 + }, + { + "epoch": 7.23999205324327, + "grad_norm": 0.134765625, + "learning_rate": 0.00022080460911890338, + "loss": 0.4675, + "step": 145770 + }, + { + "epoch": 7.2404887255388894, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002207648753352538, + "loss": 0.5069, + "step": 145780 + }, + { + "epoch": 7.240985397834509, + "grad_norm": 0.1357421875, + "learning_rate": 0.00022072514155160424, + "loss": 0.4736, + "step": 145790 + }, + { + "epoch": 7.241482070130128, + "grad_norm": 0.1357421875, + "learning_rate": 0.00022068540776795472, + "loss": 0.4937, + "step": 145800 + }, + { + "epoch": 7.241978742425747, + "grad_norm": 0.1328125, + "learning_rate": 0.00022064567398430516, + "loss": 0.4887, + "step": 145810 + }, + { + "epoch": 7.242475414721367, + "grad_norm": 0.1513671875, + "learning_rate": 0.00022060594020065563, + "loss": 0.5057, + "step": 145820 + }, + { + "epoch": 7.2429720870169865, + "grad_norm": 0.1416015625, + "learning_rate": 0.00022056620641700608, + "loss": 0.4826, + "step": 145830 + }, + { + "epoch": 7.243468759312606, + "grad_norm": 0.1494140625, + "learning_rate": 0.00022052647263335652, + "loss": 0.4793, + "step": 145840 + }, + { + "epoch": 7.243965431608225, + "grad_norm": 0.150390625, + "learning_rate": 0.000220486738849707, + "loss": 0.4725, + "step": 145850 + }, + { + "epoch": 7.244462103903844, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002204470050660574, + "loss": 0.5033, + "step": 145860 + }, + { + "epoch": 7.244958776199463, + "grad_norm": 0.1337890625, + "learning_rate": 0.00022040727128240785, + "loss": 0.4814, + "step": 145870 + }, + { + "epoch": 7.245455448495083, + "grad_norm": 0.1455078125, + "learning_rate": 0.00022036753749875832, + "loss": 0.4836, + "step": 145880 + }, + { + "epoch": 7.245952120790703, + "grad_norm": 0.1298828125, + "learning_rate": 0.00022032780371510877, + "loss": 0.4845, + "step": 145890 + }, + { + "epoch": 7.246448793086322, + "grad_norm": 0.142578125, + "learning_rate": 0.00022028806993145924, + "loss": 0.4973, + "step": 145900 + }, + { + "epoch": 7.246945465381941, + "grad_norm": 0.13671875, + "learning_rate": 0.00022024833614780968, + "loss": 0.4808, + "step": 145910 + }, + { + "epoch": 7.24744213767756, + "grad_norm": 0.1513671875, + "learning_rate": 0.00022020860236416013, + "loss": 0.494, + "step": 145920 + }, + { + "epoch": 7.24793880997318, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002201688685805106, + "loss": 0.4976, + "step": 145930 + }, + { + "epoch": 7.248435482268799, + "grad_norm": 0.146484375, + "learning_rate": 0.00022012913479686104, + "loss": 0.4904, + "step": 145940 + }, + { + "epoch": 7.248932154564418, + "grad_norm": 0.1611328125, + "learning_rate": 0.00022008940101321146, + "loss": 0.4855, + "step": 145950 + }, + { + "epoch": 7.249428826860038, + "grad_norm": 0.1376953125, + "learning_rate": 0.00022004966722956196, + "loss": 0.4814, + "step": 145960 + }, + { + "epoch": 7.249925499155657, + "grad_norm": 0.142578125, + "learning_rate": 0.00022000993344591238, + "loss": 0.4844, + "step": 145970 + }, + { + "epoch": 7.250422171451277, + "grad_norm": 0.1630859375, + "learning_rate": 0.00021997019966226287, + "loss": 0.4592, + "step": 145980 + }, + { + "epoch": 7.250918843746896, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002199304658786133, + "loss": 0.4822, + "step": 145990 + }, + { + "epoch": 7.251415516042515, + "grad_norm": 0.1357421875, + "learning_rate": 0.00021989073209496374, + "loss": 0.5044, + "step": 146000 + }, + { + "epoch": 7.251912188338134, + "grad_norm": 0.12890625, + "learning_rate": 0.0002198509983113142, + "loss": 0.4562, + "step": 146010 + }, + { + "epoch": 7.252408860633754, + "grad_norm": 0.1484375, + "learning_rate": 0.00021981126452766465, + "loss": 0.4825, + "step": 146020 + }, + { + "epoch": 7.252905532929373, + "grad_norm": 0.146484375, + "learning_rate": 0.00021977153074401512, + "loss": 0.4722, + "step": 146030 + }, + { + "epoch": 7.253402205224993, + "grad_norm": 0.140625, + "learning_rate": 0.00021973179696036557, + "loss": 0.5035, + "step": 146040 + }, + { + "epoch": 7.253898877520612, + "grad_norm": 0.1279296875, + "learning_rate": 0.000219692063176716, + "loss": 0.4719, + "step": 146050 + }, + { + "epoch": 7.254395549816231, + "grad_norm": 0.15625, + "learning_rate": 0.00021965232939306648, + "loss": 0.471, + "step": 146060 + }, + { + "epoch": 7.254892222111851, + "grad_norm": 0.16015625, + "learning_rate": 0.00021961259560941693, + "loss": 0.4632, + "step": 146070 + }, + { + "epoch": 7.25538889440747, + "grad_norm": 0.1826171875, + "learning_rate": 0.00021957286182576734, + "loss": 0.4776, + "step": 146080 + }, + { + "epoch": 7.255885566703089, + "grad_norm": 0.158203125, + "learning_rate": 0.00021953312804211784, + "loss": 0.4698, + "step": 146090 + }, + { + "epoch": 7.256382238998708, + "grad_norm": 0.1298828125, + "learning_rate": 0.00021949339425846826, + "loss": 0.4762, + "step": 146100 + }, + { + "epoch": 7.256878911294328, + "grad_norm": 0.1455078125, + "learning_rate": 0.00021945366047481876, + "loss": 0.4938, + "step": 146110 + }, + { + "epoch": 7.257375583589948, + "grad_norm": 0.142578125, + "learning_rate": 0.00021941392669116918, + "loss": 0.4769, + "step": 146120 + }, + { + "epoch": 7.257872255885567, + "grad_norm": 0.1708984375, + "learning_rate": 0.00021937419290751962, + "loss": 0.5028, + "step": 146130 + }, + { + "epoch": 7.258368928181186, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002193344591238701, + "loss": 0.5018, + "step": 146140 + }, + { + "epoch": 7.258865600476805, + "grad_norm": 0.1533203125, + "learning_rate": 0.00021929472534022054, + "loss": 0.4779, + "step": 146150 + }, + { + "epoch": 7.2593622727724245, + "grad_norm": 0.142578125, + "learning_rate": 0.00021925499155657095, + "loss": 0.4888, + "step": 146160 + }, + { + "epoch": 7.259858945068044, + "grad_norm": 0.1455078125, + "learning_rate": 0.00021921525777292145, + "loss": 0.5051, + "step": 146170 + }, + { + "epoch": 7.260355617363664, + "grad_norm": 0.1416015625, + "learning_rate": 0.00021917552398927187, + "loss": 0.5003, + "step": 146180 + }, + { + "epoch": 7.260852289659283, + "grad_norm": 0.134765625, + "learning_rate": 0.00021913579020562237, + "loss": 0.48, + "step": 146190 + }, + { + "epoch": 7.261348961954902, + "grad_norm": 0.16796875, + "learning_rate": 0.00021909605642197278, + "loss": 0.493, + "step": 146200 + }, + { + "epoch": 7.2618456342505215, + "grad_norm": 0.1396484375, + "learning_rate": 0.00021905632263832323, + "loss": 0.4842, + "step": 146210 + }, + { + "epoch": 7.262342306546141, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002190165888546737, + "loss": 0.493, + "step": 146220 + }, + { + "epoch": 7.26283897884176, + "grad_norm": 0.12353515625, + "learning_rate": 0.00021897685507102414, + "loss": 0.4798, + "step": 146230 + }, + { + "epoch": 7.263335651137379, + "grad_norm": 0.140625, + "learning_rate": 0.0002189371212873746, + "loss": 0.4905, + "step": 146240 + }, + { + "epoch": 7.2638323234329985, + "grad_norm": 0.1513671875, + "learning_rate": 0.00021889738750372506, + "loss": 0.4747, + "step": 146250 + }, + { + "epoch": 7.264328995728619, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002188576537200755, + "loss": 0.4507, + "step": 146260 + }, + { + "epoch": 7.264825668024238, + "grad_norm": 0.15234375, + "learning_rate": 0.00021881791993642598, + "loss": 0.4637, + "step": 146270 + }, + { + "epoch": 7.265322340319857, + "grad_norm": 0.140625, + "learning_rate": 0.00021877818615277642, + "loss": 0.4874, + "step": 146280 + }, + { + "epoch": 7.265819012615476, + "grad_norm": 0.1533203125, + "learning_rate": 0.00021873845236912684, + "loss": 0.5362, + "step": 146290 + }, + { + "epoch": 7.2663156849110955, + "grad_norm": 0.138671875, + "learning_rate": 0.00021869871858547733, + "loss": 0.477, + "step": 146300 + }, + { + "epoch": 7.266812357206715, + "grad_norm": 0.181640625, + "learning_rate": 0.00021865898480182775, + "loss": 0.4905, + "step": 146310 + }, + { + "epoch": 7.267309029502334, + "grad_norm": 0.1689453125, + "learning_rate": 0.0002186192510181782, + "loss": 0.4728, + "step": 146320 + }, + { + "epoch": 7.267805701797954, + "grad_norm": 0.13671875, + "learning_rate": 0.00021857951723452867, + "loss": 0.4751, + "step": 146330 + }, + { + "epoch": 7.268302374093573, + "grad_norm": 0.181640625, + "learning_rate": 0.0002185397834508791, + "loss": 0.5212, + "step": 146340 + }, + { + "epoch": 7.2687990463891925, + "grad_norm": 0.146484375, + "learning_rate": 0.00021850004966722958, + "loss": 0.4893, + "step": 146350 + }, + { + "epoch": 7.269295718684812, + "grad_norm": 0.1376953125, + "learning_rate": 0.00021846031588358003, + "loss": 0.4892, + "step": 146360 + }, + { + "epoch": 7.269792390980431, + "grad_norm": 0.12890625, + "learning_rate": 0.00021842058209993047, + "loss": 0.4533, + "step": 146370 + }, + { + "epoch": 7.27028906327605, + "grad_norm": 0.1513671875, + "learning_rate": 0.00021838084831628094, + "loss": 0.4917, + "step": 146380 + }, + { + "epoch": 7.270785735571669, + "grad_norm": 0.150390625, + "learning_rate": 0.0002183411145326314, + "loss": 0.4851, + "step": 146390 + }, + { + "epoch": 7.2712824078672895, + "grad_norm": 0.21875, + "learning_rate": 0.00021830138074898186, + "loss": 0.5024, + "step": 146400 + }, + { + "epoch": 7.271779080162909, + "grad_norm": 0.185546875, + "learning_rate": 0.00021826164696533228, + "loss": 0.5082, + "step": 146410 + }, + { + "epoch": 7.272275752458528, + "grad_norm": 0.2041015625, + "learning_rate": 0.00021822191318168272, + "loss": 0.465, + "step": 146420 + }, + { + "epoch": 7.272772424754147, + "grad_norm": 0.140625, + "learning_rate": 0.0002181821793980332, + "loss": 0.496, + "step": 146430 + }, + { + "epoch": 7.273269097049766, + "grad_norm": 0.1318359375, + "learning_rate": 0.00021814244561438364, + "loss": 0.4859, + "step": 146440 + }, + { + "epoch": 7.273765769345386, + "grad_norm": 0.1396484375, + "learning_rate": 0.00021810271183073408, + "loss": 0.502, + "step": 146450 + }, + { + "epoch": 7.274262441641005, + "grad_norm": 0.142578125, + "learning_rate": 0.00021806297804708455, + "loss": 0.5207, + "step": 146460 + }, + { + "epoch": 7.274759113936625, + "grad_norm": 0.15625, + "learning_rate": 0.000218023244263435, + "loss": 0.4643, + "step": 146470 + }, + { + "epoch": 7.275255786232244, + "grad_norm": 0.134765625, + "learning_rate": 0.00021798351047978547, + "loss": 0.4876, + "step": 146480 + }, + { + "epoch": 7.2757524585278635, + "grad_norm": 0.15234375, + "learning_rate": 0.0002179437766961359, + "loss": 0.4683, + "step": 146490 + }, + { + "epoch": 7.276249130823483, + "grad_norm": 0.166015625, + "learning_rate": 0.00021790404291248633, + "loss": 0.4881, + "step": 146500 + }, + { + "epoch": 7.276745803119102, + "grad_norm": 0.1455078125, + "learning_rate": 0.00021786430912883683, + "loss": 0.4642, + "step": 146510 + }, + { + "epoch": 7.277242475414721, + "grad_norm": 0.1455078125, + "learning_rate": 0.00021782457534518724, + "loss": 0.4739, + "step": 146520 + }, + { + "epoch": 7.27773914771034, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002177848415615377, + "loss": 0.467, + "step": 146530 + }, + { + "epoch": 7.2782358200059605, + "grad_norm": 0.1826171875, + "learning_rate": 0.00021774510777788816, + "loss": 0.5216, + "step": 146540 + }, + { + "epoch": 7.27873249230158, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002177053739942386, + "loss": 0.5084, + "step": 146550 + }, + { + "epoch": 7.279229164597199, + "grad_norm": 0.1630859375, + "learning_rate": 0.00021766564021058908, + "loss": 0.5143, + "step": 146560 + }, + { + "epoch": 7.279725836892818, + "grad_norm": 0.1513671875, + "learning_rate": 0.00021762590642693952, + "loss": 0.519, + "step": 146570 + }, + { + "epoch": 7.280222509188437, + "grad_norm": 0.1435546875, + "learning_rate": 0.00021758617264328996, + "loss": 0.4847, + "step": 146580 + }, + { + "epoch": 7.280719181484057, + "grad_norm": 0.14453125, + "learning_rate": 0.00021754643885964043, + "loss": 0.4794, + "step": 146590 + }, + { + "epoch": 7.281215853779676, + "grad_norm": 0.1328125, + "learning_rate": 0.00021750670507599088, + "loss": 0.4951, + "step": 146600 + }, + { + "epoch": 7.281712526075296, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002174669712923413, + "loss": 0.5, + "step": 146610 + }, + { + "epoch": 7.282209198370915, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002174272375086918, + "loss": 0.4903, + "step": 146620 + }, + { + "epoch": 7.282705870666534, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002173875037250422, + "loss": 0.4716, + "step": 146630 + }, + { + "epoch": 7.283202542962154, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002173477699413927, + "loss": 0.4708, + "step": 146640 + }, + { + "epoch": 7.283699215257773, + "grad_norm": 0.140625, + "learning_rate": 0.00021730803615774313, + "loss": 0.5213, + "step": 146650 + }, + { + "epoch": 7.284195887553392, + "grad_norm": 0.1630859375, + "learning_rate": 0.00021726830237409357, + "loss": 0.5167, + "step": 146660 + }, + { + "epoch": 7.284692559849011, + "grad_norm": 0.140625, + "learning_rate": 0.00021722856859044404, + "loss": 0.4967, + "step": 146670 + }, + { + "epoch": 7.285189232144631, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002171888348067945, + "loss": 0.5061, + "step": 146680 + }, + { + "epoch": 7.285685904440251, + "grad_norm": 0.142578125, + "learning_rate": 0.00021714910102314493, + "loss": 0.4926, + "step": 146690 + }, + { + "epoch": 7.28618257673587, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002171093672394954, + "loss": 0.4887, + "step": 146700 + }, + { + "epoch": 7.286679249031489, + "grad_norm": 0.1572265625, + "learning_rate": 0.00021706963345584582, + "loss": 0.5073, + "step": 146710 + }, + { + "epoch": 7.287175921327108, + "grad_norm": 0.16015625, + "learning_rate": 0.00021702989967219632, + "loss": 0.4765, + "step": 146720 + }, + { + "epoch": 7.287672593622728, + "grad_norm": 0.15234375, + "learning_rate": 0.00021699016588854674, + "loss": 0.4991, + "step": 146730 + }, + { + "epoch": 7.288169265918347, + "grad_norm": 0.140625, + "learning_rate": 0.00021695043210489718, + "loss": 0.4757, + "step": 146740 + }, + { + "epoch": 7.288665938213966, + "grad_norm": 0.15625, + "learning_rate": 0.00021691069832124765, + "loss": 0.4766, + "step": 146750 + }, + { + "epoch": 7.289162610509586, + "grad_norm": 0.146484375, + "learning_rate": 0.0002168709645375981, + "loss": 0.5002, + "step": 146760 + }, + { + "epoch": 7.289659282805205, + "grad_norm": 0.1357421875, + "learning_rate": 0.00021683123075394854, + "loss": 0.4682, + "step": 146770 + }, + { + "epoch": 7.290155955100825, + "grad_norm": 0.1513671875, + "learning_rate": 0.000216791496970299, + "loss": 0.4651, + "step": 146780 + }, + { + "epoch": 7.290652627396444, + "grad_norm": 0.1376953125, + "learning_rate": 0.00021675176318664946, + "loss": 0.4819, + "step": 146790 + }, + { + "epoch": 7.291149299692063, + "grad_norm": 0.1357421875, + "learning_rate": 0.00021671202940299993, + "loss": 0.4793, + "step": 146800 + }, + { + "epoch": 7.291645971987682, + "grad_norm": 0.185546875, + "learning_rate": 0.00021667229561935037, + "loss": 0.4844, + "step": 146810 + }, + { + "epoch": 7.2921426442833015, + "grad_norm": 0.154296875, + "learning_rate": 0.0002166325618357008, + "loss": 0.4949, + "step": 146820 + }, + { + "epoch": 7.292639316578922, + "grad_norm": 0.1484375, + "learning_rate": 0.0002165928280520513, + "loss": 0.4655, + "step": 146830 + }, + { + "epoch": 7.293135988874541, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002165530942684017, + "loss": 0.4366, + "step": 146840 + }, + { + "epoch": 7.29363266117016, + "grad_norm": 0.134765625, + "learning_rate": 0.0002165133604847522, + "loss": 0.4994, + "step": 146850 + }, + { + "epoch": 7.294129333465779, + "grad_norm": 0.14453125, + "learning_rate": 0.00021647362670110262, + "loss": 0.4936, + "step": 146860 + }, + { + "epoch": 7.2946260057613985, + "grad_norm": 0.14453125, + "learning_rate": 0.00021643389291745306, + "loss": 0.4645, + "step": 146870 + }, + { + "epoch": 7.295122678057018, + "grad_norm": 0.1630859375, + "learning_rate": 0.00021639415913380354, + "loss": 0.4965, + "step": 146880 + }, + { + "epoch": 7.295619350352637, + "grad_norm": 0.1455078125, + "learning_rate": 0.00021635442535015398, + "loss": 0.5206, + "step": 146890 + }, + { + "epoch": 7.296116022648257, + "grad_norm": 0.1416015625, + "learning_rate": 0.00021631469156650442, + "loss": 0.4899, + "step": 146900 + }, + { + "epoch": 7.296612694943876, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002162749577828549, + "loss": 0.4816, + "step": 146910 + }, + { + "epoch": 7.2971093672394955, + "grad_norm": 0.142578125, + "learning_rate": 0.00021623522399920534, + "loss": 0.4872, + "step": 146920 + }, + { + "epoch": 7.297606039535115, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002161954902155558, + "loss": 0.5007, + "step": 146930 + }, + { + "epoch": 7.298102711830734, + "grad_norm": 0.15625, + "learning_rate": 0.00021615575643190625, + "loss": 0.4545, + "step": 146940 + }, + { + "epoch": 7.298599384126353, + "grad_norm": 0.1572265625, + "learning_rate": 0.00021611602264825667, + "loss": 0.484, + "step": 146950 + }, + { + "epoch": 7.2990960564219725, + "grad_norm": 0.1474609375, + "learning_rate": 0.00021607628886460717, + "loss": 0.4755, + "step": 146960 + }, + { + "epoch": 7.299592728717592, + "grad_norm": 0.169921875, + "learning_rate": 0.0002160365550809576, + "loss": 0.4786, + "step": 146970 + }, + { + "epoch": 7.300089401013212, + "grad_norm": 0.1533203125, + "learning_rate": 0.00021599682129730803, + "loss": 0.4916, + "step": 146980 + }, + { + "epoch": 7.300586073308831, + "grad_norm": 0.158203125, + "learning_rate": 0.0002159570875136585, + "loss": 0.4628, + "step": 146990 + }, + { + "epoch": 7.30108274560445, + "grad_norm": 0.1474609375, + "learning_rate": 0.00021591735373000895, + "loss": 0.4911, + "step": 147000 + }, + { + "epoch": 7.3015794179000695, + "grad_norm": 0.134765625, + "learning_rate": 0.00021587761994635942, + "loss": 0.4863, + "step": 147010 + }, + { + "epoch": 7.302076090195689, + "grad_norm": 0.138671875, + "learning_rate": 0.00021583788616270986, + "loss": 0.4797, + "step": 147020 + }, + { + "epoch": 7.302572762491308, + "grad_norm": 0.1484375, + "learning_rate": 0.00021579815237906028, + "loss": 0.4951, + "step": 147030 + }, + { + "epoch": 7.303069434786927, + "grad_norm": 0.1630859375, + "learning_rate": 0.00021575841859541078, + "loss": 0.5233, + "step": 147040 + }, + { + "epoch": 7.303566107082547, + "grad_norm": 0.134765625, + "learning_rate": 0.0002157186848117612, + "loss": 0.4678, + "step": 147050 + }, + { + "epoch": 7.3040627793781665, + "grad_norm": 0.14453125, + "learning_rate": 0.00021567895102811164, + "loss": 0.4642, + "step": 147060 + }, + { + "epoch": 7.304559451673786, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002156392172444621, + "loss": 0.5134, + "step": 147070 + }, + { + "epoch": 7.305056123969405, + "grad_norm": 0.15234375, + "learning_rate": 0.00021559948346081256, + "loss": 0.5011, + "step": 147080 + }, + { + "epoch": 7.305552796265024, + "grad_norm": 0.142578125, + "learning_rate": 0.00021555974967716303, + "loss": 0.5014, + "step": 147090 + }, + { + "epoch": 7.306049468560643, + "grad_norm": 0.1689453125, + "learning_rate": 0.00021552001589351347, + "loss": 0.4747, + "step": 147100 + }, + { + "epoch": 7.306546140856263, + "grad_norm": 0.140625, + "learning_rate": 0.00021548028210986392, + "loss": 0.4914, + "step": 147110 + }, + { + "epoch": 7.307042813151883, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002154405483262144, + "loss": 0.4917, + "step": 147120 + }, + { + "epoch": 7.307539485447502, + "grad_norm": 0.181640625, + "learning_rate": 0.00021540081454256483, + "loss": 0.4962, + "step": 147130 + }, + { + "epoch": 7.308036157743121, + "grad_norm": 0.1337890625, + "learning_rate": 0.00021536108075891525, + "loss": 0.5183, + "step": 147140 + }, + { + "epoch": 7.30853283003874, + "grad_norm": 0.1474609375, + "learning_rate": 0.00021532134697526575, + "loss": 0.4937, + "step": 147150 + }, + { + "epoch": 7.30902950233436, + "grad_norm": 0.1494140625, + "learning_rate": 0.00021528161319161616, + "loss": 0.5082, + "step": 147160 + }, + { + "epoch": 7.309526174629979, + "grad_norm": 0.150390625, + "learning_rate": 0.00021524187940796666, + "loss": 0.4975, + "step": 147170 + }, + { + "epoch": 7.310022846925598, + "grad_norm": 0.1455078125, + "learning_rate": 0.00021520214562431708, + "loss": 0.4993, + "step": 147180 + }, + { + "epoch": 7.310519519221218, + "grad_norm": 0.140625, + "learning_rate": 0.00021516241184066752, + "loss": 0.4736, + "step": 147190 + }, + { + "epoch": 7.3110161915168375, + "grad_norm": 0.134765625, + "learning_rate": 0.000215122678057018, + "loss": 0.4645, + "step": 147200 + }, + { + "epoch": 7.311512863812457, + "grad_norm": 0.162109375, + "learning_rate": 0.00021508294427336844, + "loss": 0.4641, + "step": 147210 + }, + { + "epoch": 7.312009536108076, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002150432104897189, + "loss": 0.4738, + "step": 147220 + }, + { + "epoch": 7.312506208403695, + "grad_norm": 0.181640625, + "learning_rate": 0.00021500347670606935, + "loss": 0.5394, + "step": 147230 + }, + { + "epoch": 7.313002880699314, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002149637429224198, + "loss": 0.4848, + "step": 147240 + }, + { + "epoch": 7.313499552994934, + "grad_norm": 0.1591796875, + "learning_rate": 0.00021492400913877027, + "loss": 0.4947, + "step": 147250 + }, + { + "epoch": 7.313996225290554, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002148842753551207, + "loss": 0.4941, + "step": 147260 + }, + { + "epoch": 7.314492897586173, + "grad_norm": 0.1328125, + "learning_rate": 0.00021484454157147113, + "loss": 0.484, + "step": 147270 + }, + { + "epoch": 7.314989569881792, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002148048077878216, + "loss": 0.496, + "step": 147280 + }, + { + "epoch": 7.315486242177411, + "grad_norm": 0.134765625, + "learning_rate": 0.00021476507400417205, + "loss": 0.4739, + "step": 147290 + }, + { + "epoch": 7.315982914473031, + "grad_norm": 0.1337890625, + "learning_rate": 0.00021472534022052252, + "loss": 0.5054, + "step": 147300 + }, + { + "epoch": 7.31647958676865, + "grad_norm": 0.1416015625, + "learning_rate": 0.00021468560643687296, + "loss": 0.4693, + "step": 147310 + }, + { + "epoch": 7.316976259064269, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002146458726532234, + "loss": 0.4808, + "step": 147320 + }, + { + "epoch": 7.317472931359889, + "grad_norm": 0.15234375, + "learning_rate": 0.00021460613886957388, + "loss": 0.488, + "step": 147330 + }, + { + "epoch": 7.317969603655508, + "grad_norm": 0.1455078125, + "learning_rate": 0.00021456640508592432, + "loss": 0.5135, + "step": 147340 + }, + { + "epoch": 7.318466275951128, + "grad_norm": 0.1826171875, + "learning_rate": 0.00021452667130227474, + "loss": 0.4665, + "step": 147350 + }, + { + "epoch": 7.318962948246747, + "grad_norm": 0.1572265625, + "learning_rate": 0.00021448693751862524, + "loss": 0.4793, + "step": 147360 + }, + { + "epoch": 7.319459620542366, + "grad_norm": 0.1396484375, + "learning_rate": 0.00021444720373497566, + "loss": 0.5066, + "step": 147370 + }, + { + "epoch": 7.319956292837985, + "grad_norm": 0.171875, + "learning_rate": 0.00021440746995132615, + "loss": 0.4914, + "step": 147380 + }, + { + "epoch": 7.3204529651336046, + "grad_norm": 0.140625, + "learning_rate": 0.00021436773616767657, + "loss": 0.487, + "step": 147390 + }, + { + "epoch": 7.320949637429225, + "grad_norm": 0.14453125, + "learning_rate": 0.00021432800238402702, + "loss": 0.501, + "step": 147400 + }, + { + "epoch": 7.321446309724844, + "grad_norm": 0.16015625, + "learning_rate": 0.0002142882686003775, + "loss": 0.4784, + "step": 147410 + }, + { + "epoch": 7.321942982020463, + "grad_norm": 0.150390625, + "learning_rate": 0.00021424853481672793, + "loss": 0.5204, + "step": 147420 + }, + { + "epoch": 7.322439654316082, + "grad_norm": 0.150390625, + "learning_rate": 0.00021420880103307838, + "loss": 0.4816, + "step": 147430 + }, + { + "epoch": 7.322936326611702, + "grad_norm": 0.150390625, + "learning_rate": 0.00021416906724942885, + "loss": 0.4802, + "step": 147440 + }, + { + "epoch": 7.323432998907321, + "grad_norm": 0.1484375, + "learning_rate": 0.0002141293334657793, + "loss": 0.4924, + "step": 147450 + }, + { + "epoch": 7.32392967120294, + "grad_norm": 0.1416015625, + "learning_rate": 0.00021408959968212976, + "loss": 0.4798, + "step": 147460 + }, + { + "epoch": 7.324426343498559, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002140498658984802, + "loss": 0.4849, + "step": 147470 + }, + { + "epoch": 7.324923015794179, + "grad_norm": 0.134765625, + "learning_rate": 0.00021401013211483062, + "loss": 0.4835, + "step": 147480 + }, + { + "epoch": 7.325419688089799, + "grad_norm": 0.1396484375, + "learning_rate": 0.00021397039833118112, + "loss": 0.4794, + "step": 147490 + }, + { + "epoch": 7.325916360385418, + "grad_norm": 0.1552734375, + "learning_rate": 0.00021393066454753154, + "loss": 0.4891, + "step": 147500 + }, + { + "epoch": 7.326413032681037, + "grad_norm": 0.1708984375, + "learning_rate": 0.00021389093076388198, + "loss": 0.4946, + "step": 147510 + }, + { + "epoch": 7.326909704976656, + "grad_norm": 0.1640625, + "learning_rate": 0.00021385119698023246, + "loss": 0.509, + "step": 147520 + }, + { + "epoch": 7.3274063772722755, + "grad_norm": 0.15234375, + "learning_rate": 0.0002138114631965829, + "loss": 0.4729, + "step": 147530 + }, + { + "epoch": 7.327903049567895, + "grad_norm": 0.1787109375, + "learning_rate": 0.00021377172941293337, + "loss": 0.4801, + "step": 147540 + }, + { + "epoch": 7.328399721863515, + "grad_norm": 0.162109375, + "learning_rate": 0.00021373199562928381, + "loss": 0.5032, + "step": 147550 + }, + { + "epoch": 7.328896394159134, + "grad_norm": 0.1435546875, + "learning_rate": 0.00021369226184563423, + "loss": 0.4887, + "step": 147560 + }, + { + "epoch": 7.329393066454753, + "grad_norm": 0.1328125, + "learning_rate": 0.00021365252806198473, + "loss": 0.4697, + "step": 147570 + }, + { + "epoch": 7.3298897387503725, + "grad_norm": 0.1396484375, + "learning_rate": 0.00021361279427833515, + "loss": 0.4832, + "step": 147580 + }, + { + "epoch": 7.330386411045992, + "grad_norm": 0.12890625, + "learning_rate": 0.0002135730604946856, + "loss": 0.5004, + "step": 147590 + }, + { + "epoch": 7.330883083341611, + "grad_norm": 0.140625, + "learning_rate": 0.00021353332671103606, + "loss": 0.5154, + "step": 147600 + }, + { + "epoch": 7.33137975563723, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002134935929273865, + "loss": 0.4936, + "step": 147610 + }, + { + "epoch": 7.331876427932849, + "grad_norm": 0.1474609375, + "learning_rate": 0.00021345385914373698, + "loss": 0.4782, + "step": 147620 + }, + { + "epoch": 7.3323731002284696, + "grad_norm": 0.13671875, + "learning_rate": 0.00021341412536008742, + "loss": 0.4783, + "step": 147630 + }, + { + "epoch": 7.332869772524089, + "grad_norm": 0.1484375, + "learning_rate": 0.00021337439157643787, + "loss": 0.4588, + "step": 147640 + }, + { + "epoch": 7.333366444819708, + "grad_norm": 0.1396484375, + "learning_rate": 0.00021333465779278834, + "loss": 0.4956, + "step": 147650 + }, + { + "epoch": 7.333863117115327, + "grad_norm": 0.1552734375, + "learning_rate": 0.00021329492400913878, + "loss": 0.4903, + "step": 147660 + }, + { + "epoch": 7.3343597894109465, + "grad_norm": 0.1474609375, + "learning_rate": 0.00021325519022548925, + "loss": 0.4925, + "step": 147670 + }, + { + "epoch": 7.334856461706566, + "grad_norm": 0.140625, + "learning_rate": 0.0002132154564418397, + "loss": 0.4826, + "step": 147680 + }, + { + "epoch": 7.335353134002185, + "grad_norm": 0.1484375, + "learning_rate": 0.00021317572265819012, + "loss": 0.5, + "step": 147690 + }, + { + "epoch": 7.335849806297805, + "grad_norm": 0.1357421875, + "learning_rate": 0.00021313598887454061, + "loss": 0.4536, + "step": 147700 + }, + { + "epoch": 7.336346478593424, + "grad_norm": 0.158203125, + "learning_rate": 0.00021309625509089103, + "loss": 0.4697, + "step": 147710 + }, + { + "epoch": 7.3368431508890435, + "grad_norm": 0.1318359375, + "learning_rate": 0.00021305652130724148, + "loss": 0.4588, + "step": 147720 + }, + { + "epoch": 7.337339823184663, + "grad_norm": 0.142578125, + "learning_rate": 0.00021301678752359195, + "loss": 0.4895, + "step": 147730 + }, + { + "epoch": 7.337836495480282, + "grad_norm": 0.14453125, + "learning_rate": 0.0002129770537399424, + "loss": 0.4911, + "step": 147740 + }, + { + "epoch": 7.338333167775901, + "grad_norm": 0.193359375, + "learning_rate": 0.00021293731995629286, + "loss": 0.4846, + "step": 147750 + }, + { + "epoch": 7.33882984007152, + "grad_norm": 0.1318359375, + "learning_rate": 0.0002128975861726433, + "loss": 0.4762, + "step": 147760 + }, + { + "epoch": 7.3393265123671405, + "grad_norm": 0.1376953125, + "learning_rate": 0.00021285785238899375, + "loss": 0.4769, + "step": 147770 + }, + { + "epoch": 7.33982318466276, + "grad_norm": 0.14453125, + "learning_rate": 0.00021281811860534422, + "loss": 0.496, + "step": 147780 + }, + { + "epoch": 7.340319856958379, + "grad_norm": 0.146484375, + "learning_rate": 0.00021277838482169467, + "loss": 0.4716, + "step": 147790 + }, + { + "epoch": 7.340816529253998, + "grad_norm": 0.1484375, + "learning_rate": 0.00021273865103804508, + "loss": 0.4979, + "step": 147800 + }, + { + "epoch": 7.341313201549617, + "grad_norm": 0.146484375, + "learning_rate": 0.00021269891725439558, + "loss": 0.4991, + "step": 147810 + }, + { + "epoch": 7.341809873845237, + "grad_norm": 0.14453125, + "learning_rate": 0.000212659183470746, + "loss": 0.5062, + "step": 147820 + }, + { + "epoch": 7.342306546140856, + "grad_norm": 0.140625, + "learning_rate": 0.00021261944968709647, + "loss": 0.5022, + "step": 147830 + }, + { + "epoch": 7.342803218436476, + "grad_norm": 0.1494140625, + "learning_rate": 0.00021257971590344691, + "loss": 0.5222, + "step": 147840 + }, + { + "epoch": 7.343299890732095, + "grad_norm": 0.138671875, + "learning_rate": 0.00021253998211979736, + "loss": 0.4817, + "step": 147850 + }, + { + "epoch": 7.343796563027714, + "grad_norm": 0.1591796875, + "learning_rate": 0.00021250024833614783, + "loss": 0.4652, + "step": 147860 + }, + { + "epoch": 7.344293235323334, + "grad_norm": 0.14453125, + "learning_rate": 0.00021246051455249827, + "loss": 0.4908, + "step": 147870 + }, + { + "epoch": 7.344789907618953, + "grad_norm": 0.14453125, + "learning_rate": 0.0002124207807688487, + "loss": 0.4937, + "step": 147880 + }, + { + "epoch": 7.345286579914572, + "grad_norm": 0.1259765625, + "learning_rate": 0.0002123810469851992, + "loss": 0.4813, + "step": 147890 + }, + { + "epoch": 7.345783252210191, + "grad_norm": 0.150390625, + "learning_rate": 0.0002123413132015496, + "loss": 0.4756, + "step": 147900 + }, + { + "epoch": 7.3462799245058115, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002123015794179001, + "loss": 0.4978, + "step": 147910 + }, + { + "epoch": 7.346776596801431, + "grad_norm": 0.158203125, + "learning_rate": 0.00021226184563425052, + "loss": 0.4986, + "step": 147920 + }, + { + "epoch": 7.34727326909705, + "grad_norm": 0.134765625, + "learning_rate": 0.00021222211185060097, + "loss": 0.4819, + "step": 147930 + }, + { + "epoch": 7.347769941392669, + "grad_norm": 0.1513671875, + "learning_rate": 0.00021218237806695144, + "loss": 0.4643, + "step": 147940 + }, + { + "epoch": 7.348266613688288, + "grad_norm": 0.1474609375, + "learning_rate": 0.00021214264428330188, + "loss": 0.4582, + "step": 147950 + }, + { + "epoch": 7.348763285983908, + "grad_norm": 0.1484375, + "learning_rate": 0.00021210291049965233, + "loss": 0.4971, + "step": 147960 + }, + { + "epoch": 7.349259958279527, + "grad_norm": 0.1611328125, + "learning_rate": 0.0002120631767160028, + "loss": 0.4929, + "step": 147970 + }, + { + "epoch": 7.349756630575147, + "grad_norm": 0.1767578125, + "learning_rate": 0.00021202344293235324, + "loss": 0.4662, + "step": 147980 + }, + { + "epoch": 7.350253302870766, + "grad_norm": 0.12890625, + "learning_rate": 0.00021198370914870371, + "loss": 0.4468, + "step": 147990 + }, + { + "epoch": 7.350749975166385, + "grad_norm": 0.1435546875, + "learning_rate": 0.00021194397536505416, + "loss": 0.5046, + "step": 148000 + }, + { + "epoch": 7.351246647462005, + "grad_norm": 0.13671875, + "learning_rate": 0.00021190424158140458, + "loss": 0.4765, + "step": 148010 + }, + { + "epoch": 7.351743319757624, + "grad_norm": 0.1640625, + "learning_rate": 0.00021186450779775507, + "loss": 0.4758, + "step": 148020 + }, + { + "epoch": 7.352239992053243, + "grad_norm": 0.142578125, + "learning_rate": 0.0002118247740141055, + "loss": 0.4849, + "step": 148030 + }, + { + "epoch": 7.352736664348862, + "grad_norm": 0.1357421875, + "learning_rate": 0.00021178504023045594, + "loss": 0.4877, + "step": 148040 + }, + { + "epoch": 7.353233336644482, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002117453064468064, + "loss": 0.4915, + "step": 148050 + }, + { + "epoch": 7.353730008940102, + "grad_norm": 0.140625, + "learning_rate": 0.00021170557266315685, + "loss": 0.5046, + "step": 148060 + }, + { + "epoch": 7.354226681235721, + "grad_norm": 0.171875, + "learning_rate": 0.00021166583887950732, + "loss": 0.4907, + "step": 148070 + }, + { + "epoch": 7.35472335353134, + "grad_norm": 0.1484375, + "learning_rate": 0.00021162610509585777, + "loss": 0.4744, + "step": 148080 + }, + { + "epoch": 7.355220025826959, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002115863713122082, + "loss": 0.4648, + "step": 148090 + }, + { + "epoch": 7.355716698122579, + "grad_norm": 0.1533203125, + "learning_rate": 0.00021154663752855868, + "loss": 0.5027, + "step": 148100 + }, + { + "epoch": 7.356213370418198, + "grad_norm": 0.1455078125, + "learning_rate": 0.0002115069037449091, + "loss": 0.4799, + "step": 148110 + }, + { + "epoch": 7.356710042713817, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002114671699612596, + "loss": 0.4903, + "step": 148120 + }, + { + "epoch": 7.357206715009437, + "grad_norm": 0.1455078125, + "learning_rate": 0.00021142743617761002, + "loss": 0.4935, + "step": 148130 + }, + { + "epoch": 7.357703387305056, + "grad_norm": 0.154296875, + "learning_rate": 0.00021138770239396046, + "loss": 0.4986, + "step": 148140 + }, + { + "epoch": 7.358200059600676, + "grad_norm": 0.146484375, + "learning_rate": 0.00021134796861031093, + "loss": 0.4964, + "step": 148150 + }, + { + "epoch": 7.358696731896295, + "grad_norm": 0.1474609375, + "learning_rate": 0.00021130823482666137, + "loss": 0.4879, + "step": 148160 + }, + { + "epoch": 7.359193404191914, + "grad_norm": 0.1337890625, + "learning_rate": 0.00021126850104301182, + "loss": 0.5062, + "step": 148170 + }, + { + "epoch": 7.359690076487533, + "grad_norm": 0.138671875, + "learning_rate": 0.0002112287672593623, + "loss": 0.5151, + "step": 148180 + }, + { + "epoch": 7.3601867487831525, + "grad_norm": 0.158203125, + "learning_rate": 0.00021118903347571273, + "loss": 0.5111, + "step": 148190 + }, + { + "epoch": 7.360683421078773, + "grad_norm": 0.1328125, + "learning_rate": 0.0002111492996920632, + "loss": 0.5055, + "step": 148200 + }, + { + "epoch": 7.361180093374392, + "grad_norm": 0.154296875, + "learning_rate": 0.00021110956590841365, + "loss": 0.4919, + "step": 148210 + }, + { + "epoch": 7.361676765670011, + "grad_norm": 0.12255859375, + "learning_rate": 0.00021106983212476407, + "loss": 0.4405, + "step": 148220 + }, + { + "epoch": 7.36217343796563, + "grad_norm": 0.13671875, + "learning_rate": 0.00021103009834111457, + "loss": 0.4251, + "step": 148230 + }, + { + "epoch": 7.3626701102612495, + "grad_norm": 0.169921875, + "learning_rate": 0.00021099036455746498, + "loss": 0.5104, + "step": 148240 + }, + { + "epoch": 7.363166782556869, + "grad_norm": 0.1396484375, + "learning_rate": 0.00021095063077381543, + "loss": 0.4527, + "step": 148250 + }, + { + "epoch": 7.363663454852488, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002109108969901659, + "loss": 0.4996, + "step": 148260 + }, + { + "epoch": 7.364160127148108, + "grad_norm": 0.1416015625, + "learning_rate": 0.00021087116320651634, + "loss": 0.4782, + "step": 148270 + }, + { + "epoch": 7.364656799443727, + "grad_norm": 0.17578125, + "learning_rate": 0.00021083142942286681, + "loss": 0.5066, + "step": 148280 + }, + { + "epoch": 7.3651534717393465, + "grad_norm": 0.1416015625, + "learning_rate": 0.00021079169563921726, + "loss": 0.4703, + "step": 148290 + }, + { + "epoch": 7.365650144034966, + "grad_norm": 0.158203125, + "learning_rate": 0.0002107519618555677, + "loss": 0.4993, + "step": 148300 + }, + { + "epoch": 7.366146816330585, + "grad_norm": 0.1396484375, + "learning_rate": 0.00021071222807191817, + "loss": 0.4699, + "step": 148310 + }, + { + "epoch": 7.366643488626204, + "grad_norm": 0.16015625, + "learning_rate": 0.00021067249428826862, + "loss": 0.4557, + "step": 148320 + }, + { + "epoch": 7.367140160921823, + "grad_norm": 0.162109375, + "learning_rate": 0.00021063276050461904, + "loss": 0.4764, + "step": 148330 + }, + { + "epoch": 7.367636833217443, + "grad_norm": 0.1796875, + "learning_rate": 0.00021059302672096953, + "loss": 0.471, + "step": 148340 + }, + { + "epoch": 7.368133505513063, + "grad_norm": 0.14453125, + "learning_rate": 0.00021055329293731995, + "loss": 0.5214, + "step": 148350 + }, + { + "epoch": 7.368630177808682, + "grad_norm": 0.138671875, + "learning_rate": 0.00021051355915367045, + "loss": 0.4973, + "step": 148360 + }, + { + "epoch": 7.369126850104301, + "grad_norm": 0.1376953125, + "learning_rate": 0.00021047382537002087, + "loss": 0.4635, + "step": 148370 + }, + { + "epoch": 7.3696235223999205, + "grad_norm": 0.1884765625, + "learning_rate": 0.0002104340915863713, + "loss": 0.4943, + "step": 148380 + }, + { + "epoch": 7.37012019469554, + "grad_norm": 0.150390625, + "learning_rate": 0.00021039435780272178, + "loss": 0.5044, + "step": 148390 + }, + { + "epoch": 7.370616866991159, + "grad_norm": 0.1435546875, + "learning_rate": 0.00021035462401907223, + "loss": 0.492, + "step": 148400 + }, + { + "epoch": 7.371113539286778, + "grad_norm": 0.142578125, + "learning_rate": 0.00021031489023542264, + "loss": 0.4722, + "step": 148410 + }, + { + "epoch": 7.371610211582398, + "grad_norm": 0.14453125, + "learning_rate": 0.00021027515645177314, + "loss": 0.4814, + "step": 148420 + }, + { + "epoch": 7.3721068838780175, + "grad_norm": 0.1435546875, + "learning_rate": 0.00021023542266812356, + "loss": 0.4697, + "step": 148430 + }, + { + "epoch": 7.372603556173637, + "grad_norm": 0.1904296875, + "learning_rate": 0.00021019568888447406, + "loss": 0.4714, + "step": 148440 + }, + { + "epoch": 7.373100228469256, + "grad_norm": 0.15234375, + "learning_rate": 0.00021015595510082448, + "loss": 0.4722, + "step": 148450 + }, + { + "epoch": 7.373596900764875, + "grad_norm": 0.15234375, + "learning_rate": 0.00021011622131717492, + "loss": 0.4903, + "step": 148460 + }, + { + "epoch": 7.374093573060494, + "grad_norm": 0.150390625, + "learning_rate": 0.0002100764875335254, + "loss": 0.4621, + "step": 148470 + }, + { + "epoch": 7.374590245356114, + "grad_norm": 0.1640625, + "learning_rate": 0.00021003675374987583, + "loss": 0.4817, + "step": 148480 + }, + { + "epoch": 7.375086917651734, + "grad_norm": 0.1298828125, + "learning_rate": 0.0002099970199662263, + "loss": 0.5023, + "step": 148490 + }, + { + "epoch": 7.375583589947353, + "grad_norm": 0.166015625, + "learning_rate": 0.00020995728618257675, + "loss": 0.5172, + "step": 148500 + }, + { + "epoch": 7.376080262242972, + "grad_norm": 0.1787109375, + "learning_rate": 0.0002099175523989272, + "loss": 0.4956, + "step": 148510 + }, + { + "epoch": 7.376576934538591, + "grad_norm": 0.1806640625, + "learning_rate": 0.00020987781861527767, + "loss": 0.5087, + "step": 148520 + }, + { + "epoch": 7.377073606834211, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002098380848316281, + "loss": 0.5029, + "step": 148530 + }, + { + "epoch": 7.37757027912983, + "grad_norm": 0.16015625, + "learning_rate": 0.00020979835104797853, + "loss": 0.4689, + "step": 148540 + }, + { + "epoch": 7.378066951425449, + "grad_norm": 0.1474609375, + "learning_rate": 0.00020975861726432903, + "loss": 0.495, + "step": 148550 + }, + { + "epoch": 7.378563623721069, + "grad_norm": 0.15625, + "learning_rate": 0.00020971888348067944, + "loss": 0.4624, + "step": 148560 + }, + { + "epoch": 7.3790602960166884, + "grad_norm": 0.201171875, + "learning_rate": 0.00020967914969702994, + "loss": 0.4616, + "step": 148570 + }, + { + "epoch": 7.379556968312308, + "grad_norm": 0.1435546875, + "learning_rate": 0.00020963941591338036, + "loss": 0.4881, + "step": 148580 + }, + { + "epoch": 7.380053640607927, + "grad_norm": 0.130859375, + "learning_rate": 0.0002095996821297308, + "loss": 0.4747, + "step": 148590 + }, + { + "epoch": 7.380550312903546, + "grad_norm": 0.1796875, + "learning_rate": 0.00020955994834608127, + "loss": 0.5043, + "step": 148600 + }, + { + "epoch": 7.381046985199165, + "grad_norm": 0.1416015625, + "learning_rate": 0.00020952021456243172, + "loss": 0.4839, + "step": 148610 + }, + { + "epoch": 7.381543657494785, + "grad_norm": 0.154296875, + "learning_rate": 0.00020948048077878216, + "loss": 0.4851, + "step": 148620 + }, + { + "epoch": 7.382040329790405, + "grad_norm": 0.1640625, + "learning_rate": 0.00020944074699513263, + "loss": 0.478, + "step": 148630 + }, + { + "epoch": 7.382537002086024, + "grad_norm": 0.140625, + "learning_rate": 0.00020940101321148308, + "loss": 0.4842, + "step": 148640 + }, + { + "epoch": 7.383033674381643, + "grad_norm": 0.1552734375, + "learning_rate": 0.00020936127942783355, + "loss": 0.4903, + "step": 148650 + }, + { + "epoch": 7.383530346677262, + "grad_norm": 0.21875, + "learning_rate": 0.000209321545644184, + "loss": 0.5152, + "step": 148660 + }, + { + "epoch": 7.384027018972882, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002092818118605344, + "loss": 0.4867, + "step": 148670 + }, + { + "epoch": 7.384523691268501, + "grad_norm": 0.1337890625, + "learning_rate": 0.00020924207807688488, + "loss": 0.4878, + "step": 148680 + }, + { + "epoch": 7.38502036356412, + "grad_norm": 0.1533203125, + "learning_rate": 0.00020920234429323533, + "loss": 0.5052, + "step": 148690 + }, + { + "epoch": 7.38551703585974, + "grad_norm": 0.1748046875, + "learning_rate": 0.00020916261050958577, + "loss": 0.4869, + "step": 148700 + }, + { + "epoch": 7.386013708155359, + "grad_norm": 0.1396484375, + "learning_rate": 0.00020912287672593624, + "loss": 0.4599, + "step": 148710 + }, + { + "epoch": 7.386510380450979, + "grad_norm": 0.162109375, + "learning_rate": 0.0002090831429422867, + "loss": 0.4942, + "step": 148720 + }, + { + "epoch": 7.387007052746598, + "grad_norm": 0.1826171875, + "learning_rate": 0.00020904340915863716, + "loss": 0.5085, + "step": 148730 + }, + { + "epoch": 7.387503725042217, + "grad_norm": 0.15234375, + "learning_rate": 0.0002090036753749876, + "loss": 0.5101, + "step": 148740 + }, + { + "epoch": 7.388000397337836, + "grad_norm": 0.1572265625, + "learning_rate": 0.00020896394159133802, + "loss": 0.4933, + "step": 148750 + }, + { + "epoch": 7.3884970696334555, + "grad_norm": 0.142578125, + "learning_rate": 0.00020892420780768852, + "loss": 0.4913, + "step": 148760 + }, + { + "epoch": 7.388993741929076, + "grad_norm": 0.2431640625, + "learning_rate": 0.00020888447402403894, + "loss": 0.4553, + "step": 148770 + }, + { + "epoch": 7.389490414224695, + "grad_norm": 0.1494140625, + "learning_rate": 0.00020884474024038938, + "loss": 0.4928, + "step": 148780 + }, + { + "epoch": 7.389987086520314, + "grad_norm": 0.16796875, + "learning_rate": 0.00020880500645673985, + "loss": 0.4955, + "step": 148790 + }, + { + "epoch": 7.390483758815933, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002087652726730903, + "loss": 0.4831, + "step": 148800 + }, + { + "epoch": 7.390980431111553, + "grad_norm": 0.1474609375, + "learning_rate": 0.00020872553888944077, + "loss": 0.4747, + "step": 148810 + }, + { + "epoch": 7.391477103407172, + "grad_norm": 0.1376953125, + "learning_rate": 0.0002086858051057912, + "loss": 0.4693, + "step": 148820 + }, + { + "epoch": 7.391973775702791, + "grad_norm": 0.138671875, + "learning_rate": 0.00020864607132214165, + "loss": 0.4614, + "step": 148830 + }, + { + "epoch": 7.39247044799841, + "grad_norm": 0.1669921875, + "learning_rate": 0.00020860633753849213, + "loss": 0.4809, + "step": 148840 + }, + { + "epoch": 7.39296712029403, + "grad_norm": 0.140625, + "learning_rate": 0.00020856660375484257, + "loss": 0.4973, + "step": 148850 + }, + { + "epoch": 7.39346379258965, + "grad_norm": 0.1435546875, + "learning_rate": 0.000208526869971193, + "loss": 0.4956, + "step": 148860 + }, + { + "epoch": 7.393960464885269, + "grad_norm": 0.142578125, + "learning_rate": 0.00020848713618754349, + "loss": 0.4837, + "step": 148870 + }, + { + "epoch": 7.394457137180888, + "grad_norm": 0.1484375, + "learning_rate": 0.0002084474024038939, + "loss": 0.4962, + "step": 148880 + }, + { + "epoch": 7.394953809476507, + "grad_norm": 0.1767578125, + "learning_rate": 0.0002084076686202444, + "loss": 0.4933, + "step": 148890 + }, + { + "epoch": 7.3954504817721265, + "grad_norm": 0.16015625, + "learning_rate": 0.00020836793483659482, + "loss": 0.4801, + "step": 148900 + }, + { + "epoch": 7.395947154067746, + "grad_norm": 0.14453125, + "learning_rate": 0.00020832820105294526, + "loss": 0.5075, + "step": 148910 + }, + { + "epoch": 7.396443826363366, + "grad_norm": 0.150390625, + "learning_rate": 0.00020828846726929573, + "loss": 0.5015, + "step": 148920 + }, + { + "epoch": 7.396940498658985, + "grad_norm": 0.1474609375, + "learning_rate": 0.00020824873348564618, + "loss": 0.4751, + "step": 148930 + }, + { + "epoch": 7.397437170954604, + "grad_norm": 0.140625, + "learning_rate": 0.00020820899970199665, + "loss": 0.4797, + "step": 148940 + }, + { + "epoch": 7.3979338432502235, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002081692659183471, + "loss": 0.4828, + "step": 148950 + }, + { + "epoch": 7.398430515545843, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002081295321346975, + "loss": 0.5123, + "step": 148960 + }, + { + "epoch": 7.398927187841462, + "grad_norm": 0.158203125, + "learning_rate": 0.000208089798351048, + "loss": 0.4848, + "step": 148970 + }, + { + "epoch": 7.399423860137081, + "grad_norm": 0.142578125, + "learning_rate": 0.00020805006456739843, + "loss": 0.4856, + "step": 148980 + }, + { + "epoch": 7.399920532432701, + "grad_norm": 0.1455078125, + "learning_rate": 0.00020801033078374887, + "loss": 0.4928, + "step": 148990 + }, + { + "epoch": 7.4004172047283205, + "grad_norm": 0.1455078125, + "learning_rate": 0.00020797059700009934, + "loss": 0.5128, + "step": 149000 + }, + { + "epoch": 7.40091387702394, + "grad_norm": 0.146484375, + "learning_rate": 0.0002079308632164498, + "loss": 0.5278, + "step": 149010 + }, + { + "epoch": 7.401410549319559, + "grad_norm": 0.1357421875, + "learning_rate": 0.00020789112943280026, + "loss": 0.4889, + "step": 149020 + }, + { + "epoch": 7.401907221615178, + "grad_norm": 0.142578125, + "learning_rate": 0.0002078513956491507, + "loss": 0.4796, + "step": 149030 + }, + { + "epoch": 7.4024038939107974, + "grad_norm": 0.142578125, + "learning_rate": 0.00020781166186550115, + "loss": 0.4768, + "step": 149040 + }, + { + "epoch": 7.402900566206417, + "grad_norm": 0.1357421875, + "learning_rate": 0.00020777192808185162, + "loss": 0.4833, + "step": 149050 + }, + { + "epoch": 7.403397238502036, + "grad_norm": 0.1484375, + "learning_rate": 0.00020773219429820206, + "loss": 0.4826, + "step": 149060 + }, + { + "epoch": 7.403893910797656, + "grad_norm": 0.1533203125, + "learning_rate": 0.00020769246051455248, + "loss": 0.5, + "step": 149070 + }, + { + "epoch": 7.404390583093275, + "grad_norm": 0.1494140625, + "learning_rate": 0.00020765272673090298, + "loss": 0.479, + "step": 149080 + }, + { + "epoch": 7.4048872553888945, + "grad_norm": 0.1640625, + "learning_rate": 0.0002076129929472534, + "loss": 0.4818, + "step": 149090 + }, + { + "epoch": 7.405383927684514, + "grad_norm": 0.1640625, + "learning_rate": 0.0002075732591636039, + "loss": 0.498, + "step": 149100 + }, + { + "epoch": 7.405880599980133, + "grad_norm": 0.142578125, + "learning_rate": 0.0002075335253799543, + "loss": 0.4568, + "step": 149110 + }, + { + "epoch": 7.406377272275752, + "grad_norm": 0.1640625, + "learning_rate": 0.00020749379159630475, + "loss": 0.483, + "step": 149120 + }, + { + "epoch": 7.406873944571371, + "grad_norm": 0.13671875, + "learning_rate": 0.00020745405781265523, + "loss": 0.4718, + "step": 149130 + }, + { + "epoch": 7.4073706168669915, + "grad_norm": 0.1455078125, + "learning_rate": 0.00020741432402900567, + "loss": 0.5066, + "step": 149140 + }, + { + "epoch": 7.407867289162611, + "grad_norm": 0.1796875, + "learning_rate": 0.00020737459024535611, + "loss": 0.4941, + "step": 149150 + }, + { + "epoch": 7.40836396145823, + "grad_norm": 0.1376953125, + "learning_rate": 0.00020733485646170659, + "loss": 0.4757, + "step": 149160 + }, + { + "epoch": 7.408860633753849, + "grad_norm": 0.1474609375, + "learning_rate": 0.00020729512267805703, + "loss": 0.5042, + "step": 149170 + }, + { + "epoch": 7.409357306049468, + "grad_norm": 0.15625, + "learning_rate": 0.0002072553888944075, + "loss": 0.5143, + "step": 149180 + }, + { + "epoch": 7.409853978345088, + "grad_norm": 0.1630859375, + "learning_rate": 0.00020721565511075795, + "loss": 0.4808, + "step": 149190 + }, + { + "epoch": 7.410350650640707, + "grad_norm": 0.150390625, + "learning_rate": 0.00020717592132710836, + "loss": 0.5003, + "step": 149200 + }, + { + "epoch": 7.410847322936327, + "grad_norm": 0.1923828125, + "learning_rate": 0.00020713618754345886, + "loss": 0.4878, + "step": 149210 + }, + { + "epoch": 7.411343995231946, + "grad_norm": 0.1552734375, + "learning_rate": 0.00020709645375980928, + "loss": 0.504, + "step": 149220 + }, + { + "epoch": 7.411840667527565, + "grad_norm": 0.16015625, + "learning_rate": 0.00020705671997615972, + "loss": 0.481, + "step": 149230 + }, + { + "epoch": 7.412337339823185, + "grad_norm": 0.166015625, + "learning_rate": 0.0002070169861925102, + "loss": 0.4732, + "step": 149240 + }, + { + "epoch": 7.412834012118804, + "grad_norm": 0.1533203125, + "learning_rate": 0.00020697725240886064, + "loss": 0.5001, + "step": 149250 + }, + { + "epoch": 7.413330684414423, + "grad_norm": 0.1943359375, + "learning_rate": 0.0002069375186252111, + "loss": 0.4797, + "step": 149260 + }, + { + "epoch": 7.413827356710042, + "grad_norm": 0.162109375, + "learning_rate": 0.00020689778484156155, + "loss": 0.5091, + "step": 149270 + }, + { + "epoch": 7.4143240290056625, + "grad_norm": 0.1396484375, + "learning_rate": 0.00020685805105791197, + "loss": 0.4502, + "step": 149280 + }, + { + "epoch": 7.414820701301282, + "grad_norm": 0.1552734375, + "learning_rate": 0.00020681831727426247, + "loss": 0.4787, + "step": 149290 + }, + { + "epoch": 7.415317373596901, + "grad_norm": 0.142578125, + "learning_rate": 0.0002067785834906129, + "loss": 0.5072, + "step": 149300 + }, + { + "epoch": 7.41581404589252, + "grad_norm": 0.16015625, + "learning_rate": 0.00020673884970696333, + "loss": 0.5042, + "step": 149310 + }, + { + "epoch": 7.416310718188139, + "grad_norm": 0.1630859375, + "learning_rate": 0.0002066991159233138, + "loss": 0.4981, + "step": 149320 + }, + { + "epoch": 7.416807390483759, + "grad_norm": 0.150390625, + "learning_rate": 0.00020665938213966425, + "loss": 0.4738, + "step": 149330 + }, + { + "epoch": 7.417304062779378, + "grad_norm": 0.1357421875, + "learning_rate": 0.00020661964835601472, + "loss": 0.4689, + "step": 149340 + }, + { + "epoch": 7.417800735074998, + "grad_norm": 0.150390625, + "learning_rate": 0.00020657991457236516, + "loss": 0.4857, + "step": 149350 + }, + { + "epoch": 7.418297407370617, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002065401807887156, + "loss": 0.5077, + "step": 149360 + }, + { + "epoch": 7.418794079666236, + "grad_norm": 0.14453125, + "learning_rate": 0.00020650044700506608, + "loss": 0.4643, + "step": 149370 + }, + { + "epoch": 7.419290751961856, + "grad_norm": 0.150390625, + "learning_rate": 0.00020646071322141652, + "loss": 0.4682, + "step": 149380 + }, + { + "epoch": 7.419787424257475, + "grad_norm": 0.177734375, + "learning_rate": 0.000206420979437767, + "loss": 0.4662, + "step": 149390 + }, + { + "epoch": 7.420284096553094, + "grad_norm": 0.1474609375, + "learning_rate": 0.00020638124565411744, + "loss": 0.4841, + "step": 149400 + }, + { + "epoch": 7.420780768848713, + "grad_norm": 0.255859375, + "learning_rate": 0.00020634151187046785, + "loss": 0.4925, + "step": 149410 + }, + { + "epoch": 7.421277441144333, + "grad_norm": 0.1474609375, + "learning_rate": 0.00020630177808681835, + "loss": 0.4395, + "step": 149420 + }, + { + "epoch": 7.421774113439953, + "grad_norm": 0.13671875, + "learning_rate": 0.00020626204430316877, + "loss": 0.4531, + "step": 149430 + }, + { + "epoch": 7.422270785735572, + "grad_norm": 0.12890625, + "learning_rate": 0.00020622231051951921, + "loss": 0.4508, + "step": 149440 + }, + { + "epoch": 7.422767458031191, + "grad_norm": 0.1796875, + "learning_rate": 0.00020618257673586969, + "loss": 0.4889, + "step": 149450 + }, + { + "epoch": 7.42326413032681, + "grad_norm": 0.1796875, + "learning_rate": 0.00020614284295222013, + "loss": 0.5129, + "step": 149460 + }, + { + "epoch": 7.4237608026224295, + "grad_norm": 0.15234375, + "learning_rate": 0.0002061031091685706, + "loss": 0.5116, + "step": 149470 + }, + { + "epoch": 7.424257474918049, + "grad_norm": 0.1572265625, + "learning_rate": 0.00020606337538492105, + "loss": 0.4896, + "step": 149480 + }, + { + "epoch": 7.424754147213669, + "grad_norm": 0.1962890625, + "learning_rate": 0.0002060236416012715, + "loss": 0.5167, + "step": 149490 + }, + { + "epoch": 7.425250819509288, + "grad_norm": 0.1484375, + "learning_rate": 0.00020598390781762196, + "loss": 0.4773, + "step": 149500 + }, + { + "epoch": 7.425747491804907, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002059441740339724, + "loss": 0.4978, + "step": 149510 + }, + { + "epoch": 7.426244164100527, + "grad_norm": 0.1474609375, + "learning_rate": 0.00020590444025032282, + "loss": 0.4872, + "step": 149520 + }, + { + "epoch": 7.426740836396146, + "grad_norm": 0.16796875, + "learning_rate": 0.0002058647064666733, + "loss": 0.5036, + "step": 149530 + }, + { + "epoch": 7.427237508691765, + "grad_norm": 0.1328125, + "learning_rate": 0.00020582497268302374, + "loss": 0.4777, + "step": 149540 + }, + { + "epoch": 7.427734180987384, + "grad_norm": 0.15625, + "learning_rate": 0.0002057852388993742, + "loss": 0.4929, + "step": 149550 + }, + { + "epoch": 7.4282308532830035, + "grad_norm": 0.13671875, + "learning_rate": 0.00020574550511572465, + "loss": 0.503, + "step": 149560 + }, + { + "epoch": 7.428727525578624, + "grad_norm": 0.140625, + "learning_rate": 0.0002057057713320751, + "loss": 0.4705, + "step": 149570 + }, + { + "epoch": 7.429224197874243, + "grad_norm": 0.1689453125, + "learning_rate": 0.00020566603754842557, + "loss": 0.4755, + "step": 149580 + }, + { + "epoch": 7.429720870169862, + "grad_norm": 0.15234375, + "learning_rate": 0.00020562630376477601, + "loss": 0.4845, + "step": 149590 + }, + { + "epoch": 7.430217542465481, + "grad_norm": 0.142578125, + "learning_rate": 0.00020558656998112643, + "loss": 0.5069, + "step": 149600 + }, + { + "epoch": 7.4307142147611005, + "grad_norm": 0.220703125, + "learning_rate": 0.00020554683619747693, + "loss": 0.4874, + "step": 149610 + }, + { + "epoch": 7.43121088705672, + "grad_norm": 0.138671875, + "learning_rate": 0.00020550710241382735, + "loss": 0.482, + "step": 149620 + }, + { + "epoch": 7.431707559352339, + "grad_norm": 0.166015625, + "learning_rate": 0.00020546736863017785, + "loss": 0.4415, + "step": 149630 + }, + { + "epoch": 7.432204231647959, + "grad_norm": 0.140625, + "learning_rate": 0.00020542763484652826, + "loss": 0.4703, + "step": 149640 + }, + { + "epoch": 7.432700903943578, + "grad_norm": 0.140625, + "learning_rate": 0.0002053879010628787, + "loss": 0.4841, + "step": 149650 + }, + { + "epoch": 7.4331975762391975, + "grad_norm": 0.15234375, + "learning_rate": 0.00020534816727922918, + "loss": 0.5322, + "step": 149660 + }, + { + "epoch": 7.433694248534817, + "grad_norm": 0.140625, + "learning_rate": 0.00020530843349557962, + "loss": 0.4818, + "step": 149670 + }, + { + "epoch": 7.434190920830436, + "grad_norm": 0.1416015625, + "learning_rate": 0.00020526869971193007, + "loss": 0.4964, + "step": 149680 + }, + { + "epoch": 7.434687593126055, + "grad_norm": 0.2119140625, + "learning_rate": 0.00020522896592828054, + "loss": 0.4843, + "step": 149690 + }, + { + "epoch": 7.435184265421674, + "grad_norm": 0.1376953125, + "learning_rate": 0.00020518923214463098, + "loss": 0.4871, + "step": 149700 + }, + { + "epoch": 7.435680937717294, + "grad_norm": 0.15234375, + "learning_rate": 0.00020514949836098145, + "loss": 0.5375, + "step": 149710 + }, + { + "epoch": 7.436177610012914, + "grad_norm": 0.154296875, + "learning_rate": 0.0002051097645773319, + "loss": 0.4897, + "step": 149720 + }, + { + "epoch": 7.436674282308533, + "grad_norm": 0.1494140625, + "learning_rate": 0.00020507003079368231, + "loss": 0.4603, + "step": 149730 + }, + { + "epoch": 7.437170954604152, + "grad_norm": 0.162109375, + "learning_rate": 0.0002050302970100328, + "loss": 0.473, + "step": 149740 + }, + { + "epoch": 7.4376676268997715, + "grad_norm": 0.203125, + "learning_rate": 0.00020499056322638323, + "loss": 0.5023, + "step": 149750 + }, + { + "epoch": 7.438164299195391, + "grad_norm": 0.166015625, + "learning_rate": 0.00020495082944273373, + "loss": 0.475, + "step": 149760 + }, + { + "epoch": 7.43866097149101, + "grad_norm": 0.1640625, + "learning_rate": 0.00020491109565908415, + "loss": 0.4797, + "step": 149770 + }, + { + "epoch": 7.439157643786629, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002048713618754346, + "loss": 0.4915, + "step": 149780 + }, + { + "epoch": 7.439654316082249, + "grad_norm": 0.1708984375, + "learning_rate": 0.00020483162809178506, + "loss": 0.4932, + "step": 149790 + }, + { + "epoch": 7.4401509883778685, + "grad_norm": 0.1806640625, + "learning_rate": 0.0002047918943081355, + "loss": 0.4973, + "step": 149800 + }, + { + "epoch": 7.440647660673488, + "grad_norm": 0.15234375, + "learning_rate": 0.00020475216052448592, + "loss": 0.4873, + "step": 149810 + }, + { + "epoch": 7.441144332969107, + "grad_norm": 0.14453125, + "learning_rate": 0.00020471242674083642, + "loss": 0.474, + "step": 149820 + }, + { + "epoch": 7.441641005264726, + "grad_norm": 0.1669921875, + "learning_rate": 0.00020467269295718684, + "loss": 0.4897, + "step": 149830 + }, + { + "epoch": 7.442137677560345, + "grad_norm": 0.16015625, + "learning_rate": 0.00020463295917353734, + "loss": 0.4911, + "step": 149840 + }, + { + "epoch": 7.442634349855965, + "grad_norm": 0.16015625, + "learning_rate": 0.00020459322538988775, + "loss": 0.5272, + "step": 149850 + }, + { + "epoch": 7.443131022151585, + "grad_norm": 0.1572265625, + "learning_rate": 0.0002045534916062382, + "loss": 0.511, + "step": 149860 + }, + { + "epoch": 7.443627694447204, + "grad_norm": 0.1416015625, + "learning_rate": 0.00020451375782258867, + "loss": 0.5126, + "step": 149870 + }, + { + "epoch": 7.444124366742823, + "grad_norm": 0.1474609375, + "learning_rate": 0.00020447402403893911, + "loss": 0.4711, + "step": 149880 + }, + { + "epoch": 7.444621039038442, + "grad_norm": 0.1708984375, + "learning_rate": 0.00020443429025528956, + "loss": 0.4834, + "step": 149890 + }, + { + "epoch": 7.445117711334062, + "grad_norm": 0.1357421875, + "learning_rate": 0.00020439455647164003, + "loss": 0.5153, + "step": 149900 + }, + { + "epoch": 7.445614383629681, + "grad_norm": 0.150390625, + "learning_rate": 0.00020435482268799047, + "loss": 0.4782, + "step": 149910 + }, + { + "epoch": 7.4461110559253, + "grad_norm": 0.1484375, + "learning_rate": 0.00020431508890434095, + "loss": 0.468, + "step": 149920 + }, + { + "epoch": 7.44660772822092, + "grad_norm": 0.140625, + "learning_rate": 0.0002042753551206914, + "loss": 0.4964, + "step": 149930 + }, + { + "epoch": 7.447104400516539, + "grad_norm": 0.205078125, + "learning_rate": 0.0002042356213370418, + "loss": 0.5064, + "step": 149940 + }, + { + "epoch": 7.447601072812159, + "grad_norm": 0.158203125, + "learning_rate": 0.0002041958875533923, + "loss": 0.4847, + "step": 149950 + }, + { + "epoch": 7.448097745107778, + "grad_norm": 0.142578125, + "learning_rate": 0.00020415615376974272, + "loss": 0.4819, + "step": 149960 + }, + { + "epoch": 7.448594417403397, + "grad_norm": 0.162109375, + "learning_rate": 0.00020411641998609317, + "loss": 0.4937, + "step": 149970 + }, + { + "epoch": 7.449091089699016, + "grad_norm": 0.1455078125, + "learning_rate": 0.00020407668620244364, + "loss": 0.4924, + "step": 149980 + }, + { + "epoch": 7.449587761994636, + "grad_norm": 0.158203125, + "learning_rate": 0.00020403695241879408, + "loss": 0.4743, + "step": 149990 + }, + { + "epoch": 7.450084434290256, + "grad_norm": 0.1591796875, + "learning_rate": 0.00020399721863514455, + "loss": 0.4696, + "step": 150000 + }, + { + "epoch": 7.450581106585875, + "grad_norm": 0.1396484375, + "learning_rate": 0.000203957484851495, + "loss": 0.5106, + "step": 150010 + }, + { + "epoch": 7.451077778881494, + "grad_norm": 0.1572265625, + "learning_rate": 0.00020391775106784544, + "loss": 0.4905, + "step": 150020 + }, + { + "epoch": 7.451574451177113, + "grad_norm": 0.1669921875, + "learning_rate": 0.0002038780172841959, + "loss": 0.496, + "step": 150030 + }, + { + "epoch": 7.452071123472733, + "grad_norm": 0.1484375, + "learning_rate": 0.00020383828350054636, + "loss": 0.4744, + "step": 150040 + }, + { + "epoch": 7.452567795768352, + "grad_norm": 0.1806640625, + "learning_rate": 0.00020379854971689677, + "loss": 0.4868, + "step": 150050 + }, + { + "epoch": 7.453064468063971, + "grad_norm": 0.1396484375, + "learning_rate": 0.00020375881593324727, + "loss": 0.4942, + "step": 150060 + }, + { + "epoch": 7.453561140359591, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002037190821495977, + "loss": 0.497, + "step": 150070 + }, + { + "epoch": 7.45405781265521, + "grad_norm": 0.158203125, + "learning_rate": 0.00020367934836594816, + "loss": 0.4981, + "step": 150080 + }, + { + "epoch": 7.45455448495083, + "grad_norm": 0.15625, + "learning_rate": 0.0002036396145822986, + "loss": 0.4988, + "step": 150090 + }, + { + "epoch": 7.455051157246449, + "grad_norm": 0.1494140625, + "learning_rate": 0.00020359988079864905, + "loss": 0.479, + "step": 150100 + }, + { + "epoch": 7.455547829542068, + "grad_norm": 0.1416015625, + "learning_rate": 0.00020356014701499952, + "loss": 0.4834, + "step": 150110 + }, + { + "epoch": 7.456044501837687, + "grad_norm": 0.1708984375, + "learning_rate": 0.00020352041323134997, + "loss": 0.4881, + "step": 150120 + }, + { + "epoch": 7.4565411741333065, + "grad_norm": 0.140625, + "learning_rate": 0.00020348067944770038, + "loss": 0.4548, + "step": 150130 + }, + { + "epoch": 7.457037846428927, + "grad_norm": 0.1396484375, + "learning_rate": 0.00020344094566405088, + "loss": 0.4853, + "step": 150140 + }, + { + "epoch": 7.457534518724546, + "grad_norm": 0.1650390625, + "learning_rate": 0.0002034012118804013, + "loss": 0.5291, + "step": 150150 + }, + { + "epoch": 7.458031191020165, + "grad_norm": 0.154296875, + "learning_rate": 0.0002033614780967518, + "loss": 0.4977, + "step": 150160 + }, + { + "epoch": 7.458527863315784, + "grad_norm": 0.1669921875, + "learning_rate": 0.00020332174431310221, + "loss": 0.482, + "step": 150170 + }, + { + "epoch": 7.4590245356114036, + "grad_norm": 0.158203125, + "learning_rate": 0.00020328201052945266, + "loss": 0.463, + "step": 150180 + }, + { + "epoch": 7.459521207907023, + "grad_norm": 0.150390625, + "learning_rate": 0.00020324227674580313, + "loss": 0.504, + "step": 150190 + }, + { + "epoch": 7.460017880202642, + "grad_norm": 0.1513671875, + "learning_rate": 0.00020320254296215357, + "loss": 0.5334, + "step": 150200 + }, + { + "epoch": 7.460514552498261, + "grad_norm": 0.1435546875, + "learning_rate": 0.00020316280917850405, + "loss": 0.4615, + "step": 150210 + }, + { + "epoch": 7.461011224793881, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002031230753948545, + "loss": 0.4766, + "step": 150220 + }, + { + "epoch": 7.461507897089501, + "grad_norm": 0.1328125, + "learning_rate": 0.00020308334161120493, + "loss": 0.4841, + "step": 150230 + }, + { + "epoch": 7.46200456938512, + "grad_norm": 0.1591796875, + "learning_rate": 0.0002030436078275554, + "loss": 0.4853, + "step": 150240 + }, + { + "epoch": 7.462501241680739, + "grad_norm": 0.1513671875, + "learning_rate": 0.00020300387404390585, + "loss": 0.4894, + "step": 150250 + }, + { + "epoch": 7.462997913976358, + "grad_norm": 0.140625, + "learning_rate": 0.00020296414026025627, + "loss": 0.4651, + "step": 150260 + }, + { + "epoch": 7.4634945862719775, + "grad_norm": 0.1826171875, + "learning_rate": 0.00020292440647660677, + "loss": 0.4855, + "step": 150270 + }, + { + "epoch": 7.463991258567597, + "grad_norm": 0.1630859375, + "learning_rate": 0.00020288467269295718, + "loss": 0.4818, + "step": 150280 + }, + { + "epoch": 7.464487930863217, + "grad_norm": 0.14453125, + "learning_rate": 0.00020284493890930768, + "loss": 0.4988, + "step": 150290 + }, + { + "epoch": 7.464984603158836, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002028052051256581, + "loss": 0.487, + "step": 150300 + }, + { + "epoch": 7.465481275454455, + "grad_norm": 0.1611328125, + "learning_rate": 0.00020276547134200854, + "loss": 0.4859, + "step": 150310 + }, + { + "epoch": 7.4659779477500745, + "grad_norm": 0.1689453125, + "learning_rate": 0.00020272573755835901, + "loss": 0.4853, + "step": 150320 + }, + { + "epoch": 7.466474620045694, + "grad_norm": 0.13671875, + "learning_rate": 0.00020268600377470946, + "loss": 0.4654, + "step": 150330 + }, + { + "epoch": 7.466971292341313, + "grad_norm": 0.1328125, + "learning_rate": 0.0002026462699910599, + "loss": 0.4836, + "step": 150340 + }, + { + "epoch": 7.467467964636932, + "grad_norm": 0.1435546875, + "learning_rate": 0.00020260653620741037, + "loss": 0.4624, + "step": 150350 + }, + { + "epoch": 7.467964636932552, + "grad_norm": 0.138671875, + "learning_rate": 0.00020256680242376082, + "loss": 0.4759, + "step": 150360 + }, + { + "epoch": 7.4684613092281715, + "grad_norm": 0.150390625, + "learning_rate": 0.0002025270686401113, + "loss": 0.4909, + "step": 150370 + }, + { + "epoch": 7.468957981523791, + "grad_norm": 0.1416015625, + "learning_rate": 0.0002024873348564617, + "loss": 0.4793, + "step": 150380 + }, + { + "epoch": 7.46945465381941, + "grad_norm": 0.169921875, + "learning_rate": 0.00020244760107281215, + "loss": 0.5134, + "step": 150390 + }, + { + "epoch": 7.469951326115029, + "grad_norm": 0.13671875, + "learning_rate": 0.00020240786728916262, + "loss": 0.448, + "step": 150400 + }, + { + "epoch": 7.470447998410648, + "grad_norm": 0.1650390625, + "learning_rate": 0.00020236813350551307, + "loss": 0.4737, + "step": 150410 + }, + { + "epoch": 7.470944670706268, + "grad_norm": 0.1533203125, + "learning_rate": 0.0002023283997218635, + "loss": 0.4889, + "step": 150420 + }, + { + "epoch": 7.471441343001887, + "grad_norm": 0.138671875, + "learning_rate": 0.00020228866593821398, + "loss": 0.4734, + "step": 150430 + }, + { + "epoch": 7.471938015297507, + "grad_norm": 0.1650390625, + "learning_rate": 0.00020224893215456443, + "loss": 0.457, + "step": 150440 + }, + { + "epoch": 7.472434687593126, + "grad_norm": 0.150390625, + "learning_rate": 0.0002022091983709149, + "loss": 0.5119, + "step": 150450 + }, + { + "epoch": 7.4729313598887455, + "grad_norm": 0.1357421875, + "learning_rate": 0.00020216946458726534, + "loss": 0.4664, + "step": 150460 + }, + { + "epoch": 7.473428032184365, + "grad_norm": 0.1572265625, + "learning_rate": 0.00020212973080361576, + "loss": 0.4786, + "step": 150470 + }, + { + "epoch": 7.473924704479984, + "grad_norm": 0.154296875, + "learning_rate": 0.00020208999701996626, + "loss": 0.4814, + "step": 150480 + }, + { + "epoch": 7.474421376775603, + "grad_norm": 0.1435546875, + "learning_rate": 0.00020205026323631667, + "loss": 0.4903, + "step": 150490 + }, + { + "epoch": 7.474918049071222, + "grad_norm": 0.1591796875, + "learning_rate": 0.00020201052945266712, + "loss": 0.4863, + "step": 150500 + }, + { + "epoch": 7.4754147213668425, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002019707956690176, + "loss": 0.5055, + "step": 150510 + }, + { + "epoch": 7.475911393662462, + "grad_norm": 0.166015625, + "learning_rate": 0.00020193106188536803, + "loss": 0.5019, + "step": 150520 + }, + { + "epoch": 7.476408065958081, + "grad_norm": 0.1337890625, + "learning_rate": 0.0002018913281017185, + "loss": 0.4812, + "step": 150530 + }, + { + "epoch": 7.4769047382537, + "grad_norm": 0.15625, + "learning_rate": 0.00020185159431806895, + "loss": 0.5075, + "step": 150540 + }, + { + "epoch": 7.477401410549319, + "grad_norm": 0.1396484375, + "learning_rate": 0.0002018118605344194, + "loss": 0.4877, + "step": 150550 + }, + { + "epoch": 7.477898082844939, + "grad_norm": 0.1494140625, + "learning_rate": 0.00020177212675076987, + "loss": 0.4877, + "step": 150560 + }, + { + "epoch": 7.478394755140558, + "grad_norm": 0.1552734375, + "learning_rate": 0.0002017323929671203, + "loss": 0.4917, + "step": 150570 + }, + { + "epoch": 7.478891427436178, + "grad_norm": 0.1357421875, + "learning_rate": 0.00020169265918347073, + "loss": 0.4761, + "step": 150580 + }, + { + "epoch": 7.479388099731797, + "grad_norm": 0.13671875, + "learning_rate": 0.00020165292539982123, + "loss": 0.5167, + "step": 150590 + }, + { + "epoch": 7.479884772027416, + "grad_norm": 0.1669921875, + "learning_rate": 0.00020161319161617164, + "loss": 0.4985, + "step": 150600 + }, + { + "epoch": 7.480381444323036, + "grad_norm": 0.1689453125, + "learning_rate": 0.00020157345783252214, + "loss": 0.4823, + "step": 150610 + }, + { + "epoch": 7.480878116618655, + "grad_norm": 0.1416015625, + "learning_rate": 0.00020153372404887256, + "loss": 0.4774, + "step": 150620 + }, + { + "epoch": 7.481374788914274, + "grad_norm": 0.1513671875, + "learning_rate": 0.000201493990265223, + "loss": 0.5035, + "step": 150630 + }, + { + "epoch": 7.481871461209893, + "grad_norm": 0.1552734375, + "learning_rate": 0.00020145425648157347, + "loss": 0.4775, + "step": 150640 + }, + { + "epoch": 7.482368133505513, + "grad_norm": 0.158203125, + "learning_rate": 0.00020141452269792392, + "loss": 0.4709, + "step": 150650 + }, + { + "epoch": 7.482864805801133, + "grad_norm": 0.1474609375, + "learning_rate": 0.0002013747889142744, + "loss": 0.5085, + "step": 150660 + }, + { + "epoch": 7.483361478096752, + "grad_norm": 0.1435546875, + "learning_rate": 0.00020133505513062483, + "loss": 0.4913, + "step": 150670 + }, + { + "epoch": 7.483858150392371, + "grad_norm": 0.1484375, + "learning_rate": 0.00020129532134697525, + "loss": 0.5026, + "step": 150680 + }, + { + "epoch": 7.48435482268799, + "grad_norm": 0.1435546875, + "learning_rate": 0.00020125558756332575, + "loss": 0.5076, + "step": 150690 + }, + { + "epoch": 7.48485149498361, + "grad_norm": 0.15625, + "learning_rate": 0.00020121585377967617, + "loss": 0.4643, + "step": 150700 + }, + { + "epoch": 7.485348167279229, + "grad_norm": 0.1494140625, + "learning_rate": 0.0002011761199960266, + "loss": 0.4904, + "step": 150710 + }, + { + "epoch": 7.485844839574849, + "grad_norm": 0.166015625, + "learning_rate": 0.00020113638621237708, + "loss": 0.4684, + "step": 150720 + }, + { + "epoch": 7.486341511870468, + "grad_norm": 0.2265625, + "learning_rate": 0.00020109665242872753, + "loss": 0.4995, + "step": 150730 + }, + { + "epoch": 7.486838184166087, + "grad_norm": 0.15234375, + "learning_rate": 0.000201056918645078, + "loss": 0.4827, + "step": 150740 + }, + { + "epoch": 7.487334856461707, + "grad_norm": 0.1474609375, + "learning_rate": 0.00020101718486142844, + "loss": 0.4878, + "step": 150750 + }, + { + "epoch": 7.487831528757326, + "grad_norm": 0.1533203125, + "learning_rate": 0.00020097745107777889, + "loss": 0.5121, + "step": 150760 + }, + { + "epoch": 7.488328201052945, + "grad_norm": 0.158203125, + "learning_rate": 0.00020093771729412936, + "loss": 0.5013, + "step": 150770 + }, + { + "epoch": 7.488824873348564, + "grad_norm": 0.1435546875, + "learning_rate": 0.0002008979835104798, + "loss": 0.4898, + "step": 150780 + }, + { + "epoch": 7.489321545644184, + "grad_norm": 0.1611328125, + "learning_rate": 0.00020085824972683022, + "loss": 0.5036, + "step": 150790 + }, + { + "epoch": 7.489818217939804, + "grad_norm": 0.1396484375, + "learning_rate": 0.00020081851594318072, + "loss": 0.4944, + "step": 150800 + }, + { + "epoch": 7.490314890235423, + "grad_norm": 0.146484375, + "learning_rate": 0.00020077878215953113, + "loss": 0.5034, + "step": 150810 + }, + { + "epoch": 7.490811562531042, + "grad_norm": 0.16015625, + "learning_rate": 0.00020073904837588163, + "loss": 0.5013, + "step": 150820 + }, + { + "epoch": 7.491308234826661, + "grad_norm": 0.1708984375, + "learning_rate": 0.00020069931459223205, + "loss": 0.4785, + "step": 150830 + }, + { + "epoch": 7.4918049071222805, + "grad_norm": 0.146484375, + "learning_rate": 0.0002006595808085825, + "loss": 0.4658, + "step": 150840 + }, + { + "epoch": 7.4923015794179, + "grad_norm": 0.1533203125, + "learning_rate": 0.00020061984702493297, + "loss": 0.4785, + "step": 150850 + }, + { + "epoch": 7.49279825171352, + "grad_norm": 0.15625, + "learning_rate": 0.0002005801132412834, + "loss": 0.4904, + "step": 150860 + }, + { + "epoch": 7.493294924009139, + "grad_norm": 0.14453125, + "learning_rate": 0.00020054037945763385, + "loss": 0.4657, + "step": 150870 + }, + { + "epoch": 7.493791596304758, + "grad_norm": 0.16796875, + "learning_rate": 0.00020050064567398433, + "loss": 0.5018, + "step": 150880 + }, + { + "epoch": 7.494288268600378, + "grad_norm": 0.134765625, + "learning_rate": 0.00020046091189033477, + "loss": 0.5251, + "step": 150890 + }, + { + "epoch": 7.494784940895997, + "grad_norm": 0.1455078125, + "learning_rate": 0.00020042117810668524, + "loss": 0.481, + "step": 150900 + }, + { + "epoch": 7.495281613191616, + "grad_norm": 0.138671875, + "learning_rate": 0.00020038144432303569, + "loss": 0.4986, + "step": 150910 + }, + { + "epoch": 7.495778285487235, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002003417105393861, + "loss": 0.5211, + "step": 150920 + }, + { + "epoch": 7.4962749577828545, + "grad_norm": 0.15234375, + "learning_rate": 0.00020030197675573657, + "loss": 0.4996, + "step": 150930 + }, + { + "epoch": 7.496771630078475, + "grad_norm": 0.154296875, + "learning_rate": 0.00020026224297208702, + "loss": 0.4629, + "step": 150940 + }, + { + "epoch": 7.497268302374094, + "grad_norm": 0.1494140625, + "learning_rate": 0.00020022250918843746, + "loss": 0.5066, + "step": 150950 + }, + { + "epoch": 7.497764974669713, + "grad_norm": 0.1708984375, + "learning_rate": 0.00020018277540478793, + "loss": 0.4895, + "step": 150960 + }, + { + "epoch": 7.498261646965332, + "grad_norm": 0.1435546875, + "learning_rate": 0.00020014304162113838, + "loss": 0.4812, + "step": 150970 + }, + { + "epoch": 7.4987583192609515, + "grad_norm": 0.1416015625, + "learning_rate": 0.00020010330783748885, + "loss": 0.4761, + "step": 150980 + }, + { + "epoch": 7.499254991556571, + "grad_norm": 0.150390625, + "learning_rate": 0.0002000635740538393, + "loss": 0.5024, + "step": 150990 + }, + { + "epoch": 7.49975166385219, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002000238402701897, + "loss": 0.4823, + "step": 151000 + }, + { + "epoch": 7.50024833614781, + "grad_norm": 0.150390625, + "learning_rate": 0.0001999841064865402, + "loss": 0.4834, + "step": 151010 + }, + { + "epoch": 7.500745008443429, + "grad_norm": 0.150390625, + "learning_rate": 0.00019994437270289063, + "loss": 0.4755, + "step": 151020 + }, + { + "epoch": 7.5012416807390485, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001999046389192411, + "loss": 0.4558, + "step": 151030 + }, + { + "epoch": 7.501738353034668, + "grad_norm": 0.150390625, + "learning_rate": 0.00019986490513559154, + "loss": 0.5269, + "step": 151040 + }, + { + "epoch": 7.502235025330287, + "grad_norm": 0.1884765625, + "learning_rate": 0.000199825171351942, + "loss": 0.5016, + "step": 151050 + }, + { + "epoch": 7.502731697625906, + "grad_norm": 0.1796875, + "learning_rate": 0.00019978543756829246, + "loss": 0.4781, + "step": 151060 + }, + { + "epoch": 7.503228369921525, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001997457037846429, + "loss": 0.5139, + "step": 151070 + }, + { + "epoch": 7.503725042217145, + "grad_norm": 0.15625, + "learning_rate": 0.00019970597000099335, + "loss": 0.4934, + "step": 151080 + }, + { + "epoch": 7.504221714512765, + "grad_norm": 0.1357421875, + "learning_rate": 0.00019966623621734382, + "loss": 0.5015, + "step": 151090 + }, + { + "epoch": 7.504718386808384, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019962650243369426, + "loss": 0.4734, + "step": 151100 + }, + { + "epoch": 7.505215059104003, + "grad_norm": 0.146484375, + "learning_rate": 0.0001995867686500447, + "loss": 0.4978, + "step": 151110 + }, + { + "epoch": 7.505711731399622, + "grad_norm": 0.138671875, + "learning_rate": 0.00019954703486639518, + "loss": 0.5078, + "step": 151120 + }, + { + "epoch": 7.506208403695242, + "grad_norm": 0.158203125, + "learning_rate": 0.00019950730108274562, + "loss": 0.5149, + "step": 151130 + }, + { + "epoch": 7.506705075990861, + "grad_norm": 0.1455078125, + "learning_rate": 0.00019946756729909607, + "loss": 0.4712, + "step": 151140 + }, + { + "epoch": 7.50720174828648, + "grad_norm": 0.1484375, + "learning_rate": 0.0001994278335154465, + "loss": 0.4818, + "step": 151150 + }, + { + "epoch": 7.5076984205821, + "grad_norm": 0.171875, + "learning_rate": 0.00019938809973179698, + "loss": 0.5044, + "step": 151160 + }, + { + "epoch": 7.5081950928777195, + "grad_norm": 0.1435546875, + "learning_rate": 0.00019934836594814743, + "loss": 0.4519, + "step": 151170 + }, + { + "epoch": 7.508691765173339, + "grad_norm": 0.1484375, + "learning_rate": 0.00019930863216449787, + "loss": 0.5022, + "step": 151180 + }, + { + "epoch": 7.509188437468958, + "grad_norm": 0.16796875, + "learning_rate": 0.00019926889838084831, + "loss": 0.4742, + "step": 151190 + }, + { + "epoch": 7.509685109764577, + "grad_norm": 0.15234375, + "learning_rate": 0.00019922916459719879, + "loss": 0.5147, + "step": 151200 + }, + { + "epoch": 7.510181782060196, + "grad_norm": 0.1416015625, + "learning_rate": 0.00019918943081354923, + "loss": 0.5044, + "step": 151210 + }, + { + "epoch": 7.510678454355816, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019914969702989967, + "loss": 0.5168, + "step": 151220 + }, + { + "epoch": 7.511175126651436, + "grad_norm": 0.162109375, + "learning_rate": 0.00019910996324625012, + "loss": 0.4839, + "step": 151230 + }, + { + "epoch": 7.511671798947055, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001990702294626006, + "loss": 0.4637, + "step": 151240 + }, + { + "epoch": 7.512168471242674, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019903049567895103, + "loss": 0.4912, + "step": 151250 + }, + { + "epoch": 7.512665143538293, + "grad_norm": 0.140625, + "learning_rate": 0.00019899076189530148, + "loss": 0.4905, + "step": 151260 + }, + { + "epoch": 7.513161815833913, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019895102811165195, + "loss": 0.4811, + "step": 151270 + }, + { + "epoch": 7.513658488129532, + "grad_norm": 0.1484375, + "learning_rate": 0.0001989112943280024, + "loss": 0.4526, + "step": 151280 + }, + { + "epoch": 7.514155160425151, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019887156054435286, + "loss": 0.4655, + "step": 151290 + }, + { + "epoch": 7.514651832720771, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019883182676070328, + "loss": 0.4852, + "step": 151300 + }, + { + "epoch": 7.51514850501639, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019879209297705375, + "loss": 0.4661, + "step": 151310 + }, + { + "epoch": 7.51564517731201, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001987523591934042, + "loss": 0.4959, + "step": 151320 + }, + { + "epoch": 7.516141849607629, + "grad_norm": 0.140625, + "learning_rate": 0.00019871262540975467, + "loss": 0.4951, + "step": 151330 + }, + { + "epoch": 7.516638521903248, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019867289162610509, + "loss": 0.4933, + "step": 151340 + }, + { + "epoch": 7.517135194198867, + "grad_norm": 0.1416015625, + "learning_rate": 0.00019863315784245556, + "loss": 0.4918, + "step": 151350 + }, + { + "epoch": 7.517631866494487, + "grad_norm": 0.1513671875, + "learning_rate": 0.000198593424058806, + "loss": 0.4697, + "step": 151360 + }, + { + "epoch": 7.518128538790107, + "grad_norm": 0.14453125, + "learning_rate": 0.00019855369027515647, + "loss": 0.4969, + "step": 151370 + }, + { + "epoch": 7.518625211085726, + "grad_norm": 0.142578125, + "learning_rate": 0.0001985139564915069, + "loss": 0.5117, + "step": 151380 + }, + { + "epoch": 7.519121883381345, + "grad_norm": 0.150390625, + "learning_rate": 0.00019847422270785736, + "loss": 0.5019, + "step": 151390 + }, + { + "epoch": 7.519618555676964, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001984344889242078, + "loss": 0.5078, + "step": 151400 + }, + { + "epoch": 7.520115227972584, + "grad_norm": 0.150390625, + "learning_rate": 0.00019839475514055828, + "loss": 0.4668, + "step": 151410 + }, + { + "epoch": 7.520611900268203, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019835502135690872, + "loss": 0.4808, + "step": 151420 + }, + { + "epoch": 7.521108572563822, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019831528757325917, + "loss": 0.4847, + "step": 151430 + }, + { + "epoch": 7.521605244859442, + "grad_norm": 0.173828125, + "learning_rate": 0.00019827555378960964, + "loss": 0.4861, + "step": 151440 + }, + { + "epoch": 7.522101917155061, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019823582000596008, + "loss": 0.4836, + "step": 151450 + }, + { + "epoch": 7.522598589450681, + "grad_norm": 0.1357421875, + "learning_rate": 0.00019819608622231055, + "loss": 0.4662, + "step": 151460 + }, + { + "epoch": 7.5230952617463, + "grad_norm": 0.162109375, + "learning_rate": 0.00019815635243866097, + "loss": 0.5157, + "step": 151470 + }, + { + "epoch": 7.523591934041919, + "grad_norm": 0.16015625, + "learning_rate": 0.00019811661865501144, + "loss": 0.4875, + "step": 151480 + }, + { + "epoch": 7.524088606337538, + "grad_norm": 0.1396484375, + "learning_rate": 0.00019807688487136189, + "loss": 0.4465, + "step": 151490 + }, + { + "epoch": 7.5245852786331575, + "grad_norm": 0.150390625, + "learning_rate": 0.00019803715108771236, + "loss": 0.4683, + "step": 151500 + }, + { + "epoch": 7.525081950928778, + "grad_norm": 0.1376953125, + "learning_rate": 0.00019799741730406277, + "loss": 0.4887, + "step": 151510 + }, + { + "epoch": 7.525578623224397, + "grad_norm": 0.130859375, + "learning_rate": 0.00019795768352041325, + "loss": 0.5133, + "step": 151520 + }, + { + "epoch": 7.526075295520016, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001979179497367637, + "loss": 0.5277, + "step": 151530 + }, + { + "epoch": 7.526571967815635, + "grad_norm": 0.142578125, + "learning_rate": 0.00019787821595311416, + "loss": 0.4574, + "step": 151540 + }, + { + "epoch": 7.5270686401112545, + "grad_norm": 0.14453125, + "learning_rate": 0.00019783848216946458, + "loss": 0.5024, + "step": 151550 + }, + { + "epoch": 7.527565312406874, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019779874838581505, + "loss": 0.4973, + "step": 151560 + }, + { + "epoch": 7.528061984702493, + "grad_norm": 0.140625, + "learning_rate": 0.0001977590146021655, + "loss": 0.496, + "step": 151570 + }, + { + "epoch": 7.528558656998113, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019771928081851596, + "loss": 0.4705, + "step": 151580 + }, + { + "epoch": 7.529055329293732, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001976795470348664, + "loss": 0.4755, + "step": 151590 + }, + { + "epoch": 7.529552001589352, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019763981325121685, + "loss": 0.4684, + "step": 151600 + }, + { + "epoch": 7.530048673884971, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019760007946756732, + "loss": 0.4966, + "step": 151610 + }, + { + "epoch": 7.53054534618059, + "grad_norm": 0.16796875, + "learning_rate": 0.00019756034568391777, + "loss": 0.5127, + "step": 151620 + }, + { + "epoch": 7.531042018476209, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001975206119002682, + "loss": 0.4807, + "step": 151630 + }, + { + "epoch": 7.5315386907718285, + "grad_norm": 0.146484375, + "learning_rate": 0.00019748087811661866, + "loss": 0.4676, + "step": 151640 + }, + { + "epoch": 7.532035363067449, + "grad_norm": 0.15234375, + "learning_rate": 0.00019744114433296913, + "loss": 0.4945, + "step": 151650 + }, + { + "epoch": 7.532532035363068, + "grad_norm": 0.1435546875, + "learning_rate": 0.00019740141054931957, + "loss": 0.4862, + "step": 151660 + }, + { + "epoch": 7.533028707658687, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019736167676567002, + "loss": 0.4681, + "step": 151670 + }, + { + "epoch": 7.533525379954306, + "grad_norm": 0.15234375, + "learning_rate": 0.00019732194298202046, + "loss": 0.4649, + "step": 151680 + }, + { + "epoch": 7.5340220522499255, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019728220919837093, + "loss": 0.5175, + "step": 151690 + }, + { + "epoch": 7.534518724545545, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019724247541472138, + "loss": 0.4863, + "step": 151700 + }, + { + "epoch": 7.535015396841164, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019720274163107182, + "loss": 0.4879, + "step": 151710 + }, + { + "epoch": 7.535512069136783, + "grad_norm": 0.158203125, + "learning_rate": 0.00019716300784742227, + "loss": 0.4997, + "step": 151720 + }, + { + "epoch": 7.536008741432402, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019712327406377274, + "loss": 0.4874, + "step": 151730 + }, + { + "epoch": 7.5365054137280225, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019708354028012318, + "loss": 0.4761, + "step": 151740 + }, + { + "epoch": 7.537002086023642, + "grad_norm": 0.224609375, + "learning_rate": 0.00019704380649647363, + "loss": 0.4938, + "step": 151750 + }, + { + "epoch": 7.537498758319261, + "grad_norm": 0.19140625, + "learning_rate": 0.0001970040727128241, + "loss": 0.4834, + "step": 151760 + }, + { + "epoch": 7.53799543061488, + "grad_norm": 0.1435546875, + "learning_rate": 0.00019696433892917454, + "loss": 0.4996, + "step": 151770 + }, + { + "epoch": 7.538492102910499, + "grad_norm": 0.1484375, + "learning_rate": 0.00019692460514552499, + "loss": 0.4935, + "step": 151780 + }, + { + "epoch": 7.538988775206119, + "grad_norm": 0.1484375, + "learning_rate": 0.00019688487136187543, + "loss": 0.4975, + "step": 151790 + }, + { + "epoch": 7.539485447501738, + "grad_norm": 0.150390625, + "learning_rate": 0.0001968451375782259, + "loss": 0.4957, + "step": 151800 + }, + { + "epoch": 7.539982119797358, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019680540379457635, + "loss": 0.484, + "step": 151810 + }, + { + "epoch": 7.540478792092977, + "grad_norm": 0.1337890625, + "learning_rate": 0.00019676567001092682, + "loss": 0.5024, + "step": 151820 + }, + { + "epoch": 7.5409754643885964, + "grad_norm": 0.17578125, + "learning_rate": 0.00019672593622727723, + "loss": 0.4973, + "step": 151830 + }, + { + "epoch": 7.541472136684216, + "grad_norm": 0.16796875, + "learning_rate": 0.0001966862024436277, + "loss": 0.5094, + "step": 151840 + }, + { + "epoch": 7.541968808979835, + "grad_norm": 0.1318359375, + "learning_rate": 0.00019664646865997815, + "loss": 0.4825, + "step": 151850 + }, + { + "epoch": 7.542465481275454, + "grad_norm": 0.177734375, + "learning_rate": 0.00019660673487632862, + "loss": 0.4917, + "step": 151860 + }, + { + "epoch": 7.542962153571073, + "grad_norm": 0.15234375, + "learning_rate": 0.00019656700109267907, + "loss": 0.4823, + "step": 151870 + }, + { + "epoch": 7.5434588258666935, + "grad_norm": 0.1328125, + "learning_rate": 0.0001965272673090295, + "loss": 0.4685, + "step": 151880 + }, + { + "epoch": 7.543955498162313, + "grad_norm": 0.14453125, + "learning_rate": 0.00019648753352537995, + "loss": 0.4929, + "step": 151890 + }, + { + "epoch": 7.544452170457932, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019644779974173042, + "loss": 0.498, + "step": 151900 + }, + { + "epoch": 7.544948842753551, + "grad_norm": 0.21484375, + "learning_rate": 0.00019640806595808087, + "loss": 0.5111, + "step": 151910 + }, + { + "epoch": 7.54544551504917, + "grad_norm": 0.16796875, + "learning_rate": 0.0001963683321744313, + "loss": 0.4959, + "step": 151920 + }, + { + "epoch": 7.54594218734479, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019632859839078176, + "loss": 0.4935, + "step": 151930 + }, + { + "epoch": 7.546438859640409, + "grad_norm": 0.158203125, + "learning_rate": 0.00019628886460713223, + "loss": 0.5074, + "step": 151940 + }, + { + "epoch": 7.546935531936029, + "grad_norm": 0.142578125, + "learning_rate": 0.00019624913082348267, + "loss": 0.4886, + "step": 151950 + }, + { + "epoch": 7.547432204231648, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019620939703983312, + "loss": 0.4911, + "step": 151960 + }, + { + "epoch": 7.547928876527267, + "grad_norm": 0.166015625, + "learning_rate": 0.0001961696632561836, + "loss": 0.4812, + "step": 151970 + }, + { + "epoch": 7.548425548822887, + "grad_norm": 0.14453125, + "learning_rate": 0.00019612992947253403, + "loss": 0.498, + "step": 151980 + }, + { + "epoch": 7.548922221118506, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001960901956888845, + "loss": 0.4915, + "step": 151990 + }, + { + "epoch": 7.549418893414125, + "grad_norm": 0.134765625, + "learning_rate": 0.00019605046190523492, + "loss": 0.5126, + "step": 152000 + }, + { + "epoch": 7.549915565709744, + "grad_norm": 0.1484375, + "learning_rate": 0.0001960107281215854, + "loss": 0.4934, + "step": 152010 + }, + { + "epoch": 7.550412238005364, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019597099433793584, + "loss": 0.5036, + "step": 152020 + }, + { + "epoch": 7.550908910300984, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001959312605542863, + "loss": 0.5045, + "step": 152030 + }, + { + "epoch": 7.551405582596603, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019589152677063673, + "loss": 0.5075, + "step": 152040 + }, + { + "epoch": 7.551902254892222, + "grad_norm": 0.21875, + "learning_rate": 0.0001958517929869872, + "loss": 0.4903, + "step": 152050 + }, + { + "epoch": 7.552398927187841, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019581205920333764, + "loss": 0.5088, + "step": 152060 + }, + { + "epoch": 7.552895599483461, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001957723254196881, + "loss": 0.4865, + "step": 152070 + }, + { + "epoch": 7.55339227177908, + "grad_norm": 0.17578125, + "learning_rate": 0.00019573259163603853, + "loss": 0.4959, + "step": 152080 + }, + { + "epoch": 7.5538889440747, + "grad_norm": 0.1318359375, + "learning_rate": 0.000195692857852389, + "loss": 0.4927, + "step": 152090 + }, + { + "epoch": 7.554385616370319, + "grad_norm": 0.15625, + "learning_rate": 0.00019565312406873945, + "loss": 0.4903, + "step": 152100 + }, + { + "epoch": 7.554882288665938, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019561339028508992, + "loss": 0.4768, + "step": 152110 + }, + { + "epoch": 7.555378960961558, + "grad_norm": 0.1396484375, + "learning_rate": 0.00019557365650144036, + "loss": 0.466, + "step": 152120 + }, + { + "epoch": 7.555875633257177, + "grad_norm": 0.158203125, + "learning_rate": 0.0001955339227177908, + "loss": 0.4701, + "step": 152130 + }, + { + "epoch": 7.556372305552796, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019549418893414128, + "loss": 0.4693, + "step": 152140 + }, + { + "epoch": 7.556868977848415, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019545445515049172, + "loss": 0.5133, + "step": 152150 + }, + { + "epoch": 7.557365650144035, + "grad_norm": 0.14453125, + "learning_rate": 0.00019541472136684217, + "loss": 0.4872, + "step": 152160 + }, + { + "epoch": 7.557862322439655, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001953749875831926, + "loss": 0.4954, + "step": 152170 + }, + { + "epoch": 7.558358994735274, + "grad_norm": 0.15234375, + "learning_rate": 0.00019533525379954308, + "loss": 0.486, + "step": 152180 + }, + { + "epoch": 7.558855667030893, + "grad_norm": 0.1396484375, + "learning_rate": 0.00019529552001589353, + "loss": 0.5108, + "step": 152190 + }, + { + "epoch": 7.559352339326512, + "grad_norm": 0.1396484375, + "learning_rate": 0.00019525578623224397, + "loss": 0.4682, + "step": 152200 + }, + { + "epoch": 7.5598490116221315, + "grad_norm": 0.1357421875, + "learning_rate": 0.0001952160524485944, + "loss": 0.4867, + "step": 152210 + }, + { + "epoch": 7.560345683917751, + "grad_norm": 0.1279296875, + "learning_rate": 0.00019517631866494488, + "loss": 0.4885, + "step": 152220 + }, + { + "epoch": 7.560842356213371, + "grad_norm": 0.146484375, + "learning_rate": 0.00019513658488129533, + "loss": 0.5012, + "step": 152230 + }, + { + "epoch": 7.56133902850899, + "grad_norm": 0.1435546875, + "learning_rate": 0.00019509685109764577, + "loss": 0.4948, + "step": 152240 + }, + { + "epoch": 7.561835700804609, + "grad_norm": 0.166015625, + "learning_rate": 0.00019505711731399622, + "loss": 0.5236, + "step": 152250 + }, + { + "epoch": 7.5623323731002285, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001950173835303467, + "loss": 0.4801, + "step": 152260 + }, + { + "epoch": 7.562829045395848, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019497764974669713, + "loss": 0.5022, + "step": 152270 + }, + { + "epoch": 7.563325717691467, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001949379159630476, + "loss": 0.4811, + "step": 152280 + }, + { + "epoch": 7.563822389987086, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019489818217939805, + "loss": 0.5052, + "step": 152290 + }, + { + "epoch": 7.564319062282706, + "grad_norm": 0.15625, + "learning_rate": 0.0001948584483957485, + "loss": 0.5147, + "step": 152300 + }, + { + "epoch": 7.564815734578326, + "grad_norm": 0.150390625, + "learning_rate": 0.00019481871461209896, + "loss": 0.4936, + "step": 152310 + }, + { + "epoch": 7.565312406873945, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001947789808284494, + "loss": 0.4627, + "step": 152320 + }, + { + "epoch": 7.565809079169564, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019473924704479985, + "loss": 0.4492, + "step": 152330 + }, + { + "epoch": 7.566305751465183, + "grad_norm": 0.162109375, + "learning_rate": 0.0001946995132611503, + "loss": 0.4764, + "step": 152340 + }, + { + "epoch": 7.5668024237608025, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019465977947750077, + "loss": 0.5023, + "step": 152350 + }, + { + "epoch": 7.567299096056422, + "grad_norm": 0.14453125, + "learning_rate": 0.0001946200456938512, + "loss": 0.5026, + "step": 152360 + }, + { + "epoch": 7.567795768352042, + "grad_norm": 0.19140625, + "learning_rate": 0.00019458031191020166, + "loss": 0.4972, + "step": 152370 + }, + { + "epoch": 7.568292440647661, + "grad_norm": 0.1376953125, + "learning_rate": 0.0001945405781265521, + "loss": 0.4831, + "step": 152380 + }, + { + "epoch": 7.56878911294328, + "grad_norm": 0.158203125, + "learning_rate": 0.00019450084434290257, + "loss": 0.4975, + "step": 152390 + }, + { + "epoch": 7.5692857852388995, + "grad_norm": 0.140625, + "learning_rate": 0.00019446111055925302, + "loss": 0.4622, + "step": 152400 + }, + { + "epoch": 7.569782457534519, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019442137677560346, + "loss": 0.5062, + "step": 152410 + }, + { + "epoch": 7.570279129830138, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001943816429919539, + "loss": 0.4771, + "step": 152420 + }, + { + "epoch": 7.570775802125757, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019434190920830438, + "loss": 0.4967, + "step": 152430 + }, + { + "epoch": 7.571272474421376, + "grad_norm": 0.13671875, + "learning_rate": 0.00019430217542465482, + "loss": 0.492, + "step": 152440 + }, + { + "epoch": 7.571769146716996, + "grad_norm": 0.1337890625, + "learning_rate": 0.00019426244164100527, + "loss": 0.5033, + "step": 152450 + }, + { + "epoch": 7.572265819012616, + "grad_norm": 0.1455078125, + "learning_rate": 0.00019422270785735574, + "loss": 0.4958, + "step": 152460 + }, + { + "epoch": 7.572762491308235, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019418297407370618, + "loss": 0.4741, + "step": 152470 + }, + { + "epoch": 7.573259163603854, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019414324029005663, + "loss": 0.4976, + "step": 152480 + }, + { + "epoch": 7.573755835899473, + "grad_norm": 0.169921875, + "learning_rate": 0.00019410350650640707, + "loss": 0.4931, + "step": 152490 + }, + { + "epoch": 7.574252508195093, + "grad_norm": 0.140625, + "learning_rate": 0.00019406377272275754, + "loss": 0.487, + "step": 152500 + }, + { + "epoch": 7.574749180490712, + "grad_norm": 0.15625, + "learning_rate": 0.00019402403893910798, + "loss": 0.4996, + "step": 152510 + }, + { + "epoch": 7.575245852786331, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019398430515545846, + "loss": 0.4514, + "step": 152520 + }, + { + "epoch": 7.575742525081951, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019394457137180887, + "loss": 0.4785, + "step": 152530 + }, + { + "epoch": 7.5762391973775705, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019390483758815934, + "loss": 0.499, + "step": 152540 + }, + { + "epoch": 7.57673586967319, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001938651038045098, + "loss": 0.4918, + "step": 152550 + }, + { + "epoch": 7.577232541968809, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019382537002086026, + "loss": 0.47, + "step": 152560 + }, + { + "epoch": 7.577729214264428, + "grad_norm": 0.162109375, + "learning_rate": 0.00019378563623721068, + "loss": 0.4788, + "step": 152570 + }, + { + "epoch": 7.578225886560047, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019374590245356115, + "loss": 0.4927, + "step": 152580 + }, + { + "epoch": 7.578722558855667, + "grad_norm": 0.166015625, + "learning_rate": 0.0001937061686699116, + "loss": 0.4955, + "step": 152590 + }, + { + "epoch": 7.579219231151287, + "grad_norm": 0.14453125, + "learning_rate": 0.00019366643488626206, + "loss": 0.4595, + "step": 152600 + }, + { + "epoch": 7.579715903446906, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001936267011026125, + "loss": 0.482, + "step": 152610 + }, + { + "epoch": 7.580212575742525, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019358696731896295, + "loss": 0.4559, + "step": 152620 + }, + { + "epoch": 7.580709248038144, + "grad_norm": 0.1640625, + "learning_rate": 0.0001935472335353134, + "loss": 0.4868, + "step": 152630 + }, + { + "epoch": 7.581205920333764, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019350749975166387, + "loss": 0.498, + "step": 152640 + }, + { + "epoch": 7.581702592629383, + "grad_norm": 0.1796875, + "learning_rate": 0.0001934677659680143, + "loss": 0.5297, + "step": 152650 + }, + { + "epoch": 7.582199264925002, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019342803218436476, + "loss": 0.4542, + "step": 152660 + }, + { + "epoch": 7.582695937220622, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019338829840071523, + "loss": 0.4785, + "step": 152670 + }, + { + "epoch": 7.583192609516241, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019334856461706567, + "loss": 0.4759, + "step": 152680 + }, + { + "epoch": 7.583689281811861, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019330883083341612, + "loss": 0.5101, + "step": 152690 + }, + { + "epoch": 7.58418595410748, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019326909704976656, + "loss": 0.4911, + "step": 152700 + }, + { + "epoch": 7.584682626403099, + "grad_norm": 0.1416015625, + "learning_rate": 0.00019322936326611703, + "loss": 0.4462, + "step": 152710 + }, + { + "epoch": 7.585179298698718, + "grad_norm": 0.171875, + "learning_rate": 0.00019318962948246748, + "loss": 0.4964, + "step": 152720 + }, + { + "epoch": 7.5856759709943375, + "grad_norm": 0.18359375, + "learning_rate": 0.00019314989569881795, + "loss": 0.4997, + "step": 152730 + }, + { + "epoch": 7.586172643289958, + "grad_norm": 0.158203125, + "learning_rate": 0.00019311016191516837, + "loss": 0.4711, + "step": 152740 + }, + { + "epoch": 7.586669315585577, + "grad_norm": 0.169921875, + "learning_rate": 0.00019307042813151884, + "loss": 0.4773, + "step": 152750 + }, + { + "epoch": 7.587165987881196, + "grad_norm": 0.1416015625, + "learning_rate": 0.00019303069434786928, + "loss": 0.4628, + "step": 152760 + }, + { + "epoch": 7.587662660176815, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019299096056421975, + "loss": 0.4686, + "step": 152770 + }, + { + "epoch": 7.588159332472435, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019295122678057017, + "loss": 0.4697, + "step": 152780 + }, + { + "epoch": 7.588656004768054, + "grad_norm": 0.1357421875, + "learning_rate": 0.00019291149299692064, + "loss": 0.4738, + "step": 152790 + }, + { + "epoch": 7.589152677063673, + "grad_norm": 0.1455078125, + "learning_rate": 0.00019287175921327109, + "loss": 0.4983, + "step": 152800 + }, + { + "epoch": 7.589649349359293, + "grad_norm": 0.14453125, + "learning_rate": 0.00019283202542962156, + "loss": 0.4926, + "step": 152810 + }, + { + "epoch": 7.590146021654912, + "grad_norm": 0.15234375, + "learning_rate": 0.000192792291645972, + "loss": 0.4742, + "step": 152820 + }, + { + "epoch": 7.590642693950532, + "grad_norm": 0.154296875, + "learning_rate": 0.00019275255786232244, + "loss": 0.4904, + "step": 152830 + }, + { + "epoch": 7.591139366246151, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019271282407867292, + "loss": 0.4825, + "step": 152840 + }, + { + "epoch": 7.59163603854177, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019267309029502336, + "loss": 0.4604, + "step": 152850 + }, + { + "epoch": 7.592132710837389, + "grad_norm": 0.1484375, + "learning_rate": 0.0001926333565113738, + "loss": 0.4852, + "step": 152860 + }, + { + "epoch": 7.5926293831330085, + "grad_norm": 0.1640625, + "learning_rate": 0.00019259362272772425, + "loss": 0.4775, + "step": 152870 + }, + { + "epoch": 7.593126055428629, + "grad_norm": 0.177734375, + "learning_rate": 0.00019255388894407472, + "loss": 0.4789, + "step": 152880 + }, + { + "epoch": 7.593622727724248, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019251415516042516, + "loss": 0.4874, + "step": 152890 + }, + { + "epoch": 7.594119400019867, + "grad_norm": 0.134765625, + "learning_rate": 0.0001924744213767756, + "loss": 0.4828, + "step": 152900 + }, + { + "epoch": 7.594616072315486, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019243468759312605, + "loss": 0.4898, + "step": 152910 + }, + { + "epoch": 7.5951127446111055, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019239495380947652, + "loss": 0.4894, + "step": 152920 + }, + { + "epoch": 7.595609416906725, + "grad_norm": 0.150390625, + "learning_rate": 0.00019235522002582697, + "loss": 0.5232, + "step": 152930 + }, + { + "epoch": 7.596106089202344, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001923154862421774, + "loss": 0.4549, + "step": 152940 + }, + { + "epoch": 7.596602761497964, + "grad_norm": 0.1455078125, + "learning_rate": 0.00019227575245852786, + "loss": 0.469, + "step": 152950 + }, + { + "epoch": 7.597099433793583, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019223601867487833, + "loss": 0.517, + "step": 152960 + }, + { + "epoch": 7.5975961060892026, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019219628489122877, + "loss": 0.4966, + "step": 152970 + }, + { + "epoch": 7.598092778384822, + "grad_norm": 0.1396484375, + "learning_rate": 0.00019215655110757922, + "loss": 0.4698, + "step": 152980 + }, + { + "epoch": 7.598589450680441, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001921168173239297, + "loss": 0.4923, + "step": 152990 + }, + { + "epoch": 7.59908612297606, + "grad_norm": 0.154296875, + "learning_rate": 0.00019207708354028013, + "loss": 0.4954, + "step": 153000 + }, + { + "epoch": 7.5995827952716795, + "grad_norm": 0.138671875, + "learning_rate": 0.0001920373497566306, + "loss": 0.5022, + "step": 153010 + }, + { + "epoch": 7.6000794675673, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019199761597298102, + "loss": 0.4913, + "step": 153020 + }, + { + "epoch": 7.600576139862919, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001919578821893315, + "loss": 0.5118, + "step": 153030 + }, + { + "epoch": 7.601072812158538, + "grad_norm": 0.1484375, + "learning_rate": 0.00019191814840568194, + "loss": 0.4902, + "step": 153040 + }, + { + "epoch": 7.601569484454157, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001918784146220324, + "loss": 0.4935, + "step": 153050 + }, + { + "epoch": 7.6020661567497765, + "grad_norm": 0.14453125, + "learning_rate": 0.00019183868083838283, + "loss": 0.498, + "step": 153060 + }, + { + "epoch": 7.602562829045396, + "grad_norm": 0.14453125, + "learning_rate": 0.0001917989470547333, + "loss": 0.4913, + "step": 153070 + }, + { + "epoch": 7.603059501341015, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019175921327108374, + "loss": 0.477, + "step": 153080 + }, + { + "epoch": 7.603556173636634, + "grad_norm": 0.16796875, + "learning_rate": 0.0001917194794874342, + "loss": 0.4654, + "step": 153090 + }, + { + "epoch": 7.604052845932254, + "grad_norm": 0.142578125, + "learning_rate": 0.00019167974570378463, + "loss": 0.4991, + "step": 153100 + }, + { + "epoch": 7.6045495182278735, + "grad_norm": 0.146484375, + "learning_rate": 0.0001916400119201351, + "loss": 0.4697, + "step": 153110 + }, + { + "epoch": 7.605046190523493, + "grad_norm": 0.154296875, + "learning_rate": 0.00019160027813648555, + "loss": 0.4818, + "step": 153120 + }, + { + "epoch": 7.605542862819112, + "grad_norm": 0.154296875, + "learning_rate": 0.00019156054435283602, + "loss": 0.4834, + "step": 153130 + }, + { + "epoch": 7.606039535114731, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019152081056918646, + "loss": 0.4731, + "step": 153140 + }, + { + "epoch": 7.60653620741035, + "grad_norm": 0.1298828125, + "learning_rate": 0.0001914810767855369, + "loss": 0.4613, + "step": 153150 + }, + { + "epoch": 7.60703287970597, + "grad_norm": 0.1435546875, + "learning_rate": 0.00019144134300188738, + "loss": 0.5042, + "step": 153160 + }, + { + "epoch": 7.607529552001589, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019140160921823782, + "loss": 0.5001, + "step": 153170 + }, + { + "epoch": 7.608026224297209, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019136187543458826, + "loss": 0.484, + "step": 153180 + }, + { + "epoch": 7.608522896592828, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001913221416509387, + "loss": 0.4745, + "step": 153190 + }, + { + "epoch": 7.609019568888447, + "grad_norm": 0.1435546875, + "learning_rate": 0.00019128240786728918, + "loss": 0.4809, + "step": 153200 + }, + { + "epoch": 7.609516241184067, + "grad_norm": 0.16796875, + "learning_rate": 0.00019124267408363962, + "loss": 0.4721, + "step": 153210 + }, + { + "epoch": 7.610012913479686, + "grad_norm": 0.140625, + "learning_rate": 0.0001912029402999901, + "loss": 0.4792, + "step": 153220 + }, + { + "epoch": 7.610509585775305, + "grad_norm": 0.146484375, + "learning_rate": 0.0001911632065163405, + "loss": 0.4953, + "step": 153230 + }, + { + "epoch": 7.611006258070924, + "grad_norm": 0.1357421875, + "learning_rate": 0.00019112347273269098, + "loss": 0.4718, + "step": 153240 + }, + { + "epoch": 7.6115029303665445, + "grad_norm": 0.15625, + "learning_rate": 0.00019108373894904143, + "loss": 0.5329, + "step": 153250 + }, + { + "epoch": 7.611999602662164, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001910440051653919, + "loss": 0.5006, + "step": 153260 + }, + { + "epoch": 7.612496274957783, + "grad_norm": 0.158203125, + "learning_rate": 0.00019100427138174232, + "loss": 0.4735, + "step": 153270 + }, + { + "epoch": 7.612992947253402, + "grad_norm": 0.14453125, + "learning_rate": 0.0001909645375980928, + "loss": 0.5047, + "step": 153280 + }, + { + "epoch": 7.613489619549021, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019092480381444323, + "loss": 0.5039, + "step": 153290 + }, + { + "epoch": 7.613986291844641, + "grad_norm": 0.1376953125, + "learning_rate": 0.0001908850700307937, + "loss": 0.4499, + "step": 153300 + }, + { + "epoch": 7.61448296414026, + "grad_norm": 0.1416015625, + "learning_rate": 0.00019084533624714415, + "loss": 0.5066, + "step": 153310 + }, + { + "epoch": 7.61497963643588, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001908056024634946, + "loss": 0.4999, + "step": 153320 + }, + { + "epoch": 7.615476308731499, + "grad_norm": 0.15234375, + "learning_rate": 0.00019076586867984504, + "loss": 0.5119, + "step": 153330 + }, + { + "epoch": 7.615972981027118, + "grad_norm": 0.16796875, + "learning_rate": 0.0001907261348961955, + "loss": 0.489, + "step": 153340 + }, + { + "epoch": 7.616469653322738, + "grad_norm": 0.146484375, + "learning_rate": 0.00019068640111254595, + "loss": 0.4963, + "step": 153350 + }, + { + "epoch": 7.616966325618357, + "grad_norm": 0.162109375, + "learning_rate": 0.0001906466673288964, + "loss": 0.5032, + "step": 153360 + }, + { + "epoch": 7.617462997913976, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019060693354524687, + "loss": 0.4976, + "step": 153370 + }, + { + "epoch": 7.617959670209595, + "grad_norm": 0.17578125, + "learning_rate": 0.0001905671997615973, + "loss": 0.4896, + "step": 153380 + }, + { + "epoch": 7.618456342505215, + "grad_norm": 0.1640625, + "learning_rate": 0.00019052746597794776, + "loss": 0.5082, + "step": 153390 + }, + { + "epoch": 7.618953014800835, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001904877321942982, + "loss": 0.495, + "step": 153400 + }, + { + "epoch": 7.619449687096454, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019044799841064867, + "loss": 0.5106, + "step": 153410 + }, + { + "epoch": 7.619946359392073, + "grad_norm": 0.14453125, + "learning_rate": 0.00019040826462699912, + "loss": 0.4946, + "step": 153420 + }, + { + "epoch": 7.620443031687692, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019036853084334956, + "loss": 0.4776, + "step": 153430 + }, + { + "epoch": 7.6209397039833116, + "grad_norm": 0.1484375, + "learning_rate": 0.0001903287970597, + "loss": 0.5085, + "step": 153440 + }, + { + "epoch": 7.621436376278931, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019028906327605048, + "loss": 0.5323, + "step": 153450 + }, + { + "epoch": 7.621933048574551, + "grad_norm": 0.158203125, + "learning_rate": 0.00019024932949240092, + "loss": 0.4859, + "step": 153460 + }, + { + "epoch": 7.62242972087017, + "grad_norm": 0.1455078125, + "learning_rate": 0.00019020959570875136, + "loss": 0.5257, + "step": 153470 + }, + { + "epoch": 7.622926393165789, + "grad_norm": 0.1953125, + "learning_rate": 0.0001901698619251018, + "loss": 0.4878, + "step": 153480 + }, + { + "epoch": 7.623423065461409, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019013012814145228, + "loss": 0.4682, + "step": 153490 + }, + { + "epoch": 7.623919737757028, + "grad_norm": 0.1376953125, + "learning_rate": 0.00019009039435780272, + "loss": 0.4855, + "step": 153500 + }, + { + "epoch": 7.624416410052647, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019005066057415317, + "loss": 0.4881, + "step": 153510 + }, + { + "epoch": 7.624913082348266, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019001092679050364, + "loss": 0.4935, + "step": 153520 + }, + { + "epoch": 7.625409754643886, + "grad_norm": 0.166015625, + "learning_rate": 0.00018997119300685408, + "loss": 0.4882, + "step": 153530 + }, + { + "epoch": 7.625906426939506, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018993145922320456, + "loss": 0.5026, + "step": 153540 + }, + { + "epoch": 7.626403099235125, + "grad_norm": 0.1865234375, + "learning_rate": 0.000189891725439555, + "loss": 0.4697, + "step": 153550 + }, + { + "epoch": 7.626899771530744, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018985199165590544, + "loss": 0.493, + "step": 153560 + }, + { + "epoch": 7.627396443826363, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001898122578722559, + "loss": 0.5156, + "step": 153570 + }, + { + "epoch": 7.6278931161219825, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018977252408860636, + "loss": 0.5061, + "step": 153580 + }, + { + "epoch": 7.628389788417602, + "grad_norm": 0.1640625, + "learning_rate": 0.0001897327903049568, + "loss": 0.4553, + "step": 153590 + }, + { + "epoch": 7.628886460713222, + "grad_norm": 0.1396484375, + "learning_rate": 0.00018969305652130725, + "loss": 0.4785, + "step": 153600 + }, + { + "epoch": 7.629383133008841, + "grad_norm": 0.154296875, + "learning_rate": 0.0001896533227376577, + "loss": 0.4759, + "step": 153610 + }, + { + "epoch": 7.62987980530446, + "grad_norm": 0.1787109375, + "learning_rate": 0.00018961358895400816, + "loss": 0.4862, + "step": 153620 + }, + { + "epoch": 7.6303764776000795, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001895738551703586, + "loss": 0.5018, + "step": 153630 + }, + { + "epoch": 7.630873149895699, + "grad_norm": 0.138671875, + "learning_rate": 0.00018953412138670905, + "loss": 0.484, + "step": 153640 + }, + { + "epoch": 7.631369822191318, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001894943876030595, + "loss": 0.5266, + "step": 153650 + }, + { + "epoch": 7.631866494486937, + "grad_norm": 0.15234375, + "learning_rate": 0.00018945465381940997, + "loss": 0.4716, + "step": 153660 + }, + { + "epoch": 7.632363166782557, + "grad_norm": 0.138671875, + "learning_rate": 0.0001894149200357604, + "loss": 0.4972, + "step": 153670 + }, + { + "epoch": 7.632859839078177, + "grad_norm": 0.1357421875, + "learning_rate": 0.00018937518625211086, + "loss": 0.4969, + "step": 153680 + }, + { + "epoch": 7.633356511373796, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018933545246846133, + "loss": 0.5109, + "step": 153690 + }, + { + "epoch": 7.633853183669415, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018929571868481177, + "loss": 0.4701, + "step": 153700 + }, + { + "epoch": 7.634349855965034, + "grad_norm": 0.1396484375, + "learning_rate": 0.00018925598490116224, + "loss": 0.4861, + "step": 153710 + }, + { + "epoch": 7.6348465282606535, + "grad_norm": 0.1396484375, + "learning_rate": 0.00018921625111751266, + "loss": 0.508, + "step": 153720 + }, + { + "epoch": 7.635343200556273, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018917651733386313, + "loss": 0.4445, + "step": 153730 + }, + { + "epoch": 7.635839872851893, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018913678355021358, + "loss": 0.5065, + "step": 153740 + }, + { + "epoch": 7.636336545147512, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018909704976656405, + "loss": 0.5, + "step": 153750 + }, + { + "epoch": 7.636833217443131, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018905731598291446, + "loss": 0.4926, + "step": 153760 + }, + { + "epoch": 7.6373298897387505, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018901758219926494, + "loss": 0.501, + "step": 153770 + }, + { + "epoch": 7.63782656203437, + "grad_norm": 0.1787109375, + "learning_rate": 0.00018897784841561538, + "loss": 0.4738, + "step": 153780 + }, + { + "epoch": 7.638323234329989, + "grad_norm": 0.15625, + "learning_rate": 0.00018893811463196585, + "loss": 0.4654, + "step": 153790 + }, + { + "epoch": 7.638819906625608, + "grad_norm": 0.1396484375, + "learning_rate": 0.00018889838084831627, + "loss": 0.4611, + "step": 153800 + }, + { + "epoch": 7.639316578921227, + "grad_norm": 0.1875, + "learning_rate": 0.00018885864706466674, + "loss": 0.5073, + "step": 153810 + }, + { + "epoch": 7.6398132512168475, + "grad_norm": 0.1435546875, + "learning_rate": 0.00018881891328101718, + "loss": 0.5262, + "step": 153820 + }, + { + "epoch": 7.640309923512467, + "grad_norm": 0.1865234375, + "learning_rate": 0.00018877917949736766, + "loss": 0.4801, + "step": 153830 + }, + { + "epoch": 7.640806595808086, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001887394457137181, + "loss": 0.4817, + "step": 153840 + }, + { + "epoch": 7.641303268103705, + "grad_norm": 0.1396484375, + "learning_rate": 0.00018869971193006854, + "loss": 0.4774, + "step": 153850 + }, + { + "epoch": 7.641799940399324, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018865997814641902, + "loss": 0.5228, + "step": 153860 + }, + { + "epoch": 7.642296612694944, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018862024436276946, + "loss": 0.4823, + "step": 153870 + }, + { + "epoch": 7.642793284990563, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001885805105791199, + "loss": 0.491, + "step": 153880 + }, + { + "epoch": 7.643289957286182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018854077679547035, + "loss": 0.4891, + "step": 153890 + }, + { + "epoch": 7.643786629581802, + "grad_norm": 0.1474609375, + "learning_rate": 0.00018850104301182082, + "loss": 0.4896, + "step": 153900 + }, + { + "epoch": 7.644283301877421, + "grad_norm": 0.1796875, + "learning_rate": 0.00018846130922817126, + "loss": 0.4967, + "step": 153910 + }, + { + "epoch": 7.644779974173041, + "grad_norm": 0.1484375, + "learning_rate": 0.0001884215754445217, + "loss": 0.4747, + "step": 153920 + }, + { + "epoch": 7.64527664646866, + "grad_norm": 0.169921875, + "learning_rate": 0.00018838184166087215, + "loss": 0.4724, + "step": 153930 + }, + { + "epoch": 7.645773318764279, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018834210787722262, + "loss": 0.5126, + "step": 153940 + }, + { + "epoch": 7.646269991059898, + "grad_norm": 0.1513671875, + "learning_rate": 0.00018830237409357307, + "loss": 0.4926, + "step": 153950 + }, + { + "epoch": 7.646766663355518, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018826264030992354, + "loss": 0.5141, + "step": 153960 + }, + { + "epoch": 7.647263335651138, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018822290652627396, + "loss": 0.4958, + "step": 153970 + }, + { + "epoch": 7.647760007946757, + "grad_norm": 0.154296875, + "learning_rate": 0.00018818317274262443, + "loss": 0.488, + "step": 153980 + }, + { + "epoch": 7.648256680242376, + "grad_norm": 0.158203125, + "learning_rate": 0.00018814343895897487, + "loss": 0.5178, + "step": 153990 + }, + { + "epoch": 7.648753352537995, + "grad_norm": 0.17578125, + "learning_rate": 0.00018810370517532534, + "loss": 0.4964, + "step": 154000 + }, + { + "epoch": 7.649250024833615, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001880639713916758, + "loss": 0.4906, + "step": 154010 + }, + { + "epoch": 7.649746697129234, + "grad_norm": 0.158203125, + "learning_rate": 0.00018802423760802623, + "loss": 0.4911, + "step": 154020 + }, + { + "epoch": 7.650243369424853, + "grad_norm": 0.1513671875, + "learning_rate": 0.00018798450382437668, + "loss": 0.5231, + "step": 154030 + }, + { + "epoch": 7.650740041720473, + "grad_norm": 0.138671875, + "learning_rate": 0.00018794477004072715, + "loss": 0.485, + "step": 154040 + }, + { + "epoch": 7.651236714016092, + "grad_norm": 0.15234375, + "learning_rate": 0.0001879050362570776, + "loss": 0.4854, + "step": 154050 + }, + { + "epoch": 7.651733386311712, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018786530247342804, + "loss": 0.5198, + "step": 154060 + }, + { + "epoch": 7.652230058607331, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001878255686897785, + "loss": 0.5184, + "step": 154070 + }, + { + "epoch": 7.65272673090295, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018778583490612895, + "loss": 0.5059, + "step": 154080 + }, + { + "epoch": 7.653223403198569, + "grad_norm": 0.1484375, + "learning_rate": 0.0001877461011224794, + "loss": 0.4949, + "step": 154090 + }, + { + "epoch": 7.6537200754941885, + "grad_norm": 0.181640625, + "learning_rate": 0.00018770636733882984, + "loss": 0.4948, + "step": 154100 + }, + { + "epoch": 7.654216747789809, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001876666335551803, + "loss": 0.462, + "step": 154110 + }, + { + "epoch": 7.654713420085428, + "grad_norm": 0.166015625, + "learning_rate": 0.00018762689977153076, + "loss": 0.4503, + "step": 154120 + }, + { + "epoch": 7.655210092381047, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001875871659878812, + "loss": 0.4983, + "step": 154130 + }, + { + "epoch": 7.655706764676666, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018754743220423164, + "loss": 0.4772, + "step": 154140 + }, + { + "epoch": 7.656203436972286, + "grad_norm": 0.1826171875, + "learning_rate": 0.00018750769842058212, + "loss": 0.5236, + "step": 154150 + }, + { + "epoch": 7.656700109267905, + "grad_norm": 0.173828125, + "learning_rate": 0.00018746796463693256, + "loss": 0.4789, + "step": 154160 + }, + { + "epoch": 7.657196781563524, + "grad_norm": 0.1572265625, + "learning_rate": 0.000187428230853283, + "loss": 0.4866, + "step": 154170 + }, + { + "epoch": 7.657693453859144, + "grad_norm": 0.1484375, + "learning_rate": 0.00018738849706963345, + "loss": 0.4892, + "step": 154180 + }, + { + "epoch": 7.658190126154763, + "grad_norm": 0.1513671875, + "learning_rate": 0.00018734876328598392, + "loss": 0.506, + "step": 154190 + }, + { + "epoch": 7.658686798450383, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018730902950233436, + "loss": 0.4922, + "step": 154200 + }, + { + "epoch": 7.659183470746002, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001872692957186848, + "loss": 0.4795, + "step": 154210 + }, + { + "epoch": 7.659680143041621, + "grad_norm": 0.1357421875, + "learning_rate": 0.00018722956193503528, + "loss": 0.5183, + "step": 154220 + }, + { + "epoch": 7.66017681533724, + "grad_norm": 0.16015625, + "learning_rate": 0.00018718982815138572, + "loss": 0.4634, + "step": 154230 + }, + { + "epoch": 7.6606734876328595, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001871500943677362, + "loss": 0.5115, + "step": 154240 + }, + { + "epoch": 7.66117015992848, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001871103605840866, + "loss": 0.5029, + "step": 154250 + }, + { + "epoch": 7.661666832224099, + "grad_norm": 0.169921875, + "learning_rate": 0.00018707062680043708, + "loss": 0.5018, + "step": 154260 + }, + { + "epoch": 7.662163504519718, + "grad_norm": 0.1357421875, + "learning_rate": 0.00018703089301678753, + "loss": 0.4781, + "step": 154270 + }, + { + "epoch": 7.662660176815337, + "grad_norm": 0.15234375, + "learning_rate": 0.000186991159233138, + "loss": 0.4766, + "step": 154280 + }, + { + "epoch": 7.6631568491109565, + "grad_norm": 0.146484375, + "learning_rate": 0.00018695142544948842, + "loss": 0.4757, + "step": 154290 + }, + { + "epoch": 7.663653521406576, + "grad_norm": 0.14453125, + "learning_rate": 0.0001869116916658389, + "loss": 0.5078, + "step": 154300 + }, + { + "epoch": 7.664150193702195, + "grad_norm": 0.12890625, + "learning_rate": 0.00018687195788218933, + "loss": 0.4786, + "step": 154310 + }, + { + "epoch": 7.664646865997815, + "grad_norm": 0.16015625, + "learning_rate": 0.0001868322240985398, + "loss": 0.5029, + "step": 154320 + }, + { + "epoch": 7.665143538293434, + "grad_norm": 0.19921875, + "learning_rate": 0.00018679249031489022, + "loss": 0.4848, + "step": 154330 + }, + { + "epoch": 7.6656402105890535, + "grad_norm": 0.158203125, + "learning_rate": 0.0001867527565312407, + "loss": 0.499, + "step": 154340 + }, + { + "epoch": 7.666136882884673, + "grad_norm": 0.154296875, + "learning_rate": 0.00018671302274759114, + "loss": 0.4852, + "step": 154350 + }, + { + "epoch": 7.666633555180292, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001866732889639416, + "loss": 0.4741, + "step": 154360 + }, + { + "epoch": 7.667130227475911, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018663355518029205, + "loss": 0.4989, + "step": 154370 + }, + { + "epoch": 7.6676268997715304, + "grad_norm": 0.1435546875, + "learning_rate": 0.0001865938213966425, + "loss": 0.5169, + "step": 154380 + }, + { + "epoch": 7.668123572067151, + "grad_norm": 0.150390625, + "learning_rate": 0.00018655408761299297, + "loss": 0.4676, + "step": 154390 + }, + { + "epoch": 7.66862024436277, + "grad_norm": 0.1376953125, + "learning_rate": 0.0001865143538293434, + "loss": 0.4761, + "step": 154400 + }, + { + "epoch": 7.669116916658389, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018647462004569388, + "loss": 0.5073, + "step": 154410 + }, + { + "epoch": 7.669613588954008, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001864348862620443, + "loss": 0.4982, + "step": 154420 + }, + { + "epoch": 7.6701102612496275, + "grad_norm": 0.14453125, + "learning_rate": 0.00018639515247839477, + "loss": 0.4747, + "step": 154430 + }, + { + "epoch": 7.670606933545247, + "grad_norm": 0.1474609375, + "learning_rate": 0.00018635541869474522, + "loss": 0.4954, + "step": 154440 + }, + { + "epoch": 7.671103605840866, + "grad_norm": 0.154296875, + "learning_rate": 0.0001863156849110957, + "loss": 0.4677, + "step": 154450 + }, + { + "epoch": 7.671600278136486, + "grad_norm": 0.171875, + "learning_rate": 0.0001862759511274461, + "loss": 0.5054, + "step": 154460 + }, + { + "epoch": 7.672096950432105, + "grad_norm": 0.1396484375, + "learning_rate": 0.00018623621734379658, + "loss": 0.4732, + "step": 154470 + }, + { + "epoch": 7.6725936227277245, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018619648356014702, + "loss": 0.486, + "step": 154480 + }, + { + "epoch": 7.673090295023344, + "grad_norm": 0.146484375, + "learning_rate": 0.0001861567497764975, + "loss": 0.5007, + "step": 154490 + }, + { + "epoch": 7.673586967318963, + "grad_norm": 0.162109375, + "learning_rate": 0.0001861170159928479, + "loss": 0.4551, + "step": 154500 + }, + { + "epoch": 7.674083639614582, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018607728220919838, + "loss": 0.512, + "step": 154510 + }, + { + "epoch": 7.674580311910201, + "grad_norm": 0.1416015625, + "learning_rate": 0.00018603754842554882, + "loss": 0.5094, + "step": 154520 + }, + { + "epoch": 7.675076984205821, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001859978146418993, + "loss": 0.4922, + "step": 154530 + }, + { + "epoch": 7.67557365650144, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018595808085824974, + "loss": 0.4668, + "step": 154540 + }, + { + "epoch": 7.67607032879706, + "grad_norm": 0.1396484375, + "learning_rate": 0.00018591834707460018, + "loss": 0.4956, + "step": 154550 + }, + { + "epoch": 7.676567001092679, + "grad_norm": 0.166015625, + "learning_rate": 0.00018587861329095066, + "loss": 0.5197, + "step": 154560 + }, + { + "epoch": 7.677063673388298, + "grad_norm": 0.15625, + "learning_rate": 0.0001858388795073011, + "loss": 0.466, + "step": 154570 + }, + { + "epoch": 7.677560345683918, + "grad_norm": 0.1435546875, + "learning_rate": 0.00018579914572365154, + "loss": 0.4589, + "step": 154580 + }, + { + "epoch": 7.678057017979537, + "grad_norm": 0.181640625, + "learning_rate": 0.000185759411940002, + "loss": 0.4978, + "step": 154590 + }, + { + "epoch": 7.678553690275156, + "grad_norm": 0.1396484375, + "learning_rate": 0.00018571967815635246, + "loss": 0.4784, + "step": 154600 + }, + { + "epoch": 7.679050362570775, + "grad_norm": 0.17578125, + "learning_rate": 0.0001856799443727029, + "loss": 0.4704, + "step": 154610 + }, + { + "epoch": 7.6795470348663954, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018564021058905335, + "loss": 0.4839, + "step": 154620 + }, + { + "epoch": 7.680043707162015, + "grad_norm": 0.154296875, + "learning_rate": 0.0001856004768054038, + "loss": 0.4889, + "step": 154630 + }, + { + "epoch": 7.680540379457634, + "grad_norm": 0.1513671875, + "learning_rate": 0.00018556074302175426, + "loss": 0.4979, + "step": 154640 + }, + { + "epoch": 7.681037051753253, + "grad_norm": 0.150390625, + "learning_rate": 0.0001855210092381047, + "loss": 0.5, + "step": 154650 + }, + { + "epoch": 7.681533724048872, + "grad_norm": 0.1376953125, + "learning_rate": 0.00018548127545445515, + "loss": 0.504, + "step": 154660 + }, + { + "epoch": 7.682030396344492, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001854415416708056, + "loss": 0.4719, + "step": 154670 + }, + { + "epoch": 7.682527068640111, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018540180788715607, + "loss": 0.4961, + "step": 154680 + }, + { + "epoch": 7.683023740935731, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001853620741035065, + "loss": 0.4992, + "step": 154690 + }, + { + "epoch": 7.68352041323135, + "grad_norm": 0.1923828125, + "learning_rate": 0.00018532234031985696, + "loss": 0.4729, + "step": 154700 + }, + { + "epoch": 7.684017085526969, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018528260653620743, + "loss": 0.5064, + "step": 154710 + }, + { + "epoch": 7.684513757822589, + "grad_norm": 0.158203125, + "learning_rate": 0.00018524287275255787, + "loss": 0.4831, + "step": 154720 + }, + { + "epoch": 7.685010430118208, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018520313896890832, + "loss": 0.4868, + "step": 154730 + }, + { + "epoch": 7.685507102413827, + "grad_norm": 0.16015625, + "learning_rate": 0.00018516340518525876, + "loss": 0.4677, + "step": 154740 + }, + { + "epoch": 7.686003774709446, + "grad_norm": 0.15625, + "learning_rate": 0.00018512367140160923, + "loss": 0.5095, + "step": 154750 + }, + { + "epoch": 7.686500447005066, + "grad_norm": 0.1640625, + "learning_rate": 0.00018508393761795968, + "loss": 0.5201, + "step": 154760 + }, + { + "epoch": 7.686997119300686, + "grad_norm": 0.150390625, + "learning_rate": 0.00018504420383431015, + "loss": 0.4974, + "step": 154770 + }, + { + "epoch": 7.687493791596305, + "grad_norm": 0.1474609375, + "learning_rate": 0.00018500447005066056, + "loss": 0.479, + "step": 154780 + }, + { + "epoch": 7.687990463891924, + "grad_norm": 0.1904296875, + "learning_rate": 0.00018496473626701104, + "loss": 0.5237, + "step": 154790 + }, + { + "epoch": 7.688487136187543, + "grad_norm": 0.154296875, + "learning_rate": 0.00018492500248336148, + "loss": 0.4972, + "step": 154800 + }, + { + "epoch": 7.6889838084831625, + "grad_norm": 0.166015625, + "learning_rate": 0.00018488526869971195, + "loss": 0.5134, + "step": 154810 + }, + { + "epoch": 7.689480480778782, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001848455349160624, + "loss": 0.4614, + "step": 154820 + }, + { + "epoch": 7.689977153074402, + "grad_norm": 0.1484375, + "learning_rate": 0.00018480580113241284, + "loss": 0.508, + "step": 154830 + }, + { + "epoch": 7.690473825370021, + "grad_norm": 0.1435546875, + "learning_rate": 0.00018476606734876328, + "loss": 0.4976, + "step": 154840 + }, + { + "epoch": 7.69097049766564, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018472633356511376, + "loss": 0.4854, + "step": 154850 + }, + { + "epoch": 7.69146716996126, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001846865997814642, + "loss": 0.4516, + "step": 154860 + }, + { + "epoch": 7.691963842256879, + "grad_norm": 0.1416015625, + "learning_rate": 0.00018464686599781464, + "loss": 0.4901, + "step": 154870 + }, + { + "epoch": 7.692460514552498, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001846071322141651, + "loss": 0.484, + "step": 154880 + }, + { + "epoch": 7.692957186848117, + "grad_norm": 0.16015625, + "learning_rate": 0.00018456739843051556, + "loss": 0.446, + "step": 154890 + }, + { + "epoch": 7.693453859143737, + "grad_norm": 0.2041015625, + "learning_rate": 0.000184527664646866, + "loss": 0.4732, + "step": 154900 + }, + { + "epoch": 7.693950531439357, + "grad_norm": 0.1484375, + "learning_rate": 0.00018448793086321645, + "loss": 0.518, + "step": 154910 + }, + { + "epoch": 7.694447203734976, + "grad_norm": 0.150390625, + "learning_rate": 0.00018444819707956692, + "loss": 0.4795, + "step": 154920 + }, + { + "epoch": 7.694943876030595, + "grad_norm": 0.150390625, + "learning_rate": 0.00018440846329591736, + "loss": 0.5062, + "step": 154930 + }, + { + "epoch": 7.695440548326214, + "grad_norm": 0.1435546875, + "learning_rate": 0.00018436872951226784, + "loss": 0.4908, + "step": 154940 + }, + { + "epoch": 7.6959372206218335, + "grad_norm": 0.13671875, + "learning_rate": 0.00018432899572861825, + "loss": 0.4745, + "step": 154950 + }, + { + "epoch": 7.696433892917453, + "grad_norm": 0.146484375, + "learning_rate": 0.00018428926194496872, + "loss": 0.5064, + "step": 154960 + }, + { + "epoch": 7.696930565213073, + "grad_norm": 0.16796875, + "learning_rate": 0.00018424952816131917, + "loss": 0.4763, + "step": 154970 + }, + { + "epoch": 7.697427237508692, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018420979437766964, + "loss": 0.5162, + "step": 154980 + }, + { + "epoch": 7.697923909804311, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018417006059402006, + "loss": 0.4906, + "step": 154990 + }, + { + "epoch": 7.6984205820999305, + "grad_norm": 0.14453125, + "learning_rate": 0.00018413032681037053, + "loss": 0.4674, + "step": 155000 + }, + { + "epoch": 7.69891725439555, + "grad_norm": 0.1748046875, + "learning_rate": 0.00018409059302672097, + "loss": 0.5068, + "step": 155010 + }, + { + "epoch": 7.699413926691169, + "grad_norm": 0.1796875, + "learning_rate": 0.00018405085924307144, + "loss": 0.4977, + "step": 155020 + }, + { + "epoch": 7.699910598986788, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018401112545942186, + "loss": 0.4931, + "step": 155030 + }, + { + "epoch": 7.700407271282408, + "grad_norm": 0.189453125, + "learning_rate": 0.00018397139167577233, + "loss": 0.5055, + "step": 155040 + }, + { + "epoch": 7.7009039435780275, + "grad_norm": 0.171875, + "learning_rate": 0.00018393165789212278, + "loss": 0.4862, + "step": 155050 + }, + { + "epoch": 7.701400615873647, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018389192410847325, + "loss": 0.4901, + "step": 155060 + }, + { + "epoch": 7.701897288169266, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001838521903248237, + "loss": 0.4955, + "step": 155070 + }, + { + "epoch": 7.702393960464885, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018381245654117414, + "loss": 0.4724, + "step": 155080 + }, + { + "epoch": 7.7028906327605045, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001837727227575246, + "loss": 0.4718, + "step": 155090 + }, + { + "epoch": 7.703387305056124, + "grad_norm": 0.158203125, + "learning_rate": 0.00018373298897387505, + "loss": 0.484, + "step": 155100 + }, + { + "epoch": 7.703883977351744, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001836932551902255, + "loss": 0.4619, + "step": 155110 + }, + { + "epoch": 7.704380649647363, + "grad_norm": 0.162109375, + "learning_rate": 0.00018365352140657594, + "loss": 0.5065, + "step": 155120 + }, + { + "epoch": 7.704877321942982, + "grad_norm": 0.146484375, + "learning_rate": 0.0001836137876229264, + "loss": 0.4808, + "step": 155130 + }, + { + "epoch": 7.7053739942386015, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018357405383927686, + "loss": 0.4847, + "step": 155140 + }, + { + "epoch": 7.705870666534221, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001835343200556273, + "loss": 0.4682, + "step": 155150 + }, + { + "epoch": 7.70636733882984, + "grad_norm": 0.1865234375, + "learning_rate": 0.00018349458627197774, + "loss": 0.504, + "step": 155160 + }, + { + "epoch": 7.706864011125459, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018345485248832822, + "loss": 0.4837, + "step": 155170 + }, + { + "epoch": 7.707360683421078, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018341511870467866, + "loss": 0.4956, + "step": 155180 + }, + { + "epoch": 7.7078573557166985, + "grad_norm": 0.16015625, + "learning_rate": 0.0001833753849210291, + "loss": 0.5038, + "step": 155190 + }, + { + "epoch": 7.708354028012318, + "grad_norm": 0.13671875, + "learning_rate": 0.00018333565113737955, + "loss": 0.4812, + "step": 155200 + }, + { + "epoch": 7.708850700307937, + "grad_norm": 0.1484375, + "learning_rate": 0.00018329591735373002, + "loss": 0.5041, + "step": 155210 + }, + { + "epoch": 7.709347372603556, + "grad_norm": 0.142578125, + "learning_rate": 0.00018325618357008046, + "loss": 0.4985, + "step": 155220 + }, + { + "epoch": 7.709844044899175, + "grad_norm": 0.140625, + "learning_rate": 0.00018321644978643094, + "loss": 0.4971, + "step": 155230 + }, + { + "epoch": 7.710340717194795, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018317671600278138, + "loss": 0.481, + "step": 155240 + }, + { + "epoch": 7.710837389490414, + "grad_norm": 0.154296875, + "learning_rate": 0.00018313698221913182, + "loss": 0.4365, + "step": 155250 + }, + { + "epoch": 7.711334061786033, + "grad_norm": 0.15625, + "learning_rate": 0.0001830972484354823, + "loss": 0.4678, + "step": 155260 + }, + { + "epoch": 7.711830734081653, + "grad_norm": 0.1416015625, + "learning_rate": 0.00018305751465183274, + "loss": 0.466, + "step": 155270 + }, + { + "epoch": 7.712327406377272, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018301778086818318, + "loss": 0.4843, + "step": 155280 + }, + { + "epoch": 7.712824078672892, + "grad_norm": 0.1416015625, + "learning_rate": 0.00018297804708453363, + "loss": 0.4702, + "step": 155290 + }, + { + "epoch": 7.713320750968511, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001829383133008841, + "loss": 0.4972, + "step": 155300 + }, + { + "epoch": 7.71381742326413, + "grad_norm": 0.1484375, + "learning_rate": 0.00018289857951723454, + "loss": 0.5, + "step": 155310 + }, + { + "epoch": 7.714314095559749, + "grad_norm": 0.1435546875, + "learning_rate": 0.000182858845733585, + "loss": 0.5008, + "step": 155320 + }, + { + "epoch": 7.714810767855369, + "grad_norm": 0.138671875, + "learning_rate": 0.00018281911194993543, + "loss": 0.4864, + "step": 155330 + }, + { + "epoch": 7.715307440150989, + "grad_norm": 0.158203125, + "learning_rate": 0.0001827793781662859, + "loss": 0.4841, + "step": 155340 + }, + { + "epoch": 7.715804112446608, + "grad_norm": 0.1376953125, + "learning_rate": 0.00018273964438263635, + "loss": 0.4886, + "step": 155350 + }, + { + "epoch": 7.716300784742227, + "grad_norm": 0.16796875, + "learning_rate": 0.0001826999105989868, + "loss": 0.5069, + "step": 155360 + }, + { + "epoch": 7.716797457037846, + "grad_norm": 0.1513671875, + "learning_rate": 0.00018266017681533724, + "loss": 0.4672, + "step": 155370 + }, + { + "epoch": 7.717294129333466, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001826204430316877, + "loss": 0.4969, + "step": 155380 + }, + { + "epoch": 7.717790801629085, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018258070924803815, + "loss": 0.5053, + "step": 155390 + }, + { + "epoch": 7.718287473924704, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001825409754643886, + "loss": 0.4804, + "step": 155400 + }, + { + "epoch": 7.718784146220324, + "grad_norm": 0.14453125, + "learning_rate": 0.00018250124168073907, + "loss": 0.4871, + "step": 155410 + }, + { + "epoch": 7.719280818515943, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001824615078970895, + "loss": 0.5047, + "step": 155420 + }, + { + "epoch": 7.719777490811563, + "grad_norm": 0.146484375, + "learning_rate": 0.00018242177411343996, + "loss": 0.4768, + "step": 155430 + }, + { + "epoch": 7.720274163107182, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001823820403297904, + "loss": 0.4859, + "step": 155440 + }, + { + "epoch": 7.720770835402801, + "grad_norm": 0.150390625, + "learning_rate": 0.00018234230654614087, + "loss": 0.4843, + "step": 155450 + }, + { + "epoch": 7.72126750769842, + "grad_norm": 0.15625, + "learning_rate": 0.00018230257276249132, + "loss": 0.4914, + "step": 155460 + }, + { + "epoch": 7.7217641799940395, + "grad_norm": 0.169921875, + "learning_rate": 0.0001822628389788418, + "loss": 0.4985, + "step": 155470 + }, + { + "epoch": 7.72226085228966, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001822231051951922, + "loss": 0.4893, + "step": 155480 + }, + { + "epoch": 7.722757524585279, + "grad_norm": 0.146484375, + "learning_rate": 0.00018218337141154268, + "loss": 0.5087, + "step": 155490 + }, + { + "epoch": 7.723254196880898, + "grad_norm": 0.150390625, + "learning_rate": 0.00018214363762789312, + "loss": 0.4684, + "step": 155500 + }, + { + "epoch": 7.723750869176517, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001821039038442436, + "loss": 0.492, + "step": 155510 + }, + { + "epoch": 7.7242475414721365, + "grad_norm": 0.1533203125, + "learning_rate": 0.000182064170060594, + "loss": 0.4771, + "step": 155520 + }, + { + "epoch": 7.724744213767756, + "grad_norm": 0.16015625, + "learning_rate": 0.00018202443627694448, + "loss": 0.4561, + "step": 155530 + }, + { + "epoch": 7.725240886063375, + "grad_norm": 0.169921875, + "learning_rate": 0.00018198470249329492, + "loss": 0.4831, + "step": 155540 + }, + { + "epoch": 7.725737558358995, + "grad_norm": 0.16015625, + "learning_rate": 0.0001819449687096454, + "loss": 0.4935, + "step": 155550 + }, + { + "epoch": 7.726234230654614, + "grad_norm": 0.1376953125, + "learning_rate": 0.00018190523492599584, + "loss": 0.4966, + "step": 155560 + }, + { + "epoch": 7.726730902950234, + "grad_norm": 0.16015625, + "learning_rate": 0.00018186550114234628, + "loss": 0.48, + "step": 155570 + }, + { + "epoch": 7.727227575245853, + "grad_norm": 0.2060546875, + "learning_rate": 0.00018182576735869673, + "loss": 0.5067, + "step": 155580 + }, + { + "epoch": 7.727724247541472, + "grad_norm": 0.166015625, + "learning_rate": 0.0001817860335750472, + "loss": 0.4902, + "step": 155590 + }, + { + "epoch": 7.728220919837091, + "grad_norm": 0.146484375, + "learning_rate": 0.00018174629979139764, + "loss": 0.4832, + "step": 155600 + }, + { + "epoch": 7.7287175921327105, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001817065660077481, + "loss": 0.5113, + "step": 155610 + }, + { + "epoch": 7.729214264428331, + "grad_norm": 0.21484375, + "learning_rate": 0.00018166683222409856, + "loss": 0.4901, + "step": 155620 + }, + { + "epoch": 7.72971093672395, + "grad_norm": 0.162109375, + "learning_rate": 0.000181627098440449, + "loss": 0.4785, + "step": 155630 + }, + { + "epoch": 7.730207609019569, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018158736465679947, + "loss": 0.4777, + "step": 155640 + }, + { + "epoch": 7.730704281315188, + "grad_norm": 0.138671875, + "learning_rate": 0.0001815476308731499, + "loss": 0.4751, + "step": 155650 + }, + { + "epoch": 7.7312009536108075, + "grad_norm": 0.177734375, + "learning_rate": 0.00018150789708950036, + "loss": 0.4931, + "step": 155660 + }, + { + "epoch": 7.731697625906427, + "grad_norm": 0.177734375, + "learning_rate": 0.0001814681633058508, + "loss": 0.5069, + "step": 155670 + }, + { + "epoch": 7.732194298202046, + "grad_norm": 0.15234375, + "learning_rate": 0.00018142842952220128, + "loss": 0.5336, + "step": 155680 + }, + { + "epoch": 7.732690970497666, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001813886957385517, + "loss": 0.4794, + "step": 155690 + }, + { + "epoch": 7.733187642793285, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018134896195490217, + "loss": 0.4496, + "step": 155700 + }, + { + "epoch": 7.7336843150889045, + "grad_norm": 0.140625, + "learning_rate": 0.0001813092281712526, + "loss": 0.506, + "step": 155710 + }, + { + "epoch": 7.734180987384524, + "grad_norm": 0.1474609375, + "learning_rate": 0.00018126949438760308, + "loss": 0.4618, + "step": 155720 + }, + { + "epoch": 7.734677659680143, + "grad_norm": 0.154296875, + "learning_rate": 0.0001812297606039535, + "loss": 0.4909, + "step": 155730 + }, + { + "epoch": 7.735174331975762, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018119002682030397, + "loss": 0.5024, + "step": 155740 + }, + { + "epoch": 7.735671004271381, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018115029303665442, + "loss": 0.5077, + "step": 155750 + }, + { + "epoch": 7.7361676765670015, + "grad_norm": 0.15234375, + "learning_rate": 0.0001811105592530049, + "loss": 0.4669, + "step": 155760 + }, + { + "epoch": 7.736664348862621, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018107082546935533, + "loss": 0.5059, + "step": 155770 + }, + { + "epoch": 7.73716102115824, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018103109168570578, + "loss": 0.49, + "step": 155780 + }, + { + "epoch": 7.737657693453859, + "grad_norm": 0.1474609375, + "learning_rate": 0.00018099135790205625, + "loss": 0.5105, + "step": 155790 + }, + { + "epoch": 7.7381543657494785, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001809516241184067, + "loss": 0.4722, + "step": 155800 + }, + { + "epoch": 7.738651038045098, + "grad_norm": 0.15234375, + "learning_rate": 0.00018091189033475714, + "loss": 0.484, + "step": 155810 + }, + { + "epoch": 7.739147710340717, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018087215655110758, + "loss": 0.4742, + "step": 155820 + }, + { + "epoch": 7.739644382636337, + "grad_norm": 0.1474609375, + "learning_rate": 0.00018083242276745805, + "loss": 0.5169, + "step": 155830 + }, + { + "epoch": 7.740141054931956, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001807926889838085, + "loss": 0.462, + "step": 155840 + }, + { + "epoch": 7.7406377272275755, + "grad_norm": 0.146484375, + "learning_rate": 0.00018075295520015894, + "loss": 0.4676, + "step": 155850 + }, + { + "epoch": 7.741134399523195, + "grad_norm": 0.205078125, + "learning_rate": 0.00018071322141650938, + "loss": 0.5185, + "step": 155860 + }, + { + "epoch": 7.741631071818814, + "grad_norm": 0.1484375, + "learning_rate": 0.00018067348763285986, + "loss": 0.4933, + "step": 155870 + }, + { + "epoch": 7.742127744114433, + "grad_norm": 0.154296875, + "learning_rate": 0.0001806337538492103, + "loss": 0.4851, + "step": 155880 + }, + { + "epoch": 7.742624416410052, + "grad_norm": 0.1376953125, + "learning_rate": 0.00018059402006556074, + "loss": 0.498, + "step": 155890 + }, + { + "epoch": 7.743121088705672, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001805542862819112, + "loss": 0.4972, + "step": 155900 + }, + { + "epoch": 7.743617761001292, + "grad_norm": 0.1474609375, + "learning_rate": 0.00018051455249826166, + "loss": 0.4893, + "step": 155910 + }, + { + "epoch": 7.744114433296911, + "grad_norm": 0.146484375, + "learning_rate": 0.0001804748187146121, + "loss": 0.491, + "step": 155920 + }, + { + "epoch": 7.74461110559253, + "grad_norm": 0.171875, + "learning_rate": 0.00018043508493096255, + "loss": 0.5067, + "step": 155930 + }, + { + "epoch": 7.745107777888149, + "grad_norm": 0.150390625, + "learning_rate": 0.00018039535114731302, + "loss": 0.4909, + "step": 155940 + }, + { + "epoch": 7.745604450183769, + "grad_norm": 0.14453125, + "learning_rate": 0.00018035561736366346, + "loss": 0.5073, + "step": 155950 + }, + { + "epoch": 7.746101122479388, + "grad_norm": 0.134765625, + "learning_rate": 0.00018031588358001393, + "loss": 0.4952, + "step": 155960 + }, + { + "epoch": 7.746597794775007, + "grad_norm": 0.146484375, + "learning_rate": 0.00018027614979636435, + "loss": 0.485, + "step": 155970 + }, + { + "epoch": 7.747094467070626, + "grad_norm": 0.1474609375, + "learning_rate": 0.00018023641601271482, + "loss": 0.4798, + "step": 155980 + }, + { + "epoch": 7.747591139366246, + "grad_norm": 0.138671875, + "learning_rate": 0.00018019668222906527, + "loss": 0.4683, + "step": 155990 + }, + { + "epoch": 7.748087811661866, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018015694844541574, + "loss": 0.4895, + "step": 156000 + }, + { + "epoch": 7.748584483957485, + "grad_norm": 0.16015625, + "learning_rate": 0.00018011721466176616, + "loss": 0.4628, + "step": 156010 + }, + { + "epoch": 7.749081156253104, + "grad_norm": 0.1826171875, + "learning_rate": 0.00018007748087811663, + "loss": 0.4879, + "step": 156020 + }, + { + "epoch": 7.749577828548723, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018003774709446707, + "loss": 0.4913, + "step": 156030 + }, + { + "epoch": 7.750074500844343, + "grad_norm": 0.150390625, + "learning_rate": 0.00017999801331081754, + "loss": 0.4876, + "step": 156040 + }, + { + "epoch": 7.750571173139962, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017995827952716796, + "loss": 0.5229, + "step": 156050 + }, + { + "epoch": 7.751067845435582, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017991854574351843, + "loss": 0.4982, + "step": 156060 + }, + { + "epoch": 7.751564517731201, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017987881195986888, + "loss": 0.4882, + "step": 156070 + }, + { + "epoch": 7.75206119002682, + "grad_norm": 0.1640625, + "learning_rate": 0.00017983907817621935, + "loss": 0.5045, + "step": 156080 + }, + { + "epoch": 7.75255786232244, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001797993443925698, + "loss": 0.456, + "step": 156090 + }, + { + "epoch": 7.753054534618059, + "grad_norm": 0.162109375, + "learning_rate": 0.00017975961060892024, + "loss": 0.5028, + "step": 156100 + }, + { + "epoch": 7.753551206913678, + "grad_norm": 0.15625, + "learning_rate": 0.0001797198768252707, + "loss": 0.5156, + "step": 156110 + }, + { + "epoch": 7.754047879209297, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017968014304162115, + "loss": 0.4691, + "step": 156120 + }, + { + "epoch": 7.754544551504917, + "grad_norm": 0.142578125, + "learning_rate": 0.00017964040925797162, + "loss": 0.4989, + "step": 156130 + }, + { + "epoch": 7.755041223800537, + "grad_norm": 0.15234375, + "learning_rate": 0.00017960067547432204, + "loss": 0.4972, + "step": 156140 + }, + { + "epoch": 7.755537896096156, + "grad_norm": 0.169921875, + "learning_rate": 0.0001795609416906725, + "loss": 0.4985, + "step": 156150 + }, + { + "epoch": 7.756034568391775, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017952120790702296, + "loss": 0.5022, + "step": 156160 + }, + { + "epoch": 7.756531240687394, + "grad_norm": 0.1455078125, + "learning_rate": 0.00017948147412337343, + "loss": 0.5003, + "step": 156170 + }, + { + "epoch": 7.7570279129830135, + "grad_norm": 0.1533203125, + "learning_rate": 0.00017944174033972384, + "loss": 0.5014, + "step": 156180 + }, + { + "epoch": 7.757524585278633, + "grad_norm": 0.1474609375, + "learning_rate": 0.00017940200655607432, + "loss": 0.4945, + "step": 156190 + }, + { + "epoch": 7.758021257574253, + "grad_norm": 0.1640625, + "learning_rate": 0.00017936227277242476, + "loss": 0.5087, + "step": 156200 + }, + { + "epoch": 7.758517929869872, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017932253898877523, + "loss": 0.4933, + "step": 156210 + }, + { + "epoch": 7.759014602165491, + "grad_norm": 0.1474609375, + "learning_rate": 0.00017928280520512565, + "loss": 0.5283, + "step": 156220 + }, + { + "epoch": 7.7595112744611106, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017924307142147612, + "loss": 0.4793, + "step": 156230 + }, + { + "epoch": 7.76000794675673, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017920333763782656, + "loss": 0.4575, + "step": 156240 + }, + { + "epoch": 7.760504619052349, + "grad_norm": 0.1640625, + "learning_rate": 0.00017916360385417703, + "loss": 0.5082, + "step": 156250 + }, + { + "epoch": 7.761001291347968, + "grad_norm": 0.1806640625, + "learning_rate": 0.00017912387007052748, + "loss": 0.4818, + "step": 156260 + }, + { + "epoch": 7.761497963643588, + "grad_norm": 0.14453125, + "learning_rate": 0.00017908413628687792, + "loss": 0.4817, + "step": 156270 + }, + { + "epoch": 7.761994635939208, + "grad_norm": 0.1640625, + "learning_rate": 0.00017904440250322837, + "loss": 0.4939, + "step": 156280 + }, + { + "epoch": 7.762491308234827, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017900466871957884, + "loss": 0.4987, + "step": 156290 + }, + { + "epoch": 7.762987980530446, + "grad_norm": 0.1376953125, + "learning_rate": 0.00017896493493592928, + "loss": 0.4516, + "step": 156300 + }, + { + "epoch": 7.763484652826065, + "grad_norm": 0.16015625, + "learning_rate": 0.00017892520115227973, + "loss": 0.4825, + "step": 156310 + }, + { + "epoch": 7.7639813251216845, + "grad_norm": 0.173828125, + "learning_rate": 0.0001788854673686302, + "loss": 0.5048, + "step": 156320 + }, + { + "epoch": 7.764477997417304, + "grad_norm": 0.1767578125, + "learning_rate": 0.00017884573358498064, + "loss": 0.5065, + "step": 156330 + }, + { + "epoch": 7.764974669712924, + "grad_norm": 0.150390625, + "learning_rate": 0.0001788059998013311, + "loss": 0.4905, + "step": 156340 + }, + { + "epoch": 7.765471342008543, + "grad_norm": 0.177734375, + "learning_rate": 0.00017876626601768153, + "loss": 0.4914, + "step": 156350 + }, + { + "epoch": 7.765968014304162, + "grad_norm": 0.1611328125, + "learning_rate": 0.000178726532234032, + "loss": 0.5184, + "step": 156360 + }, + { + "epoch": 7.7664646865997815, + "grad_norm": 0.16015625, + "learning_rate": 0.00017868679845038245, + "loss": 0.4882, + "step": 156370 + }, + { + "epoch": 7.766961358895401, + "grad_norm": 0.154296875, + "learning_rate": 0.0001786470646667329, + "loss": 0.501, + "step": 156380 + }, + { + "epoch": 7.76745803119102, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017860733088308334, + "loss": 0.5198, + "step": 156390 + }, + { + "epoch": 7.767954703486639, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001785675970994338, + "loss": 0.4973, + "step": 156400 + }, + { + "epoch": 7.768451375782259, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017852786331578425, + "loss": 0.5214, + "step": 156410 + }, + { + "epoch": 7.7689480480778785, + "grad_norm": 0.146484375, + "learning_rate": 0.0001784881295321347, + "loss": 0.4826, + "step": 156420 + }, + { + "epoch": 7.769444720373498, + "grad_norm": 0.1484375, + "learning_rate": 0.00017844839574848514, + "loss": 0.4648, + "step": 156430 + }, + { + "epoch": 7.769941392669117, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001784086619648356, + "loss": 0.5546, + "step": 156440 + }, + { + "epoch": 7.770438064964736, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017836892818118606, + "loss": 0.502, + "step": 156450 + }, + { + "epoch": 7.770934737260355, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001783291943975365, + "loss": 0.4913, + "step": 156460 + }, + { + "epoch": 7.771431409555975, + "grad_norm": 0.158203125, + "learning_rate": 0.00017828946061388697, + "loss": 0.4959, + "step": 156470 + }, + { + "epoch": 7.771928081851595, + "grad_norm": 0.15234375, + "learning_rate": 0.00017824972683023742, + "loss": 0.4986, + "step": 156480 + }, + { + "epoch": 7.772424754147214, + "grad_norm": 0.154296875, + "learning_rate": 0.0001782099930465879, + "loss": 0.4853, + "step": 156490 + }, + { + "epoch": 7.772921426442833, + "grad_norm": 0.16015625, + "learning_rate": 0.00017817025926293833, + "loss": 0.4826, + "step": 156500 + }, + { + "epoch": 7.7734180987384525, + "grad_norm": 0.15234375, + "learning_rate": 0.00017813052547928878, + "loss": 0.5112, + "step": 156510 + }, + { + "epoch": 7.773914771034072, + "grad_norm": 0.17578125, + "learning_rate": 0.00017809079169563922, + "loss": 0.4555, + "step": 156520 + }, + { + "epoch": 7.774411443329691, + "grad_norm": 0.154296875, + "learning_rate": 0.0001780510579119897, + "loss": 0.5022, + "step": 156530 + }, + { + "epoch": 7.77490811562531, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017801132412834014, + "loss": 0.4864, + "step": 156540 + }, + { + "epoch": 7.77540478792093, + "grad_norm": 0.1767578125, + "learning_rate": 0.00017797159034469058, + "loss": 0.5203, + "step": 156550 + }, + { + "epoch": 7.7759014602165495, + "grad_norm": 0.150390625, + "learning_rate": 0.00017793185656104102, + "loss": 0.4664, + "step": 156560 + }, + { + "epoch": 7.776398132512169, + "grad_norm": 0.134765625, + "learning_rate": 0.0001778921227773915, + "loss": 0.4759, + "step": 156570 + }, + { + "epoch": 7.776894804807788, + "grad_norm": 0.1435546875, + "learning_rate": 0.00017785238899374194, + "loss": 0.5151, + "step": 156580 + }, + { + "epoch": 7.777391477103407, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017781265521009238, + "loss": 0.5092, + "step": 156590 + }, + { + "epoch": 7.777888149399026, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017777292142644283, + "loss": 0.4774, + "step": 156600 + }, + { + "epoch": 7.778384821694646, + "grad_norm": 0.1455078125, + "learning_rate": 0.0001777331876427933, + "loss": 0.481, + "step": 156610 + }, + { + "epoch": 7.778881493990265, + "grad_norm": 0.169921875, + "learning_rate": 0.00017769345385914374, + "loss": 0.4945, + "step": 156620 + }, + { + "epoch": 7.779378166285884, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001776537200754942, + "loss": 0.4764, + "step": 156630 + }, + { + "epoch": 7.779874838581504, + "grad_norm": 0.1884765625, + "learning_rate": 0.00017761398629184466, + "loss": 0.4602, + "step": 156640 + }, + { + "epoch": 7.780371510877123, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001775742525081951, + "loss": 0.5032, + "step": 156650 + }, + { + "epoch": 7.780868183172743, + "grad_norm": 0.162109375, + "learning_rate": 0.00017753451872454557, + "loss": 0.4916, + "step": 156660 + }, + { + "epoch": 7.781364855468362, + "grad_norm": 0.1650390625, + "learning_rate": 0.000177494784940896, + "loss": 0.4989, + "step": 156670 + }, + { + "epoch": 7.781861527763981, + "grad_norm": 0.17578125, + "learning_rate": 0.00017745505115724646, + "loss": 0.5072, + "step": 156680 + }, + { + "epoch": 7.7823582000596, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001774153173735969, + "loss": 0.4669, + "step": 156690 + }, + { + "epoch": 7.7828548723552196, + "grad_norm": 0.16015625, + "learning_rate": 0.00017737558358994738, + "loss": 0.4916, + "step": 156700 + }, + { + "epoch": 7.78335154465084, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001773358498062978, + "loss": 0.5019, + "step": 156710 + }, + { + "epoch": 7.783848216946459, + "grad_norm": 0.18359375, + "learning_rate": 0.00017729611602264827, + "loss": 0.5173, + "step": 156720 + }, + { + "epoch": 7.784344889242078, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001772563822389987, + "loss": 0.4668, + "step": 156730 + }, + { + "epoch": 7.784841561537697, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017721664845534918, + "loss": 0.4851, + "step": 156740 + }, + { + "epoch": 7.785338233833317, + "grad_norm": 0.162109375, + "learning_rate": 0.0001771769146716996, + "loss": 0.4818, + "step": 156750 + }, + { + "epoch": 7.785834906128936, + "grad_norm": 0.1396484375, + "learning_rate": 0.00017713718088805007, + "loss": 0.5166, + "step": 156760 + }, + { + "epoch": 7.786331578424555, + "grad_norm": 0.1640625, + "learning_rate": 0.00017709744710440052, + "loss": 0.4916, + "step": 156770 + }, + { + "epoch": 7.786828250720175, + "grad_norm": 0.1494140625, + "learning_rate": 0.000177057713320751, + "loss": 0.483, + "step": 156780 + }, + { + "epoch": 7.787324923015794, + "grad_norm": 0.171875, + "learning_rate": 0.00017701797953710143, + "loss": 0.4881, + "step": 156790 + }, + { + "epoch": 7.787821595311414, + "grad_norm": 0.162109375, + "learning_rate": 0.00017697824575345188, + "loss": 0.493, + "step": 156800 + }, + { + "epoch": 7.788318267607033, + "grad_norm": 0.15234375, + "learning_rate": 0.00017693851196980235, + "loss": 0.4836, + "step": 156810 + }, + { + "epoch": 7.788814939902652, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001768987781861528, + "loss": 0.4688, + "step": 156820 + }, + { + "epoch": 7.789311612198271, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017685904440250324, + "loss": 0.5103, + "step": 156830 + }, + { + "epoch": 7.7898082844938905, + "grad_norm": 0.150390625, + "learning_rate": 0.00017681931061885368, + "loss": 0.5353, + "step": 156840 + }, + { + "epoch": 7.790304956789511, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017677957683520415, + "loss": 0.493, + "step": 156850 + }, + { + "epoch": 7.79080162908513, + "grad_norm": 0.14453125, + "learning_rate": 0.0001767398430515546, + "loss": 0.488, + "step": 156860 + }, + { + "epoch": 7.791298301380749, + "grad_norm": 0.138671875, + "learning_rate": 0.00017670010926790504, + "loss": 0.4948, + "step": 156870 + }, + { + "epoch": 7.791794973676368, + "grad_norm": 0.1435546875, + "learning_rate": 0.00017666037548425548, + "loss": 0.5081, + "step": 156880 + }, + { + "epoch": 7.7922916459719875, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017662064170060595, + "loss": 0.483, + "step": 156890 + }, + { + "epoch": 7.792788318267607, + "grad_norm": 0.142578125, + "learning_rate": 0.0001765809079169564, + "loss": 0.5121, + "step": 156900 + }, + { + "epoch": 7.793284990563226, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017654117413330687, + "loss": 0.4898, + "step": 156910 + }, + { + "epoch": 7.793781662858846, + "grad_norm": 0.16796875, + "learning_rate": 0.0001765014403496573, + "loss": 0.4897, + "step": 156920 + }, + { + "epoch": 7.794278335154465, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017646170656600776, + "loss": 0.4827, + "step": 156930 + }, + { + "epoch": 7.794775007450085, + "grad_norm": 0.150390625, + "learning_rate": 0.0001764219727823582, + "loss": 0.4754, + "step": 156940 + }, + { + "epoch": 7.795271679745704, + "grad_norm": 0.17578125, + "learning_rate": 0.00017638223899870867, + "loss": 0.5168, + "step": 156950 + }, + { + "epoch": 7.795768352041323, + "grad_norm": 0.142578125, + "learning_rate": 0.00017634250521505912, + "loss": 0.4789, + "step": 156960 + }, + { + "epoch": 7.796265024336942, + "grad_norm": 0.162109375, + "learning_rate": 0.00017630277143140956, + "loss": 0.4859, + "step": 156970 + }, + { + "epoch": 7.7967616966325615, + "grad_norm": 0.1396484375, + "learning_rate": 0.00017626303764776003, + "loss": 0.5107, + "step": 156980 + }, + { + "epoch": 7.797258368928182, + "grad_norm": 0.16015625, + "learning_rate": 0.00017622330386411048, + "loss": 0.4651, + "step": 156990 + }, + { + "epoch": 7.797755041223801, + "grad_norm": 0.1845703125, + "learning_rate": 0.00017618357008046092, + "loss": 0.468, + "step": 157000 + }, + { + "epoch": 7.79825171351942, + "grad_norm": 0.162109375, + "learning_rate": 0.00017614383629681137, + "loss": 0.4886, + "step": 157010 + }, + { + "epoch": 7.798748385815039, + "grad_norm": 0.1416015625, + "learning_rate": 0.00017610410251316184, + "loss": 0.4947, + "step": 157020 + }, + { + "epoch": 7.7992450581106585, + "grad_norm": 0.158203125, + "learning_rate": 0.00017606436872951228, + "loss": 0.5017, + "step": 157030 + }, + { + "epoch": 7.799741730406278, + "grad_norm": 0.177734375, + "learning_rate": 0.00017602463494586273, + "loss": 0.4941, + "step": 157040 + }, + { + "epoch": 7.800238402701897, + "grad_norm": 0.15234375, + "learning_rate": 0.00017598490116221317, + "loss": 0.4735, + "step": 157050 + }, + { + "epoch": 7.800735074997517, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017594516737856364, + "loss": 0.4988, + "step": 157060 + }, + { + "epoch": 7.801231747293136, + "grad_norm": 0.158203125, + "learning_rate": 0.0001759054335949141, + "loss": 0.4839, + "step": 157070 + }, + { + "epoch": 7.8017284195887555, + "grad_norm": 0.201171875, + "learning_rate": 0.00017586569981126453, + "loss": 0.4763, + "step": 157080 + }, + { + "epoch": 7.802225091884375, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017582596602761498, + "loss": 0.5064, + "step": 157090 + }, + { + "epoch": 7.802721764179994, + "grad_norm": 0.16796875, + "learning_rate": 0.00017578623224396545, + "loss": 0.4735, + "step": 157100 + }, + { + "epoch": 7.803218436475613, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001757464984603159, + "loss": 0.483, + "step": 157110 + }, + { + "epoch": 7.803715108771232, + "grad_norm": 0.154296875, + "learning_rate": 0.00017570676467666634, + "loss": 0.4756, + "step": 157120 + }, + { + "epoch": 7.8042117810668525, + "grad_norm": 0.171875, + "learning_rate": 0.00017566703089301678, + "loss": 0.4639, + "step": 157130 + }, + { + "epoch": 7.804708453362472, + "grad_norm": 0.1767578125, + "learning_rate": 0.00017562729710936725, + "loss": 0.4743, + "step": 157140 + }, + { + "epoch": 7.805205125658091, + "grad_norm": 0.173828125, + "learning_rate": 0.0001755875633257177, + "loss": 0.5498, + "step": 157150 + }, + { + "epoch": 7.80570179795371, + "grad_norm": 0.1455078125, + "learning_rate": 0.00017554782954206814, + "loss": 0.4827, + "step": 157160 + }, + { + "epoch": 7.8061984702493294, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001755080957584186, + "loss": 0.4982, + "step": 157170 + }, + { + "epoch": 7.806695142544949, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017546836197476905, + "loss": 0.5006, + "step": 157180 + }, + { + "epoch": 7.807191814840568, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017542862819111953, + "loss": 0.4829, + "step": 157190 + }, + { + "epoch": 7.807688487136188, + "grad_norm": 0.14453125, + "learning_rate": 0.00017538889440746994, + "loss": 0.4748, + "step": 157200 + }, + { + "epoch": 7.808185159431807, + "grad_norm": 0.2060546875, + "learning_rate": 0.00017534916062382041, + "loss": 0.4986, + "step": 157210 + }, + { + "epoch": 7.8086818317274265, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017530942684017086, + "loss": 0.4668, + "step": 157220 + }, + { + "epoch": 7.809178504023046, + "grad_norm": 0.173828125, + "learning_rate": 0.00017526969305652133, + "loss": 0.484, + "step": 157230 + }, + { + "epoch": 7.809675176318665, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017522995927287175, + "loss": 0.4552, + "step": 157240 + }, + { + "epoch": 7.810171848614284, + "grad_norm": 0.1796875, + "learning_rate": 0.00017519022548922222, + "loss": 0.4988, + "step": 157250 + }, + { + "epoch": 7.810668520909903, + "grad_norm": 0.16015625, + "learning_rate": 0.00017515049170557266, + "loss": 0.4594, + "step": 157260 + }, + { + "epoch": 7.8111651932055235, + "grad_norm": 0.1396484375, + "learning_rate": 0.00017511075792192313, + "loss": 0.4824, + "step": 157270 + }, + { + "epoch": 7.811661865501143, + "grad_norm": 0.16015625, + "learning_rate": 0.00017507102413827355, + "loss": 0.5047, + "step": 157280 + }, + { + "epoch": 7.812158537796762, + "grad_norm": 0.1474609375, + "learning_rate": 0.00017503129035462402, + "loss": 0.5078, + "step": 157290 + }, + { + "epoch": 7.812655210092381, + "grad_norm": 0.169921875, + "learning_rate": 0.00017499155657097447, + "loss": 0.4975, + "step": 157300 + }, + { + "epoch": 7.813151882388, + "grad_norm": 0.138671875, + "learning_rate": 0.00017495182278732494, + "loss": 0.4623, + "step": 157310 + }, + { + "epoch": 7.81364855468362, + "grad_norm": 0.15234375, + "learning_rate": 0.00017491208900367538, + "loss": 0.4954, + "step": 157320 + }, + { + "epoch": 7.814145226979239, + "grad_norm": 0.17578125, + "learning_rate": 0.00017487235522002583, + "loss": 0.4881, + "step": 157330 + }, + { + "epoch": 7.814641899274858, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001748326214363763, + "loss": 0.4988, + "step": 157340 + }, + { + "epoch": 7.815138571570477, + "grad_norm": 0.1435546875, + "learning_rate": 0.00017479288765272674, + "loss": 0.4848, + "step": 157350 + }, + { + "epoch": 7.815635243866097, + "grad_norm": 0.18359375, + "learning_rate": 0.00017475315386907721, + "loss": 0.4845, + "step": 157360 + }, + { + "epoch": 7.816131916161717, + "grad_norm": 0.162109375, + "learning_rate": 0.00017471342008542763, + "loss": 0.503, + "step": 157370 + }, + { + "epoch": 7.816628588457336, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001746736863017781, + "loss": 0.478, + "step": 157380 + }, + { + "epoch": 7.817125260752955, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017463395251812855, + "loss": 0.4855, + "step": 157390 + }, + { + "epoch": 7.817621933048574, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017459421873447902, + "loss": 0.4919, + "step": 157400 + }, + { + "epoch": 7.818118605344194, + "grad_norm": 0.171875, + "learning_rate": 0.00017455448495082944, + "loss": 0.493, + "step": 157410 + }, + { + "epoch": 7.818615277639813, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001745147511671799, + "loss": 0.4962, + "step": 157420 + }, + { + "epoch": 7.819111949935433, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017447501738353035, + "loss": 0.4884, + "step": 157430 + }, + { + "epoch": 7.819608622231052, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017443528359988082, + "loss": 0.5151, + "step": 157440 + }, + { + "epoch": 7.820105294526671, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017439554981623124, + "loss": 0.4866, + "step": 157450 + }, + { + "epoch": 7.820601966822291, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001743558160325817, + "loss": 0.5049, + "step": 157460 + }, + { + "epoch": 7.82109863911791, + "grad_norm": 0.2138671875, + "learning_rate": 0.00017431608224893216, + "loss": 0.4865, + "step": 157470 + }, + { + "epoch": 7.821595311413529, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017427634846528263, + "loss": 0.5304, + "step": 157480 + }, + { + "epoch": 7.822091983709148, + "grad_norm": 0.1396484375, + "learning_rate": 0.00017423661468163307, + "loss": 0.4809, + "step": 157490 + }, + { + "epoch": 7.822588656004768, + "grad_norm": 0.150390625, + "learning_rate": 0.00017419688089798351, + "loss": 0.5039, + "step": 157500 + }, + { + "epoch": 7.823085328300388, + "grad_norm": 0.146484375, + "learning_rate": 0.00017415714711433399, + "loss": 0.4733, + "step": 157510 + }, + { + "epoch": 7.823582000596007, + "grad_norm": 0.173828125, + "learning_rate": 0.00017411741333068443, + "loss": 0.4945, + "step": 157520 + }, + { + "epoch": 7.824078672891626, + "grad_norm": 0.1474609375, + "learning_rate": 0.00017407767954703487, + "loss": 0.4967, + "step": 157530 + }, + { + "epoch": 7.824575345187245, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017403794576338532, + "loss": 0.4992, + "step": 157540 + }, + { + "epoch": 7.8250720174828645, + "grad_norm": 0.15625, + "learning_rate": 0.0001739982119797358, + "loss": 0.4455, + "step": 157550 + }, + { + "epoch": 7.825568689778484, + "grad_norm": 0.16796875, + "learning_rate": 0.00017395847819608623, + "loss": 0.4939, + "step": 157560 + }, + { + "epoch": 7.826065362074104, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017391874441243668, + "loss": 0.4764, + "step": 157570 + }, + { + "epoch": 7.826562034369723, + "grad_norm": 0.1474609375, + "learning_rate": 0.00017387901062878712, + "loss": 0.4906, + "step": 157580 + }, + { + "epoch": 7.827058706665342, + "grad_norm": 0.166015625, + "learning_rate": 0.0001738392768451376, + "loss": 0.5121, + "step": 157590 + }, + { + "epoch": 7.8275553789609615, + "grad_norm": 0.16015625, + "learning_rate": 0.00017379954306148804, + "loss": 0.4899, + "step": 157600 + }, + { + "epoch": 7.828052051256581, + "grad_norm": 0.16015625, + "learning_rate": 0.00017375980927783848, + "loss": 0.5032, + "step": 157610 + }, + { + "epoch": 7.8285487235522, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017372007549418893, + "loss": 0.5003, + "step": 157620 + }, + { + "epoch": 7.829045395847819, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001736803417105394, + "loss": 0.5219, + "step": 157630 + }, + { + "epoch": 7.829542068143439, + "grad_norm": 0.1962890625, + "learning_rate": 0.00017364060792688984, + "loss": 0.4936, + "step": 157640 + }, + { + "epoch": 7.830038740439059, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001736008741432403, + "loss": 0.4803, + "step": 157650 + }, + { + "epoch": 7.830535412734678, + "grad_norm": 0.16015625, + "learning_rate": 0.00017356114035959076, + "loss": 0.5206, + "step": 157660 + }, + { + "epoch": 7.831032085030297, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001735214065759412, + "loss": 0.5155, + "step": 157670 + }, + { + "epoch": 7.831528757325916, + "grad_norm": 0.1796875, + "learning_rate": 0.00017348167279229167, + "loss": 0.4948, + "step": 157680 + }, + { + "epoch": 7.8320254296215355, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001734419390086421, + "loss": 0.4792, + "step": 157690 + }, + { + "epoch": 7.832522101917155, + "grad_norm": 0.16796875, + "learning_rate": 0.00017340220522499256, + "loss": 0.4773, + "step": 157700 + }, + { + "epoch": 7.833018774212775, + "grad_norm": 0.1494140625, + "learning_rate": 0.000173362471441343, + "loss": 0.4869, + "step": 157710 + }, + { + "epoch": 7.833515446508394, + "grad_norm": 0.17578125, + "learning_rate": 0.00017332273765769348, + "loss": 0.4808, + "step": 157720 + }, + { + "epoch": 7.834012118804013, + "grad_norm": 0.2109375, + "learning_rate": 0.0001732830038740439, + "loss": 0.4866, + "step": 157730 + }, + { + "epoch": 7.8345087910996325, + "grad_norm": 0.1787109375, + "learning_rate": 0.00017324327009039437, + "loss": 0.4477, + "step": 157740 + }, + { + "epoch": 7.835005463395252, + "grad_norm": 0.162109375, + "learning_rate": 0.0001732035363067448, + "loss": 0.483, + "step": 157750 + }, + { + "epoch": 7.835502135690871, + "grad_norm": 0.169921875, + "learning_rate": 0.00017316380252309528, + "loss": 0.5277, + "step": 157760 + }, + { + "epoch": 7.83599880798649, + "grad_norm": 0.1494140625, + "learning_rate": 0.00017312406873944573, + "loss": 0.4935, + "step": 157770 + }, + { + "epoch": 7.83649548028211, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017308433495579617, + "loss": 0.4675, + "step": 157780 + }, + { + "epoch": 7.8369921525777295, + "grad_norm": 0.1435546875, + "learning_rate": 0.00017304460117214662, + "loss": 0.4856, + "step": 157790 + }, + { + "epoch": 7.837488824873349, + "grad_norm": 0.177734375, + "learning_rate": 0.00017300486738849709, + "loss": 0.4605, + "step": 157800 + }, + { + "epoch": 7.837985497168968, + "grad_norm": 0.1748046875, + "learning_rate": 0.00017296513360484753, + "loss": 0.4991, + "step": 157810 + }, + { + "epoch": 7.838482169464587, + "grad_norm": 0.154296875, + "learning_rate": 0.00017292539982119797, + "loss": 0.5, + "step": 157820 + }, + { + "epoch": 7.838978841760206, + "grad_norm": 0.1435546875, + "learning_rate": 0.00017288566603754845, + "loss": 0.4949, + "step": 157830 + }, + { + "epoch": 7.839475514055826, + "grad_norm": 0.17578125, + "learning_rate": 0.0001728459322538989, + "loss": 0.4982, + "step": 157840 + }, + { + "epoch": 7.839972186351446, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017280619847024933, + "loss": 0.4761, + "step": 157850 + }, + { + "epoch": 7.840468858647065, + "grad_norm": 0.1796875, + "learning_rate": 0.00017276646468659978, + "loss": 0.4898, + "step": 157860 + }, + { + "epoch": 7.840965530942684, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017272673090295025, + "loss": 0.5283, + "step": 157870 + }, + { + "epoch": 7.8414622032383035, + "grad_norm": 0.146484375, + "learning_rate": 0.0001726869971193007, + "loss": 0.4675, + "step": 157880 + }, + { + "epoch": 7.841958875533923, + "grad_norm": 0.140625, + "learning_rate": 0.00017264726333565117, + "loss": 0.4695, + "step": 157890 + }, + { + "epoch": 7.842455547829542, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017260752955200158, + "loss": 0.4885, + "step": 157900 + }, + { + "epoch": 7.842952220125161, + "grad_norm": 0.1484375, + "learning_rate": 0.00017256779576835205, + "loss": 0.4774, + "step": 157910 + }, + { + "epoch": 7.843448892420781, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001725280619847025, + "loss": 0.489, + "step": 157920 + }, + { + "epoch": 7.8439455647164005, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017248832820105297, + "loss": 0.4955, + "step": 157930 + }, + { + "epoch": 7.84444223701202, + "grad_norm": 0.16015625, + "learning_rate": 0.0001724485944174034, + "loss": 0.4899, + "step": 157940 + }, + { + "epoch": 7.844938909307639, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017240886063375386, + "loss": 0.5077, + "step": 157950 + }, + { + "epoch": 7.845435581603258, + "grad_norm": 0.146484375, + "learning_rate": 0.0001723691268501043, + "loss": 0.5141, + "step": 157960 + }, + { + "epoch": 7.845932253898877, + "grad_norm": 0.1826171875, + "learning_rate": 0.00017232939306645477, + "loss": 0.486, + "step": 157970 + }, + { + "epoch": 7.846428926194497, + "grad_norm": 0.169921875, + "learning_rate": 0.0001722896592828052, + "loss": 0.4688, + "step": 157980 + }, + { + "epoch": 7.846925598490116, + "grad_norm": 0.146484375, + "learning_rate": 0.00017224992549915566, + "loss": 0.4759, + "step": 157990 + }, + { + "epoch": 7.847422270785736, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001722101917155061, + "loss": 0.5069, + "step": 158000 + }, + { + "epoch": 7.847918943081355, + "grad_norm": 0.173828125, + "learning_rate": 0.00017217045793185658, + "loss": 0.4946, + "step": 158010 + }, + { + "epoch": 7.848415615376974, + "grad_norm": 0.189453125, + "learning_rate": 0.00017213072414820702, + "loss": 0.515, + "step": 158020 + }, + { + "epoch": 7.848912287672594, + "grad_norm": 0.1875, + "learning_rate": 0.00017209099036455747, + "loss": 0.4999, + "step": 158030 + }, + { + "epoch": 7.849408959968213, + "grad_norm": 0.15625, + "learning_rate": 0.00017205125658090794, + "loss": 0.4776, + "step": 158040 + }, + { + "epoch": 7.849905632263832, + "grad_norm": 0.1640625, + "learning_rate": 0.00017201152279725838, + "loss": 0.487, + "step": 158050 + }, + { + "epoch": 7.850402304559451, + "grad_norm": 0.203125, + "learning_rate": 0.00017197178901360883, + "loss": 0.5035, + "step": 158060 + }, + { + "epoch": 7.8508989768550705, + "grad_norm": 0.1376953125, + "learning_rate": 0.00017193205522995927, + "loss": 0.488, + "step": 158070 + }, + { + "epoch": 7.851395649150691, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017189232144630974, + "loss": 0.5068, + "step": 158080 + }, + { + "epoch": 7.85189232144631, + "grad_norm": 0.140625, + "learning_rate": 0.0001718525876626602, + "loss": 0.4852, + "step": 158090 + }, + { + "epoch": 7.852388993741929, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017181285387901063, + "loss": 0.5238, + "step": 158100 + }, + { + "epoch": 7.852885666037548, + "grad_norm": 0.15625, + "learning_rate": 0.00017177312009536107, + "loss": 0.498, + "step": 158110 + }, + { + "epoch": 7.853382338333168, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017173338631171155, + "loss": 0.485, + "step": 158120 + }, + { + "epoch": 7.853879010628787, + "grad_norm": 0.173828125, + "learning_rate": 0.000171693652528062, + "loss": 0.4411, + "step": 158130 + }, + { + "epoch": 7.854375682924406, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017165391874441243, + "loss": 0.5216, + "step": 158140 + }, + { + "epoch": 7.854872355220026, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017161418496076288, + "loss": 0.494, + "step": 158150 + }, + { + "epoch": 7.855369027515645, + "grad_norm": 0.1396484375, + "learning_rate": 0.00017157445117711335, + "loss": 0.5004, + "step": 158160 + }, + { + "epoch": 7.855865699811265, + "grad_norm": 0.162109375, + "learning_rate": 0.0001715347173934638, + "loss": 0.4854, + "step": 158170 + }, + { + "epoch": 7.856362372106884, + "grad_norm": 0.158203125, + "learning_rate": 0.00017149498360981427, + "loss": 0.467, + "step": 158180 + }, + { + "epoch": 7.856859044402503, + "grad_norm": 0.162109375, + "learning_rate": 0.0001714552498261647, + "loss": 0.4838, + "step": 158190 + }, + { + "epoch": 7.857355716698122, + "grad_norm": 0.14453125, + "learning_rate": 0.00017141551604251515, + "loss": 0.5075, + "step": 158200 + }, + { + "epoch": 7.8578523889937415, + "grad_norm": 0.171875, + "learning_rate": 0.00017137578225886563, + "loss": 0.4666, + "step": 158210 + }, + { + "epoch": 7.858349061289362, + "grad_norm": 0.1865234375, + "learning_rate": 0.00017133604847521607, + "loss": 0.495, + "step": 158220 + }, + { + "epoch": 7.858845733584981, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017129631469156651, + "loss": 0.4623, + "step": 158230 + }, + { + "epoch": 7.8593424058806, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017125658090791696, + "loss": 0.4929, + "step": 158240 + }, + { + "epoch": 7.859839078176219, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017121684712426743, + "loss": 0.4998, + "step": 158250 + }, + { + "epoch": 7.8603357504718385, + "grad_norm": 0.1494140625, + "learning_rate": 0.00017117711334061787, + "loss": 0.465, + "step": 158260 + }, + { + "epoch": 7.860832422767458, + "grad_norm": 0.162109375, + "learning_rate": 0.00017113737955696832, + "loss": 0.5087, + "step": 158270 + }, + { + "epoch": 7.861329095063077, + "grad_norm": 0.169921875, + "learning_rate": 0.00017109764577331876, + "loss": 0.4851, + "step": 158280 + }, + { + "epoch": 7.861825767358697, + "grad_norm": 0.171875, + "learning_rate": 0.00017105791198966923, + "loss": 0.4744, + "step": 158290 + }, + { + "epoch": 7.862322439654316, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017101817820601968, + "loss": 0.505, + "step": 158300 + }, + { + "epoch": 7.8628191119499355, + "grad_norm": 0.1484375, + "learning_rate": 0.00017097844442237012, + "loss": 0.4845, + "step": 158310 + }, + { + "epoch": 7.863315784245555, + "grad_norm": 0.150390625, + "learning_rate": 0.00017093871063872057, + "loss": 0.5029, + "step": 158320 + }, + { + "epoch": 7.863812456541174, + "grad_norm": 0.18359375, + "learning_rate": 0.00017089897685507104, + "loss": 0.4545, + "step": 158330 + }, + { + "epoch": 7.864309128836793, + "grad_norm": 0.16796875, + "learning_rate": 0.00017085924307142148, + "loss": 0.4649, + "step": 158340 + }, + { + "epoch": 7.8648058011324125, + "grad_norm": 0.1533203125, + "learning_rate": 0.00017081950928777193, + "loss": 0.5021, + "step": 158350 + }, + { + "epoch": 7.865302473428033, + "grad_norm": 0.17578125, + "learning_rate": 0.0001707797755041224, + "loss": 0.4561, + "step": 158360 + }, + { + "epoch": 7.865799145723652, + "grad_norm": 0.181640625, + "learning_rate": 0.00017074004172047284, + "loss": 0.5059, + "step": 158370 + }, + { + "epoch": 7.866295818019271, + "grad_norm": 0.150390625, + "learning_rate": 0.00017070030793682331, + "loss": 0.5086, + "step": 158380 + }, + { + "epoch": 7.86679249031489, + "grad_norm": 0.1455078125, + "learning_rate": 0.00017066057415317373, + "loss": 0.4865, + "step": 158390 + }, + { + "epoch": 7.8672891626105095, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001706208403695242, + "loss": 0.4936, + "step": 158400 + }, + { + "epoch": 7.867785834906129, + "grad_norm": 0.14453125, + "learning_rate": 0.00017058110658587465, + "loss": 0.4558, + "step": 158410 + }, + { + "epoch": 7.868282507201748, + "grad_norm": 0.173828125, + "learning_rate": 0.00017054137280222512, + "loss": 0.5057, + "step": 158420 + }, + { + "epoch": 7.868779179497368, + "grad_norm": 0.169921875, + "learning_rate": 0.00017050163901857553, + "loss": 0.4906, + "step": 158430 + }, + { + "epoch": 7.869275851792987, + "grad_norm": 0.1474609375, + "learning_rate": 0.000170461905234926, + "loss": 0.5208, + "step": 158440 + }, + { + "epoch": 7.8697725240886065, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017042217145127645, + "loss": 0.4733, + "step": 158450 + }, + { + "epoch": 7.870269196384226, + "grad_norm": 0.1865234375, + "learning_rate": 0.00017038243766762692, + "loss": 0.4874, + "step": 158460 + }, + { + "epoch": 7.870765868679845, + "grad_norm": 0.158203125, + "learning_rate": 0.00017034270388397734, + "loss": 0.4597, + "step": 158470 + }, + { + "epoch": 7.871262540975464, + "grad_norm": 0.150390625, + "learning_rate": 0.0001703029701003278, + "loss": 0.5403, + "step": 158480 + }, + { + "epoch": 7.871759213271083, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017026323631667825, + "loss": 0.525, + "step": 158490 + }, + { + "epoch": 7.8722558855667035, + "grad_norm": 0.173828125, + "learning_rate": 0.00017022350253302873, + "loss": 0.4927, + "step": 158500 + }, + { + "epoch": 7.872752557862323, + "grad_norm": 0.15234375, + "learning_rate": 0.00017018376874937917, + "loss": 0.5222, + "step": 158510 + }, + { + "epoch": 7.873249230157942, + "grad_norm": 0.14453125, + "learning_rate": 0.00017014403496572961, + "loss": 0.4882, + "step": 158520 + }, + { + "epoch": 7.873745902453561, + "grad_norm": 0.1845703125, + "learning_rate": 0.00017010430118208009, + "loss": 0.4862, + "step": 158530 + }, + { + "epoch": 7.87424257474918, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017006456739843053, + "loss": 0.4564, + "step": 158540 + }, + { + "epoch": 7.8747392470448, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017002483361478097, + "loss": 0.5056, + "step": 158550 + }, + { + "epoch": 7.875235919340419, + "grad_norm": 0.140625, + "learning_rate": 0.00016998509983113142, + "loss": 0.49, + "step": 158560 + }, + { + "epoch": 7.875732591636039, + "grad_norm": 0.1640625, + "learning_rate": 0.0001699453660474819, + "loss": 0.4968, + "step": 158570 + }, + { + "epoch": 7.876229263931658, + "grad_norm": 0.15234375, + "learning_rate": 0.00016990563226383233, + "loss": 0.4868, + "step": 158580 + }, + { + "epoch": 7.8767259362272775, + "grad_norm": 0.177734375, + "learning_rate": 0.0001698658984801828, + "loss": 0.4979, + "step": 158590 + }, + { + "epoch": 7.877222608522897, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016982616469653322, + "loss": 0.4879, + "step": 158600 + }, + { + "epoch": 7.877719280818516, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001697864309128837, + "loss": 0.5487, + "step": 158610 + }, + { + "epoch": 7.878215953114135, + "grad_norm": 0.150390625, + "learning_rate": 0.00016974669712923414, + "loss": 0.4763, + "step": 158620 + }, + { + "epoch": 7.878712625409754, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001697069633455846, + "loss": 0.4907, + "step": 158630 + }, + { + "epoch": 7.8792092977053745, + "grad_norm": 0.166015625, + "learning_rate": 0.00016966722956193503, + "loss": 0.4819, + "step": 158640 + }, + { + "epoch": 7.879705970000994, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001696274957782855, + "loss": 0.4651, + "step": 158650 + }, + { + "epoch": 7.880202642296613, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016958776199463594, + "loss": 0.5158, + "step": 158660 + }, + { + "epoch": 7.880699314592232, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016954802821098641, + "loss": 0.4985, + "step": 158670 + }, + { + "epoch": 7.881195986887851, + "grad_norm": 0.154296875, + "learning_rate": 0.00016950829442733683, + "loss": 0.4626, + "step": 158680 + }, + { + "epoch": 7.881692659183471, + "grad_norm": 0.15625, + "learning_rate": 0.0001694685606436873, + "loss": 0.4984, + "step": 158690 + }, + { + "epoch": 7.88218933147909, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016942882686003775, + "loss": 0.4647, + "step": 158700 + }, + { + "epoch": 7.882686003774709, + "grad_norm": 0.1513671875, + "learning_rate": 0.00016938909307638822, + "loss": 0.4911, + "step": 158710 + }, + { + "epoch": 7.883182676070328, + "grad_norm": 0.1787109375, + "learning_rate": 0.00016934935929273866, + "loss": 0.5046, + "step": 158720 + }, + { + "epoch": 7.883679348365948, + "grad_norm": 0.154296875, + "learning_rate": 0.0001693096255090891, + "loss": 0.4652, + "step": 158730 + }, + { + "epoch": 7.884176020661568, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016926989172543958, + "loss": 0.5216, + "step": 158740 + }, + { + "epoch": 7.884672692957187, + "grad_norm": 0.15625, + "learning_rate": 0.00016923015794179002, + "loss": 0.4934, + "step": 158750 + }, + { + "epoch": 7.885169365252806, + "grad_norm": 0.14453125, + "learning_rate": 0.00016919042415814047, + "loss": 0.5015, + "step": 158760 + }, + { + "epoch": 7.885666037548425, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001691506903744909, + "loss": 0.5578, + "step": 158770 + }, + { + "epoch": 7.8861627098440445, + "grad_norm": 0.1455078125, + "learning_rate": 0.00016911095659084138, + "loss": 0.4995, + "step": 158780 + }, + { + "epoch": 7.886659382139664, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016907122280719183, + "loss": 0.5216, + "step": 158790 + }, + { + "epoch": 7.887156054435284, + "grad_norm": 0.1435546875, + "learning_rate": 0.00016903148902354227, + "loss": 0.4803, + "step": 158800 + }, + { + "epoch": 7.887652726730903, + "grad_norm": 0.1435546875, + "learning_rate": 0.00016899175523989271, + "loss": 0.4816, + "step": 158810 + }, + { + "epoch": 7.888149399026522, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016895202145624319, + "loss": 0.4879, + "step": 158820 + }, + { + "epoch": 7.888646071322142, + "grad_norm": 0.1728515625, + "learning_rate": 0.00016891228767259363, + "loss": 0.4714, + "step": 158830 + }, + { + "epoch": 7.889142743617761, + "grad_norm": 0.158203125, + "learning_rate": 0.00016887255388894407, + "loss": 0.4949, + "step": 158840 + }, + { + "epoch": 7.88963941591338, + "grad_norm": 0.15625, + "learning_rate": 0.00016883282010529452, + "loss": 0.5038, + "step": 158850 + }, + { + "epoch": 7.890136088208999, + "grad_norm": 0.1513671875, + "learning_rate": 0.000168793086321645, + "loss": 0.4899, + "step": 158860 + }, + { + "epoch": 7.890632760504619, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016875335253799543, + "loss": 0.5325, + "step": 158870 + }, + { + "epoch": 7.891129432800239, + "grad_norm": 0.1982421875, + "learning_rate": 0.00016871361875434588, + "loss": 0.4795, + "step": 158880 + }, + { + "epoch": 7.891626105095858, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016867388497069635, + "loss": 0.5102, + "step": 158890 + }, + { + "epoch": 7.892122777391477, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001686341511870468, + "loss": 0.4802, + "step": 158900 + }, + { + "epoch": 7.892619449687096, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016859441740339727, + "loss": 0.4854, + "step": 158910 + }, + { + "epoch": 7.8931161219827155, + "grad_norm": 0.158203125, + "learning_rate": 0.00016855468361974768, + "loss": 0.4677, + "step": 158920 + }, + { + "epoch": 7.893612794278335, + "grad_norm": 0.1474609375, + "learning_rate": 0.00016851494983609815, + "loss": 0.4833, + "step": 158930 + }, + { + "epoch": 7.894109466573955, + "grad_norm": 0.169921875, + "learning_rate": 0.0001684752160524486, + "loss": 0.4998, + "step": 158940 + }, + { + "epoch": 7.894606138869574, + "grad_norm": 0.138671875, + "learning_rate": 0.00016843548226879907, + "loss": 0.4769, + "step": 158950 + }, + { + "epoch": 7.895102811165193, + "grad_norm": 0.146484375, + "learning_rate": 0.0001683957484851495, + "loss": 0.4937, + "step": 158960 + }, + { + "epoch": 7.8955994834608125, + "grad_norm": 0.1796875, + "learning_rate": 0.00016835601470149996, + "loss": 0.487, + "step": 158970 + }, + { + "epoch": 7.896096155756432, + "grad_norm": 0.1484375, + "learning_rate": 0.0001683162809178504, + "loss": 0.5119, + "step": 158980 + }, + { + "epoch": 7.896592828052051, + "grad_norm": 0.15625, + "learning_rate": 0.00016827654713420087, + "loss": 0.4928, + "step": 158990 + }, + { + "epoch": 7.89708950034767, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001682368133505513, + "loss": 0.4702, + "step": 159000 + }, + { + "epoch": 7.89758617264329, + "grad_norm": 0.1904296875, + "learning_rate": 0.00016819707956690176, + "loss": 0.481, + "step": 159010 + }, + { + "epoch": 7.8980828449389096, + "grad_norm": 0.1357421875, + "learning_rate": 0.0001681573457832522, + "loss": 0.5237, + "step": 159020 + }, + { + "epoch": 7.898579517234529, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016811761199960268, + "loss": 0.4982, + "step": 159030 + }, + { + "epoch": 7.899076189530148, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016807787821595312, + "loss": 0.4826, + "step": 159040 + }, + { + "epoch": 7.899572861825767, + "grad_norm": 0.154296875, + "learning_rate": 0.00016803814443230357, + "loss": 0.4898, + "step": 159050 + }, + { + "epoch": 7.9000695341213865, + "grad_norm": 0.166015625, + "learning_rate": 0.00016799841064865404, + "loss": 0.479, + "step": 159060 + }, + { + "epoch": 7.900566206417006, + "grad_norm": 0.17578125, + "learning_rate": 0.00016795867686500448, + "loss": 0.4809, + "step": 159070 + }, + { + "epoch": 7.901062878712626, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016791894308135495, + "loss": 0.4751, + "step": 159080 + }, + { + "epoch": 7.901559551008245, + "grad_norm": 0.171875, + "learning_rate": 0.00016787920929770537, + "loss": 0.5325, + "step": 159090 + }, + { + "epoch": 7.902056223303864, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016783947551405584, + "loss": 0.4959, + "step": 159100 + }, + { + "epoch": 7.9025528955994835, + "grad_norm": 0.154296875, + "learning_rate": 0.00016779974173040629, + "loss": 0.4888, + "step": 159110 + }, + { + "epoch": 7.903049567895103, + "grad_norm": 0.1796875, + "learning_rate": 0.00016776000794675676, + "loss": 0.5317, + "step": 159120 + }, + { + "epoch": 7.903546240190722, + "grad_norm": 0.1630859375, + "learning_rate": 0.00016772027416310717, + "loss": 0.4846, + "step": 159130 + }, + { + "epoch": 7.904042912486341, + "grad_norm": 0.1787109375, + "learning_rate": 0.00016768054037945765, + "loss": 0.5025, + "step": 159140 + }, + { + "epoch": 7.904539584781961, + "grad_norm": 0.171875, + "learning_rate": 0.0001676408065958081, + "loss": 0.5048, + "step": 159150 + }, + { + "epoch": 7.9050362570775805, + "grad_norm": 0.146484375, + "learning_rate": 0.00016760107281215856, + "loss": 0.5299, + "step": 159160 + }, + { + "epoch": 7.9055329293732, + "grad_norm": 0.173828125, + "learning_rate": 0.00016756133902850898, + "loss": 0.4928, + "step": 159170 + }, + { + "epoch": 7.906029601668819, + "grad_norm": 0.1513671875, + "learning_rate": 0.00016752160524485945, + "loss": 0.491, + "step": 159180 + }, + { + "epoch": 7.906526273964438, + "grad_norm": 0.1796875, + "learning_rate": 0.0001674818714612099, + "loss": 0.4857, + "step": 159190 + }, + { + "epoch": 7.907022946260057, + "grad_norm": 0.1474609375, + "learning_rate": 0.00016744213767756037, + "loss": 0.4391, + "step": 159200 + }, + { + "epoch": 7.907519618555677, + "grad_norm": 0.154296875, + "learning_rate": 0.0001674024038939108, + "loss": 0.5006, + "step": 159210 + }, + { + "epoch": 7.908016290851297, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016736267011026125, + "loss": 0.5145, + "step": 159220 + }, + { + "epoch": 7.908512963146916, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016732293632661173, + "loss": 0.4822, + "step": 159230 + }, + { + "epoch": 7.909009635442535, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016728320254296217, + "loss": 0.4696, + "step": 159240 + }, + { + "epoch": 7.909506307738154, + "grad_norm": 0.1533203125, + "learning_rate": 0.00016724346875931261, + "loss": 0.4848, + "step": 159250 + }, + { + "epoch": 7.910002980033774, + "grad_norm": 0.1416015625, + "learning_rate": 0.00016720373497566306, + "loss": 0.4845, + "step": 159260 + }, + { + "epoch": 7.910499652329393, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016716400119201353, + "loss": 0.5213, + "step": 159270 + }, + { + "epoch": 7.910996324625012, + "grad_norm": 0.15625, + "learning_rate": 0.00016712426740836397, + "loss": 0.4884, + "step": 159280 + }, + { + "epoch": 7.911492996920632, + "grad_norm": 0.1435546875, + "learning_rate": 0.00016708453362471442, + "loss": 0.4857, + "step": 159290 + }, + { + "epoch": 7.9119896692162515, + "grad_norm": 0.150390625, + "learning_rate": 0.00016704479984106486, + "loss": 0.4944, + "step": 159300 + }, + { + "epoch": 7.912486341511871, + "grad_norm": 0.1640625, + "learning_rate": 0.00016700506605741533, + "loss": 0.5093, + "step": 159310 + }, + { + "epoch": 7.91298301380749, + "grad_norm": 0.1533203125, + "learning_rate": 0.00016696533227376578, + "loss": 0.4598, + "step": 159320 + }, + { + "epoch": 7.913479686103109, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016692559849011622, + "loss": 0.5011, + "step": 159330 + }, + { + "epoch": 7.913976358398728, + "grad_norm": 0.1494140625, + "learning_rate": 0.00016688586470646667, + "loss": 0.4991, + "step": 159340 + }, + { + "epoch": 7.914473030694348, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016684613092281714, + "loss": 0.4935, + "step": 159350 + }, + { + "epoch": 7.914969702989968, + "grad_norm": 0.166015625, + "learning_rate": 0.00016680639713916758, + "loss": 0.4924, + "step": 159360 + }, + { + "epoch": 7.915466375285587, + "grad_norm": 0.166015625, + "learning_rate": 0.00016676666335551803, + "loss": 0.503, + "step": 159370 + }, + { + "epoch": 7.915963047581206, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001667269295718685, + "loss": 0.4942, + "step": 159380 + }, + { + "epoch": 7.916459719876825, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016668719578821894, + "loss": 0.4976, + "step": 159390 + }, + { + "epoch": 7.916956392172445, + "grad_norm": 0.146484375, + "learning_rate": 0.00016664746200456939, + "loss": 0.4774, + "step": 159400 + }, + { + "epoch": 7.917453064468064, + "grad_norm": 0.1435546875, + "learning_rate": 0.00016660772822091983, + "loss": 0.4851, + "step": 159410 + }, + { + "epoch": 7.917949736763683, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001665679944372703, + "loss": 0.4923, + "step": 159420 + }, + { + "epoch": 7.918446409059302, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016652826065362075, + "loss": 0.4775, + "step": 159430 + }, + { + "epoch": 7.9189430813549215, + "grad_norm": 0.169921875, + "learning_rate": 0.00016648852686997122, + "loss": 0.5103, + "step": 159440 + }, + { + "epoch": 7.919439753650542, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016644879308632166, + "loss": 0.5057, + "step": 159450 + }, + { + "epoch": 7.919936425946161, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001664090593026721, + "loss": 0.4869, + "step": 159460 + }, + { + "epoch": 7.92043309824178, + "grad_norm": 0.158203125, + "learning_rate": 0.00016636932551902255, + "loss": 0.4947, + "step": 159470 + }, + { + "epoch": 7.920929770537399, + "grad_norm": 0.1474609375, + "learning_rate": 0.00016632959173537302, + "loss": 0.4713, + "step": 159480 + }, + { + "epoch": 7.9214264428330186, + "grad_norm": 0.15625, + "learning_rate": 0.00016628985795172347, + "loss": 0.4921, + "step": 159490 + }, + { + "epoch": 7.921923115128638, + "grad_norm": 0.15234375, + "learning_rate": 0.0001662501241680739, + "loss": 0.4989, + "step": 159500 + }, + { + "epoch": 7.922419787424257, + "grad_norm": 0.1884765625, + "learning_rate": 0.00016621039038442435, + "loss": 0.5064, + "step": 159510 + }, + { + "epoch": 7.922916459719877, + "grad_norm": 0.1796875, + "learning_rate": 0.00016617065660077483, + "loss": 0.4692, + "step": 159520 + }, + { + "epoch": 7.923413132015496, + "grad_norm": 0.150390625, + "learning_rate": 0.00016613092281712527, + "loss": 0.5094, + "step": 159530 + }, + { + "epoch": 7.923909804311116, + "grad_norm": 0.1416015625, + "learning_rate": 0.00016609118903347571, + "loss": 0.5177, + "step": 159540 + }, + { + "epoch": 7.924406476606735, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016605145524982616, + "loss": 0.5124, + "step": 159550 + }, + { + "epoch": 7.924903148902354, + "grad_norm": 0.2041015625, + "learning_rate": 0.00016601172146617663, + "loss": 0.5021, + "step": 159560 + }, + { + "epoch": 7.925399821197973, + "grad_norm": 0.154296875, + "learning_rate": 0.00016597198768252707, + "loss": 0.5097, + "step": 159570 + }, + { + "epoch": 7.9258964934935925, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016593225389887752, + "loss": 0.4946, + "step": 159580 + }, + { + "epoch": 7.926393165789213, + "grad_norm": 0.1787109375, + "learning_rate": 0.000165892520115228, + "loss": 0.5066, + "step": 159590 + }, + { + "epoch": 7.926889838084832, + "grad_norm": 0.181640625, + "learning_rate": 0.00016585278633157843, + "loss": 0.5075, + "step": 159600 + }, + { + "epoch": 7.927386510380451, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001658130525479289, + "loss": 0.4872, + "step": 159610 + }, + { + "epoch": 7.92788318267607, + "grad_norm": 0.185546875, + "learning_rate": 0.00016577331876427932, + "loss": 0.4864, + "step": 159620 + }, + { + "epoch": 7.9283798549716895, + "grad_norm": 0.181640625, + "learning_rate": 0.0001657335849806298, + "loss": 0.4786, + "step": 159630 + }, + { + "epoch": 7.928876527267309, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016569385119698024, + "loss": 0.511, + "step": 159640 + }, + { + "epoch": 7.929373199562928, + "grad_norm": 0.15234375, + "learning_rate": 0.0001656541174133307, + "loss": 0.4909, + "step": 159650 + }, + { + "epoch": 7.929869871858548, + "grad_norm": 0.1640625, + "learning_rate": 0.00016561438362968113, + "loss": 0.4978, + "step": 159660 + }, + { + "epoch": 7.930366544154167, + "grad_norm": 0.169921875, + "learning_rate": 0.0001655746498460316, + "loss": 0.475, + "step": 159670 + }, + { + "epoch": 7.9308632164497865, + "grad_norm": 0.158203125, + "learning_rate": 0.00016553491606238204, + "loss": 0.4994, + "step": 159680 + }, + { + "epoch": 7.931359888745406, + "grad_norm": 0.189453125, + "learning_rate": 0.0001654951822787325, + "loss": 0.5013, + "step": 159690 + }, + { + "epoch": 7.931856561041025, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016545544849508293, + "loss": 0.5227, + "step": 159700 + }, + { + "epoch": 7.932353233336644, + "grad_norm": 0.19921875, + "learning_rate": 0.0001654157147114334, + "loss": 0.4901, + "step": 159710 + }, + { + "epoch": 7.932849905632263, + "grad_norm": 0.146484375, + "learning_rate": 0.00016537598092778385, + "loss": 0.5077, + "step": 159720 + }, + { + "epoch": 7.933346577927884, + "grad_norm": 0.162109375, + "learning_rate": 0.00016533624714413432, + "loss": 0.4844, + "step": 159730 + }, + { + "epoch": 7.933843250223503, + "grad_norm": 0.1796875, + "learning_rate": 0.00016529651336048476, + "loss": 0.4831, + "step": 159740 + }, + { + "epoch": 7.934339922519122, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001652567795768352, + "loss": 0.4687, + "step": 159750 + }, + { + "epoch": 7.934836594814741, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016521704579318568, + "loss": 0.4882, + "step": 159760 + }, + { + "epoch": 7.9353332671103605, + "grad_norm": 0.1513671875, + "learning_rate": 0.00016517731200953612, + "loss": 0.5135, + "step": 159770 + }, + { + "epoch": 7.93582993940598, + "grad_norm": 0.16015625, + "learning_rate": 0.00016513757822588657, + "loss": 0.5015, + "step": 159780 + }, + { + "epoch": 7.936326611701599, + "grad_norm": 0.15234375, + "learning_rate": 0.000165097844442237, + "loss": 0.4614, + "step": 159790 + }, + { + "epoch": 7.936823283997219, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016505811065858748, + "loss": 0.4798, + "step": 159800 + }, + { + "epoch": 7.937319956292838, + "grad_norm": 0.1787109375, + "learning_rate": 0.00016501837687493793, + "loss": 0.4945, + "step": 159810 + }, + { + "epoch": 7.9378166285884575, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016497864309128837, + "loss": 0.4811, + "step": 159820 + }, + { + "epoch": 7.938313300884077, + "grad_norm": 0.1533203125, + "learning_rate": 0.00016493890930763881, + "loss": 0.497, + "step": 159830 + }, + { + "epoch": 7.938809973179696, + "grad_norm": 0.15625, + "learning_rate": 0.00016489917552398929, + "loss": 0.5042, + "step": 159840 + }, + { + "epoch": 7.939306645475315, + "grad_norm": 0.1484375, + "learning_rate": 0.00016485944174033973, + "loss": 0.496, + "step": 159850 + }, + { + "epoch": 7.939803317770934, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001648197079566902, + "loss": 0.5029, + "step": 159860 + }, + { + "epoch": 7.9402999900665545, + "grad_norm": 0.20703125, + "learning_rate": 0.00016477997417304062, + "loss": 0.479, + "step": 159870 + }, + { + "epoch": 7.940796662362174, + "grad_norm": 0.158203125, + "learning_rate": 0.0001647402403893911, + "loss": 0.5002, + "step": 159880 + }, + { + "epoch": 7.941293334657793, + "grad_norm": 0.1826171875, + "learning_rate": 0.00016470050660574153, + "loss": 0.4777, + "step": 159890 + }, + { + "epoch": 7.941790006953412, + "grad_norm": 0.203125, + "learning_rate": 0.000164660772822092, + "loss": 0.4911, + "step": 159900 + }, + { + "epoch": 7.942286679249031, + "grad_norm": 0.1904296875, + "learning_rate": 0.00016462103903844245, + "loss": 0.4854, + "step": 159910 + }, + { + "epoch": 7.942783351544651, + "grad_norm": 0.1484375, + "learning_rate": 0.0001645813052547929, + "loss": 0.5135, + "step": 159920 + }, + { + "epoch": 7.94328002384027, + "grad_norm": 0.14453125, + "learning_rate": 0.00016454157147114337, + "loss": 0.4886, + "step": 159930 + }, + { + "epoch": 7.94377669613589, + "grad_norm": 0.162109375, + "learning_rate": 0.0001645018376874938, + "loss": 0.4596, + "step": 159940 + }, + { + "epoch": 7.944273368431509, + "grad_norm": 0.173828125, + "learning_rate": 0.00016446210390384425, + "loss": 0.4739, + "step": 159950 + }, + { + "epoch": 7.9447700407271284, + "grad_norm": 0.158203125, + "learning_rate": 0.0001644223701201947, + "loss": 0.4765, + "step": 159960 + }, + { + "epoch": 7.945266713022748, + "grad_norm": 0.1416015625, + "learning_rate": 0.00016438263633654517, + "loss": 0.4598, + "step": 159970 + }, + { + "epoch": 7.945763385318367, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001643429025528956, + "loss": 0.5043, + "step": 159980 + }, + { + "epoch": 7.946260057613986, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016430316876924606, + "loss": 0.514, + "step": 159990 + }, + { + "epoch": 7.946756729909605, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001642634349855965, + "loss": 0.4984, + "step": 160000 + }, + { + "epoch": 7.9472534022052255, + "grad_norm": 0.142578125, + "learning_rate": 0.00016422370120194697, + "loss": 0.498, + "step": 160010 + }, + { + "epoch": 7.947750074500845, + "grad_norm": 0.16796875, + "learning_rate": 0.00016418396741829742, + "loss": 0.51, + "step": 160020 + }, + { + "epoch": 7.948246746796464, + "grad_norm": 0.1875, + "learning_rate": 0.00016414423363464786, + "loss": 0.5087, + "step": 160030 + }, + { + "epoch": 7.948743419092083, + "grad_norm": 0.150390625, + "learning_rate": 0.0001641044998509983, + "loss": 0.5029, + "step": 160040 + }, + { + "epoch": 7.949240091387702, + "grad_norm": 0.1455078125, + "learning_rate": 0.00016406476606734878, + "loss": 0.5312, + "step": 160050 + }, + { + "epoch": 7.949736763683322, + "grad_norm": 0.16015625, + "learning_rate": 0.00016402503228369922, + "loss": 0.4919, + "step": 160060 + }, + { + "epoch": 7.950233435978941, + "grad_norm": 0.181640625, + "learning_rate": 0.00016398529850004967, + "loss": 0.4937, + "step": 160070 + }, + { + "epoch": 7.95073010827456, + "grad_norm": 0.1416015625, + "learning_rate": 0.00016394556471640014, + "loss": 0.475, + "step": 160080 + }, + { + "epoch": 7.95122678057018, + "grad_norm": 0.14453125, + "learning_rate": 0.00016390583093275058, + "loss": 0.4837, + "step": 160090 + }, + { + "epoch": 7.951723452865799, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016386609714910103, + "loss": 0.4909, + "step": 160100 + }, + { + "epoch": 7.952220125161419, + "grad_norm": 0.2080078125, + "learning_rate": 0.00016382636336545147, + "loss": 0.5178, + "step": 160110 + }, + { + "epoch": 7.952716797457038, + "grad_norm": 0.15234375, + "learning_rate": 0.00016378662958180194, + "loss": 0.5154, + "step": 160120 + }, + { + "epoch": 7.953213469752657, + "grad_norm": 0.142578125, + "learning_rate": 0.00016374689579815239, + "loss": 0.4721, + "step": 160130 + }, + { + "epoch": 7.953710142048276, + "grad_norm": 0.158203125, + "learning_rate": 0.00016370716201450286, + "loss": 0.5122, + "step": 160140 + }, + { + "epoch": 7.9542068143438955, + "grad_norm": 0.154296875, + "learning_rate": 0.00016366742823085327, + "loss": 0.4571, + "step": 160150 + }, + { + "epoch": 7.954703486639515, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016362769444720375, + "loss": 0.4744, + "step": 160160 + }, + { + "epoch": 7.955200158935135, + "grad_norm": 0.158203125, + "learning_rate": 0.0001635879606635542, + "loss": 0.498, + "step": 160170 + }, + { + "epoch": 7.955696831230754, + "grad_norm": 0.146484375, + "learning_rate": 0.00016354822687990466, + "loss": 0.4578, + "step": 160180 + }, + { + "epoch": 7.956193503526373, + "grad_norm": 0.14453125, + "learning_rate": 0.00016350849309625508, + "loss": 0.4896, + "step": 160190 + }, + { + "epoch": 7.956690175821993, + "grad_norm": 0.1455078125, + "learning_rate": 0.00016346875931260555, + "loss": 0.4738, + "step": 160200 + }, + { + "epoch": 7.957186848117612, + "grad_norm": 0.1591796875, + "learning_rate": 0.000163429025528956, + "loss": 0.5022, + "step": 160210 + }, + { + "epoch": 7.957683520413231, + "grad_norm": 0.158203125, + "learning_rate": 0.00016338929174530647, + "loss": 0.4943, + "step": 160220 + }, + { + "epoch": 7.95818019270885, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001633495579616569, + "loss": 0.4896, + "step": 160230 + }, + { + "epoch": 7.95867686500447, + "grad_norm": 0.1640625, + "learning_rate": 0.00016330982417800735, + "loss": 0.492, + "step": 160240 + }, + { + "epoch": 7.95917353730009, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001632700903943578, + "loss": 0.5101, + "step": 160250 + }, + { + "epoch": 7.959670209595709, + "grad_norm": 0.2001953125, + "learning_rate": 0.00016323035661070827, + "loss": 0.5329, + "step": 160260 + }, + { + "epoch": 7.960166881891328, + "grad_norm": 0.15234375, + "learning_rate": 0.00016319062282705871, + "loss": 0.476, + "step": 160270 + }, + { + "epoch": 7.960663554186947, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016315088904340916, + "loss": 0.4737, + "step": 160280 + }, + { + "epoch": 7.9611602264825665, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016311115525975963, + "loss": 0.4942, + "step": 160290 + }, + { + "epoch": 7.961656898778186, + "grad_norm": 0.15625, + "learning_rate": 0.00016307142147611007, + "loss": 0.5531, + "step": 160300 + }, + { + "epoch": 7.962153571073806, + "grad_norm": 0.1533203125, + "learning_rate": 0.00016303168769246054, + "loss": 0.5119, + "step": 160310 + }, + { + "epoch": 7.962650243369425, + "grad_norm": 0.146484375, + "learning_rate": 0.00016299195390881096, + "loss": 0.5132, + "step": 160320 + }, + { + "epoch": 7.963146915665044, + "grad_norm": 0.1767578125, + "learning_rate": 0.00016295222012516143, + "loss": 0.479, + "step": 160330 + }, + { + "epoch": 7.9636435879606635, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016291248634151188, + "loss": 0.5006, + "step": 160340 + }, + { + "epoch": 7.964140260256283, + "grad_norm": 0.1484375, + "learning_rate": 0.00016287275255786235, + "loss": 0.4917, + "step": 160350 + }, + { + "epoch": 7.964636932551902, + "grad_norm": 0.169921875, + "learning_rate": 0.00016283301877421277, + "loss": 0.5246, + "step": 160360 + }, + { + "epoch": 7.965133604847521, + "grad_norm": 0.1953125, + "learning_rate": 0.00016279328499056324, + "loss": 0.4925, + "step": 160370 + }, + { + "epoch": 7.965630277143141, + "grad_norm": 0.16015625, + "learning_rate": 0.00016275355120691368, + "loss": 0.474, + "step": 160380 + }, + { + "epoch": 7.9661269494387605, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016271381742326415, + "loss": 0.4957, + "step": 160390 + }, + { + "epoch": 7.96662362173438, + "grad_norm": 0.16796875, + "learning_rate": 0.00016267408363961457, + "loss": 0.4688, + "step": 160400 + }, + { + "epoch": 7.967120294029999, + "grad_norm": 0.166015625, + "learning_rate": 0.00016263434985596504, + "loss": 0.4886, + "step": 160410 + }, + { + "epoch": 7.967616966325618, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016259461607231549, + "loss": 0.4947, + "step": 160420 + }, + { + "epoch": 7.9681136386212374, + "grad_norm": 0.166015625, + "learning_rate": 0.00016255488228866596, + "loss": 0.4888, + "step": 160430 + }, + { + "epoch": 7.968610310916857, + "grad_norm": 0.166015625, + "learning_rate": 0.0001625151485050164, + "loss": 0.4831, + "step": 160440 + }, + { + "epoch": 7.969106983212477, + "grad_norm": 0.16015625, + "learning_rate": 0.00016247541472136685, + "loss": 0.5021, + "step": 160450 + }, + { + "epoch": 7.969603655508096, + "grad_norm": 0.16796875, + "learning_rate": 0.00016243568093771732, + "loss": 0.4852, + "step": 160460 + }, + { + "epoch": 7.970100327803715, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016239594715406776, + "loss": 0.4923, + "step": 160470 + }, + { + "epoch": 7.9705970000993345, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001623562133704182, + "loss": 0.4684, + "step": 160480 + }, + { + "epoch": 7.971093672394954, + "grad_norm": 0.166015625, + "learning_rate": 0.00016231647958676865, + "loss": 0.4737, + "step": 160490 + }, + { + "epoch": 7.971590344690573, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016227674580311912, + "loss": 0.521, + "step": 160500 + }, + { + "epoch": 7.972087016986192, + "grad_norm": 0.1474609375, + "learning_rate": 0.00016223701201946957, + "loss": 0.468, + "step": 160510 + }, + { + "epoch": 7.972583689281812, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016219727823582, + "loss": 0.5087, + "step": 160520 + }, + { + "epoch": 7.9730803615774315, + "grad_norm": 0.15625, + "learning_rate": 0.00016215754445217045, + "loss": 0.4756, + "step": 160530 + }, + { + "epoch": 7.973577033873051, + "grad_norm": 0.2099609375, + "learning_rate": 0.00016211781066852093, + "loss": 0.5339, + "step": 160540 + }, + { + "epoch": 7.97407370616867, + "grad_norm": 0.1484375, + "learning_rate": 0.00016207807688487137, + "loss": 0.4721, + "step": 160550 + }, + { + "epoch": 7.974570378464289, + "grad_norm": 0.14453125, + "learning_rate": 0.00016203834310122181, + "loss": 0.5024, + "step": 160560 + }, + { + "epoch": 7.975067050759908, + "grad_norm": 0.1416015625, + "learning_rate": 0.00016199860931757226, + "loss": 0.4412, + "step": 160570 + }, + { + "epoch": 7.975563723055528, + "grad_norm": 0.189453125, + "learning_rate": 0.00016195887553392273, + "loss": 0.4727, + "step": 160580 + }, + { + "epoch": 7.976060395351148, + "grad_norm": 0.138671875, + "learning_rate": 0.00016191914175027317, + "loss": 0.5233, + "step": 160590 + }, + { + "epoch": 7.976557067646767, + "grad_norm": 0.1943359375, + "learning_rate": 0.00016187940796662362, + "loss": 0.4815, + "step": 160600 + }, + { + "epoch": 7.977053739942386, + "grad_norm": 0.16796875, + "learning_rate": 0.0001618396741829741, + "loss": 0.4791, + "step": 160610 + }, + { + "epoch": 7.977550412238005, + "grad_norm": 0.154296875, + "learning_rate": 0.00016179994039932453, + "loss": 0.5128, + "step": 160620 + }, + { + "epoch": 7.978047084533625, + "grad_norm": 0.1591796875, + "learning_rate": 0.000161760206615675, + "loss": 0.4934, + "step": 160630 + }, + { + "epoch": 7.978543756829244, + "grad_norm": 0.15234375, + "learning_rate": 0.00016172047283202542, + "loss": 0.5397, + "step": 160640 + }, + { + "epoch": 7.979040429124863, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001616807390483759, + "loss": 0.4757, + "step": 160650 + }, + { + "epoch": 7.979537101420483, + "grad_norm": 0.1494140625, + "learning_rate": 0.00016164100526472634, + "loss": 0.4689, + "step": 160660 + }, + { + "epoch": 7.9800337737161025, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001616012714810768, + "loss": 0.5168, + "step": 160670 + }, + { + "epoch": 7.980530446011722, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016156153769742723, + "loss": 0.4812, + "step": 160680 + }, + { + "epoch": 7.981027118307341, + "grad_norm": 0.15234375, + "learning_rate": 0.0001615218039137777, + "loss": 0.5067, + "step": 160690 + }, + { + "epoch": 7.98152379060296, + "grad_norm": 0.1455078125, + "learning_rate": 0.00016148207013012814, + "loss": 0.4903, + "step": 160700 + }, + { + "epoch": 7.982020462898579, + "grad_norm": 0.142578125, + "learning_rate": 0.0001614423363464786, + "loss": 0.4981, + "step": 160710 + }, + { + "epoch": 7.982517135194199, + "grad_norm": 0.1962890625, + "learning_rate": 0.00016140260256282906, + "loss": 0.5052, + "step": 160720 + }, + { + "epoch": 7.983013807489819, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001613628687791795, + "loss": 0.4815, + "step": 160730 + }, + { + "epoch": 7.983510479785438, + "grad_norm": 0.15625, + "learning_rate": 0.00016132313499552995, + "loss": 0.5113, + "step": 160740 + }, + { + "epoch": 7.984007152081057, + "grad_norm": 0.171875, + "learning_rate": 0.00016128340121188042, + "loss": 0.492, + "step": 160750 + }, + { + "epoch": 7.984503824376676, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016124366742823086, + "loss": 0.5066, + "step": 160760 + }, + { + "epoch": 7.985000496672296, + "grad_norm": 0.19140625, + "learning_rate": 0.0001612039336445813, + "loss": 0.497, + "step": 160770 + }, + { + "epoch": 7.985497168967915, + "grad_norm": 0.1513671875, + "learning_rate": 0.00016116419986093178, + "loss": 0.5048, + "step": 160780 + }, + { + "epoch": 7.985993841263534, + "grad_norm": 0.224609375, + "learning_rate": 0.00016112446607728222, + "loss": 0.4885, + "step": 160790 + }, + { + "epoch": 7.986490513559153, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016108473229363267, + "loss": 0.4969, + "step": 160800 + }, + { + "epoch": 7.986987185854773, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001610449985099831, + "loss": 0.4893, + "step": 160810 + }, + { + "epoch": 7.987483858150393, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016100526472633358, + "loss": 0.4677, + "step": 160820 + }, + { + "epoch": 7.987980530446012, + "grad_norm": 0.1796875, + "learning_rate": 0.00016096553094268403, + "loss": 0.4925, + "step": 160830 + }, + { + "epoch": 7.988477202741631, + "grad_norm": 0.1640625, + "learning_rate": 0.0001609257971590345, + "loss": 0.489, + "step": 160840 + }, + { + "epoch": 7.98897387503725, + "grad_norm": 0.17578125, + "learning_rate": 0.00016088606337538491, + "loss": 0.4792, + "step": 160850 + }, + { + "epoch": 7.9894705473328695, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016084632959173539, + "loss": 0.5002, + "step": 160860 + }, + { + "epoch": 7.989967219628489, + "grad_norm": 0.16015625, + "learning_rate": 0.00016080659580808583, + "loss": 0.4733, + "step": 160870 + }, + { + "epoch": 7.990463891924108, + "grad_norm": 0.166015625, + "learning_rate": 0.0001607668620244363, + "loss": 0.4713, + "step": 160880 + }, + { + "epoch": 7.990960564219728, + "grad_norm": 0.1533203125, + "learning_rate": 0.00016072712824078672, + "loss": 0.5031, + "step": 160890 + }, + { + "epoch": 7.991457236515347, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001606873944571372, + "loss": 0.4917, + "step": 160900 + }, + { + "epoch": 7.991953908810967, + "grad_norm": 0.146484375, + "learning_rate": 0.00016064766067348763, + "loss": 0.4995, + "step": 160910 + }, + { + "epoch": 7.992450581106586, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001606079268898381, + "loss": 0.5008, + "step": 160920 + }, + { + "epoch": 7.992947253402205, + "grad_norm": 0.1904296875, + "learning_rate": 0.00016056819310618855, + "loss": 0.4829, + "step": 160930 + }, + { + "epoch": 7.993443925697824, + "grad_norm": 0.1455078125, + "learning_rate": 0.000160528459322539, + "loss": 0.5061, + "step": 160940 + }, + { + "epoch": 7.9939405979934435, + "grad_norm": 0.1484375, + "learning_rate": 0.00016048872553888944, + "loss": 0.4793, + "step": 160950 + }, + { + "epoch": 7.994437270289064, + "grad_norm": 0.15625, + "learning_rate": 0.0001604489917552399, + "loss": 0.4777, + "step": 160960 + }, + { + "epoch": 7.994933942584683, + "grad_norm": 0.166015625, + "learning_rate": 0.00016040925797159035, + "loss": 0.4421, + "step": 160970 + }, + { + "epoch": 7.995430614880302, + "grad_norm": 0.150390625, + "learning_rate": 0.0001603695241879408, + "loss": 0.4834, + "step": 160980 + }, + { + "epoch": 7.995927287175921, + "grad_norm": 0.20703125, + "learning_rate": 0.00016032979040429127, + "loss": 0.5118, + "step": 160990 + }, + { + "epoch": 7.9964239594715405, + "grad_norm": 0.1796875, + "learning_rate": 0.0001602900566206417, + "loss": 0.4914, + "step": 161000 + }, + { + "epoch": 7.99692063176716, + "grad_norm": 0.150390625, + "learning_rate": 0.00016025032283699216, + "loss": 0.4723, + "step": 161010 + }, + { + "epoch": 7.997417304062779, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001602105890533426, + "loss": 0.5193, + "step": 161020 + }, + { + "epoch": 7.997913976358399, + "grad_norm": 0.1884765625, + "learning_rate": 0.00016017085526969307, + "loss": 0.5055, + "step": 161030 + }, + { + "epoch": 7.998410648654018, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016013112148604352, + "loss": 0.454, + "step": 161040 + }, + { + "epoch": 7.9989073209496375, + "grad_norm": 0.1484375, + "learning_rate": 0.00016009138770239396, + "loss": 0.4973, + "step": 161050 + }, + { + "epoch": 7.999403993245257, + "grad_norm": 0.166015625, + "learning_rate": 0.0001600516539187444, + "loss": 0.4694, + "step": 161060 + }, + { + "epoch": 7.999900665540876, + "grad_norm": 0.173828125, + "learning_rate": 0.00016001192013509488, + "loss": 0.5157, + "step": 161070 + }, + { + "epoch": 8.000397337836496, + "grad_norm": 0.162109375, + "learning_rate": 0.00015997218635144532, + "loss": 0.5245, + "step": 161080 + }, + { + "epoch": 8.000894010132114, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015993245256779577, + "loss": 0.4829, + "step": 161090 + }, + { + "epoch": 8.001390682427735, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001598927187841462, + "loss": 0.4883, + "step": 161100 + }, + { + "epoch": 8.001887354723353, + "grad_norm": 0.1767578125, + "learning_rate": 0.00015985298500049668, + "loss": 0.478, + "step": 161110 + }, + { + "epoch": 8.002384027018973, + "grad_norm": 0.150390625, + "learning_rate": 0.00015981325121684713, + "loss": 0.508, + "step": 161120 + }, + { + "epoch": 8.002880699314593, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001597735174331976, + "loss": 0.4908, + "step": 161130 + }, + { + "epoch": 8.003377371610211, + "grad_norm": 0.146484375, + "learning_rate": 0.00015973378364954804, + "loss": 0.4701, + "step": 161140 + }, + { + "epoch": 8.003874043905832, + "grad_norm": 0.1552734375, + "learning_rate": 0.00015969404986589849, + "loss": 0.459, + "step": 161150 + }, + { + "epoch": 8.00437071620145, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015965431608224896, + "loss": 0.4498, + "step": 161160 + }, + { + "epoch": 8.00486738849707, + "grad_norm": 0.177734375, + "learning_rate": 0.0001596145822985994, + "loss": 0.4693, + "step": 161170 + }, + { + "epoch": 8.005364060792688, + "grad_norm": 0.15625, + "learning_rate": 0.00015957484851494985, + "loss": 0.4906, + "step": 161180 + }, + { + "epoch": 8.005860733088308, + "grad_norm": 0.15625, + "learning_rate": 0.0001595351147313003, + "loss": 0.478, + "step": 161190 + }, + { + "epoch": 8.006357405383927, + "grad_norm": 0.142578125, + "learning_rate": 0.00015949538094765076, + "loss": 0.4693, + "step": 161200 + }, + { + "epoch": 8.006854077679547, + "grad_norm": 0.158203125, + "learning_rate": 0.0001594556471640012, + "loss": 0.4263, + "step": 161210 + }, + { + "epoch": 8.007350749975167, + "grad_norm": 0.1708984375, + "learning_rate": 0.00015941591338035165, + "loss": 0.4807, + "step": 161220 + }, + { + "epoch": 8.007847422270785, + "grad_norm": 0.169921875, + "learning_rate": 0.0001593761795967021, + "loss": 0.541, + "step": 161230 + }, + { + "epoch": 8.008344094566406, + "grad_norm": 0.1875, + "learning_rate": 0.00015933644581305256, + "loss": 0.4734, + "step": 161240 + }, + { + "epoch": 8.008840766862024, + "grad_norm": 0.18359375, + "learning_rate": 0.000159296712029403, + "loss": 0.4799, + "step": 161250 + }, + { + "epoch": 8.009337439157644, + "grad_norm": 0.142578125, + "learning_rate": 0.00015925697824575345, + "loss": 0.4768, + "step": 161260 + }, + { + "epoch": 8.009834111453262, + "grad_norm": 0.15625, + "learning_rate": 0.0001592172444621039, + "loss": 0.5115, + "step": 161270 + }, + { + "epoch": 8.010330783748882, + "grad_norm": 0.154296875, + "learning_rate": 0.00015917751067845437, + "loss": 0.4876, + "step": 161280 + }, + { + "epoch": 8.010827456044503, + "grad_norm": 0.15234375, + "learning_rate": 0.0001591377768948048, + "loss": 0.4816, + "step": 161290 + }, + { + "epoch": 8.01132412834012, + "grad_norm": 0.14453125, + "learning_rate": 0.00015909804311115526, + "loss": 0.4597, + "step": 161300 + }, + { + "epoch": 8.011820800635741, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015905830932750573, + "loss": 0.4957, + "step": 161310 + }, + { + "epoch": 8.01231747293136, + "grad_norm": 0.185546875, + "learning_rate": 0.00015901857554385617, + "loss": 0.4762, + "step": 161320 + }, + { + "epoch": 8.01281414522698, + "grad_norm": 0.1806640625, + "learning_rate": 0.00015897884176020664, + "loss": 0.4954, + "step": 161330 + }, + { + "epoch": 8.013310817522598, + "grad_norm": 0.154296875, + "learning_rate": 0.00015893910797655706, + "loss": 0.4877, + "step": 161340 + }, + { + "epoch": 8.013807489818218, + "grad_norm": 0.1806640625, + "learning_rate": 0.00015889937419290753, + "loss": 0.4988, + "step": 161350 + }, + { + "epoch": 8.014304162113838, + "grad_norm": 0.158203125, + "learning_rate": 0.00015885964040925798, + "loss": 0.4862, + "step": 161360 + }, + { + "epoch": 8.014800834409456, + "grad_norm": 0.1552734375, + "learning_rate": 0.00015881990662560845, + "loss": 0.4652, + "step": 161370 + }, + { + "epoch": 8.015297506705076, + "grad_norm": 0.162109375, + "learning_rate": 0.00015878017284195887, + "loss": 0.4944, + "step": 161380 + }, + { + "epoch": 8.015794179000695, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015874043905830934, + "loss": 0.4868, + "step": 161390 + }, + { + "epoch": 8.016290851296315, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015870070527465978, + "loss": 0.527, + "step": 161400 + }, + { + "epoch": 8.016787523591933, + "grad_norm": 0.158203125, + "learning_rate": 0.00015866097149101025, + "loss": 0.4963, + "step": 161410 + }, + { + "epoch": 8.017284195887553, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015862123770736067, + "loss": 0.4732, + "step": 161420 + }, + { + "epoch": 8.017780868183173, + "grad_norm": 0.1806640625, + "learning_rate": 0.00015858150392371114, + "loss": 0.4751, + "step": 161430 + }, + { + "epoch": 8.018277540478792, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015854177014006159, + "loss": 0.45, + "step": 161440 + }, + { + "epoch": 8.018774212774412, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015850203635641206, + "loss": 0.495, + "step": 161450 + }, + { + "epoch": 8.01927088507003, + "grad_norm": 0.1875, + "learning_rate": 0.0001584623025727625, + "loss": 0.4826, + "step": 161460 + }, + { + "epoch": 8.01976755736565, + "grad_norm": 0.171875, + "learning_rate": 0.00015842256878911295, + "loss": 0.5109, + "step": 161470 + }, + { + "epoch": 8.020264229661269, + "grad_norm": 0.16015625, + "learning_rate": 0.00015838283500546342, + "loss": 0.4613, + "step": 161480 + }, + { + "epoch": 8.020760901956889, + "grad_norm": 0.15234375, + "learning_rate": 0.00015834310122181386, + "loss": 0.4482, + "step": 161490 + }, + { + "epoch": 8.021257574252509, + "grad_norm": 0.1953125, + "learning_rate": 0.0001583033674381643, + "loss": 0.4715, + "step": 161500 + }, + { + "epoch": 8.021754246548127, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015826363365451475, + "loss": 0.5069, + "step": 161510 + }, + { + "epoch": 8.022250918843747, + "grad_norm": 0.1533203125, + "learning_rate": 0.00015822389987086522, + "loss": 0.4852, + "step": 161520 + }, + { + "epoch": 8.022747591139366, + "grad_norm": 0.1455078125, + "learning_rate": 0.00015818416608721566, + "loss": 0.4645, + "step": 161530 + }, + { + "epoch": 8.023244263434986, + "grad_norm": 0.1845703125, + "learning_rate": 0.00015814443230356614, + "loss": 0.4513, + "step": 161540 + }, + { + "epoch": 8.023740935730604, + "grad_norm": 0.2060546875, + "learning_rate": 0.00015810469851991655, + "loss": 0.4544, + "step": 161550 + }, + { + "epoch": 8.024237608026224, + "grad_norm": 0.1865234375, + "learning_rate": 0.00015806496473626702, + "loss": 0.4841, + "step": 161560 + }, + { + "epoch": 8.024734280321844, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015802523095261747, + "loss": 0.5216, + "step": 161570 + }, + { + "epoch": 8.025230952617463, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015798549716896794, + "loss": 0.5066, + "step": 161580 + }, + { + "epoch": 8.025727624913083, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015794576338531836, + "loss": 0.4658, + "step": 161590 + }, + { + "epoch": 8.026224297208701, + "grad_norm": 0.162109375, + "learning_rate": 0.00015790602960166883, + "loss": 0.4869, + "step": 161600 + }, + { + "epoch": 8.026720969504321, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015786629581801927, + "loss": 0.4999, + "step": 161610 + }, + { + "epoch": 8.02721764179994, + "grad_norm": 0.181640625, + "learning_rate": 0.00015782656203436974, + "loss": 0.483, + "step": 161620 + }, + { + "epoch": 8.02771431409556, + "grad_norm": 0.185546875, + "learning_rate": 0.0001577868282507202, + "loss": 0.491, + "step": 161630 + }, + { + "epoch": 8.02821098639118, + "grad_norm": 0.15625, + "learning_rate": 0.00015774709446707063, + "loss": 0.4676, + "step": 161640 + }, + { + "epoch": 8.028707658686798, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015770736068342108, + "loss": 0.466, + "step": 161650 + }, + { + "epoch": 8.029204330982418, + "grad_norm": 0.173828125, + "learning_rate": 0.00015766762689977155, + "loss": 0.5024, + "step": 161660 + }, + { + "epoch": 8.029701003278037, + "grad_norm": 0.201171875, + "learning_rate": 0.000157627893116122, + "loss": 0.4902, + "step": 161670 + }, + { + "epoch": 8.030197675573657, + "grad_norm": 0.189453125, + "learning_rate": 0.00015758815933247244, + "loss": 0.4683, + "step": 161680 + }, + { + "epoch": 8.030694347869275, + "grad_norm": 0.1640625, + "learning_rate": 0.0001575484255488229, + "loss": 0.484, + "step": 161690 + }, + { + "epoch": 8.031191020164895, + "grad_norm": 0.16015625, + "learning_rate": 0.00015750869176517335, + "loss": 0.5135, + "step": 161700 + }, + { + "epoch": 8.031687692460515, + "grad_norm": 0.1484375, + "learning_rate": 0.0001574689579815238, + "loss": 0.4741, + "step": 161710 + }, + { + "epoch": 8.032184364756134, + "grad_norm": 0.1826171875, + "learning_rate": 0.00015742922419787424, + "loss": 0.4908, + "step": 161720 + }, + { + "epoch": 8.032681037051754, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001573894904142247, + "loss": 0.4432, + "step": 161730 + }, + { + "epoch": 8.033177709347372, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015734975663057516, + "loss": 0.4724, + "step": 161740 + }, + { + "epoch": 8.033674381642992, + "grad_norm": 0.158203125, + "learning_rate": 0.0001573100228469256, + "loss": 0.4892, + "step": 161750 + }, + { + "epoch": 8.03417105393861, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015727028906327605, + "loss": 0.4908, + "step": 161760 + }, + { + "epoch": 8.03466772623423, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015723055527962652, + "loss": 0.493, + "step": 161770 + }, + { + "epoch": 8.03516439852985, + "grad_norm": 0.201171875, + "learning_rate": 0.00015719082149597696, + "loss": 0.4561, + "step": 161780 + }, + { + "epoch": 8.03566107082547, + "grad_norm": 0.162109375, + "learning_rate": 0.0001571510877123274, + "loss": 0.5212, + "step": 161790 + }, + { + "epoch": 8.03615774312109, + "grad_norm": 0.1708984375, + "learning_rate": 0.00015711135392867785, + "loss": 0.4831, + "step": 161800 + }, + { + "epoch": 8.036654415416708, + "grad_norm": 0.1865234375, + "learning_rate": 0.00015707162014502832, + "loss": 0.4874, + "step": 161810 + }, + { + "epoch": 8.037151087712328, + "grad_norm": 0.1484375, + "learning_rate": 0.00015703188636137877, + "loss": 0.4625, + "step": 161820 + }, + { + "epoch": 8.037647760007946, + "grad_norm": 0.171875, + "learning_rate": 0.0001569921525777292, + "loss": 0.4806, + "step": 161830 + }, + { + "epoch": 8.038144432303566, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015695241879407968, + "loss": 0.4679, + "step": 161840 + }, + { + "epoch": 8.038641104599186, + "grad_norm": 0.171875, + "learning_rate": 0.00015691268501043012, + "loss": 0.4581, + "step": 161850 + }, + { + "epoch": 8.039137776894805, + "grad_norm": 0.1455078125, + "learning_rate": 0.0001568729512267806, + "loss": 0.4933, + "step": 161860 + }, + { + "epoch": 8.039634449190425, + "grad_norm": 0.1669921875, + "learning_rate": 0.000156833217443131, + "loss": 0.4792, + "step": 161870 + }, + { + "epoch": 8.040131121486043, + "grad_norm": 0.1708984375, + "learning_rate": 0.00015679348365948148, + "loss": 0.4653, + "step": 161880 + }, + { + "epoch": 8.040627793781663, + "grad_norm": 0.181640625, + "learning_rate": 0.00015675374987583193, + "loss": 0.4661, + "step": 161890 + }, + { + "epoch": 8.041124466077282, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001567140160921824, + "loss": 0.4943, + "step": 161900 + }, + { + "epoch": 8.041621138372902, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015667428230853282, + "loss": 0.5065, + "step": 161910 + }, + { + "epoch": 8.04211781066852, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001566345485248833, + "loss": 0.4755, + "step": 161920 + }, + { + "epoch": 8.04261448296414, + "grad_norm": 0.15234375, + "learning_rate": 0.00015659481474123373, + "loss": 0.4787, + "step": 161930 + }, + { + "epoch": 8.04311115525976, + "grad_norm": 0.15625, + "learning_rate": 0.0001565550809575842, + "loss": 0.4992, + "step": 161940 + }, + { + "epoch": 8.043607827555379, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015651534717393462, + "loss": 0.4444, + "step": 161950 + }, + { + "epoch": 8.044104499850999, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001564756133902851, + "loss": 0.4849, + "step": 161960 + }, + { + "epoch": 8.044601172146617, + "grad_norm": 0.2138671875, + "learning_rate": 0.00015643587960663554, + "loss": 0.5151, + "step": 161970 + }, + { + "epoch": 8.045097844442237, + "grad_norm": 0.162109375, + "learning_rate": 0.000156396145822986, + "loss": 0.4721, + "step": 161980 + }, + { + "epoch": 8.045594516737856, + "grad_norm": 0.1640625, + "learning_rate": 0.00015635641203933645, + "loss": 0.4897, + "step": 161990 + }, + { + "epoch": 8.046091189033476, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001563166782556869, + "loss": 0.4776, + "step": 162000 + }, + { + "epoch": 8.046587861329096, + "grad_norm": 0.1396484375, + "learning_rate": 0.00015627694447203737, + "loss": 0.4532, + "step": 162010 + }, + { + "epoch": 8.047084533624714, + "grad_norm": 0.177734375, + "learning_rate": 0.0001562372106883878, + "loss": 0.4664, + "step": 162020 + }, + { + "epoch": 8.047581205920334, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015619747690473828, + "loss": 0.4775, + "step": 162030 + }, + { + "epoch": 8.048077878215953, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001561577431210887, + "loss": 0.4718, + "step": 162040 + }, + { + "epoch": 8.048574550511573, + "grad_norm": 0.1494140625, + "learning_rate": 0.00015611800933743917, + "loss": 0.47, + "step": 162050 + }, + { + "epoch": 8.049071222807191, + "grad_norm": 0.1796875, + "learning_rate": 0.00015607827555378962, + "loss": 0.5064, + "step": 162060 + }, + { + "epoch": 8.049567895102811, + "grad_norm": 0.189453125, + "learning_rate": 0.0001560385417701401, + "loss": 0.4787, + "step": 162070 + }, + { + "epoch": 8.050064567398431, + "grad_norm": 0.203125, + "learning_rate": 0.0001559988079864905, + "loss": 0.4881, + "step": 162080 + }, + { + "epoch": 8.05056123969405, + "grad_norm": 0.1552734375, + "learning_rate": 0.00015595907420284098, + "loss": 0.4707, + "step": 162090 + }, + { + "epoch": 8.05105791198967, + "grad_norm": 0.15625, + "learning_rate": 0.00015591934041919142, + "loss": 0.4533, + "step": 162100 + }, + { + "epoch": 8.051554584285288, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001558796066355419, + "loss": 0.4785, + "step": 162110 + }, + { + "epoch": 8.052051256580908, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001558398728518923, + "loss": 0.4745, + "step": 162120 + }, + { + "epoch": 8.052547928876526, + "grad_norm": 0.15625, + "learning_rate": 0.00015580013906824278, + "loss": 0.4967, + "step": 162130 + }, + { + "epoch": 8.053044601172147, + "grad_norm": 0.166015625, + "learning_rate": 0.00015576040528459323, + "loss": 0.4866, + "step": 162140 + }, + { + "epoch": 8.053541273467767, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001557206715009437, + "loss": 0.4914, + "step": 162150 + }, + { + "epoch": 8.054037945763385, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015568093771729414, + "loss": 0.4741, + "step": 162160 + }, + { + "epoch": 8.054534618059005, + "grad_norm": 0.16796875, + "learning_rate": 0.00015564120393364458, + "loss": 0.4544, + "step": 162170 + }, + { + "epoch": 8.055031290354624, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015560147014999506, + "loss": 0.5047, + "step": 162180 + }, + { + "epoch": 8.055527962650244, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001555617363663455, + "loss": 0.4983, + "step": 162190 + }, + { + "epoch": 8.056024634945862, + "grad_norm": 0.1494140625, + "learning_rate": 0.00015552200258269594, + "loss": 0.4744, + "step": 162200 + }, + { + "epoch": 8.056521307241482, + "grad_norm": 0.15625, + "learning_rate": 0.0001554822687990464, + "loss": 0.5062, + "step": 162210 + }, + { + "epoch": 8.057017979537102, + "grad_norm": 0.1494140625, + "learning_rate": 0.00015544253501539686, + "loss": 0.4831, + "step": 162220 + }, + { + "epoch": 8.05751465183272, + "grad_norm": 0.154296875, + "learning_rate": 0.0001554028012317473, + "loss": 0.4767, + "step": 162230 + }, + { + "epoch": 8.05801132412834, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015536306744809775, + "loss": 0.457, + "step": 162240 + }, + { + "epoch": 8.058507996423959, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001553233336644482, + "loss": 0.4757, + "step": 162250 + }, + { + "epoch": 8.059004668719579, + "grad_norm": 0.177734375, + "learning_rate": 0.00015528359988079866, + "loss": 0.486, + "step": 162260 + }, + { + "epoch": 8.059501341015197, + "grad_norm": 0.1640625, + "learning_rate": 0.0001552438660971491, + "loss": 0.4749, + "step": 162270 + }, + { + "epoch": 8.059998013310818, + "grad_norm": 0.158203125, + "learning_rate": 0.00015520413231349955, + "loss": 0.4829, + "step": 162280 + }, + { + "epoch": 8.060494685606438, + "grad_norm": 0.1787109375, + "learning_rate": 0.00015516439852985, + "loss": 0.4866, + "step": 162290 + }, + { + "epoch": 8.060991357902056, + "grad_norm": 0.1455078125, + "learning_rate": 0.00015512466474620047, + "loss": 0.4831, + "step": 162300 + }, + { + "epoch": 8.061488030197676, + "grad_norm": 0.173828125, + "learning_rate": 0.0001550849309625509, + "loss": 0.4949, + "step": 162310 + }, + { + "epoch": 8.061984702493294, + "grad_norm": 0.18359375, + "learning_rate": 0.00015504519717890136, + "loss": 0.4942, + "step": 162320 + }, + { + "epoch": 8.062481374788915, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015500546339525183, + "loss": 0.4613, + "step": 162330 + }, + { + "epoch": 8.062978047084533, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015496572961160227, + "loss": 0.4588, + "step": 162340 + }, + { + "epoch": 8.063474719380153, + "grad_norm": 0.1513671875, + "learning_rate": 0.00015492599582795272, + "loss": 0.4524, + "step": 162350 + }, + { + "epoch": 8.063971391675773, + "grad_norm": 0.1708984375, + "learning_rate": 0.00015488626204430316, + "loss": 0.4873, + "step": 162360 + }, + { + "epoch": 8.064468063971391, + "grad_norm": 0.1962890625, + "learning_rate": 0.00015484652826065363, + "loss": 0.523, + "step": 162370 + }, + { + "epoch": 8.064964736267012, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015480679447700408, + "loss": 0.4733, + "step": 162380 + }, + { + "epoch": 8.06546140856263, + "grad_norm": 0.15234375, + "learning_rate": 0.00015476706069335455, + "loss": 0.4955, + "step": 162390 + }, + { + "epoch": 8.06595808085825, + "grad_norm": 0.1875, + "learning_rate": 0.000154727326909705, + "loss": 0.5039, + "step": 162400 + }, + { + "epoch": 8.066454753153868, + "grad_norm": 0.142578125, + "learning_rate": 0.00015468759312605544, + "loss": 0.489, + "step": 162410 + }, + { + "epoch": 8.066951425449489, + "grad_norm": 0.16015625, + "learning_rate": 0.00015464785934240588, + "loss": 0.479, + "step": 162420 + }, + { + "epoch": 8.067448097745109, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015460812555875635, + "loss": 0.4693, + "step": 162430 + }, + { + "epoch": 8.067944770040727, + "grad_norm": 0.1640625, + "learning_rate": 0.0001545683917751068, + "loss": 0.4643, + "step": 162440 + }, + { + "epoch": 8.068441442336347, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015452865799145724, + "loss": 0.4647, + "step": 162450 + }, + { + "epoch": 8.068938114631965, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015448892420780769, + "loss": 0.482, + "step": 162460 + }, + { + "epoch": 8.069434786927586, + "grad_norm": 0.138671875, + "learning_rate": 0.00015444919042415816, + "loss": 0.4824, + "step": 162470 + }, + { + "epoch": 8.069931459223204, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001544094566405086, + "loss": 0.4639, + "step": 162480 + }, + { + "epoch": 8.070428131518824, + "grad_norm": 0.154296875, + "learning_rate": 0.00015436972285685904, + "loss": 0.4406, + "step": 162490 + }, + { + "epoch": 8.070924803814444, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001543299890732095, + "loss": 0.478, + "step": 162500 + }, + { + "epoch": 8.071421476110062, + "grad_norm": 0.154296875, + "learning_rate": 0.00015429025528955996, + "loss": 0.4737, + "step": 162510 + }, + { + "epoch": 8.071918148405683, + "grad_norm": 0.203125, + "learning_rate": 0.0001542505215059104, + "loss": 0.4339, + "step": 162520 + }, + { + "epoch": 8.072414820701301, + "grad_norm": 0.1708984375, + "learning_rate": 0.00015421078772226085, + "loss": 0.498, + "step": 162530 + }, + { + "epoch": 8.072911492996921, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015417105393861132, + "loss": 0.4988, + "step": 162540 + }, + { + "epoch": 8.07340816529254, + "grad_norm": 0.158203125, + "learning_rate": 0.00015413132015496176, + "loss": 0.4765, + "step": 162550 + }, + { + "epoch": 8.07390483758816, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015409158637131224, + "loss": 0.4536, + "step": 162560 + }, + { + "epoch": 8.074401509883778, + "grad_norm": 0.1494140625, + "learning_rate": 0.00015405185258766265, + "loss": 0.5135, + "step": 162570 + }, + { + "epoch": 8.074898182179398, + "grad_norm": 0.1533203125, + "learning_rate": 0.00015401211880401312, + "loss": 0.4666, + "step": 162580 + }, + { + "epoch": 8.075394854475018, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015397238502036357, + "loss": 0.4725, + "step": 162590 + }, + { + "epoch": 8.075891526770636, + "grad_norm": 0.16796875, + "learning_rate": 0.00015393265123671404, + "loss": 0.4554, + "step": 162600 + }, + { + "epoch": 8.076388199066256, + "grad_norm": 0.15625, + "learning_rate": 0.00015389291745306446, + "loss": 0.4599, + "step": 162610 + }, + { + "epoch": 8.076884871361875, + "grad_norm": 0.1533203125, + "learning_rate": 0.00015385318366941493, + "loss": 0.4925, + "step": 162620 + }, + { + "epoch": 8.077381543657495, + "grad_norm": 0.1796875, + "learning_rate": 0.00015381344988576537, + "loss": 0.4482, + "step": 162630 + }, + { + "epoch": 8.077878215953113, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015377371610211584, + "loss": 0.4978, + "step": 162640 + }, + { + "epoch": 8.078374888248733, + "grad_norm": 0.2041015625, + "learning_rate": 0.00015373398231846626, + "loss": 0.469, + "step": 162650 + }, + { + "epoch": 8.078871560544354, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015369424853481673, + "loss": 0.4678, + "step": 162660 + }, + { + "epoch": 8.079368232839972, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015365451475116718, + "loss": 0.4798, + "step": 162670 + }, + { + "epoch": 8.079864905135592, + "grad_norm": 0.169921875, + "learning_rate": 0.00015361478096751765, + "loss": 0.4581, + "step": 162680 + }, + { + "epoch": 8.08036157743121, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001535750471838681, + "loss": 0.4758, + "step": 162690 + }, + { + "epoch": 8.08085824972683, + "grad_norm": 0.1826171875, + "learning_rate": 0.00015353531340021854, + "loss": 0.5031, + "step": 162700 + }, + { + "epoch": 8.081354922022449, + "grad_norm": 0.1611328125, + "learning_rate": 0.000153495579616569, + "loss": 0.4785, + "step": 162710 + }, + { + "epoch": 8.081851594318069, + "grad_norm": 0.16015625, + "learning_rate": 0.00015345584583291945, + "loss": 0.4417, + "step": 162720 + }, + { + "epoch": 8.082348266613689, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001534161120492699, + "loss": 0.5085, + "step": 162730 + }, + { + "epoch": 8.082844938909307, + "grad_norm": 0.158203125, + "learning_rate": 0.00015337637826562034, + "loss": 0.4757, + "step": 162740 + }, + { + "epoch": 8.083341611204927, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001533366444819708, + "loss": 0.4616, + "step": 162750 + }, + { + "epoch": 8.083838283500546, + "grad_norm": 0.1748046875, + "learning_rate": 0.00015329691069832126, + "loss": 0.4805, + "step": 162760 + }, + { + "epoch": 8.084334955796166, + "grad_norm": 0.169921875, + "learning_rate": 0.0001532571769146717, + "loss": 0.5201, + "step": 162770 + }, + { + "epoch": 8.084831628091784, + "grad_norm": 0.138671875, + "learning_rate": 0.00015321744313102214, + "loss": 0.4898, + "step": 162780 + }, + { + "epoch": 8.085328300387404, + "grad_norm": 0.15625, + "learning_rate": 0.00015317770934737262, + "loss": 0.5156, + "step": 162790 + }, + { + "epoch": 8.085824972683024, + "grad_norm": 0.1904296875, + "learning_rate": 0.00015313797556372306, + "loss": 0.5232, + "step": 162800 + }, + { + "epoch": 8.086321644978643, + "grad_norm": 0.150390625, + "learning_rate": 0.00015309824178007353, + "loss": 0.5013, + "step": 162810 + }, + { + "epoch": 8.086818317274263, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015305850799642395, + "loss": 0.5049, + "step": 162820 + }, + { + "epoch": 8.087314989569881, + "grad_norm": 0.1923828125, + "learning_rate": 0.00015301877421277442, + "loss": 0.4755, + "step": 162830 + }, + { + "epoch": 8.087811661865501, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015297904042912486, + "loss": 0.4835, + "step": 162840 + }, + { + "epoch": 8.08830833416112, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015293930664547534, + "loss": 0.5212, + "step": 162850 + }, + { + "epoch": 8.08880500645674, + "grad_norm": 0.1513671875, + "learning_rate": 0.00015289957286182578, + "loss": 0.488, + "step": 162860 + }, + { + "epoch": 8.08930167875236, + "grad_norm": 0.16796875, + "learning_rate": 0.00015285983907817622, + "loss": 0.5045, + "step": 162870 + }, + { + "epoch": 8.089798351047978, + "grad_norm": 0.16015625, + "learning_rate": 0.0001528201052945267, + "loss": 0.4862, + "step": 162880 + }, + { + "epoch": 8.090295023343598, + "grad_norm": 0.1484375, + "learning_rate": 0.00015278037151087714, + "loss": 0.4815, + "step": 162890 + }, + { + "epoch": 8.090791695639217, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015274063772722758, + "loss": 0.5271, + "step": 162900 + }, + { + "epoch": 8.091288367934837, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015270090394357803, + "loss": 0.5063, + "step": 162910 + }, + { + "epoch": 8.091785040230455, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001526611701599285, + "loss": 0.4708, + "step": 162920 + }, + { + "epoch": 8.092281712526075, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015262143637627894, + "loss": 0.4968, + "step": 162930 + }, + { + "epoch": 8.092778384821695, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001525817025926294, + "loss": 0.5011, + "step": 162940 + }, + { + "epoch": 8.093275057117314, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015254196880897983, + "loss": 0.488, + "step": 162950 + }, + { + "epoch": 8.093771729412934, + "grad_norm": 0.154296875, + "learning_rate": 0.0001525022350253303, + "loss": 0.4811, + "step": 162960 + }, + { + "epoch": 8.094268401708552, + "grad_norm": 0.169921875, + "learning_rate": 0.00015246250124168075, + "loss": 0.4724, + "step": 162970 + }, + { + "epoch": 8.094765074004172, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001524227674580312, + "loss": 0.5173, + "step": 162980 + }, + { + "epoch": 8.09526174629979, + "grad_norm": 0.1396484375, + "learning_rate": 0.00015238303367438164, + "loss": 0.4809, + "step": 162990 + }, + { + "epoch": 8.09575841859541, + "grad_norm": 0.166015625, + "learning_rate": 0.0001523432998907321, + "loss": 0.5081, + "step": 163000 + }, + { + "epoch": 8.096255090891031, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015230356610708255, + "loss": 0.4998, + "step": 163010 + }, + { + "epoch": 8.09675176318665, + "grad_norm": 0.1533203125, + "learning_rate": 0.000152263832323433, + "loss": 0.4874, + "step": 163020 + }, + { + "epoch": 8.09724843548227, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015222409853978347, + "loss": 0.4991, + "step": 163030 + }, + { + "epoch": 8.097745107777888, + "grad_norm": 0.197265625, + "learning_rate": 0.0001521843647561339, + "loss": 0.4939, + "step": 163040 + }, + { + "epoch": 8.098241780073508, + "grad_norm": 0.1552734375, + "learning_rate": 0.00015214463097248436, + "loss": 0.4714, + "step": 163050 + }, + { + "epoch": 8.098738452369126, + "grad_norm": 0.150390625, + "learning_rate": 0.0001521048971888348, + "loss": 0.4791, + "step": 163060 + }, + { + "epoch": 8.099235124664746, + "grad_norm": 0.171875, + "learning_rate": 0.00015206516340518527, + "loss": 0.4741, + "step": 163070 + }, + { + "epoch": 8.099731796960366, + "grad_norm": 0.146484375, + "learning_rate": 0.00015202542962153572, + "loss": 0.5234, + "step": 163080 + }, + { + "epoch": 8.100228469255985, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001519856958378862, + "loss": 0.5092, + "step": 163090 + }, + { + "epoch": 8.100725141551605, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001519459620542366, + "loss": 0.4809, + "step": 163100 + }, + { + "epoch": 8.101221813847223, + "grad_norm": 0.1767578125, + "learning_rate": 0.00015190622827058708, + "loss": 0.4739, + "step": 163110 + }, + { + "epoch": 8.101718486142843, + "grad_norm": 0.1806640625, + "learning_rate": 0.00015186649448693752, + "loss": 0.5185, + "step": 163120 + }, + { + "epoch": 8.102215158438462, + "grad_norm": 0.1591796875, + "learning_rate": 0.000151826760703288, + "loss": 0.4776, + "step": 163130 + }, + { + "epoch": 8.102711830734082, + "grad_norm": 0.1455078125, + "learning_rate": 0.0001517870269196384, + "loss": 0.4871, + "step": 163140 + }, + { + "epoch": 8.103208503029702, + "grad_norm": 0.19140625, + "learning_rate": 0.00015174729313598888, + "loss": 0.4446, + "step": 163150 + }, + { + "epoch": 8.10370517532532, + "grad_norm": 0.154296875, + "learning_rate": 0.00015170755935233932, + "loss": 0.4787, + "step": 163160 + }, + { + "epoch": 8.10420184762094, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001516678255686898, + "loss": 0.4722, + "step": 163170 + }, + { + "epoch": 8.104698519916559, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015162809178504024, + "loss": 0.5063, + "step": 163180 + }, + { + "epoch": 8.105195192212179, + "grad_norm": 0.177734375, + "learning_rate": 0.00015158835800139068, + "loss": 0.5127, + "step": 163190 + }, + { + "epoch": 8.105691864507797, + "grad_norm": 0.15625, + "learning_rate": 0.00015154862421774113, + "loss": 0.4697, + "step": 163200 + }, + { + "epoch": 8.106188536803417, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001515088904340916, + "loss": 0.5029, + "step": 163210 + }, + { + "epoch": 8.106685209099037, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015146915665044204, + "loss": 0.4469, + "step": 163220 + }, + { + "epoch": 8.107181881394656, + "grad_norm": 0.15234375, + "learning_rate": 0.0001514294228667925, + "loss": 0.4893, + "step": 163230 + }, + { + "epoch": 8.107678553690276, + "grad_norm": 0.150390625, + "learning_rate": 0.00015138968908314296, + "loss": 0.5028, + "step": 163240 + }, + { + "epoch": 8.108175225985894, + "grad_norm": 0.1953125, + "learning_rate": 0.0001513499552994934, + "loss": 0.5145, + "step": 163250 + }, + { + "epoch": 8.108671898281514, + "grad_norm": 0.1533203125, + "learning_rate": 0.00015131022151584388, + "loss": 0.4838, + "step": 163260 + }, + { + "epoch": 8.109168570577133, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001512704877321943, + "loss": 0.4886, + "step": 163270 + }, + { + "epoch": 8.109665242872753, + "grad_norm": 0.15625, + "learning_rate": 0.00015123075394854476, + "loss": 0.4737, + "step": 163280 + }, + { + "epoch": 8.110161915168371, + "grad_norm": 0.19140625, + "learning_rate": 0.0001511910201648952, + "loss": 0.5064, + "step": 163290 + }, + { + "epoch": 8.110658587463991, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015115128638124568, + "loss": 0.4509, + "step": 163300 + }, + { + "epoch": 8.111155259759611, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001511115525975961, + "loss": 0.4886, + "step": 163310 + }, + { + "epoch": 8.11165193205523, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015107181881394657, + "loss": 0.4764, + "step": 163320 + }, + { + "epoch": 8.11214860435085, + "grad_norm": 0.1435546875, + "learning_rate": 0.000151032085030297, + "loss": 0.4824, + "step": 163330 + }, + { + "epoch": 8.112645276646468, + "grad_norm": 0.1708984375, + "learning_rate": 0.00015099235124664748, + "loss": 0.4774, + "step": 163340 + }, + { + "epoch": 8.113141948942088, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001509526174629979, + "loss": 0.4948, + "step": 163350 + }, + { + "epoch": 8.113638621237707, + "grad_norm": 0.1494140625, + "learning_rate": 0.00015091288367934837, + "loss": 0.4568, + "step": 163360 + }, + { + "epoch": 8.114135293533327, + "grad_norm": 0.158203125, + "learning_rate": 0.00015087314989569882, + "loss": 0.4688, + "step": 163370 + }, + { + "epoch": 8.114631965828947, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001508334161120493, + "loss": 0.4855, + "step": 163380 + }, + { + "epoch": 8.115128638124565, + "grad_norm": 0.16015625, + "learning_rate": 0.00015079368232839973, + "loss": 0.4662, + "step": 163390 + }, + { + "epoch": 8.115625310420185, + "grad_norm": 0.1552734375, + "learning_rate": 0.00015075394854475018, + "loss": 0.491, + "step": 163400 + }, + { + "epoch": 8.116121982715804, + "grad_norm": 0.1806640625, + "learning_rate": 0.00015071421476110065, + "loss": 0.4785, + "step": 163410 + }, + { + "epoch": 8.116618655011424, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001506744809774511, + "loss": 0.4968, + "step": 163420 + }, + { + "epoch": 8.117115327307042, + "grad_norm": 0.1640625, + "learning_rate": 0.00015063474719380154, + "loss": 0.4812, + "step": 163430 + }, + { + "epoch": 8.117611999602662, + "grad_norm": 0.15625, + "learning_rate": 0.00015059501341015198, + "loss": 0.5179, + "step": 163440 + }, + { + "epoch": 8.118108671898282, + "grad_norm": 0.1708984375, + "learning_rate": 0.00015055527962650245, + "loss": 0.5029, + "step": 163450 + }, + { + "epoch": 8.1186053441939, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001505155458428529, + "loss": 0.5008, + "step": 163460 + }, + { + "epoch": 8.11910201648952, + "grad_norm": 0.1748046875, + "learning_rate": 0.00015047581205920334, + "loss": 0.5056, + "step": 163470 + }, + { + "epoch": 8.119598688785139, + "grad_norm": 0.154296875, + "learning_rate": 0.00015043607827555378, + "loss": 0.4666, + "step": 163480 + }, + { + "epoch": 8.12009536108076, + "grad_norm": 0.150390625, + "learning_rate": 0.00015039634449190426, + "loss": 0.5242, + "step": 163490 + }, + { + "epoch": 8.120592033376377, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001503566107082547, + "loss": 0.4416, + "step": 163500 + }, + { + "epoch": 8.121088705671998, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015031687692460514, + "loss": 0.5143, + "step": 163510 + }, + { + "epoch": 8.121585377967618, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001502771431409556, + "loss": 0.4992, + "step": 163520 + }, + { + "epoch": 8.122082050263236, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015023740935730606, + "loss": 0.4799, + "step": 163530 + }, + { + "epoch": 8.122578722558856, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001501976755736565, + "loss": 0.4654, + "step": 163540 + }, + { + "epoch": 8.123075394854474, + "grad_norm": 0.1884765625, + "learning_rate": 0.00015015794179000695, + "loss": 0.5201, + "step": 163550 + }, + { + "epoch": 8.123572067150095, + "grad_norm": 0.162109375, + "learning_rate": 0.00015011820800635742, + "loss": 0.4963, + "step": 163560 + }, + { + "epoch": 8.124068739445713, + "grad_norm": 0.1494140625, + "learning_rate": 0.00015007847422270786, + "loss": 0.477, + "step": 163570 + }, + { + "epoch": 8.124565411741333, + "grad_norm": 0.185546875, + "learning_rate": 0.00015003874043905834, + "loss": 0.4793, + "step": 163580 + }, + { + "epoch": 8.125062084036953, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014999900665540875, + "loss": 0.4878, + "step": 163590 + }, + { + "epoch": 8.125558756332572, + "grad_norm": 0.1787109375, + "learning_rate": 0.00014995927287175922, + "loss": 0.4903, + "step": 163600 + }, + { + "epoch": 8.126055428628192, + "grad_norm": 0.1513671875, + "learning_rate": 0.00014991953908810967, + "loss": 0.4818, + "step": 163610 + }, + { + "epoch": 8.12655210092381, + "grad_norm": 0.1796875, + "learning_rate": 0.00014987980530446014, + "loss": 0.48, + "step": 163620 + }, + { + "epoch": 8.12704877321943, + "grad_norm": 0.19140625, + "learning_rate": 0.00014984007152081056, + "loss": 0.4938, + "step": 163630 + }, + { + "epoch": 8.127545445515048, + "grad_norm": 0.16796875, + "learning_rate": 0.00014980033773716103, + "loss": 0.484, + "step": 163640 + }, + { + "epoch": 8.128042117810669, + "grad_norm": 0.158203125, + "learning_rate": 0.00014976060395351147, + "loss": 0.4536, + "step": 163650 + }, + { + "epoch": 8.128538790106289, + "grad_norm": 0.1533203125, + "learning_rate": 0.00014972087016986194, + "loss": 0.4506, + "step": 163660 + }, + { + "epoch": 8.129035462401907, + "grad_norm": 0.185546875, + "learning_rate": 0.0001496811363862124, + "loss": 0.4585, + "step": 163670 + }, + { + "epoch": 8.129532134697527, + "grad_norm": 0.169921875, + "learning_rate": 0.00014964140260256283, + "loss": 0.4976, + "step": 163680 + }, + { + "epoch": 8.130028806993145, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014960166881891328, + "loss": 0.4749, + "step": 163690 + }, + { + "epoch": 8.130525479288766, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014956193503526375, + "loss": 0.4705, + "step": 163700 + }, + { + "epoch": 8.131022151584384, + "grad_norm": 0.14453125, + "learning_rate": 0.0001495222012516142, + "loss": 0.4701, + "step": 163710 + }, + { + "epoch": 8.131518823880004, + "grad_norm": 0.169921875, + "learning_rate": 0.00014948246746796464, + "loss": 0.5111, + "step": 163720 + }, + { + "epoch": 8.132015496175624, + "grad_norm": 0.154296875, + "learning_rate": 0.0001494427336843151, + "loss": 0.4588, + "step": 163730 + }, + { + "epoch": 8.132512168471242, + "grad_norm": 0.1826171875, + "learning_rate": 0.00014940299990066555, + "loss": 0.4914, + "step": 163740 + }, + { + "epoch": 8.133008840766863, + "grad_norm": 0.1611328125, + "learning_rate": 0.000149363266117016, + "loss": 0.5111, + "step": 163750 + }, + { + "epoch": 8.133505513062481, + "grad_norm": 0.1806640625, + "learning_rate": 0.00014932353233336644, + "loss": 0.529, + "step": 163760 + }, + { + "epoch": 8.134002185358101, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001492837985497169, + "loss": 0.4805, + "step": 163770 + }, + { + "epoch": 8.13449885765372, + "grad_norm": 0.1474609375, + "learning_rate": 0.00014924406476606736, + "loss": 0.4843, + "step": 163780 + }, + { + "epoch": 8.13499552994934, + "grad_norm": 0.15234375, + "learning_rate": 0.00014920433098241783, + "loss": 0.4816, + "step": 163790 + }, + { + "epoch": 8.13549220224496, + "grad_norm": 0.1474609375, + "learning_rate": 0.00014916459719876824, + "loss": 0.5106, + "step": 163800 + }, + { + "epoch": 8.135988874540578, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014912486341511872, + "loss": 0.4676, + "step": 163810 + }, + { + "epoch": 8.136485546836198, + "grad_norm": 0.193359375, + "learning_rate": 0.00014908512963146916, + "loss": 0.4662, + "step": 163820 + }, + { + "epoch": 8.136982219131816, + "grad_norm": 0.2001953125, + "learning_rate": 0.00014904539584781963, + "loss": 0.4769, + "step": 163830 + }, + { + "epoch": 8.137478891427437, + "grad_norm": 0.173828125, + "learning_rate": 0.00014900566206417005, + "loss": 0.4501, + "step": 163840 + }, + { + "epoch": 8.137975563723055, + "grad_norm": 0.177734375, + "learning_rate": 0.00014896592828052052, + "loss": 0.4867, + "step": 163850 + }, + { + "epoch": 8.138472236018675, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014892619449687096, + "loss": 0.4927, + "step": 163860 + }, + { + "epoch": 8.138968908314295, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014888646071322144, + "loss": 0.4846, + "step": 163870 + }, + { + "epoch": 8.139465580609913, + "grad_norm": 0.171875, + "learning_rate": 0.00014884672692957188, + "loss": 0.4848, + "step": 163880 + }, + { + "epoch": 8.139962252905534, + "grad_norm": 0.16796875, + "learning_rate": 0.00014880699314592232, + "loss": 0.5082, + "step": 163890 + }, + { + "epoch": 8.140458925201152, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014876725936227277, + "loss": 0.4832, + "step": 163900 + }, + { + "epoch": 8.140955597496772, + "grad_norm": 0.150390625, + "learning_rate": 0.00014872752557862324, + "loss": 0.4672, + "step": 163910 + }, + { + "epoch": 8.14145226979239, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014868779179497368, + "loss": 0.4833, + "step": 163920 + }, + { + "epoch": 8.14194894208801, + "grad_norm": 0.16015625, + "learning_rate": 0.00014864805801132413, + "loss": 0.484, + "step": 163930 + }, + { + "epoch": 8.142445614383629, + "grad_norm": 0.15625, + "learning_rate": 0.0001486083242276746, + "loss": 0.498, + "step": 163940 + }, + { + "epoch": 8.142942286679249, + "grad_norm": 0.15625, + "learning_rate": 0.00014856859044402504, + "loss": 0.4554, + "step": 163950 + }, + { + "epoch": 8.143438958974869, + "grad_norm": 0.154296875, + "learning_rate": 0.0001485288566603755, + "loss": 0.4805, + "step": 163960 + }, + { + "epoch": 8.143935631270487, + "grad_norm": 0.171875, + "learning_rate": 0.00014848912287672593, + "loss": 0.446, + "step": 163970 + }, + { + "epoch": 8.144432303566107, + "grad_norm": 0.158203125, + "learning_rate": 0.0001484493890930764, + "loss": 0.4562, + "step": 163980 + }, + { + "epoch": 8.144928975861726, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014840965530942685, + "loss": 0.4906, + "step": 163990 + }, + { + "epoch": 8.145425648157346, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001483699215257773, + "loss": 0.4687, + "step": 164000 + }, + { + "epoch": 8.145922320452964, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014833018774212774, + "loss": 0.4506, + "step": 164010 + }, + { + "epoch": 8.146418992748584, + "grad_norm": 0.171875, + "learning_rate": 0.0001482904539584782, + "loss": 0.4859, + "step": 164020 + }, + { + "epoch": 8.146915665044205, + "grad_norm": 0.1572265625, + "learning_rate": 0.00014825072017482865, + "loss": 0.493, + "step": 164030 + }, + { + "epoch": 8.147412337339823, + "grad_norm": 0.16015625, + "learning_rate": 0.0001482109863911791, + "loss": 0.4749, + "step": 164040 + }, + { + "epoch": 8.147909009635443, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014817125260752954, + "loss": 0.4579, + "step": 164050 + }, + { + "epoch": 8.148405681931061, + "grad_norm": 0.16015625, + "learning_rate": 0.00014813151882388, + "loss": 0.5064, + "step": 164060 + }, + { + "epoch": 8.148902354226681, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014809178504023046, + "loss": 0.4737, + "step": 164070 + }, + { + "epoch": 8.1493990265223, + "grad_norm": 0.162109375, + "learning_rate": 0.00014805205125658093, + "loss": 0.5041, + "step": 164080 + }, + { + "epoch": 8.14989569881792, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014801231747293137, + "loss": 0.49, + "step": 164090 + }, + { + "epoch": 8.15039237111354, + "grad_norm": 0.158203125, + "learning_rate": 0.00014797258368928182, + "loss": 0.4585, + "step": 164100 + }, + { + "epoch": 8.150889043409158, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001479328499056323, + "loss": 0.4613, + "step": 164110 + }, + { + "epoch": 8.151385715704778, + "grad_norm": 0.197265625, + "learning_rate": 0.00014789311612198273, + "loss": 0.4807, + "step": 164120 + }, + { + "epoch": 8.151882388000397, + "grad_norm": 0.15234375, + "learning_rate": 0.00014785338233833318, + "loss": 0.4522, + "step": 164130 + }, + { + "epoch": 8.152379060296017, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014781364855468362, + "loss": 0.4512, + "step": 164140 + }, + { + "epoch": 8.152875732591635, + "grad_norm": 0.169921875, + "learning_rate": 0.0001477739147710341, + "loss": 0.4485, + "step": 164150 + }, + { + "epoch": 8.153372404887255, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014773418098738454, + "loss": 0.5078, + "step": 164160 + }, + { + "epoch": 8.153869077182875, + "grad_norm": 0.1640625, + "learning_rate": 0.00014769444720373498, + "loss": 0.4984, + "step": 164170 + }, + { + "epoch": 8.154365749478494, + "grad_norm": 0.142578125, + "learning_rate": 0.00014765471342008542, + "loss": 0.5128, + "step": 164180 + }, + { + "epoch": 8.154862421774114, + "grad_norm": 0.1640625, + "learning_rate": 0.0001476149796364359, + "loss": 0.4821, + "step": 164190 + }, + { + "epoch": 8.155359094069732, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014757524585278634, + "loss": 0.4764, + "step": 164200 + }, + { + "epoch": 8.155855766365352, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014753551206913678, + "loss": 0.4622, + "step": 164210 + }, + { + "epoch": 8.15635243866097, + "grad_norm": 0.173828125, + "learning_rate": 0.00014749577828548723, + "loss": 0.4858, + "step": 164220 + }, + { + "epoch": 8.15684911095659, + "grad_norm": 0.169921875, + "learning_rate": 0.0001474560445018377, + "loss": 0.4987, + "step": 164230 + }, + { + "epoch": 8.157345783252211, + "grad_norm": 0.1904296875, + "learning_rate": 0.00014741631071818814, + "loss": 0.4995, + "step": 164240 + }, + { + "epoch": 8.15784245554783, + "grad_norm": 0.154296875, + "learning_rate": 0.0001473765769345386, + "loss": 0.4922, + "step": 164250 + }, + { + "epoch": 8.15833912784345, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014733684315088906, + "loss": 0.5072, + "step": 164260 + }, + { + "epoch": 8.158835800139068, + "grad_norm": 0.177734375, + "learning_rate": 0.0001472971093672395, + "loss": 0.4662, + "step": 164270 + }, + { + "epoch": 8.159332472434688, + "grad_norm": 0.1962890625, + "learning_rate": 0.00014725737558358998, + "loss": 0.4912, + "step": 164280 + }, + { + "epoch": 8.159829144730306, + "grad_norm": 0.1875, + "learning_rate": 0.0001472176417999404, + "loss": 0.4743, + "step": 164290 + }, + { + "epoch": 8.160325817025926, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014717790801629086, + "loss": 0.5222, + "step": 164300 + }, + { + "epoch": 8.160822489321546, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001471381742326413, + "loss": 0.4899, + "step": 164310 + }, + { + "epoch": 8.161319161617165, + "grad_norm": 0.1552734375, + "learning_rate": 0.00014709844044899178, + "loss": 0.5119, + "step": 164320 + }, + { + "epoch": 8.161815833912785, + "grad_norm": 0.1796875, + "learning_rate": 0.0001470587066653422, + "loss": 0.4994, + "step": 164330 + }, + { + "epoch": 8.162312506208403, + "grad_norm": 0.15625, + "learning_rate": 0.00014701897288169267, + "loss": 0.4725, + "step": 164340 + }, + { + "epoch": 8.162809178504023, + "grad_norm": 0.166015625, + "learning_rate": 0.0001469792390980431, + "loss": 0.4845, + "step": 164350 + }, + { + "epoch": 8.163305850799642, + "grad_norm": 0.166015625, + "learning_rate": 0.00014693950531439358, + "loss": 0.5011, + "step": 164360 + }, + { + "epoch": 8.163802523095262, + "grad_norm": 0.1787109375, + "learning_rate": 0.000146899771530744, + "loss": 0.4875, + "step": 164370 + }, + { + "epoch": 8.164299195390882, + "grad_norm": 0.1533203125, + "learning_rate": 0.00014686003774709447, + "loss": 0.4793, + "step": 164380 + }, + { + "epoch": 8.1647958676865, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014682030396344492, + "loss": 0.4925, + "step": 164390 + }, + { + "epoch": 8.16529253998212, + "grad_norm": 0.169921875, + "learning_rate": 0.0001467805701797954, + "loss": 0.5141, + "step": 164400 + }, + { + "epoch": 8.165789212277739, + "grad_norm": 0.15234375, + "learning_rate": 0.00014674083639614583, + "loss": 0.4963, + "step": 164410 + }, + { + "epoch": 8.166285884573359, + "grad_norm": 0.173828125, + "learning_rate": 0.00014670110261249628, + "loss": 0.4683, + "step": 164420 + }, + { + "epoch": 8.166782556868977, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014666136882884675, + "loss": 0.4896, + "step": 164430 + }, + { + "epoch": 8.167279229164597, + "grad_norm": 0.15625, + "learning_rate": 0.0001466216350451972, + "loss": 0.4534, + "step": 164440 + }, + { + "epoch": 8.167775901460217, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014658190126154764, + "loss": 0.4682, + "step": 164450 + }, + { + "epoch": 8.168272573755836, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014654216747789808, + "loss": 0.4751, + "step": 164460 + }, + { + "epoch": 8.168769246051456, + "grad_norm": 0.158203125, + "learning_rate": 0.00014650243369424855, + "loss": 0.4642, + "step": 164470 + }, + { + "epoch": 8.169265918347074, + "grad_norm": 0.1611328125, + "learning_rate": 0.000146462699910599, + "loss": 0.4879, + "step": 164480 + }, + { + "epoch": 8.169762590642694, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014642296612694947, + "loss": 0.497, + "step": 164490 + }, + { + "epoch": 8.170259262938313, + "grad_norm": 0.1494140625, + "learning_rate": 0.00014638323234329988, + "loss": 0.4932, + "step": 164500 + }, + { + "epoch": 8.170755935233933, + "grad_norm": 0.16796875, + "learning_rate": 0.00014634349855965036, + "loss": 0.5398, + "step": 164510 + }, + { + "epoch": 8.171252607529553, + "grad_norm": 0.16796875, + "learning_rate": 0.0001463037647760008, + "loss": 0.4897, + "step": 164520 + }, + { + "epoch": 8.171749279825171, + "grad_norm": 0.1796875, + "learning_rate": 0.00014626403099235127, + "loss": 0.4901, + "step": 164530 + }, + { + "epoch": 8.172245952120791, + "grad_norm": 0.15234375, + "learning_rate": 0.0001462242972087017, + "loss": 0.446, + "step": 164540 + }, + { + "epoch": 8.17274262441641, + "grad_norm": 0.15625, + "learning_rate": 0.00014618456342505216, + "loss": 0.4869, + "step": 164550 + }, + { + "epoch": 8.17323929671203, + "grad_norm": 0.1875, + "learning_rate": 0.0001461448296414026, + "loss": 0.4803, + "step": 164560 + }, + { + "epoch": 8.173735969007648, + "grad_norm": 0.17578125, + "learning_rate": 0.00014610509585775308, + "loss": 0.4773, + "step": 164570 + }, + { + "epoch": 8.174232641303268, + "grad_norm": 0.1416015625, + "learning_rate": 0.00014606536207410352, + "loss": 0.4384, + "step": 164580 + }, + { + "epoch": 8.174729313598888, + "grad_norm": 0.189453125, + "learning_rate": 0.00014602562829045396, + "loss": 0.4849, + "step": 164590 + }, + { + "epoch": 8.175225985894507, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001459858945068044, + "loss": 0.4766, + "step": 164600 + }, + { + "epoch": 8.175722658190127, + "grad_norm": 0.154296875, + "learning_rate": 0.00014594616072315488, + "loss": 0.4943, + "step": 164610 + }, + { + "epoch": 8.176219330485745, + "grad_norm": 0.1572265625, + "learning_rate": 0.00014590642693950532, + "loss": 0.4399, + "step": 164620 + }, + { + "epoch": 8.176716002781365, + "grad_norm": 0.158203125, + "learning_rate": 0.00014586669315585577, + "loss": 0.4539, + "step": 164630 + }, + { + "epoch": 8.177212675076984, + "grad_norm": 0.17578125, + "learning_rate": 0.00014582695937220624, + "loss": 0.5025, + "step": 164640 + }, + { + "epoch": 8.177709347372604, + "grad_norm": 0.251953125, + "learning_rate": 0.00014578722558855668, + "loss": 0.5087, + "step": 164650 + }, + { + "epoch": 8.178206019668224, + "grad_norm": 0.169921875, + "learning_rate": 0.00014574749180490713, + "loss": 0.5102, + "step": 164660 + }, + { + "epoch": 8.178702691963842, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014570775802125757, + "loss": 0.4862, + "step": 164670 + }, + { + "epoch": 8.179199364259462, + "grad_norm": 0.2021484375, + "learning_rate": 0.00014566802423760804, + "loss": 0.4795, + "step": 164680 + }, + { + "epoch": 8.17969603655508, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001456282904539585, + "loss": 0.5009, + "step": 164690 + }, + { + "epoch": 8.1801927088507, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014558855667030893, + "loss": 0.4772, + "step": 164700 + }, + { + "epoch": 8.180689381146319, + "grad_norm": 0.166015625, + "learning_rate": 0.00014554882288665938, + "loss": 0.5138, + "step": 164710 + }, + { + "epoch": 8.18118605344194, + "grad_norm": 0.16796875, + "learning_rate": 0.00014550908910300985, + "loss": 0.4987, + "step": 164720 + }, + { + "epoch": 8.181682725737558, + "grad_norm": 0.16015625, + "learning_rate": 0.0001454693553193603, + "loss": 0.4644, + "step": 164730 + }, + { + "epoch": 8.182179398033178, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014542962153571074, + "loss": 0.481, + "step": 164740 + }, + { + "epoch": 8.182676070328798, + "grad_norm": 0.158203125, + "learning_rate": 0.00014538988775206118, + "loss": 0.4189, + "step": 164750 + }, + { + "epoch": 8.183172742624416, + "grad_norm": 0.16796875, + "learning_rate": 0.00014535015396841165, + "loss": 0.4587, + "step": 164760 + }, + { + "epoch": 8.183669414920036, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001453104201847621, + "loss": 0.4845, + "step": 164770 + }, + { + "epoch": 8.184166087215655, + "grad_norm": 0.2080078125, + "learning_rate": 0.00014527068640111254, + "loss": 0.4905, + "step": 164780 + }, + { + "epoch": 8.184662759511275, + "grad_norm": 0.1611328125, + "learning_rate": 0.000145230952617463, + "loss": 0.4633, + "step": 164790 + }, + { + "epoch": 8.185159431806893, + "grad_norm": 0.16796875, + "learning_rate": 0.00014519121883381346, + "loss": 0.4864, + "step": 164800 + }, + { + "epoch": 8.185656104102513, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014515148505016393, + "loss": 0.501, + "step": 164810 + }, + { + "epoch": 8.186152776398133, + "grad_norm": 0.1748046875, + "learning_rate": 0.00014511175126651434, + "loss": 0.4567, + "step": 164820 + }, + { + "epoch": 8.186649448693752, + "grad_norm": 0.177734375, + "learning_rate": 0.00014507201748286482, + "loss": 0.4716, + "step": 164830 + }, + { + "epoch": 8.187146120989372, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014503228369921526, + "loss": 0.4943, + "step": 164840 + }, + { + "epoch": 8.18764279328499, + "grad_norm": 0.1552734375, + "learning_rate": 0.00014499254991556573, + "loss": 0.4988, + "step": 164850 + }, + { + "epoch": 8.18813946558061, + "grad_norm": 0.14453125, + "learning_rate": 0.00014495281613191615, + "loss": 0.4557, + "step": 164860 + }, + { + "epoch": 8.188636137876228, + "grad_norm": 0.15625, + "learning_rate": 0.00014491308234826662, + "loss": 0.4808, + "step": 164870 + }, + { + "epoch": 8.189132810171849, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014487334856461706, + "loss": 0.4897, + "step": 164880 + }, + { + "epoch": 8.189629482467469, + "grad_norm": 0.19140625, + "learning_rate": 0.00014483361478096754, + "loss": 0.4593, + "step": 164890 + }, + { + "epoch": 8.190126154763087, + "grad_norm": 0.189453125, + "learning_rate": 0.00014479388099731795, + "loss": 0.5139, + "step": 164900 + }, + { + "epoch": 8.190622827058707, + "grad_norm": 0.189453125, + "learning_rate": 0.00014475414721366842, + "loss": 0.4925, + "step": 164910 + }, + { + "epoch": 8.191119499354325, + "grad_norm": 0.162109375, + "learning_rate": 0.00014471441343001887, + "loss": 0.4642, + "step": 164920 + }, + { + "epoch": 8.191616171649946, + "grad_norm": 0.1533203125, + "learning_rate": 0.00014467467964636934, + "loss": 0.4647, + "step": 164930 + }, + { + "epoch": 8.192112843945564, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014463494586271978, + "loss": 0.4486, + "step": 164940 + }, + { + "epoch": 8.192609516241184, + "grad_norm": 0.16796875, + "learning_rate": 0.00014459521207907023, + "loss": 0.5145, + "step": 164950 + }, + { + "epoch": 8.193106188536804, + "grad_norm": 0.16796875, + "learning_rate": 0.0001445554782954207, + "loss": 0.4644, + "step": 164960 + }, + { + "epoch": 8.193602860832423, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014451574451177114, + "loss": 0.4735, + "step": 164970 + }, + { + "epoch": 8.194099533128043, + "grad_norm": 0.1572265625, + "learning_rate": 0.00014447601072812161, + "loss": 0.4492, + "step": 164980 + }, + { + "epoch": 8.194596205423661, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014443627694447203, + "loss": 0.4675, + "step": 164990 + }, + { + "epoch": 8.195092877719281, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001443965431608225, + "loss": 0.4816, + "step": 165000 + }, + { + "epoch": 8.1955895500149, + "grad_norm": 0.158203125, + "learning_rate": 0.00014435680937717295, + "loss": 0.4675, + "step": 165010 + }, + { + "epoch": 8.19608622231052, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014431707559352342, + "loss": 0.4893, + "step": 165020 + }, + { + "epoch": 8.19658289460614, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014427734180987384, + "loss": 0.4923, + "step": 165030 + }, + { + "epoch": 8.197079566901758, + "grad_norm": 0.15234375, + "learning_rate": 0.0001442376080262243, + "loss": 0.5033, + "step": 165040 + }, + { + "epoch": 8.197576239197378, + "grad_norm": 0.1923828125, + "learning_rate": 0.00014419787424257475, + "loss": 0.454, + "step": 165050 + }, + { + "epoch": 8.198072911492996, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014415814045892522, + "loss": 0.4845, + "step": 165060 + }, + { + "epoch": 8.198569583788617, + "grad_norm": 0.177734375, + "learning_rate": 0.00014411840667527564, + "loss": 0.4828, + "step": 165070 + }, + { + "epoch": 8.199066256084235, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001440786728916261, + "loss": 0.4931, + "step": 165080 + }, + { + "epoch": 8.199562928379855, + "grad_norm": 0.169921875, + "learning_rate": 0.00014403893910797656, + "loss": 0.5006, + "step": 165090 + }, + { + "epoch": 8.200059600675475, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014399920532432703, + "loss": 0.4702, + "step": 165100 + }, + { + "epoch": 8.200556272971093, + "grad_norm": 0.1572265625, + "learning_rate": 0.00014395947154067747, + "loss": 0.4977, + "step": 165110 + }, + { + "epoch": 8.201052945266714, + "grad_norm": 0.16796875, + "learning_rate": 0.00014391973775702792, + "loss": 0.4561, + "step": 165120 + }, + { + "epoch": 8.201549617562332, + "grad_norm": 0.189453125, + "learning_rate": 0.0001438800039733784, + "loss": 0.5116, + "step": 165130 + }, + { + "epoch": 8.202046289857952, + "grad_norm": 0.1513671875, + "learning_rate": 0.00014384027018972883, + "loss": 0.4462, + "step": 165140 + }, + { + "epoch": 8.20254296215357, + "grad_norm": 0.2119140625, + "learning_rate": 0.00014380053640607928, + "loss": 0.4528, + "step": 165150 + }, + { + "epoch": 8.20303963444919, + "grad_norm": 0.15625, + "learning_rate": 0.00014376080262242972, + "loss": 0.4725, + "step": 165160 + }, + { + "epoch": 8.20353630674481, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001437210688387802, + "loss": 0.483, + "step": 165170 + }, + { + "epoch": 8.204032979040429, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014368133505513064, + "loss": 0.4928, + "step": 165180 + }, + { + "epoch": 8.204529651336049, + "grad_norm": 0.173828125, + "learning_rate": 0.00014364160127148108, + "loss": 0.4657, + "step": 165190 + }, + { + "epoch": 8.205026323631667, + "grad_norm": 0.171875, + "learning_rate": 0.00014360186748783152, + "loss": 0.4717, + "step": 165200 + }, + { + "epoch": 8.205522995927288, + "grad_norm": 0.162109375, + "learning_rate": 0.000143562133704182, + "loss": 0.457, + "step": 165210 + }, + { + "epoch": 8.206019668222906, + "grad_norm": 0.16015625, + "learning_rate": 0.00014352239992053244, + "loss": 0.5057, + "step": 165220 + }, + { + "epoch": 8.206516340518526, + "grad_norm": 0.1513671875, + "learning_rate": 0.00014348266613688288, + "loss": 0.4999, + "step": 165230 + }, + { + "epoch": 8.207013012814146, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014344293235323333, + "loss": 0.4944, + "step": 165240 + }, + { + "epoch": 8.207509685109764, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001434031985695838, + "loss": 0.4968, + "step": 165250 + }, + { + "epoch": 8.208006357405385, + "grad_norm": 0.16796875, + "learning_rate": 0.00014336346478593424, + "loss": 0.4919, + "step": 165260 + }, + { + "epoch": 8.208503029701003, + "grad_norm": 0.208984375, + "learning_rate": 0.0001433237310022847, + "loss": 0.5149, + "step": 165270 + }, + { + "epoch": 8.208999701996623, + "grad_norm": 0.1513671875, + "learning_rate": 0.00014328399721863516, + "loss": 0.4786, + "step": 165280 + }, + { + "epoch": 8.209496374292241, + "grad_norm": 0.158203125, + "learning_rate": 0.0001432442634349856, + "loss": 0.5123, + "step": 165290 + }, + { + "epoch": 8.209993046587861, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014320452965133605, + "loss": 0.4706, + "step": 165300 + }, + { + "epoch": 8.21048971888348, + "grad_norm": 0.1796875, + "learning_rate": 0.0001431647958676865, + "loss": 0.5235, + "step": 165310 + }, + { + "epoch": 8.2109863911791, + "grad_norm": 0.2041015625, + "learning_rate": 0.00014312506208403696, + "loss": 0.4814, + "step": 165320 + }, + { + "epoch": 8.21148306347472, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001430853283003874, + "loss": 0.464, + "step": 165330 + }, + { + "epoch": 8.211979735770338, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014304559451673788, + "loss": 0.5068, + "step": 165340 + }, + { + "epoch": 8.212476408065958, + "grad_norm": 0.1796875, + "learning_rate": 0.00014300586073308832, + "loss": 0.5117, + "step": 165350 + }, + { + "epoch": 8.212973080361577, + "grad_norm": 0.16796875, + "learning_rate": 0.00014296612694943877, + "loss": 0.4758, + "step": 165360 + }, + { + "epoch": 8.213469752657197, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001429263931657892, + "loss": 0.4912, + "step": 165370 + }, + { + "epoch": 8.213966424952815, + "grad_norm": 0.1513671875, + "learning_rate": 0.00014288665938213968, + "loss": 0.476, + "step": 165380 + }, + { + "epoch": 8.214463097248435, + "grad_norm": 0.1806640625, + "learning_rate": 0.00014284692559849013, + "loss": 0.5403, + "step": 165390 + }, + { + "epoch": 8.214959769544055, + "grad_norm": 0.1572265625, + "learning_rate": 0.00014280719181484057, + "loss": 0.4758, + "step": 165400 + }, + { + "epoch": 8.215456441839674, + "grad_norm": 0.2080078125, + "learning_rate": 0.00014276745803119102, + "loss": 0.4988, + "step": 165410 + }, + { + "epoch": 8.215953114135294, + "grad_norm": 0.166015625, + "learning_rate": 0.0001427277242475415, + "loss": 0.4871, + "step": 165420 + }, + { + "epoch": 8.216449786430912, + "grad_norm": 0.15234375, + "learning_rate": 0.00014268799046389193, + "loss": 0.4574, + "step": 165430 + }, + { + "epoch": 8.216946458726532, + "grad_norm": 0.189453125, + "learning_rate": 0.00014264825668024238, + "loss": 0.4665, + "step": 165440 + }, + { + "epoch": 8.21744313102215, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014260852289659282, + "loss": 0.4975, + "step": 165450 + }, + { + "epoch": 8.21793980331777, + "grad_norm": 0.17578125, + "learning_rate": 0.0001425687891129433, + "loss": 0.479, + "step": 165460 + }, + { + "epoch": 8.218436475613391, + "grad_norm": 0.1748046875, + "learning_rate": 0.00014252905532929374, + "loss": 0.4817, + "step": 165470 + }, + { + "epoch": 8.21893314790901, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014248932154564418, + "loss": 0.4735, + "step": 165480 + }, + { + "epoch": 8.21942982020463, + "grad_norm": 0.150390625, + "learning_rate": 0.00014244958776199465, + "loss": 0.4687, + "step": 165490 + }, + { + "epoch": 8.219926492500248, + "grad_norm": 0.208984375, + "learning_rate": 0.0001424098539783451, + "loss": 0.4996, + "step": 165500 + }, + { + "epoch": 8.220423164795868, + "grad_norm": 0.1474609375, + "learning_rate": 0.00014237012019469557, + "loss": 0.4872, + "step": 165510 + }, + { + "epoch": 8.220919837091486, + "grad_norm": 0.166015625, + "learning_rate": 0.00014233038641104598, + "loss": 0.5011, + "step": 165520 + }, + { + "epoch": 8.221416509387106, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014229065262739646, + "loss": 0.4914, + "step": 165530 + }, + { + "epoch": 8.221913181682726, + "grad_norm": 0.185546875, + "learning_rate": 0.0001422509188437469, + "loss": 0.4813, + "step": 165540 + }, + { + "epoch": 8.222409853978345, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014221118506009737, + "loss": 0.4656, + "step": 165550 + }, + { + "epoch": 8.222906526273965, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001421714512764478, + "loss": 0.4844, + "step": 165560 + }, + { + "epoch": 8.223403198569583, + "grad_norm": 0.240234375, + "learning_rate": 0.00014213171749279826, + "loss": 0.5205, + "step": 165570 + }, + { + "epoch": 8.223899870865203, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001420919837091487, + "loss": 0.4786, + "step": 165580 + }, + { + "epoch": 8.224396543160822, + "grad_norm": 0.169921875, + "learning_rate": 0.00014205224992549917, + "loss": 0.5083, + "step": 165590 + }, + { + "epoch": 8.224893215456442, + "grad_norm": 0.154296875, + "learning_rate": 0.0001420125161418496, + "loss": 0.4834, + "step": 165600 + }, + { + "epoch": 8.225389887752062, + "grad_norm": 0.181640625, + "learning_rate": 0.00014197278235820006, + "loss": 0.4658, + "step": 165610 + }, + { + "epoch": 8.22588656004768, + "grad_norm": 0.181640625, + "learning_rate": 0.0001419330485745505, + "loss": 0.5276, + "step": 165620 + }, + { + "epoch": 8.2263832323433, + "grad_norm": 0.1533203125, + "learning_rate": 0.00014189331479090098, + "loss": 0.4855, + "step": 165630 + }, + { + "epoch": 8.226879904638919, + "grad_norm": 0.162109375, + "learning_rate": 0.00014185358100725142, + "loss": 0.4836, + "step": 165640 + }, + { + "epoch": 8.227376576934539, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014181384722360187, + "loss": 0.4809, + "step": 165650 + }, + { + "epoch": 8.227873249230157, + "grad_norm": 0.1513671875, + "learning_rate": 0.00014177411343995234, + "loss": 0.498, + "step": 165660 + }, + { + "epoch": 8.228369921525777, + "grad_norm": 0.1748046875, + "learning_rate": 0.00014173437965630278, + "loss": 0.5017, + "step": 165670 + }, + { + "epoch": 8.228866593821397, + "grad_norm": 0.1796875, + "learning_rate": 0.00014169464587265323, + "loss": 0.4874, + "step": 165680 + }, + { + "epoch": 8.229363266117016, + "grad_norm": 0.169921875, + "learning_rate": 0.00014165491208900367, + "loss": 0.4943, + "step": 165690 + }, + { + "epoch": 8.229859938412636, + "grad_norm": 0.1552734375, + "learning_rate": 0.00014161517830535414, + "loss": 0.4968, + "step": 165700 + }, + { + "epoch": 8.230356610708254, + "grad_norm": 0.1796875, + "learning_rate": 0.0001415754445217046, + "loss": 0.4797, + "step": 165710 + }, + { + "epoch": 8.230853283003874, + "grad_norm": 0.1884765625, + "learning_rate": 0.00014153571073805503, + "loss": 0.4862, + "step": 165720 + }, + { + "epoch": 8.231349955299493, + "grad_norm": 0.171875, + "learning_rate": 0.00014149597695440548, + "loss": 0.4464, + "step": 165730 + }, + { + "epoch": 8.231846627595113, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014145624317075595, + "loss": 0.4743, + "step": 165740 + }, + { + "epoch": 8.232343299890733, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001414165093871064, + "loss": 0.5065, + "step": 165750 + }, + { + "epoch": 8.232839972186351, + "grad_norm": 0.1796875, + "learning_rate": 0.00014137677560345686, + "loss": 0.4617, + "step": 165760 + }, + { + "epoch": 8.233336644481971, + "grad_norm": 0.16015625, + "learning_rate": 0.00014133704181980728, + "loss": 0.494, + "step": 165770 + }, + { + "epoch": 8.23383331677759, + "grad_norm": 0.16796875, + "learning_rate": 0.00014129730803615775, + "loss": 0.4749, + "step": 165780 + }, + { + "epoch": 8.23432998907321, + "grad_norm": 0.150390625, + "learning_rate": 0.0001412575742525082, + "loss": 0.467, + "step": 165790 + }, + { + "epoch": 8.234826661368828, + "grad_norm": 0.1845703125, + "learning_rate": 0.00014121784046885867, + "loss": 0.4945, + "step": 165800 + }, + { + "epoch": 8.235323333664448, + "grad_norm": 0.166015625, + "learning_rate": 0.0001411781066852091, + "loss": 0.5129, + "step": 165810 + }, + { + "epoch": 8.235820005960068, + "grad_norm": 0.22265625, + "learning_rate": 0.00014113837290155956, + "loss": 0.5185, + "step": 165820 + }, + { + "epoch": 8.236316678255687, + "grad_norm": 0.150390625, + "learning_rate": 0.00014109863911791003, + "loss": 0.5067, + "step": 165830 + }, + { + "epoch": 8.236813350551307, + "grad_norm": 0.1533203125, + "learning_rate": 0.00014105890533426047, + "loss": 0.4612, + "step": 165840 + }, + { + "epoch": 8.237310022846925, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014101917155061092, + "loss": 0.5043, + "step": 165850 + }, + { + "epoch": 8.237806695142545, + "grad_norm": 0.1640625, + "learning_rate": 0.00014097943776696136, + "loss": 0.4934, + "step": 165860 + }, + { + "epoch": 8.238303367438164, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014093970398331183, + "loss": 0.495, + "step": 165870 + }, + { + "epoch": 8.238800039733784, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014089997019966228, + "loss": 0.4677, + "step": 165880 + }, + { + "epoch": 8.239296712029404, + "grad_norm": 0.1953125, + "learning_rate": 0.00014086023641601272, + "loss": 0.4895, + "step": 165890 + }, + { + "epoch": 8.239793384325022, + "grad_norm": 0.1787109375, + "learning_rate": 0.00014082050263236316, + "loss": 0.4675, + "step": 165900 + }, + { + "epoch": 8.240290056620642, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014078076884871363, + "loss": 0.4758, + "step": 165910 + }, + { + "epoch": 8.24078672891626, + "grad_norm": 0.162109375, + "learning_rate": 0.00014074103506506408, + "loss": 0.4697, + "step": 165920 + }, + { + "epoch": 8.24128340121188, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014070130128141452, + "loss": 0.4968, + "step": 165930 + }, + { + "epoch": 8.241780073507499, + "grad_norm": 0.1640625, + "learning_rate": 0.00014066156749776497, + "loss": 0.477, + "step": 165940 + }, + { + "epoch": 8.24227674580312, + "grad_norm": 0.171875, + "learning_rate": 0.00014062183371411544, + "loss": 0.4564, + "step": 165950 + }, + { + "epoch": 8.24277341809874, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014058209993046588, + "loss": 0.4658, + "step": 165960 + }, + { + "epoch": 8.243270090394358, + "grad_norm": 0.1572265625, + "learning_rate": 0.00014054236614681633, + "loss": 0.4952, + "step": 165970 + }, + { + "epoch": 8.243766762689978, + "grad_norm": 0.203125, + "learning_rate": 0.0001405026323631668, + "loss": 0.4651, + "step": 165980 + }, + { + "epoch": 8.244263434985596, + "grad_norm": 0.2138671875, + "learning_rate": 0.00014046289857951724, + "loss": 0.4815, + "step": 165990 + }, + { + "epoch": 8.244760107281216, + "grad_norm": 0.171875, + "learning_rate": 0.00014042316479586771, + "loss": 0.5055, + "step": 166000 + }, + { + "epoch": 8.245256779576835, + "grad_norm": 0.1552734375, + "learning_rate": 0.00014038343101221813, + "loss": 0.5135, + "step": 166010 + }, + { + "epoch": 8.245753451872455, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001403436972285686, + "loss": 0.52, + "step": 166020 + }, + { + "epoch": 8.246250124168075, + "grad_norm": 0.1875, + "learning_rate": 0.00014030396344491905, + "loss": 0.4857, + "step": 166030 + }, + { + "epoch": 8.246746796463693, + "grad_norm": 0.18359375, + "learning_rate": 0.00014026422966126952, + "loss": 0.5122, + "step": 166040 + }, + { + "epoch": 8.247243468759313, + "grad_norm": 0.1845703125, + "learning_rate": 0.00014022449587761994, + "loss": 0.489, + "step": 166050 + }, + { + "epoch": 8.247740141054932, + "grad_norm": 0.166015625, + "learning_rate": 0.0001401847620939704, + "loss": 0.4951, + "step": 166060 + }, + { + "epoch": 8.248236813350552, + "grad_norm": 0.1875, + "learning_rate": 0.00014014502831032085, + "loss": 0.5143, + "step": 166070 + }, + { + "epoch": 8.24873348564617, + "grad_norm": 0.1826171875, + "learning_rate": 0.00014010529452667132, + "loss": 0.4649, + "step": 166080 + }, + { + "epoch": 8.24923015794179, + "grad_norm": 0.19921875, + "learning_rate": 0.00014006556074302174, + "loss": 0.4765, + "step": 166090 + }, + { + "epoch": 8.249726830237408, + "grad_norm": 0.154296875, + "learning_rate": 0.0001400258269593722, + "loss": 0.4906, + "step": 166100 + }, + { + "epoch": 8.250223502533029, + "grad_norm": 0.15234375, + "learning_rate": 0.00013998609317572266, + "loss": 0.461, + "step": 166110 + }, + { + "epoch": 8.250720174828649, + "grad_norm": 0.1572265625, + "learning_rate": 0.00013994635939207313, + "loss": 0.4916, + "step": 166120 + }, + { + "epoch": 8.251216847124267, + "grad_norm": 0.173828125, + "learning_rate": 0.00013990662560842357, + "loss": 0.478, + "step": 166130 + }, + { + "epoch": 8.251713519419887, + "grad_norm": 0.2021484375, + "learning_rate": 0.00013986689182477402, + "loss": 0.5184, + "step": 166140 + }, + { + "epoch": 8.252210191715506, + "grad_norm": 0.15625, + "learning_rate": 0.00013982715804112446, + "loss": 0.4736, + "step": 166150 + }, + { + "epoch": 8.252706864011126, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013978742425747493, + "loss": 0.4896, + "step": 166160 + }, + { + "epoch": 8.253203536306744, + "grad_norm": 0.166015625, + "learning_rate": 0.00013974769047382538, + "loss": 0.4874, + "step": 166170 + }, + { + "epoch": 8.253700208602364, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013970795669017582, + "loss": 0.46, + "step": 166180 + }, + { + "epoch": 8.254196880897984, + "grad_norm": 0.197265625, + "learning_rate": 0.0001396682229065263, + "loss": 0.486, + "step": 166190 + }, + { + "epoch": 8.254693553193603, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013962848912287673, + "loss": 0.4872, + "step": 166200 + }, + { + "epoch": 8.255190225489223, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001395887553392272, + "loss": 0.5024, + "step": 166210 + }, + { + "epoch": 8.255686897784841, + "grad_norm": 0.158203125, + "learning_rate": 0.00013954902155557762, + "loss": 0.4826, + "step": 166220 + }, + { + "epoch": 8.256183570080461, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001395092877719281, + "loss": 0.4846, + "step": 166230 + }, + { + "epoch": 8.25668024237608, + "grad_norm": 0.1513671875, + "learning_rate": 0.00013946955398827854, + "loss": 0.4903, + "step": 166240 + }, + { + "epoch": 8.2571769146717, + "grad_norm": 0.1767578125, + "learning_rate": 0.000139429820204629, + "loss": 0.5027, + "step": 166250 + }, + { + "epoch": 8.25767358696732, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013939008642097943, + "loss": 0.4871, + "step": 166260 + }, + { + "epoch": 8.258170259262938, + "grad_norm": 0.162109375, + "learning_rate": 0.0001393503526373299, + "loss": 0.481, + "step": 166270 + }, + { + "epoch": 8.258666931558558, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013931061885368034, + "loss": 0.4921, + "step": 166280 + }, + { + "epoch": 8.259163603854176, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013927088507003081, + "loss": 0.491, + "step": 166290 + }, + { + "epoch": 8.259660276149797, + "grad_norm": 0.1572265625, + "learning_rate": 0.00013923115128638123, + "loss": 0.4899, + "step": 166300 + }, + { + "epoch": 8.260156948445415, + "grad_norm": 0.177734375, + "learning_rate": 0.0001391914175027317, + "loss": 0.4881, + "step": 166310 + }, + { + "epoch": 8.260653620741035, + "grad_norm": 0.1513671875, + "learning_rate": 0.00013915168371908215, + "loss": 0.4838, + "step": 166320 + }, + { + "epoch": 8.261150293036655, + "grad_norm": 0.216796875, + "learning_rate": 0.00013911194993543262, + "loss": 0.4698, + "step": 166330 + }, + { + "epoch": 8.261646965332273, + "grad_norm": 0.1904296875, + "learning_rate": 0.00013907221615178306, + "loss": 0.4901, + "step": 166340 + }, + { + "epoch": 8.262143637627894, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001390324823681335, + "loss": 0.46, + "step": 166350 + }, + { + "epoch": 8.262640309923512, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013899274858448398, + "loss": 0.5303, + "step": 166360 + }, + { + "epoch": 8.263136982219132, + "grad_norm": 0.173828125, + "learning_rate": 0.00013895301480083442, + "loss": 0.5084, + "step": 166370 + }, + { + "epoch": 8.26363365451475, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013891328101718487, + "loss": 0.5177, + "step": 166380 + }, + { + "epoch": 8.26413032681037, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001388735472335353, + "loss": 0.4443, + "step": 166390 + }, + { + "epoch": 8.26462699910599, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013883381344988578, + "loss": 0.4598, + "step": 166400 + }, + { + "epoch": 8.265123671401609, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013879407966623623, + "loss": 0.482, + "step": 166410 + }, + { + "epoch": 8.265620343697229, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013875434588258667, + "loss": 0.5358, + "step": 166420 + }, + { + "epoch": 8.266117015992847, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013871461209893712, + "loss": 0.4901, + "step": 166430 + }, + { + "epoch": 8.266613688288468, + "grad_norm": 0.166015625, + "learning_rate": 0.0001386748783152876, + "loss": 0.5012, + "step": 166440 + }, + { + "epoch": 8.267110360584086, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013863514453163803, + "loss": 0.5218, + "step": 166450 + }, + { + "epoch": 8.267607032879706, + "grad_norm": 0.171875, + "learning_rate": 0.00013859541074798848, + "loss": 0.4715, + "step": 166460 + }, + { + "epoch": 8.268103705175326, + "grad_norm": 0.162109375, + "learning_rate": 0.00013855567696433892, + "loss": 0.4567, + "step": 166470 + }, + { + "epoch": 8.268600377470944, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001385159431806894, + "loss": 0.4753, + "step": 166480 + }, + { + "epoch": 8.269097049766565, + "grad_norm": 0.1640625, + "learning_rate": 0.00013847620939703984, + "loss": 0.4756, + "step": 166490 + }, + { + "epoch": 8.269593722062183, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013843647561339028, + "loss": 0.4921, + "step": 166500 + }, + { + "epoch": 8.270090394357803, + "grad_norm": 0.1591796875, + "learning_rate": 0.00013839674182974075, + "loss": 0.4789, + "step": 166510 + }, + { + "epoch": 8.270587066653421, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001383570080460912, + "loss": 0.4963, + "step": 166520 + }, + { + "epoch": 8.271083738949041, + "grad_norm": 0.185546875, + "learning_rate": 0.00013831727426244167, + "loss": 0.485, + "step": 166530 + }, + { + "epoch": 8.271580411244662, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013827754047879208, + "loss": 0.4849, + "step": 166540 + }, + { + "epoch": 8.27207708354028, + "grad_norm": 0.15625, + "learning_rate": 0.00013823780669514255, + "loss": 0.4716, + "step": 166550 + }, + { + "epoch": 8.2725737558359, + "grad_norm": 0.18359375, + "learning_rate": 0.000138198072911493, + "loss": 0.5257, + "step": 166560 + }, + { + "epoch": 8.273070428131518, + "grad_norm": 0.1826171875, + "learning_rate": 0.00013815833912784347, + "loss": 0.5066, + "step": 166570 + }, + { + "epoch": 8.273567100427138, + "grad_norm": 0.166015625, + "learning_rate": 0.0001381186053441939, + "loss": 0.4887, + "step": 166580 + }, + { + "epoch": 8.274063772722757, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013807887156054436, + "loss": 0.4754, + "step": 166590 + }, + { + "epoch": 8.274560445018377, + "grad_norm": 0.16015625, + "learning_rate": 0.0001380391377768948, + "loss": 0.4981, + "step": 166600 + }, + { + "epoch": 8.275057117313997, + "grad_norm": 0.2158203125, + "learning_rate": 0.00013799940399324527, + "loss": 0.4992, + "step": 166610 + }, + { + "epoch": 8.275553789609615, + "grad_norm": 0.193359375, + "learning_rate": 0.00013795967020959572, + "loss": 0.4977, + "step": 166620 + }, + { + "epoch": 8.276050461905236, + "grad_norm": 0.158203125, + "learning_rate": 0.00013791993642594616, + "loss": 0.464, + "step": 166630 + }, + { + "epoch": 8.276547134200854, + "grad_norm": 0.15234375, + "learning_rate": 0.0001378802026422966, + "loss": 0.475, + "step": 166640 + }, + { + "epoch": 8.277043806496474, + "grad_norm": 0.197265625, + "learning_rate": 0.00013784046885864708, + "loss": 0.5013, + "step": 166650 + }, + { + "epoch": 8.277540478792092, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013780073507499752, + "loss": 0.4751, + "step": 166660 + }, + { + "epoch": 8.278037151087712, + "grad_norm": 0.1533203125, + "learning_rate": 0.00013776100129134797, + "loss": 0.4773, + "step": 166670 + }, + { + "epoch": 8.27853382338333, + "grad_norm": 0.1767578125, + "learning_rate": 0.00013772126750769844, + "loss": 0.4894, + "step": 166680 + }, + { + "epoch": 8.27903049567895, + "grad_norm": 0.193359375, + "learning_rate": 0.00013768153372404888, + "loss": 0.4941, + "step": 166690 + }, + { + "epoch": 8.279527167974571, + "grad_norm": 0.181640625, + "learning_rate": 0.00013764179994039935, + "loss": 0.5007, + "step": 166700 + }, + { + "epoch": 8.28002384027019, + "grad_norm": 0.166015625, + "learning_rate": 0.00013760206615674977, + "loss": 0.4881, + "step": 166710 + }, + { + "epoch": 8.28052051256581, + "grad_norm": 0.1591796875, + "learning_rate": 0.00013756233237310024, + "loss": 0.4957, + "step": 166720 + }, + { + "epoch": 8.281017184861428, + "grad_norm": 0.1640625, + "learning_rate": 0.0001375225985894507, + "loss": 0.4816, + "step": 166730 + }, + { + "epoch": 8.281513857157048, + "grad_norm": 0.173828125, + "learning_rate": 0.00013748286480580116, + "loss": 0.4573, + "step": 166740 + }, + { + "epoch": 8.282010529452666, + "grad_norm": 0.16015625, + "learning_rate": 0.00013744313102215158, + "loss": 0.4782, + "step": 166750 + }, + { + "epoch": 8.282507201748286, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013740339723850205, + "loss": 0.508, + "step": 166760 + }, + { + "epoch": 8.283003874043906, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001373636634548525, + "loss": 0.486, + "step": 166770 + }, + { + "epoch": 8.283500546339525, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013732392967120296, + "loss": 0.4679, + "step": 166780 + }, + { + "epoch": 8.283997218635145, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013728419588755338, + "loss": 0.4881, + "step": 166790 + }, + { + "epoch": 8.284493890930763, + "grad_norm": 0.171875, + "learning_rate": 0.00013724446210390385, + "loss": 0.4946, + "step": 166800 + }, + { + "epoch": 8.284990563226383, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001372047283202543, + "loss": 0.5021, + "step": 166810 + }, + { + "epoch": 8.285487235522002, + "grad_norm": 0.16015625, + "learning_rate": 0.00013716499453660477, + "loss": 0.4723, + "step": 166820 + }, + { + "epoch": 8.285983907817622, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001371252607529552, + "loss": 0.4937, + "step": 166830 + }, + { + "epoch": 8.286480580113242, + "grad_norm": 0.1552734375, + "learning_rate": 0.00013708552696930565, + "loss": 0.4536, + "step": 166840 + }, + { + "epoch": 8.28697725240886, + "grad_norm": 0.169921875, + "learning_rate": 0.00013704579318565613, + "loss": 0.5034, + "step": 166850 + }, + { + "epoch": 8.28747392470448, + "grad_norm": 0.185546875, + "learning_rate": 0.00013700605940200657, + "loss": 0.475, + "step": 166860 + }, + { + "epoch": 8.287970597000099, + "grad_norm": 0.1513671875, + "learning_rate": 0.00013696632561835701, + "loss": 0.4466, + "step": 166870 + }, + { + "epoch": 8.288467269295719, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013692659183470746, + "loss": 0.4629, + "step": 166880 + }, + { + "epoch": 8.288963941591337, + "grad_norm": 0.1796875, + "learning_rate": 0.00013688685805105793, + "loss": 0.4828, + "step": 166890 + }, + { + "epoch": 8.289460613886957, + "grad_norm": 0.1513671875, + "learning_rate": 0.00013684712426740837, + "loss": 0.4674, + "step": 166900 + }, + { + "epoch": 8.289957286182577, + "grad_norm": 0.18359375, + "learning_rate": 0.00013680739048375882, + "loss": 0.4985, + "step": 166910 + }, + { + "epoch": 8.290453958478196, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013676765670010926, + "loss": 0.462, + "step": 166920 + }, + { + "epoch": 8.290950630773816, + "grad_norm": 0.17578125, + "learning_rate": 0.00013672792291645973, + "loss": 0.4734, + "step": 166930 + }, + { + "epoch": 8.291447303069434, + "grad_norm": 0.177734375, + "learning_rate": 0.00013668818913281018, + "loss": 0.4651, + "step": 166940 + }, + { + "epoch": 8.291943975365054, + "grad_norm": 0.154296875, + "learning_rate": 0.00013664845534916062, + "loss": 0.473, + "step": 166950 + }, + { + "epoch": 8.292440647660673, + "grad_norm": 0.181640625, + "learning_rate": 0.00013660872156551107, + "loss": 0.5003, + "step": 166960 + }, + { + "epoch": 8.292937319956293, + "grad_norm": 0.162109375, + "learning_rate": 0.00013656898778186154, + "loss": 0.4831, + "step": 166970 + }, + { + "epoch": 8.293433992251913, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013652925399821198, + "loss": 0.4685, + "step": 166980 + }, + { + "epoch": 8.293930664547531, + "grad_norm": 0.16796875, + "learning_rate": 0.00013648952021456243, + "loss": 0.4819, + "step": 166990 + }, + { + "epoch": 8.294427336843151, + "grad_norm": 0.169921875, + "learning_rate": 0.00013644978643091287, + "loss": 0.4569, + "step": 167000 + }, + { + "epoch": 8.29492400913877, + "grad_norm": 0.189453125, + "learning_rate": 0.00013641005264726334, + "loss": 0.4882, + "step": 167010 + }, + { + "epoch": 8.29542068143439, + "grad_norm": 0.173828125, + "learning_rate": 0.0001363703188636138, + "loss": 0.4702, + "step": 167020 + }, + { + "epoch": 8.295917353730008, + "grad_norm": 0.189453125, + "learning_rate": 0.00013633058507996426, + "loss": 0.4986, + "step": 167030 + }, + { + "epoch": 8.296414026025628, + "grad_norm": 0.181640625, + "learning_rate": 0.0001362908512963147, + "loss": 0.4749, + "step": 167040 + }, + { + "epoch": 8.296910698321248, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013625111751266515, + "loss": 0.4875, + "step": 167050 + }, + { + "epoch": 8.297407370616867, + "grad_norm": 0.1484375, + "learning_rate": 0.00013621138372901562, + "loss": 0.4534, + "step": 167060 + }, + { + "epoch": 8.297904042912487, + "grad_norm": 0.1904296875, + "learning_rate": 0.00013617164994536606, + "loss": 0.5031, + "step": 167070 + }, + { + "epoch": 8.298400715208105, + "grad_norm": 0.166015625, + "learning_rate": 0.0001361319161617165, + "loss": 0.4927, + "step": 167080 + }, + { + "epoch": 8.298897387503725, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013609218237806695, + "loss": 0.4837, + "step": 167090 + }, + { + "epoch": 8.299394059799344, + "grad_norm": 0.150390625, + "learning_rate": 0.00013605244859441742, + "loss": 0.4649, + "step": 167100 + }, + { + "epoch": 8.299890732094964, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013601271481076787, + "loss": 0.4737, + "step": 167110 + }, + { + "epoch": 8.300387404390584, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001359729810271183, + "loss": 0.4946, + "step": 167120 + }, + { + "epoch": 8.300884076686202, + "grad_norm": 0.1943359375, + "learning_rate": 0.00013593324724346876, + "loss": 0.4786, + "step": 167130 + }, + { + "epoch": 8.301380748981822, + "grad_norm": 0.1591796875, + "learning_rate": 0.00013589351345981923, + "loss": 0.4839, + "step": 167140 + }, + { + "epoch": 8.30187742127744, + "grad_norm": 0.1796875, + "learning_rate": 0.00013585377967616967, + "loss": 0.4802, + "step": 167150 + }, + { + "epoch": 8.30237409357306, + "grad_norm": 0.1533203125, + "learning_rate": 0.00013581404589252011, + "loss": 0.5023, + "step": 167160 + }, + { + "epoch": 8.30287076586868, + "grad_norm": 0.201171875, + "learning_rate": 0.00013577431210887056, + "loss": 0.497, + "step": 167170 + }, + { + "epoch": 8.3033674381643, + "grad_norm": 0.1904296875, + "learning_rate": 0.00013573457832522103, + "loss": 0.4474, + "step": 167180 + }, + { + "epoch": 8.30386411045992, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013569484454157147, + "loss": 0.5103, + "step": 167190 + }, + { + "epoch": 8.304360782755538, + "grad_norm": 0.197265625, + "learning_rate": 0.00013565511075792192, + "loss": 0.503, + "step": 167200 + }, + { + "epoch": 8.304857455051158, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001356153769742724, + "loss": 0.5081, + "step": 167210 + }, + { + "epoch": 8.305354127346776, + "grad_norm": 0.162109375, + "learning_rate": 0.00013557564319062283, + "loss": 0.494, + "step": 167220 + }, + { + "epoch": 8.305850799642396, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001355359094069733, + "loss": 0.4958, + "step": 167230 + }, + { + "epoch": 8.306347471938015, + "grad_norm": 0.1552734375, + "learning_rate": 0.00013549617562332372, + "loss": 0.4953, + "step": 167240 + }, + { + "epoch": 8.306844144233635, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001354564418396742, + "loss": 0.5154, + "step": 167250 + }, + { + "epoch": 8.307340816529255, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013541670805602464, + "loss": 0.4744, + "step": 167260 + }, + { + "epoch": 8.307837488824873, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001353769742723751, + "loss": 0.4729, + "step": 167270 + }, + { + "epoch": 8.308334161120493, + "grad_norm": 0.169921875, + "learning_rate": 0.00013533724048872553, + "loss": 0.5144, + "step": 167280 + }, + { + "epoch": 8.308830833416112, + "grad_norm": 0.17578125, + "learning_rate": 0.000135297506705076, + "loss": 0.4924, + "step": 167290 + }, + { + "epoch": 8.309327505711732, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013525777292142644, + "loss": 0.4912, + "step": 167300 + }, + { + "epoch": 8.30982417800735, + "grad_norm": 0.16796875, + "learning_rate": 0.00013521803913777691, + "loss": 0.4705, + "step": 167310 + }, + { + "epoch": 8.31032085030297, + "grad_norm": 0.166015625, + "learning_rate": 0.00013517830535412733, + "loss": 0.5122, + "step": 167320 + }, + { + "epoch": 8.31081752259859, + "grad_norm": 0.154296875, + "learning_rate": 0.0001351385715704778, + "loss": 0.4926, + "step": 167330 + }, + { + "epoch": 8.311314194894209, + "grad_norm": 0.171875, + "learning_rate": 0.00013509883778682825, + "loss": 0.4612, + "step": 167340 + }, + { + "epoch": 8.311810867189829, + "grad_norm": 0.18359375, + "learning_rate": 0.00013505910400317872, + "loss": 0.4623, + "step": 167350 + }, + { + "epoch": 8.312307539485447, + "grad_norm": 0.171875, + "learning_rate": 0.00013501937021952916, + "loss": 0.4782, + "step": 167360 + }, + { + "epoch": 8.312804211781067, + "grad_norm": 0.158203125, + "learning_rate": 0.0001349796364358796, + "loss": 0.4786, + "step": 167370 + }, + { + "epoch": 8.313300884076686, + "grad_norm": 0.1572265625, + "learning_rate": 0.00013493990265223008, + "loss": 0.4938, + "step": 167380 + }, + { + "epoch": 8.313797556372306, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013490016886858052, + "loss": 0.4889, + "step": 167390 + }, + { + "epoch": 8.314294228667926, + "grad_norm": 0.2001953125, + "learning_rate": 0.00013486043508493097, + "loss": 0.528, + "step": 167400 + }, + { + "epoch": 8.314790900963544, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001348207013012814, + "loss": 0.4834, + "step": 167410 + }, + { + "epoch": 8.315287573259164, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013478096751763188, + "loss": 0.4942, + "step": 167420 + }, + { + "epoch": 8.315784245554783, + "grad_norm": 0.1796875, + "learning_rate": 0.00013474123373398233, + "loss": 0.4927, + "step": 167430 + }, + { + "epoch": 8.316280917850403, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001347014999503328, + "loss": 0.482, + "step": 167440 + }, + { + "epoch": 8.316777590146021, + "grad_norm": 0.154296875, + "learning_rate": 0.00013466176616668321, + "loss": 0.4717, + "step": 167450 + }, + { + "epoch": 8.317274262441641, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013462203238303369, + "loss": 0.4825, + "step": 167460 + }, + { + "epoch": 8.317770934737261, + "grad_norm": 0.16796875, + "learning_rate": 0.00013458229859938413, + "loss": 0.4978, + "step": 167470 + }, + { + "epoch": 8.31826760703288, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001345425648157346, + "loss": 0.5176, + "step": 167480 + }, + { + "epoch": 8.3187642793285, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013450283103208502, + "loss": 0.4729, + "step": 167490 + }, + { + "epoch": 8.319260951624118, + "grad_norm": 0.1640625, + "learning_rate": 0.0001344630972484355, + "loss": 0.5029, + "step": 167500 + }, + { + "epoch": 8.319757623919738, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013442336346478593, + "loss": 0.4453, + "step": 167510 + }, + { + "epoch": 8.320254296215357, + "grad_norm": 0.19140625, + "learning_rate": 0.0001343836296811364, + "loss": 0.5059, + "step": 167520 + }, + { + "epoch": 8.320750968510977, + "grad_norm": 0.171875, + "learning_rate": 0.00013434389589748685, + "loss": 0.4966, + "step": 167530 + }, + { + "epoch": 8.321247640806595, + "grad_norm": 0.185546875, + "learning_rate": 0.0001343041621138373, + "loss": 0.4385, + "step": 167540 + }, + { + "epoch": 8.321744313102215, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013426442833018777, + "loss": 0.4618, + "step": 167550 + }, + { + "epoch": 8.322240985397835, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001342246945465382, + "loss": 0.4689, + "step": 167560 + }, + { + "epoch": 8.322737657693454, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013418496076288865, + "loss": 0.4645, + "step": 167570 + }, + { + "epoch": 8.323234329989074, + "grad_norm": 0.1640625, + "learning_rate": 0.0001341452269792391, + "loss": 0.4748, + "step": 167580 + }, + { + "epoch": 8.323731002284692, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013410549319558957, + "loss": 0.4818, + "step": 167590 + }, + { + "epoch": 8.324227674580312, + "grad_norm": 0.1806640625, + "learning_rate": 0.00013406575941194001, + "loss": 0.4575, + "step": 167600 + }, + { + "epoch": 8.32472434687593, + "grad_norm": 0.1845703125, + "learning_rate": 0.00013402602562829046, + "loss": 0.4976, + "step": 167610 + }, + { + "epoch": 8.32522101917155, + "grad_norm": 0.1875, + "learning_rate": 0.0001339862918446409, + "loss": 0.4791, + "step": 167620 + }, + { + "epoch": 8.32571769146717, + "grad_norm": 0.2001953125, + "learning_rate": 0.00013394655806099137, + "loss": 0.4636, + "step": 167630 + }, + { + "epoch": 8.326214363762789, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013390682427734182, + "loss": 0.4777, + "step": 167640 + }, + { + "epoch": 8.32671103605841, + "grad_norm": 0.158203125, + "learning_rate": 0.00013386709049369226, + "loss": 0.4811, + "step": 167650 + }, + { + "epoch": 8.327207708354027, + "grad_norm": 0.1796875, + "learning_rate": 0.0001338273567100427, + "loss": 0.4824, + "step": 167660 + }, + { + "epoch": 8.327704380649648, + "grad_norm": 0.1591796875, + "learning_rate": 0.00013378762292639318, + "loss": 0.464, + "step": 167670 + }, + { + "epoch": 8.328201052945266, + "grad_norm": 0.189453125, + "learning_rate": 0.00013374788914274362, + "loss": 0.4885, + "step": 167680 + }, + { + "epoch": 8.328697725240886, + "grad_norm": 0.169921875, + "learning_rate": 0.00013370815535909407, + "loss": 0.4901, + "step": 167690 + }, + { + "epoch": 8.329194397536506, + "grad_norm": 0.19921875, + "learning_rate": 0.00013366842157544454, + "loss": 0.4973, + "step": 167700 + }, + { + "epoch": 8.329691069832124, + "grad_norm": 0.1767578125, + "learning_rate": 0.00013362868779179498, + "loss": 0.4965, + "step": 167710 + }, + { + "epoch": 8.330187742127745, + "grad_norm": 0.1953125, + "learning_rate": 0.00013358895400814543, + "loss": 0.4656, + "step": 167720 + }, + { + "epoch": 8.330684414423363, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013354922022449587, + "loss": 0.4857, + "step": 167730 + }, + { + "epoch": 8.331181086718983, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013350948644084634, + "loss": 0.494, + "step": 167740 + }, + { + "epoch": 8.331677759014601, + "grad_norm": 0.185546875, + "learning_rate": 0.0001334697526571968, + "loss": 0.4538, + "step": 167750 + }, + { + "epoch": 8.332174431310222, + "grad_norm": 0.1572265625, + "learning_rate": 0.00013343001887354726, + "loss": 0.48, + "step": 167760 + }, + { + "epoch": 8.332671103605842, + "grad_norm": 0.2216796875, + "learning_rate": 0.00013339028508989767, + "loss": 0.4763, + "step": 167770 + }, + { + "epoch": 8.33316777590146, + "grad_norm": 0.201171875, + "learning_rate": 0.00013335055130624815, + "loss": 0.4868, + "step": 167780 + }, + { + "epoch": 8.33366444819708, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001333108175225986, + "loss": 0.4968, + "step": 167790 + }, + { + "epoch": 8.334161120492698, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013327108373894906, + "loss": 0.4734, + "step": 167800 + }, + { + "epoch": 8.334657792788319, + "grad_norm": 0.154296875, + "learning_rate": 0.00013323134995529948, + "loss": 0.5021, + "step": 167810 + }, + { + "epoch": 8.335154465083937, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013319161617164995, + "loss": 0.4863, + "step": 167820 + }, + { + "epoch": 8.335651137379557, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001331518823880004, + "loss": 0.5015, + "step": 167830 + }, + { + "epoch": 8.336147809675177, + "grad_norm": 0.1552734375, + "learning_rate": 0.00013311214860435087, + "loss": 0.5054, + "step": 167840 + }, + { + "epoch": 8.336644481970795, + "grad_norm": 0.169921875, + "learning_rate": 0.00013307241482070128, + "loss": 0.4841, + "step": 167850 + }, + { + "epoch": 8.337141154266416, + "grad_norm": 0.181640625, + "learning_rate": 0.00013303268103705175, + "loss": 0.4526, + "step": 167860 + }, + { + "epoch": 8.337637826562034, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001329929472534022, + "loss": 0.508, + "step": 167870 + }, + { + "epoch": 8.338134498857654, + "grad_norm": 0.169921875, + "learning_rate": 0.00013295321346975267, + "loss": 0.5082, + "step": 167880 + }, + { + "epoch": 8.338631171153272, + "grad_norm": 0.173828125, + "learning_rate": 0.00013291347968610311, + "loss": 0.5039, + "step": 167890 + }, + { + "epoch": 8.339127843448892, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013287374590245356, + "loss": 0.4792, + "step": 167900 + }, + { + "epoch": 8.339624515744513, + "grad_norm": 0.197265625, + "learning_rate": 0.00013283401211880403, + "loss": 0.4819, + "step": 167910 + }, + { + "epoch": 8.340121188040131, + "grad_norm": 0.17578125, + "learning_rate": 0.00013279427833515447, + "loss": 0.5087, + "step": 167920 + }, + { + "epoch": 8.340617860335751, + "grad_norm": 0.20703125, + "learning_rate": 0.00013275454455150495, + "loss": 0.4872, + "step": 167930 + }, + { + "epoch": 8.34111453263137, + "grad_norm": 0.1826171875, + "learning_rate": 0.00013271481076785536, + "loss": 0.4954, + "step": 167940 + }, + { + "epoch": 8.34161120492699, + "grad_norm": 0.1640625, + "learning_rate": 0.00013267507698420583, + "loss": 0.4834, + "step": 167950 + }, + { + "epoch": 8.342107877222608, + "grad_norm": 0.1865234375, + "learning_rate": 0.00013263534320055628, + "loss": 0.4951, + "step": 167960 + }, + { + "epoch": 8.342604549518228, + "grad_norm": 0.1806640625, + "learning_rate": 0.00013259560941690675, + "loss": 0.464, + "step": 167970 + }, + { + "epoch": 8.343101221813848, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013255587563325717, + "loss": 0.4987, + "step": 167980 + }, + { + "epoch": 8.343597894109466, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013251614184960764, + "loss": 0.5149, + "step": 167990 + }, + { + "epoch": 8.344094566405087, + "grad_norm": 0.162109375, + "learning_rate": 0.00013247640806595808, + "loss": 0.4995, + "step": 168000 + }, + { + "epoch": 8.344591238700705, + "grad_norm": 0.19140625, + "learning_rate": 0.00013243667428230855, + "loss": 0.4931, + "step": 168010 + }, + { + "epoch": 8.345087910996325, + "grad_norm": 0.1884765625, + "learning_rate": 0.00013239694049865897, + "loss": 0.5091, + "step": 168020 + }, + { + "epoch": 8.345584583291943, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013235720671500944, + "loss": 0.4792, + "step": 168030 + }, + { + "epoch": 8.346081255587563, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001323174729313599, + "loss": 0.4816, + "step": 168040 + }, + { + "epoch": 8.346577927883184, + "grad_norm": 0.1875, + "learning_rate": 0.00013227773914771036, + "loss": 0.483, + "step": 168050 + }, + { + "epoch": 8.347074600178802, + "grad_norm": 0.173828125, + "learning_rate": 0.0001322380053640608, + "loss": 0.4836, + "step": 168060 + }, + { + "epoch": 8.347571272474422, + "grad_norm": 0.169921875, + "learning_rate": 0.00013219827158041125, + "loss": 0.4806, + "step": 168070 + }, + { + "epoch": 8.34806794477004, + "grad_norm": 0.197265625, + "learning_rate": 0.00013215853779676172, + "loss": 0.4986, + "step": 168080 + }, + { + "epoch": 8.34856461706566, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013211880401311216, + "loss": 0.49, + "step": 168090 + }, + { + "epoch": 8.349061289361279, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001320790702294626, + "loss": 0.4752, + "step": 168100 + }, + { + "epoch": 8.349557961656899, + "grad_norm": 0.1865234375, + "learning_rate": 0.00013203933644581305, + "loss": 0.4681, + "step": 168110 + }, + { + "epoch": 8.350054633952517, + "grad_norm": 0.1923828125, + "learning_rate": 0.00013199960266216352, + "loss": 0.4882, + "step": 168120 + }, + { + "epoch": 8.350551306248137, + "grad_norm": 0.1845703125, + "learning_rate": 0.00013195986887851397, + "loss": 0.4639, + "step": 168130 + }, + { + "epoch": 8.351047978543757, + "grad_norm": 0.171875, + "learning_rate": 0.0001319201350948644, + "loss": 0.5005, + "step": 168140 + }, + { + "epoch": 8.351544650839376, + "grad_norm": 0.166015625, + "learning_rate": 0.00013188040131121485, + "loss": 0.4698, + "step": 168150 + }, + { + "epoch": 8.352041323134996, + "grad_norm": 0.1982421875, + "learning_rate": 0.00013184066752756533, + "loss": 0.4761, + "step": 168160 + }, + { + "epoch": 8.352537995430614, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013180093374391577, + "loss": 0.4914, + "step": 168170 + }, + { + "epoch": 8.353034667726234, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013176119996026621, + "loss": 0.4655, + "step": 168180 + }, + { + "epoch": 8.353531340021853, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013172146617661666, + "loss": 0.4935, + "step": 168190 + }, + { + "epoch": 8.354028012317473, + "grad_norm": 0.1533203125, + "learning_rate": 0.00013168173239296713, + "loss": 0.4744, + "step": 168200 + }, + { + "epoch": 8.354524684613093, + "grad_norm": 0.1865234375, + "learning_rate": 0.00013164199860931757, + "loss": 0.4751, + "step": 168210 + }, + { + "epoch": 8.355021356908711, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013160226482566802, + "loss": 0.4717, + "step": 168220 + }, + { + "epoch": 8.355518029204331, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001315625310420185, + "loss": 0.4954, + "step": 168230 + }, + { + "epoch": 8.35601470149995, + "grad_norm": 0.1845703125, + "learning_rate": 0.00013152279725836893, + "loss": 0.5185, + "step": 168240 + }, + { + "epoch": 8.35651137379557, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001314830634747194, + "loss": 0.4877, + "step": 168250 + }, + { + "epoch": 8.357008046091188, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013144332969106982, + "loss": 0.4824, + "step": 168260 + }, + { + "epoch": 8.357504718386808, + "grad_norm": 0.17578125, + "learning_rate": 0.0001314035959074203, + "loss": 0.5182, + "step": 168270 + }, + { + "epoch": 8.358001390682428, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013136386212377074, + "loss": 0.4792, + "step": 168280 + }, + { + "epoch": 8.358498062978047, + "grad_norm": 0.162109375, + "learning_rate": 0.0001313241283401212, + "loss": 0.479, + "step": 168290 + }, + { + "epoch": 8.358994735273667, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013128439455647165, + "loss": 0.513, + "step": 168300 + }, + { + "epoch": 8.359491407569285, + "grad_norm": 0.189453125, + "learning_rate": 0.0001312446607728221, + "loss": 0.481, + "step": 168310 + }, + { + "epoch": 8.359988079864905, + "grad_norm": 0.1806640625, + "learning_rate": 0.00013120492698917254, + "loss": 0.462, + "step": 168320 + }, + { + "epoch": 8.360484752160524, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013116519320552301, + "loss": 0.5018, + "step": 168330 + }, + { + "epoch": 8.360981424456144, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013112545942187346, + "loss": 0.4758, + "step": 168340 + }, + { + "epoch": 8.361478096751764, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001310857256382239, + "loss": 0.4626, + "step": 168350 + }, + { + "epoch": 8.361974769047382, + "grad_norm": 0.16015625, + "learning_rate": 0.00013104599185457435, + "loss": 0.4703, + "step": 168360 + }, + { + "epoch": 8.362471441343002, + "grad_norm": 0.1845703125, + "learning_rate": 0.00013100625807092482, + "loss": 0.478, + "step": 168370 + }, + { + "epoch": 8.36296811363862, + "grad_norm": 0.154296875, + "learning_rate": 0.00013096652428727526, + "loss": 0.4709, + "step": 168380 + }, + { + "epoch": 8.36346478593424, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001309267905036257, + "loss": 0.4842, + "step": 168390 + }, + { + "epoch": 8.36396145822986, + "grad_norm": 0.2021484375, + "learning_rate": 0.00013088705671997618, + "loss": 0.5029, + "step": 168400 + }, + { + "epoch": 8.36445813052548, + "grad_norm": 0.2177734375, + "learning_rate": 0.00013084732293632662, + "loss": 0.4755, + "step": 168410 + }, + { + "epoch": 8.3649548028211, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013080758915267707, + "loss": 0.4617, + "step": 168420 + }, + { + "epoch": 8.365451475116718, + "grad_norm": 0.19140625, + "learning_rate": 0.0001307678553690275, + "loss": 0.4698, + "step": 168430 + }, + { + "epoch": 8.365948147412338, + "grad_norm": 0.185546875, + "learning_rate": 0.00013072812158537798, + "loss": 0.4857, + "step": 168440 + }, + { + "epoch": 8.366444819707956, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013068838780172843, + "loss": 0.4522, + "step": 168450 + }, + { + "epoch": 8.366941492003576, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001306486540180789, + "loss": 0.5127, + "step": 168460 + }, + { + "epoch": 8.367438164299195, + "grad_norm": 0.171875, + "learning_rate": 0.00013060892023442931, + "loss": 0.4891, + "step": 168470 + }, + { + "epoch": 8.367934836594815, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013056918645077979, + "loss": 0.4853, + "step": 168480 + }, + { + "epoch": 8.368431508890435, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013052945266713023, + "loss": 0.4682, + "step": 168490 + }, + { + "epoch": 8.368928181186053, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001304897188834807, + "loss": 0.4787, + "step": 168500 + }, + { + "epoch": 8.369424853481673, + "grad_norm": 0.162109375, + "learning_rate": 0.00013044998509983112, + "loss": 0.4798, + "step": 168510 + }, + { + "epoch": 8.369921525777292, + "grad_norm": 0.193359375, + "learning_rate": 0.0001304102513161816, + "loss": 0.4655, + "step": 168520 + }, + { + "epoch": 8.370418198072912, + "grad_norm": 0.185546875, + "learning_rate": 0.00013037051753253203, + "loss": 0.4892, + "step": 168530 + }, + { + "epoch": 8.37091487036853, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001303307837488825, + "loss": 0.5185, + "step": 168540 + }, + { + "epoch": 8.37141154266415, + "grad_norm": 0.16796875, + "learning_rate": 0.00013029104996523295, + "loss": 0.4781, + "step": 168550 + }, + { + "epoch": 8.37190821495977, + "grad_norm": 0.16796875, + "learning_rate": 0.0001302513161815834, + "loss": 0.4573, + "step": 168560 + }, + { + "epoch": 8.372404887255389, + "grad_norm": 0.17578125, + "learning_rate": 0.00013021158239793384, + "loss": 0.4944, + "step": 168570 + }, + { + "epoch": 8.372901559551009, + "grad_norm": 0.2109375, + "learning_rate": 0.0001301718486142843, + "loss": 0.4696, + "step": 168580 + }, + { + "epoch": 8.373398231846627, + "grad_norm": 0.1845703125, + "learning_rate": 0.00013013211483063475, + "loss": 0.4993, + "step": 168590 + }, + { + "epoch": 8.373894904142247, + "grad_norm": 0.166015625, + "learning_rate": 0.0001300923810469852, + "loss": 0.4979, + "step": 168600 + }, + { + "epoch": 8.374391576437866, + "grad_norm": 0.171875, + "learning_rate": 0.00013005264726333567, + "loss": 0.4934, + "step": 168610 + }, + { + "epoch": 8.374888248733486, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013001291347968611, + "loss": 0.4886, + "step": 168620 + }, + { + "epoch": 8.375384921029106, + "grad_norm": 0.177734375, + "learning_rate": 0.00012997317969603656, + "loss": 0.4948, + "step": 168630 + }, + { + "epoch": 8.375881593324724, + "grad_norm": 0.1787109375, + "learning_rate": 0.000129933445912387, + "loss": 0.4783, + "step": 168640 + }, + { + "epoch": 8.376378265620344, + "grad_norm": 0.1640625, + "learning_rate": 0.00012989371212873747, + "loss": 0.4894, + "step": 168650 + }, + { + "epoch": 8.376874937915963, + "grad_norm": 0.181640625, + "learning_rate": 0.00012985397834508792, + "loss": 0.4993, + "step": 168660 + }, + { + "epoch": 8.377371610211583, + "grad_norm": 0.18359375, + "learning_rate": 0.00012981424456143836, + "loss": 0.4976, + "step": 168670 + }, + { + "epoch": 8.377868282507201, + "grad_norm": 0.16015625, + "learning_rate": 0.0001297745107777888, + "loss": 0.4744, + "step": 168680 + }, + { + "epoch": 8.378364954802821, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012973477699413928, + "loss": 0.4681, + "step": 168690 + }, + { + "epoch": 8.378861627098441, + "grad_norm": 0.16796875, + "learning_rate": 0.00012969504321048972, + "loss": 0.5058, + "step": 168700 + }, + { + "epoch": 8.37935829939406, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001296553094268402, + "loss": 0.5096, + "step": 168710 + }, + { + "epoch": 8.37985497168968, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001296155756431906, + "loss": 0.4912, + "step": 168720 + }, + { + "epoch": 8.380351643985298, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012957584185954108, + "loss": 0.5029, + "step": 168730 + }, + { + "epoch": 8.380848316280918, + "grad_norm": 0.1875, + "learning_rate": 0.00012953610807589153, + "loss": 0.5057, + "step": 168740 + }, + { + "epoch": 8.381344988576537, + "grad_norm": 0.1845703125, + "learning_rate": 0.000129496374292242, + "loss": 0.4688, + "step": 168750 + }, + { + "epoch": 8.381841660872157, + "grad_norm": 0.19140625, + "learning_rate": 0.00012945664050859244, + "loss": 0.4389, + "step": 168760 + }, + { + "epoch": 8.382338333167777, + "grad_norm": 0.1611328125, + "learning_rate": 0.00012941690672494289, + "loss": 0.4644, + "step": 168770 + }, + { + "epoch": 8.382835005463395, + "grad_norm": 0.173828125, + "learning_rate": 0.00012937717294129336, + "loss": 0.5107, + "step": 168780 + }, + { + "epoch": 8.383331677759015, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001293374391576438, + "loss": 0.4563, + "step": 168790 + }, + { + "epoch": 8.383828350054634, + "grad_norm": 0.173828125, + "learning_rate": 0.00012929770537399425, + "loss": 0.5044, + "step": 168800 + }, + { + "epoch": 8.384325022350254, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001292579715903447, + "loss": 0.4981, + "step": 168810 + }, + { + "epoch": 8.384821694645872, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012921823780669516, + "loss": 0.4397, + "step": 168820 + }, + { + "epoch": 8.385318366941492, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001291785040230456, + "loss": 0.4958, + "step": 168830 + }, + { + "epoch": 8.385815039237112, + "grad_norm": 0.171875, + "learning_rate": 0.00012913877023939605, + "loss": 0.4877, + "step": 168840 + }, + { + "epoch": 8.38631171153273, + "grad_norm": 0.1640625, + "learning_rate": 0.0001290990364557465, + "loss": 0.5225, + "step": 168850 + }, + { + "epoch": 8.38680838382835, + "grad_norm": 0.162109375, + "learning_rate": 0.00012905930267209697, + "loss": 0.4748, + "step": 168860 + }, + { + "epoch": 8.387305056123969, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001290195688884474, + "loss": 0.4867, + "step": 168870 + }, + { + "epoch": 8.38780172841959, + "grad_norm": 0.166015625, + "learning_rate": 0.00012897983510479785, + "loss": 0.4634, + "step": 168880 + }, + { + "epoch": 8.388298400715207, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001289401013211483, + "loss": 0.4952, + "step": 168890 + }, + { + "epoch": 8.388795073010828, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012890036753749877, + "loss": 0.5218, + "step": 168900 + }, + { + "epoch": 8.389291745306446, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012886063375384921, + "loss": 0.4845, + "step": 168910 + }, + { + "epoch": 8.389788417602066, + "grad_norm": 0.1796875, + "learning_rate": 0.00012882089997019966, + "loss": 0.488, + "step": 168920 + }, + { + "epoch": 8.390285089897686, + "grad_norm": 0.169921875, + "learning_rate": 0.00012878116618655013, + "loss": 0.539, + "step": 168930 + }, + { + "epoch": 8.390781762193305, + "grad_norm": 0.1806640625, + "learning_rate": 0.00012874143240290057, + "loss": 0.4901, + "step": 168940 + }, + { + "epoch": 8.391278434488925, + "grad_norm": 0.166015625, + "learning_rate": 0.00012870169861925105, + "loss": 0.4447, + "step": 168950 + }, + { + "epoch": 8.391775106784543, + "grad_norm": 0.1611328125, + "learning_rate": 0.00012866196483560146, + "loss": 0.5132, + "step": 168960 + }, + { + "epoch": 8.392271779080163, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012862223105195193, + "loss": 0.4866, + "step": 168970 + }, + { + "epoch": 8.392768451375781, + "grad_norm": 0.1875, + "learning_rate": 0.00012858249726830238, + "loss": 0.4917, + "step": 168980 + }, + { + "epoch": 8.393265123671402, + "grad_norm": 0.158203125, + "learning_rate": 0.00012854276348465285, + "loss": 0.4911, + "step": 168990 + }, + { + "epoch": 8.393761795967022, + "grad_norm": 0.16015625, + "learning_rate": 0.00012850302970100327, + "loss": 0.4666, + "step": 169000 + }, + { + "epoch": 8.39425846826264, + "grad_norm": 0.1796875, + "learning_rate": 0.00012846329591735374, + "loss": 0.4556, + "step": 169010 + }, + { + "epoch": 8.39475514055826, + "grad_norm": 0.18359375, + "learning_rate": 0.00012842356213370418, + "loss": 0.5018, + "step": 169020 + }, + { + "epoch": 8.395251812853878, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012838382835005465, + "loss": 0.4684, + "step": 169030 + }, + { + "epoch": 8.395748485149499, + "grad_norm": 0.181640625, + "learning_rate": 0.00012834409456640507, + "loss": 0.4969, + "step": 169040 + }, + { + "epoch": 8.396245157445117, + "grad_norm": 0.1611328125, + "learning_rate": 0.00012830436078275554, + "loss": 0.4415, + "step": 169050 + }, + { + "epoch": 8.396741829740737, + "grad_norm": 0.1845703125, + "learning_rate": 0.00012826462699910599, + "loss": 0.486, + "step": 169060 + }, + { + "epoch": 8.397238502036357, + "grad_norm": 0.1494140625, + "learning_rate": 0.00012822489321545646, + "loss": 0.472, + "step": 169070 + }, + { + "epoch": 8.397735174331975, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001281851594318069, + "loss": 0.5017, + "step": 169080 + }, + { + "epoch": 8.398231846627596, + "grad_norm": 0.1552734375, + "learning_rate": 0.00012814542564815735, + "loss": 0.4763, + "step": 169090 + }, + { + "epoch": 8.398728518923214, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012810569186450782, + "loss": 0.498, + "step": 169100 + }, + { + "epoch": 8.399225191218834, + "grad_norm": 0.189453125, + "learning_rate": 0.00012806595808085826, + "loss": 0.5043, + "step": 169110 + }, + { + "epoch": 8.399721863514452, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001280262242972087, + "loss": 0.4787, + "step": 169120 + }, + { + "epoch": 8.400218535810072, + "grad_norm": 0.19140625, + "learning_rate": 0.00012798649051355915, + "loss": 0.4647, + "step": 169130 + }, + { + "epoch": 8.400715208105693, + "grad_norm": 0.173828125, + "learning_rate": 0.00012794675672990962, + "loss": 0.4939, + "step": 169140 + }, + { + "epoch": 8.401211880401311, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012790702294626007, + "loss": 0.473, + "step": 169150 + }, + { + "epoch": 8.401708552696931, + "grad_norm": 0.1865234375, + "learning_rate": 0.00012786728916261054, + "loss": 0.4581, + "step": 169160 + }, + { + "epoch": 8.40220522499255, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012782755537896095, + "loss": 0.4621, + "step": 169170 + }, + { + "epoch": 8.40270189728817, + "grad_norm": 0.1884765625, + "learning_rate": 0.00012778782159531143, + "loss": 0.4803, + "step": 169180 + }, + { + "epoch": 8.403198569583788, + "grad_norm": 0.166015625, + "learning_rate": 0.00012774808781166187, + "loss": 0.4912, + "step": 169190 + }, + { + "epoch": 8.403695241879408, + "grad_norm": 0.166015625, + "learning_rate": 0.00012770835402801234, + "loss": 0.4673, + "step": 169200 + }, + { + "epoch": 8.404191914175028, + "grad_norm": 0.158203125, + "learning_rate": 0.00012766862024436276, + "loss": 0.5012, + "step": 169210 + }, + { + "epoch": 8.404688586470646, + "grad_norm": 0.1640625, + "learning_rate": 0.00012762888646071323, + "loss": 0.4878, + "step": 169220 + }, + { + "epoch": 8.405185258766267, + "grad_norm": 0.205078125, + "learning_rate": 0.00012758915267706367, + "loss": 0.5079, + "step": 169230 + }, + { + "epoch": 8.405681931061885, + "grad_norm": 0.17578125, + "learning_rate": 0.00012754941889341415, + "loss": 0.4728, + "step": 169240 + }, + { + "epoch": 8.406178603357505, + "grad_norm": 0.185546875, + "learning_rate": 0.0001275096851097646, + "loss": 0.4736, + "step": 169250 + }, + { + "epoch": 8.406675275653123, + "grad_norm": 0.181640625, + "learning_rate": 0.00012746995132611503, + "loss": 0.4654, + "step": 169260 + }, + { + "epoch": 8.407171947948743, + "grad_norm": 0.2041015625, + "learning_rate": 0.00012743021754246548, + "loss": 0.4865, + "step": 169270 + }, + { + "epoch": 8.407668620244364, + "grad_norm": 0.16796875, + "learning_rate": 0.00012739048375881595, + "loss": 0.4786, + "step": 169280 + }, + { + "epoch": 8.408165292539982, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001273507499751664, + "loss": 0.4763, + "step": 169290 + }, + { + "epoch": 8.408661964835602, + "grad_norm": 0.19921875, + "learning_rate": 0.00012731101619151684, + "loss": 0.4851, + "step": 169300 + }, + { + "epoch": 8.40915863713122, + "grad_norm": 0.16015625, + "learning_rate": 0.0001272712824078673, + "loss": 0.4838, + "step": 169310 + }, + { + "epoch": 8.40965530942684, + "grad_norm": 0.173828125, + "learning_rate": 0.00012723154862421775, + "loss": 0.4796, + "step": 169320 + }, + { + "epoch": 8.410151981722459, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001271918148405682, + "loss": 0.5066, + "step": 169330 + }, + { + "epoch": 8.410648654018079, + "grad_norm": 0.17578125, + "learning_rate": 0.00012715208105691864, + "loss": 0.5034, + "step": 169340 + }, + { + "epoch": 8.411145326313699, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001271123472732691, + "loss": 0.4853, + "step": 169350 + }, + { + "epoch": 8.411641998609317, + "grad_norm": 0.171875, + "learning_rate": 0.00012707261348961956, + "loss": 0.4663, + "step": 169360 + }, + { + "epoch": 8.412138670904937, + "grad_norm": 0.16796875, + "learning_rate": 0.00012703287970597, + "loss": 0.4583, + "step": 169370 + }, + { + "epoch": 8.412635343200556, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012699314592232045, + "loss": 0.5136, + "step": 169380 + }, + { + "epoch": 8.413132015496176, + "grad_norm": 0.17578125, + "learning_rate": 0.00012695341213867092, + "loss": 0.5158, + "step": 169390 + }, + { + "epoch": 8.413628687791794, + "grad_norm": 0.181640625, + "learning_rate": 0.00012691367835502136, + "loss": 0.4755, + "step": 169400 + }, + { + "epoch": 8.414125360087414, + "grad_norm": 0.181640625, + "learning_rate": 0.0001268739445713718, + "loss": 0.4769, + "step": 169410 + }, + { + "epoch": 8.414622032383035, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012683421078772225, + "loss": 0.4683, + "step": 169420 + }, + { + "epoch": 8.415118704678653, + "grad_norm": 0.16015625, + "learning_rate": 0.00012679447700407272, + "loss": 0.4729, + "step": 169430 + }, + { + "epoch": 8.415615376974273, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012675474322042317, + "loss": 0.4856, + "step": 169440 + }, + { + "epoch": 8.416112049269891, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001267150094367736, + "loss": 0.5044, + "step": 169450 + }, + { + "epoch": 8.416608721565511, + "grad_norm": 0.1591796875, + "learning_rate": 0.00012667527565312408, + "loss": 0.4702, + "step": 169460 + }, + { + "epoch": 8.41710539386113, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012663554186947453, + "loss": 0.4685, + "step": 169470 + }, + { + "epoch": 8.41760206615675, + "grad_norm": 0.185546875, + "learning_rate": 0.000126595808085825, + "loss": 0.4905, + "step": 169480 + }, + { + "epoch": 8.418098738452368, + "grad_norm": 0.177734375, + "learning_rate": 0.00012655607430217541, + "loss": 0.4802, + "step": 169490 + }, + { + "epoch": 8.418595410747988, + "grad_norm": 0.169921875, + "learning_rate": 0.00012651634051852589, + "loss": 0.4804, + "step": 169500 + }, + { + "epoch": 8.419092083043608, + "grad_norm": 0.19921875, + "learning_rate": 0.00012647660673487633, + "loss": 0.4988, + "step": 169510 + }, + { + "epoch": 8.419588755339227, + "grad_norm": 0.162109375, + "learning_rate": 0.0001264368729512268, + "loss": 0.5153, + "step": 169520 + }, + { + "epoch": 8.420085427634847, + "grad_norm": 0.1826171875, + "learning_rate": 0.00012639713916757722, + "loss": 0.4659, + "step": 169530 + }, + { + "epoch": 8.420582099930465, + "grad_norm": 0.181640625, + "learning_rate": 0.0001263574053839277, + "loss": 0.4871, + "step": 169540 + }, + { + "epoch": 8.421078772226085, + "grad_norm": 0.193359375, + "learning_rate": 0.00012631767160027813, + "loss": 0.4974, + "step": 169550 + }, + { + "epoch": 8.421575444521704, + "grad_norm": 0.1640625, + "learning_rate": 0.0001262779378166286, + "loss": 0.4921, + "step": 169560 + }, + { + "epoch": 8.422072116817324, + "grad_norm": 0.162109375, + "learning_rate": 0.00012623820403297905, + "loss": 0.5091, + "step": 169570 + }, + { + "epoch": 8.422568789112944, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001261984702493295, + "loss": 0.472, + "step": 169580 + }, + { + "epoch": 8.423065461408562, + "grad_norm": 0.2080078125, + "learning_rate": 0.00012615873646567994, + "loss": 0.4821, + "step": 169590 + }, + { + "epoch": 8.423562133704182, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001261190026820304, + "loss": 0.5078, + "step": 169600 + }, + { + "epoch": 8.4240588059998, + "grad_norm": 0.1767578125, + "learning_rate": 0.00012607926889838085, + "loss": 0.4889, + "step": 169610 + }, + { + "epoch": 8.42455547829542, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001260395351147313, + "loss": 0.4557, + "step": 169620 + }, + { + "epoch": 8.42505215059104, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012599980133108177, + "loss": 0.4891, + "step": 169630 + }, + { + "epoch": 8.42554882288666, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001259600675474322, + "loss": 0.4704, + "step": 169640 + }, + { + "epoch": 8.42604549518228, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012592033376378268, + "loss": 0.4891, + "step": 169650 + }, + { + "epoch": 8.426542167477898, + "grad_norm": 0.169921875, + "learning_rate": 0.0001258805999801331, + "loss": 0.5038, + "step": 169660 + }, + { + "epoch": 8.427038839773518, + "grad_norm": 0.181640625, + "learning_rate": 0.00012584086619648357, + "loss": 0.5051, + "step": 169670 + }, + { + "epoch": 8.427535512069136, + "grad_norm": 0.17578125, + "learning_rate": 0.00012580113241283402, + "loss": 0.4965, + "step": 169680 + }, + { + "epoch": 8.428032184364756, + "grad_norm": 0.248046875, + "learning_rate": 0.0001257613986291845, + "loss": 0.4932, + "step": 169690 + }, + { + "epoch": 8.428528856660375, + "grad_norm": 0.1875, + "learning_rate": 0.0001257216648455349, + "loss": 0.5121, + "step": 169700 + }, + { + "epoch": 8.429025528955995, + "grad_norm": 0.169921875, + "learning_rate": 0.00012568193106188538, + "loss": 0.4726, + "step": 169710 + }, + { + "epoch": 8.429522201251615, + "grad_norm": 0.18359375, + "learning_rate": 0.00012564219727823582, + "loss": 0.4629, + "step": 169720 + }, + { + "epoch": 8.430018873547233, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001256024634945863, + "loss": 0.4679, + "step": 169730 + }, + { + "epoch": 8.430515545842853, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001255627297109367, + "loss": 0.4935, + "step": 169740 + }, + { + "epoch": 8.431012218138472, + "grad_norm": 0.1796875, + "learning_rate": 0.00012552299592728718, + "loss": 0.4959, + "step": 169750 + }, + { + "epoch": 8.431508890434092, + "grad_norm": 0.1865234375, + "learning_rate": 0.00012548326214363763, + "loss": 0.4851, + "step": 169760 + }, + { + "epoch": 8.43200556272971, + "grad_norm": 0.1953125, + "learning_rate": 0.0001254435283599881, + "loss": 0.4833, + "step": 169770 + }, + { + "epoch": 8.43250223502533, + "grad_norm": 0.1552734375, + "learning_rate": 0.00012540379457633854, + "loss": 0.4903, + "step": 169780 + }, + { + "epoch": 8.43299890732095, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012536406079268899, + "loss": 0.4767, + "step": 169790 + }, + { + "epoch": 8.433495579616569, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012532432700903946, + "loss": 0.4849, + "step": 169800 + }, + { + "epoch": 8.433992251912189, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001252845932253899, + "loss": 0.5229, + "step": 169810 + }, + { + "epoch": 8.434488924207807, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012524485944174035, + "loss": 0.4598, + "step": 169820 + }, + { + "epoch": 8.434985596503427, + "grad_norm": 0.1875, + "learning_rate": 0.0001252051256580908, + "loss": 0.5054, + "step": 169830 + }, + { + "epoch": 8.435482268799046, + "grad_norm": 0.158203125, + "learning_rate": 0.00012516539187444126, + "loss": 0.459, + "step": 169840 + }, + { + "epoch": 8.435978941094666, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001251256580907917, + "loss": 0.491, + "step": 169850 + }, + { + "epoch": 8.436475613390286, + "grad_norm": 0.1806640625, + "learning_rate": 0.00012508592430714215, + "loss": 0.4843, + "step": 169860 + }, + { + "epoch": 8.436972285685904, + "grad_norm": 0.189453125, + "learning_rate": 0.0001250461905234926, + "loss": 0.5034, + "step": 169870 + }, + { + "epoch": 8.437468957981524, + "grad_norm": 0.1845703125, + "learning_rate": 0.00012500645673984307, + "loss": 0.4848, + "step": 169880 + }, + { + "epoch": 8.437965630277143, + "grad_norm": 0.25, + "learning_rate": 0.0001249667229561935, + "loss": 0.5074, + "step": 169890 + }, + { + "epoch": 8.438462302572763, + "grad_norm": 0.1884765625, + "learning_rate": 0.00012492698917254395, + "loss": 0.5188, + "step": 169900 + }, + { + "epoch": 8.438958974868381, + "grad_norm": 0.203125, + "learning_rate": 0.0001248872553888944, + "loss": 0.5135, + "step": 169910 + }, + { + "epoch": 8.439455647164001, + "grad_norm": 0.15625, + "learning_rate": 0.00012484752160524487, + "loss": 0.457, + "step": 169920 + }, + { + "epoch": 8.439952319459621, + "grad_norm": 0.208984375, + "learning_rate": 0.00012480778782159531, + "loss": 0.5279, + "step": 169930 + }, + { + "epoch": 8.44044899175524, + "grad_norm": 0.1923828125, + "learning_rate": 0.00012476805403794576, + "loss": 0.4814, + "step": 169940 + }, + { + "epoch": 8.44094566405086, + "grad_norm": 0.1572265625, + "learning_rate": 0.00012472832025429623, + "loss": 0.485, + "step": 169950 + }, + { + "epoch": 8.441442336346478, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012468858647064667, + "loss": 0.5091, + "step": 169960 + }, + { + "epoch": 8.441939008642098, + "grad_norm": 0.19140625, + "learning_rate": 0.00012464885268699712, + "loss": 0.4557, + "step": 169970 + }, + { + "epoch": 8.442435680937717, + "grad_norm": 0.173828125, + "learning_rate": 0.0001246091189033476, + "loss": 0.4626, + "step": 169980 + }, + { + "epoch": 8.442932353233337, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012456938511969803, + "loss": 0.4495, + "step": 169990 + }, + { + "epoch": 8.443429025528957, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012452965133604848, + "loss": 0.4806, + "step": 170000 + }, + { + "epoch": 8.443925697824575, + "grad_norm": 0.189453125, + "learning_rate": 0.00012448991755239895, + "loss": 0.4817, + "step": 170010 + }, + { + "epoch": 8.444422370120195, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001244501837687494, + "loss": 0.4995, + "step": 170020 + }, + { + "epoch": 8.444919042415814, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012441044998509984, + "loss": 0.4927, + "step": 170030 + }, + { + "epoch": 8.445415714711434, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012437071620145028, + "loss": 0.5012, + "step": 170040 + }, + { + "epoch": 8.445912387007052, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012433098241780075, + "loss": 0.493, + "step": 170050 + }, + { + "epoch": 8.446409059302672, + "grad_norm": 0.166015625, + "learning_rate": 0.0001242912486341512, + "loss": 0.525, + "step": 170060 + }, + { + "epoch": 8.446905731598292, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012425151485050164, + "loss": 0.4894, + "step": 170070 + }, + { + "epoch": 8.44740240389391, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012421178106685209, + "loss": 0.4814, + "step": 170080 + }, + { + "epoch": 8.44789907618953, + "grad_norm": 0.173828125, + "learning_rate": 0.00012417204728320256, + "loss": 0.4779, + "step": 170090 + }, + { + "epoch": 8.448395748485149, + "grad_norm": 0.1650390625, + "learning_rate": 0.000124132313499553, + "loss": 0.4791, + "step": 170100 + }, + { + "epoch": 8.44889242078077, + "grad_norm": 0.1767578125, + "learning_rate": 0.00012409257971590345, + "loss": 0.4647, + "step": 170110 + }, + { + "epoch": 8.449389093076388, + "grad_norm": 0.1875, + "learning_rate": 0.0001240528459322539, + "loss": 0.5275, + "step": 170120 + }, + { + "epoch": 8.449885765372008, + "grad_norm": 0.16015625, + "learning_rate": 0.00012401311214860436, + "loss": 0.4688, + "step": 170130 + }, + { + "epoch": 8.450382437667628, + "grad_norm": 0.17578125, + "learning_rate": 0.0001239733783649548, + "loss": 0.458, + "step": 170140 + }, + { + "epoch": 8.450879109963246, + "grad_norm": 0.224609375, + "learning_rate": 0.00012393364458130525, + "loss": 0.4909, + "step": 170150 + }, + { + "epoch": 8.451375782258866, + "grad_norm": 0.1591796875, + "learning_rate": 0.00012389391079765572, + "loss": 0.5083, + "step": 170160 + }, + { + "epoch": 8.451872454554485, + "grad_norm": 0.169921875, + "learning_rate": 0.00012385417701400617, + "loss": 0.5134, + "step": 170170 + }, + { + "epoch": 8.452369126850105, + "grad_norm": 0.181640625, + "learning_rate": 0.00012381444323035664, + "loss": 0.4914, + "step": 170180 + }, + { + "epoch": 8.452865799145723, + "grad_norm": 0.1640625, + "learning_rate": 0.00012377470944670705, + "loss": 0.4646, + "step": 170190 + }, + { + "epoch": 8.453362471441343, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012373497566305753, + "loss": 0.4769, + "step": 170200 + }, + { + "epoch": 8.453859143736963, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012369524187940797, + "loss": 0.4946, + "step": 170210 + }, + { + "epoch": 8.454355816032582, + "grad_norm": 0.185546875, + "learning_rate": 0.00012365550809575844, + "loss": 0.503, + "step": 170220 + }, + { + "epoch": 8.454852488328202, + "grad_norm": 0.181640625, + "learning_rate": 0.00012361577431210886, + "loss": 0.5124, + "step": 170230 + }, + { + "epoch": 8.45534916062382, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012357604052845933, + "loss": 0.4763, + "step": 170240 + }, + { + "epoch": 8.45584583291944, + "grad_norm": 0.166015625, + "learning_rate": 0.00012353630674480977, + "loss": 0.4672, + "step": 170250 + }, + { + "epoch": 8.456342505215058, + "grad_norm": 0.162109375, + "learning_rate": 0.00012349657296116024, + "loss": 0.5051, + "step": 170260 + }, + { + "epoch": 8.456839177510679, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012345683917751066, + "loss": 0.4574, + "step": 170270 + }, + { + "epoch": 8.457335849806299, + "grad_norm": 0.16015625, + "learning_rate": 0.00012341710539386113, + "loss": 0.4711, + "step": 170280 + }, + { + "epoch": 8.457832522101917, + "grad_norm": 0.1982421875, + "learning_rate": 0.00012337737161021158, + "loss": 0.4805, + "step": 170290 + }, + { + "epoch": 8.458329194397537, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012333763782656205, + "loss": 0.4692, + "step": 170300 + }, + { + "epoch": 8.458825866693156, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001232979040429125, + "loss": 0.4767, + "step": 170310 + }, + { + "epoch": 8.459322538988776, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012325817025926294, + "loss": 0.4833, + "step": 170320 + }, + { + "epoch": 8.459819211284394, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001232184364756134, + "loss": 0.488, + "step": 170330 + }, + { + "epoch": 8.460315883580014, + "grad_norm": 0.1796875, + "learning_rate": 0.00012317870269196385, + "loss": 0.529, + "step": 170340 + }, + { + "epoch": 8.460812555875632, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001231389689083143, + "loss": 0.4887, + "step": 170350 + }, + { + "epoch": 8.461309228171253, + "grad_norm": 0.1767578125, + "learning_rate": 0.00012309923512466474, + "loss": 0.5137, + "step": 170360 + }, + { + "epoch": 8.461805900466873, + "grad_norm": 0.197265625, + "learning_rate": 0.0001230595013410152, + "loss": 0.499, + "step": 170370 + }, + { + "epoch": 8.462302572762491, + "grad_norm": 0.150390625, + "learning_rate": 0.00012301976755736566, + "loss": 0.5015, + "step": 170380 + }, + { + "epoch": 8.462799245058111, + "grad_norm": 0.162109375, + "learning_rate": 0.00012298003377371613, + "loss": 0.4844, + "step": 170390 + }, + { + "epoch": 8.46329591735373, + "grad_norm": 0.1796875, + "learning_rate": 0.00012294029999006655, + "loss": 0.4932, + "step": 170400 + }, + { + "epoch": 8.46379258964935, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012290056620641702, + "loss": 0.4661, + "step": 170410 + }, + { + "epoch": 8.464289261944968, + "grad_norm": 0.185546875, + "learning_rate": 0.00012286083242276746, + "loss": 0.504, + "step": 170420 + }, + { + "epoch": 8.464785934240588, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012282109863911793, + "loss": 0.4773, + "step": 170430 + }, + { + "epoch": 8.465282606536208, + "grad_norm": 0.16015625, + "learning_rate": 0.00012278136485546835, + "loss": 0.4944, + "step": 170440 + }, + { + "epoch": 8.465779278831826, + "grad_norm": 0.1904296875, + "learning_rate": 0.00012274163107181882, + "loss": 0.4932, + "step": 170450 + }, + { + "epoch": 8.466275951127447, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012270189728816927, + "loss": 0.4765, + "step": 170460 + }, + { + "epoch": 8.466772623423065, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012266216350451974, + "loss": 0.487, + "step": 170470 + }, + { + "epoch": 8.467269295718685, + "grad_norm": 0.2119140625, + "learning_rate": 0.00012262242972087018, + "loss": 0.4692, + "step": 170480 + }, + { + "epoch": 8.467765968014303, + "grad_norm": 0.171875, + "learning_rate": 0.00012258269593722063, + "loss": 0.4835, + "step": 170490 + }, + { + "epoch": 8.468262640309923, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001225429621535711, + "loss": 0.4907, + "step": 170500 + }, + { + "epoch": 8.468759312605544, + "grad_norm": 0.17578125, + "learning_rate": 0.00012250322836992154, + "loss": 0.4705, + "step": 170510 + }, + { + "epoch": 8.469255984901162, + "grad_norm": 0.1982421875, + "learning_rate": 0.00012246349458627199, + "loss": 0.4721, + "step": 170520 + }, + { + "epoch": 8.469752657196782, + "grad_norm": 0.16796875, + "learning_rate": 0.00012242376080262243, + "loss": 0.4743, + "step": 170530 + }, + { + "epoch": 8.4702493294924, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001223840270189729, + "loss": 0.4876, + "step": 170540 + }, + { + "epoch": 8.47074600178802, + "grad_norm": 0.1640625, + "learning_rate": 0.00012234429323532335, + "loss": 0.5025, + "step": 170550 + }, + { + "epoch": 8.471242674083639, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001223045594516738, + "loss": 0.4845, + "step": 170560 + }, + { + "epoch": 8.471739346379259, + "grad_norm": 0.1640625, + "learning_rate": 0.00012226482566802423, + "loss": 0.5168, + "step": 170570 + }, + { + "epoch": 8.472236018674879, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001222250918843747, + "loss": 0.498, + "step": 170580 + }, + { + "epoch": 8.472732690970497, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012218535810072515, + "loss": 0.4583, + "step": 170590 + }, + { + "epoch": 8.473229363266118, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001221456243170756, + "loss": 0.4809, + "step": 170600 + }, + { + "epoch": 8.473726035561736, + "grad_norm": 0.1845703125, + "learning_rate": 0.00012210589053342604, + "loss": 0.5027, + "step": 170610 + }, + { + "epoch": 8.474222707857356, + "grad_norm": 0.16015625, + "learning_rate": 0.00012206615674977651, + "loss": 0.459, + "step": 170620 + }, + { + "epoch": 8.474719380152974, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012202642296612697, + "loss": 0.4806, + "step": 170630 + }, + { + "epoch": 8.475216052448594, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001219866891824774, + "loss": 0.4887, + "step": 170640 + }, + { + "epoch": 8.475712724744215, + "grad_norm": 0.1865234375, + "learning_rate": 0.00012194695539882786, + "loss": 0.4781, + "step": 170650 + }, + { + "epoch": 8.476209397039833, + "grad_norm": 0.177734375, + "learning_rate": 0.00012190722161517831, + "loss": 0.4878, + "step": 170660 + }, + { + "epoch": 8.476706069335453, + "grad_norm": 0.19921875, + "learning_rate": 0.00012186748783152877, + "loss": 0.5021, + "step": 170670 + }, + { + "epoch": 8.477202741631071, + "grad_norm": 0.17578125, + "learning_rate": 0.0001218277540478792, + "loss": 0.4751, + "step": 170680 + }, + { + "epoch": 8.477699413926691, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012178802026422966, + "loss": 0.4795, + "step": 170690 + }, + { + "epoch": 8.47819608622231, + "grad_norm": 0.197265625, + "learning_rate": 0.00012174828648058012, + "loss": 0.5012, + "step": 170700 + }, + { + "epoch": 8.47869275851793, + "grad_norm": 0.1953125, + "learning_rate": 0.00012170855269693058, + "loss": 0.5053, + "step": 170710 + }, + { + "epoch": 8.47918943081355, + "grad_norm": 0.1640625, + "learning_rate": 0.00012166881891328102, + "loss": 0.4818, + "step": 170720 + }, + { + "epoch": 8.479686103109168, + "grad_norm": 0.173828125, + "learning_rate": 0.00012162908512963148, + "loss": 0.4805, + "step": 170730 + }, + { + "epoch": 8.480182775404788, + "grad_norm": 0.171875, + "learning_rate": 0.00012158935134598192, + "loss": 0.5127, + "step": 170740 + }, + { + "epoch": 8.480679447700407, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012154961756233238, + "loss": 0.5017, + "step": 170750 + }, + { + "epoch": 8.481176119996027, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012150988377868282, + "loss": 0.4945, + "step": 170760 + }, + { + "epoch": 8.481672792291645, + "grad_norm": 0.185546875, + "learning_rate": 0.00012147014999503328, + "loss": 0.4775, + "step": 170770 + }, + { + "epoch": 8.482169464587265, + "grad_norm": 0.197265625, + "learning_rate": 0.00012143041621138374, + "loss": 0.4812, + "step": 170780 + }, + { + "epoch": 8.482666136882886, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001213906824277342, + "loss": 0.4761, + "step": 170790 + }, + { + "epoch": 8.483162809178504, + "grad_norm": 0.173828125, + "learning_rate": 0.00012135094864408463, + "loss": 0.4856, + "step": 170800 + }, + { + "epoch": 8.483659481474124, + "grad_norm": 0.1611328125, + "learning_rate": 0.00012131121486043509, + "loss": 0.4723, + "step": 170810 + }, + { + "epoch": 8.484156153769742, + "grad_norm": 0.189453125, + "learning_rate": 0.00012127148107678554, + "loss": 0.5, + "step": 170820 + }, + { + "epoch": 8.484652826065362, + "grad_norm": 0.1796875, + "learning_rate": 0.000121231747293136, + "loss": 0.4597, + "step": 170830 + }, + { + "epoch": 8.48514949836098, + "grad_norm": 0.1572265625, + "learning_rate": 0.00012119201350948646, + "loss": 0.4632, + "step": 170840 + }, + { + "epoch": 8.4856461706566, + "grad_norm": 0.1591796875, + "learning_rate": 0.00012115227972583689, + "loss": 0.4383, + "step": 170850 + }, + { + "epoch": 8.48614284295222, + "grad_norm": 0.19140625, + "learning_rate": 0.00012111254594218735, + "loss": 0.4637, + "step": 170860 + }, + { + "epoch": 8.48663951524784, + "grad_norm": 0.150390625, + "learning_rate": 0.0001210728121585378, + "loss": 0.485, + "step": 170870 + }, + { + "epoch": 8.48713618754346, + "grad_norm": 0.1884765625, + "learning_rate": 0.00012103307837488826, + "loss": 0.4719, + "step": 170880 + }, + { + "epoch": 8.487632859839078, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001209933445912387, + "loss": 0.4889, + "step": 170890 + }, + { + "epoch": 8.488129532134698, + "grad_norm": 0.1767578125, + "learning_rate": 0.00012095361080758915, + "loss": 0.5083, + "step": 170900 + }, + { + "epoch": 8.488626204430316, + "grad_norm": 0.1640625, + "learning_rate": 0.00012091387702393961, + "loss": 0.4648, + "step": 170910 + }, + { + "epoch": 8.489122876725936, + "grad_norm": 0.1884765625, + "learning_rate": 0.00012087414324029007, + "loss": 0.462, + "step": 170920 + }, + { + "epoch": 8.489619549021555, + "grad_norm": 0.17578125, + "learning_rate": 0.00012083440945664051, + "loss": 0.4874, + "step": 170930 + }, + { + "epoch": 8.490116221317175, + "grad_norm": 0.2099609375, + "learning_rate": 0.00012079467567299097, + "loss": 0.4802, + "step": 170940 + }, + { + "epoch": 8.490612893612795, + "grad_norm": 0.17578125, + "learning_rate": 0.00012075494188934143, + "loss": 0.5076, + "step": 170950 + }, + { + "epoch": 8.491109565908413, + "grad_norm": 0.15234375, + "learning_rate": 0.00012071520810569188, + "loss": 0.4954, + "step": 170960 + }, + { + "epoch": 8.491606238204033, + "grad_norm": 0.1845703125, + "learning_rate": 0.00012067547432204232, + "loss": 0.497, + "step": 170970 + }, + { + "epoch": 8.492102910499652, + "grad_norm": 0.169921875, + "learning_rate": 0.00012063574053839277, + "loss": 0.5424, + "step": 170980 + }, + { + "epoch": 8.492599582795272, + "grad_norm": 0.1611328125, + "learning_rate": 0.00012059600675474323, + "loss": 0.5002, + "step": 170990 + }, + { + "epoch": 8.49309625509089, + "grad_norm": 0.1767578125, + "learning_rate": 0.00012055627297109369, + "loss": 0.4813, + "step": 171000 + }, + { + "epoch": 8.49359292738651, + "grad_norm": 0.16796875, + "learning_rate": 0.00012051653918744412, + "loss": 0.5072, + "step": 171010 + }, + { + "epoch": 8.49408959968213, + "grad_norm": 0.185546875, + "learning_rate": 0.00012047680540379458, + "loss": 0.5017, + "step": 171020 + }, + { + "epoch": 8.494586271977749, + "grad_norm": 0.169921875, + "learning_rate": 0.00012043707162014503, + "loss": 0.4959, + "step": 171030 + }, + { + "epoch": 8.495082944273369, + "grad_norm": 0.18359375, + "learning_rate": 0.00012039733783649549, + "loss": 0.4379, + "step": 171040 + }, + { + "epoch": 8.495579616568987, + "grad_norm": 0.158203125, + "learning_rate": 0.00012035760405284592, + "loss": 0.4611, + "step": 171050 + }, + { + "epoch": 8.496076288864607, + "grad_norm": 0.1884765625, + "learning_rate": 0.00012031787026919638, + "loss": 0.5006, + "step": 171060 + }, + { + "epoch": 8.496572961160226, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012027813648554684, + "loss": 0.4515, + "step": 171070 + }, + { + "epoch": 8.497069633455846, + "grad_norm": 0.197265625, + "learning_rate": 0.0001202384027018973, + "loss": 0.487, + "step": 171080 + }, + { + "epoch": 8.497566305751466, + "grad_norm": 0.2041015625, + "learning_rate": 0.00012019866891824774, + "loss": 0.484, + "step": 171090 + }, + { + "epoch": 8.498062978047084, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001201589351345982, + "loss": 0.4772, + "step": 171100 + }, + { + "epoch": 8.498559650342704, + "grad_norm": 0.16796875, + "learning_rate": 0.00012011920135094866, + "loss": 0.4699, + "step": 171110 + }, + { + "epoch": 8.499056322638323, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012007946756729911, + "loss": 0.4888, + "step": 171120 + }, + { + "epoch": 8.499552994933943, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012003973378364955, + "loss": 0.461, + "step": 171130 + }, + { + "epoch": 8.500049667229561, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012, + "loss": 0.5063, + "step": 171140 + }, + { + "epoch": 8.500546339525181, + "grad_norm": 0.1845703125, + "learning_rate": 0.00011996026621635046, + "loss": 0.4832, + "step": 171150 + }, + { + "epoch": 8.501043011820801, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011992053243270092, + "loss": 0.4892, + "step": 171160 + }, + { + "epoch": 8.50153968411642, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011988079864905135, + "loss": 0.4947, + "step": 171170 + }, + { + "epoch": 8.50203635641204, + "grad_norm": 0.17578125, + "learning_rate": 0.00011984106486540181, + "loss": 0.4856, + "step": 171180 + }, + { + "epoch": 8.502533028707658, + "grad_norm": 0.1845703125, + "learning_rate": 0.00011980133108175226, + "loss": 0.4755, + "step": 171190 + }, + { + "epoch": 8.503029701003278, + "grad_norm": 0.171875, + "learning_rate": 0.00011976159729810272, + "loss": 0.5107, + "step": 171200 + }, + { + "epoch": 8.503526373298897, + "grad_norm": 0.17578125, + "learning_rate": 0.00011972186351445315, + "loss": 0.5227, + "step": 171210 + }, + { + "epoch": 8.504023045594517, + "grad_norm": 0.1552734375, + "learning_rate": 0.00011968212973080361, + "loss": 0.4864, + "step": 171220 + }, + { + "epoch": 8.504519717890137, + "grad_norm": 0.1748046875, + "learning_rate": 0.00011964239594715407, + "loss": 0.4398, + "step": 171230 + }, + { + "epoch": 8.505016390185755, + "grad_norm": 0.1875, + "learning_rate": 0.00011960266216350453, + "loss": 0.4832, + "step": 171240 + }, + { + "epoch": 8.505513062481375, + "grad_norm": 0.1796875, + "learning_rate": 0.00011956292837985498, + "loss": 0.4942, + "step": 171250 + }, + { + "epoch": 8.506009734776994, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011952319459620543, + "loss": 0.4764, + "step": 171260 + }, + { + "epoch": 8.506506407072614, + "grad_norm": 0.18359375, + "learning_rate": 0.00011948346081255589, + "loss": 0.4805, + "step": 171270 + }, + { + "epoch": 8.507003079368232, + "grad_norm": 0.1943359375, + "learning_rate": 0.00011944372702890634, + "loss": 0.5, + "step": 171280 + }, + { + "epoch": 8.507499751663852, + "grad_norm": 0.15234375, + "learning_rate": 0.00011940399324525679, + "loss": 0.4862, + "step": 171290 + }, + { + "epoch": 8.507996423959472, + "grad_norm": 0.197265625, + "learning_rate": 0.00011936425946160723, + "loss": 0.4842, + "step": 171300 + }, + { + "epoch": 8.50849309625509, + "grad_norm": 0.169921875, + "learning_rate": 0.00011932452567795769, + "loss": 0.4655, + "step": 171310 + }, + { + "epoch": 8.50898976855071, + "grad_norm": 0.166015625, + "learning_rate": 0.00011928479189430815, + "loss": 0.4868, + "step": 171320 + }, + { + "epoch": 8.509486440846329, + "grad_norm": 0.177734375, + "learning_rate": 0.0001192450581106586, + "loss": 0.4836, + "step": 171330 + }, + { + "epoch": 8.50998311314195, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011920532432700904, + "loss": 0.4904, + "step": 171340 + }, + { + "epoch": 8.510479785437568, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001191655905433595, + "loss": 0.4697, + "step": 171350 + }, + { + "epoch": 8.510976457733188, + "grad_norm": 0.181640625, + "learning_rate": 0.00011912585675970995, + "loss": 0.5178, + "step": 171360 + }, + { + "epoch": 8.511473130028808, + "grad_norm": 0.1748046875, + "learning_rate": 0.00011908612297606041, + "loss": 0.4777, + "step": 171370 + }, + { + "epoch": 8.511969802324426, + "grad_norm": 0.197265625, + "learning_rate": 0.00011904638919241084, + "loss": 0.5055, + "step": 171380 + }, + { + "epoch": 8.512466474620046, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001190066554087613, + "loss": 0.4759, + "step": 171390 + }, + { + "epoch": 8.512963146915665, + "grad_norm": 0.177734375, + "learning_rate": 0.00011896692162511176, + "loss": 0.4742, + "step": 171400 + }, + { + "epoch": 8.513459819211285, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011892718784146221, + "loss": 0.4928, + "step": 171410 + }, + { + "epoch": 8.513956491506903, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011888745405781266, + "loss": 0.4833, + "step": 171420 + }, + { + "epoch": 8.514453163802523, + "grad_norm": 0.171875, + "learning_rate": 0.00011884772027416312, + "loss": 0.514, + "step": 171430 + }, + { + "epoch": 8.514949836098143, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011880798649051356, + "loss": 0.4703, + "step": 171440 + }, + { + "epoch": 8.515446508393762, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011876825270686402, + "loss": 0.4729, + "step": 171450 + }, + { + "epoch": 8.515943180689382, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011872851892321446, + "loss": 0.4881, + "step": 171460 + }, + { + "epoch": 8.516439852985, + "grad_norm": 0.208984375, + "learning_rate": 0.00011868878513956492, + "loss": 0.4682, + "step": 171470 + }, + { + "epoch": 8.51693652528062, + "grad_norm": 0.162109375, + "learning_rate": 0.00011864905135591538, + "loss": 0.4637, + "step": 171480 + }, + { + "epoch": 8.517433197576239, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011860931757226584, + "loss": 0.4956, + "step": 171490 + }, + { + "epoch": 8.517929869871859, + "grad_norm": 0.1806640625, + "learning_rate": 0.00011856958378861627, + "loss": 0.4511, + "step": 171500 + }, + { + "epoch": 8.518426542167479, + "grad_norm": 0.1962890625, + "learning_rate": 0.00011852985000496672, + "loss": 0.4889, + "step": 171510 + }, + { + "epoch": 8.518923214463097, + "grad_norm": 0.166015625, + "learning_rate": 0.00011849011622131718, + "loss": 0.45, + "step": 171520 + }, + { + "epoch": 8.519419886758717, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011845038243766764, + "loss": 0.4617, + "step": 171530 + }, + { + "epoch": 8.519916559054336, + "grad_norm": 0.181640625, + "learning_rate": 0.00011841064865401807, + "loss": 0.5195, + "step": 171540 + }, + { + "epoch": 8.520413231349956, + "grad_norm": 0.17578125, + "learning_rate": 0.00011837091487036853, + "loss": 0.4709, + "step": 171550 + }, + { + "epoch": 8.520909903645574, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011833118108671899, + "loss": 0.4762, + "step": 171560 + }, + { + "epoch": 8.521406575941194, + "grad_norm": 0.16796875, + "learning_rate": 0.00011829144730306944, + "loss": 0.4603, + "step": 171570 + }, + { + "epoch": 8.521903248236814, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011825171351941989, + "loss": 0.4745, + "step": 171580 + }, + { + "epoch": 8.522399920532433, + "grad_norm": 0.19140625, + "learning_rate": 0.00011821197973577033, + "loss": 0.501, + "step": 171590 + }, + { + "epoch": 8.522896592828053, + "grad_norm": 0.1796875, + "learning_rate": 0.00011817224595212079, + "loss": 0.4991, + "step": 171600 + }, + { + "epoch": 8.523393265123671, + "grad_norm": 0.1748046875, + "learning_rate": 0.00011813251216847125, + "loss": 0.4872, + "step": 171610 + }, + { + "epoch": 8.523889937419291, + "grad_norm": 0.166015625, + "learning_rate": 0.00011809277838482169, + "loss": 0.4998, + "step": 171620 + }, + { + "epoch": 8.52438660971491, + "grad_norm": 0.1787109375, + "learning_rate": 0.00011805304460117215, + "loss": 0.4902, + "step": 171630 + }, + { + "epoch": 8.52488328201053, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011801331081752261, + "loss": 0.4906, + "step": 171640 + }, + { + "epoch": 8.52537995430615, + "grad_norm": 0.169921875, + "learning_rate": 0.00011797357703387307, + "loss": 0.4801, + "step": 171650 + }, + { + "epoch": 8.525876626601768, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011793384325022352, + "loss": 0.4798, + "step": 171660 + }, + { + "epoch": 8.526373298897388, + "grad_norm": 0.1884765625, + "learning_rate": 0.00011789410946657395, + "loss": 0.4599, + "step": 171670 + }, + { + "epoch": 8.526869971193006, + "grad_norm": 0.197265625, + "learning_rate": 0.00011785437568292441, + "loss": 0.4866, + "step": 171680 + }, + { + "epoch": 8.527366643488627, + "grad_norm": 0.177734375, + "learning_rate": 0.00011781464189927487, + "loss": 0.5076, + "step": 171690 + }, + { + "epoch": 8.527863315784245, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011777490811562533, + "loss": 0.4867, + "step": 171700 + }, + { + "epoch": 8.528359988079865, + "grad_norm": 0.16796875, + "learning_rate": 0.00011773517433197576, + "loss": 0.4963, + "step": 171710 + }, + { + "epoch": 8.528856660375485, + "grad_norm": 0.1845703125, + "learning_rate": 0.00011769544054832622, + "loss": 0.5126, + "step": 171720 + }, + { + "epoch": 8.529353332671104, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011765570676467667, + "loss": 0.4964, + "step": 171730 + }, + { + "epoch": 8.529850004966724, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011761597298102713, + "loss": 0.4977, + "step": 171740 + }, + { + "epoch": 8.530346677262342, + "grad_norm": 0.1640625, + "learning_rate": 0.00011757623919737756, + "loss": 0.5222, + "step": 171750 + }, + { + "epoch": 8.530843349557962, + "grad_norm": 0.1845703125, + "learning_rate": 0.00011753650541372802, + "loss": 0.4701, + "step": 171760 + }, + { + "epoch": 8.53134002185358, + "grad_norm": 0.171875, + "learning_rate": 0.00011749677163007848, + "loss": 0.4865, + "step": 171770 + }, + { + "epoch": 8.5318366941492, + "grad_norm": 0.193359375, + "learning_rate": 0.00011745703784642894, + "loss": 0.4622, + "step": 171780 + }, + { + "epoch": 8.532333366444819, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011741730406277938, + "loss": 0.4803, + "step": 171790 + }, + { + "epoch": 8.532830038740439, + "grad_norm": 0.16796875, + "learning_rate": 0.00011737757027912984, + "loss": 0.4632, + "step": 171800 + }, + { + "epoch": 8.533326711036059, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001173378364954803, + "loss": 0.4871, + "step": 171810 + }, + { + "epoch": 8.533823383331677, + "grad_norm": 0.185546875, + "learning_rate": 0.00011729810271183075, + "loss": 0.5087, + "step": 171820 + }, + { + "epoch": 8.534320055627298, + "grad_norm": 0.1640625, + "learning_rate": 0.00011725836892818118, + "loss": 0.4876, + "step": 171830 + }, + { + "epoch": 8.534816727922916, + "grad_norm": 0.2021484375, + "learning_rate": 0.00011721863514453164, + "loss": 0.4843, + "step": 171840 + }, + { + "epoch": 8.535313400218536, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001171789013608821, + "loss": 0.4838, + "step": 171850 + }, + { + "epoch": 8.535810072514154, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011713916757723256, + "loss": 0.463, + "step": 171860 + }, + { + "epoch": 8.536306744809774, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011709943379358299, + "loss": 0.4703, + "step": 171870 + }, + { + "epoch": 8.536803417105395, + "grad_norm": 0.16796875, + "learning_rate": 0.00011705970000993345, + "loss": 0.4724, + "step": 171880 + }, + { + "epoch": 8.537300089401013, + "grad_norm": 0.189453125, + "learning_rate": 0.0001170199662262839, + "loss": 0.5123, + "step": 171890 + }, + { + "epoch": 8.537796761696633, + "grad_norm": 0.1884765625, + "learning_rate": 0.00011698023244263436, + "loss": 0.4887, + "step": 171900 + }, + { + "epoch": 8.538293433992251, + "grad_norm": 0.17578125, + "learning_rate": 0.00011694049865898479, + "loss": 0.4584, + "step": 171910 + }, + { + "epoch": 8.538790106287871, + "grad_norm": 0.1826171875, + "learning_rate": 0.00011690076487533525, + "loss": 0.4788, + "step": 171920 + }, + { + "epoch": 8.53928677858349, + "grad_norm": 0.171875, + "learning_rate": 0.00011686103109168571, + "loss": 0.5075, + "step": 171930 + }, + { + "epoch": 8.53978345087911, + "grad_norm": 0.16796875, + "learning_rate": 0.00011682129730803617, + "loss": 0.5154, + "step": 171940 + }, + { + "epoch": 8.54028012317473, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011678156352438661, + "loss": 0.4921, + "step": 171950 + }, + { + "epoch": 8.540776795470348, + "grad_norm": 0.1787109375, + "learning_rate": 0.00011674182974073707, + "loss": 0.4844, + "step": 171960 + }, + { + "epoch": 8.541273467765969, + "grad_norm": 0.1943359375, + "learning_rate": 0.00011670209595708753, + "loss": 0.4855, + "step": 171970 + }, + { + "epoch": 8.541770140061587, + "grad_norm": 0.1796875, + "learning_rate": 0.00011666236217343798, + "loss": 0.4719, + "step": 171980 + }, + { + "epoch": 8.542266812357207, + "grad_norm": 0.2373046875, + "learning_rate": 0.00011662262838978841, + "loss": 0.4725, + "step": 171990 + }, + { + "epoch": 8.542763484652825, + "grad_norm": 0.177734375, + "learning_rate": 0.00011658289460613887, + "loss": 0.4709, + "step": 172000 + }, + { + "epoch": 8.543260156948445, + "grad_norm": 0.1865234375, + "learning_rate": 0.00011654316082248933, + "loss": 0.4989, + "step": 172010 + }, + { + "epoch": 8.543756829244066, + "grad_norm": 0.1796875, + "learning_rate": 0.00011650342703883979, + "loss": 0.4963, + "step": 172020 + }, + { + "epoch": 8.544253501539684, + "grad_norm": 0.1748046875, + "learning_rate": 0.00011646369325519022, + "loss": 0.4863, + "step": 172030 + }, + { + "epoch": 8.544750173835304, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011642395947154068, + "loss": 0.4976, + "step": 172040 + }, + { + "epoch": 8.545246846130922, + "grad_norm": 0.1884765625, + "learning_rate": 0.00011638422568789113, + "loss": 0.4842, + "step": 172050 + }, + { + "epoch": 8.545743518426542, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011634449190424159, + "loss": 0.4863, + "step": 172060 + }, + { + "epoch": 8.54624019072216, + "grad_norm": 0.2041015625, + "learning_rate": 0.00011630475812059205, + "loss": 0.4896, + "step": 172070 + }, + { + "epoch": 8.546736863017781, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011626502433694248, + "loss": 0.495, + "step": 172080 + }, + { + "epoch": 8.547233535313401, + "grad_norm": 0.18359375, + "learning_rate": 0.00011622529055329294, + "loss": 0.4976, + "step": 172090 + }, + { + "epoch": 8.54773020760902, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001161855567696434, + "loss": 0.5086, + "step": 172100 + }, + { + "epoch": 8.54822687990464, + "grad_norm": 0.1962890625, + "learning_rate": 0.00011614582298599385, + "loss": 0.4871, + "step": 172110 + }, + { + "epoch": 8.548723552200258, + "grad_norm": 0.171875, + "learning_rate": 0.0001161060892023443, + "loss": 0.4809, + "step": 172120 + }, + { + "epoch": 8.549220224495878, + "grad_norm": 0.1806640625, + "learning_rate": 0.00011606635541869476, + "loss": 0.4747, + "step": 172130 + }, + { + "epoch": 8.549716896791496, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001160266216350452, + "loss": 0.5212, + "step": 172140 + }, + { + "epoch": 8.550213569087116, + "grad_norm": 0.1865234375, + "learning_rate": 0.00011598688785139566, + "loss": 0.4953, + "step": 172150 + }, + { + "epoch": 8.550710241382736, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001159471540677461, + "loss": 0.4812, + "step": 172160 + }, + { + "epoch": 8.551206913678355, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011590742028409656, + "loss": 0.4604, + "step": 172170 + }, + { + "epoch": 8.551703585973975, + "grad_norm": 0.177734375, + "learning_rate": 0.00011586768650044702, + "loss": 0.4723, + "step": 172180 + }, + { + "epoch": 8.552200258269593, + "grad_norm": 0.2021484375, + "learning_rate": 0.00011582795271679748, + "loss": 0.4686, + "step": 172190 + }, + { + "epoch": 8.552696930565213, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001157882189331479, + "loss": 0.4577, + "step": 172200 + }, + { + "epoch": 8.553193602860832, + "grad_norm": 0.197265625, + "learning_rate": 0.00011574848514949836, + "loss": 0.4925, + "step": 172210 + }, + { + "epoch": 8.553690275156452, + "grad_norm": 0.1796875, + "learning_rate": 0.00011570875136584882, + "loss": 0.5066, + "step": 172220 + }, + { + "epoch": 8.55418694745207, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011566901758219928, + "loss": 0.4873, + "step": 172230 + }, + { + "epoch": 8.55468361974769, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011562928379854971, + "loss": 0.4874, + "step": 172240 + }, + { + "epoch": 8.55518029204331, + "grad_norm": 0.1748046875, + "learning_rate": 0.00011558955001490017, + "loss": 0.4744, + "step": 172250 + }, + { + "epoch": 8.555676964338929, + "grad_norm": 0.16796875, + "learning_rate": 0.00011554981623125063, + "loss": 0.4727, + "step": 172260 + }, + { + "epoch": 8.556173636634549, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011551008244760108, + "loss": 0.4891, + "step": 172270 + }, + { + "epoch": 8.556670308930167, + "grad_norm": 0.1806640625, + "learning_rate": 0.00011547034866395153, + "loss": 0.4895, + "step": 172280 + }, + { + "epoch": 8.557166981225787, + "grad_norm": 0.1796875, + "learning_rate": 0.00011543061488030197, + "loss": 0.4906, + "step": 172290 + }, + { + "epoch": 8.557663653521406, + "grad_norm": 0.2041015625, + "learning_rate": 0.00011539088109665243, + "loss": 0.493, + "step": 172300 + }, + { + "epoch": 8.558160325817026, + "grad_norm": 0.1806640625, + "learning_rate": 0.00011535114731300289, + "loss": 0.5026, + "step": 172310 + }, + { + "epoch": 8.558656998112646, + "grad_norm": 0.1826171875, + "learning_rate": 0.00011531141352935333, + "loss": 0.4587, + "step": 172320 + }, + { + "epoch": 8.559153670408264, + "grad_norm": 0.1572265625, + "learning_rate": 0.00011527167974570379, + "loss": 0.4745, + "step": 172330 + }, + { + "epoch": 8.559650342703884, + "grad_norm": 0.19921875, + "learning_rate": 0.00011523194596205425, + "loss": 0.4961, + "step": 172340 + }, + { + "epoch": 8.560147014999503, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001151922121784047, + "loss": 0.5004, + "step": 172350 + }, + { + "epoch": 8.560643687295123, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011515247839475514, + "loss": 0.4838, + "step": 172360 + }, + { + "epoch": 8.561140359590741, + "grad_norm": 0.19140625, + "learning_rate": 0.0001151127446111056, + "loss": 0.5345, + "step": 172370 + }, + { + "epoch": 8.561637031886361, + "grad_norm": 0.16796875, + "learning_rate": 0.00011507301082745605, + "loss": 0.4883, + "step": 172380 + }, + { + "epoch": 8.562133704181981, + "grad_norm": 0.19140625, + "learning_rate": 0.00011503327704380651, + "loss": 0.4802, + "step": 172390 + }, + { + "epoch": 8.5626303764776, + "grad_norm": 0.1787109375, + "learning_rate": 0.00011499354326015694, + "loss": 0.477, + "step": 172400 + }, + { + "epoch": 8.56312704877322, + "grad_norm": 0.18359375, + "learning_rate": 0.0001149538094765074, + "loss": 0.4884, + "step": 172410 + }, + { + "epoch": 8.563623721068838, + "grad_norm": 0.177734375, + "learning_rate": 0.00011491407569285786, + "loss": 0.4693, + "step": 172420 + }, + { + "epoch": 8.564120393364458, + "grad_norm": 0.1865234375, + "learning_rate": 0.00011487434190920831, + "loss": 0.4957, + "step": 172430 + }, + { + "epoch": 8.564617065660077, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011483460812555874, + "loss": 0.4737, + "step": 172440 + }, + { + "epoch": 8.565113737955697, + "grad_norm": 0.1640625, + "learning_rate": 0.0001147948743419092, + "loss": 0.5143, + "step": 172450 + }, + { + "epoch": 8.565610410251317, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011475514055825966, + "loss": 0.4957, + "step": 172460 + }, + { + "epoch": 8.566107082546935, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011471540677461012, + "loss": 0.475, + "step": 172470 + }, + { + "epoch": 8.566603754842555, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011467567299096056, + "loss": 0.4944, + "step": 172480 + }, + { + "epoch": 8.567100427138174, + "grad_norm": 0.1865234375, + "learning_rate": 0.00011463593920731102, + "loss": 0.4913, + "step": 172490 + }, + { + "epoch": 8.567597099433794, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011459620542366148, + "loss": 0.4978, + "step": 172500 + }, + { + "epoch": 8.568093771729412, + "grad_norm": 0.17578125, + "learning_rate": 0.00011455647164001194, + "loss": 0.4725, + "step": 172510 + }, + { + "epoch": 8.568590444025032, + "grad_norm": 0.19140625, + "learning_rate": 0.0001145167378563624, + "loss": 0.5416, + "step": 172520 + }, + { + "epoch": 8.569087116320652, + "grad_norm": 0.1787109375, + "learning_rate": 0.00011447700407271282, + "loss": 0.4676, + "step": 172530 + }, + { + "epoch": 8.56958378861627, + "grad_norm": 0.1640625, + "learning_rate": 0.00011443727028906328, + "loss": 0.4885, + "step": 172540 + }, + { + "epoch": 8.57008046091189, + "grad_norm": 0.2158203125, + "learning_rate": 0.00011439753650541374, + "loss": 0.4661, + "step": 172550 + }, + { + "epoch": 8.57057713320751, + "grad_norm": 0.169921875, + "learning_rate": 0.0001143578027217642, + "loss": 0.4737, + "step": 172560 + }, + { + "epoch": 8.57107380550313, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011431806893811463, + "loss": 0.481, + "step": 172570 + }, + { + "epoch": 8.571570477798748, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011427833515446509, + "loss": 0.4818, + "step": 172580 + }, + { + "epoch": 8.572067150094368, + "grad_norm": 0.197265625, + "learning_rate": 0.00011423860137081554, + "loss": 0.4792, + "step": 172590 + }, + { + "epoch": 8.572563822389988, + "grad_norm": 0.171875, + "learning_rate": 0.000114198867587166, + "loss": 0.5084, + "step": 172600 + }, + { + "epoch": 8.573060494685606, + "grad_norm": 0.189453125, + "learning_rate": 0.00011415913380351643, + "loss": 0.5065, + "step": 172610 + }, + { + "epoch": 8.573557166981226, + "grad_norm": 0.1640625, + "learning_rate": 0.00011411940001986689, + "loss": 0.5042, + "step": 172620 + }, + { + "epoch": 8.574053839276845, + "grad_norm": 0.17578125, + "learning_rate": 0.00011407966623621735, + "loss": 0.5009, + "step": 172630 + }, + { + "epoch": 8.574550511572465, + "grad_norm": 0.169921875, + "learning_rate": 0.0001140399324525678, + "loss": 0.4828, + "step": 172640 + }, + { + "epoch": 8.575047183868083, + "grad_norm": 0.19140625, + "learning_rate": 0.00011400019866891825, + "loss": 0.4809, + "step": 172650 + }, + { + "epoch": 8.575543856163703, + "grad_norm": 0.16796875, + "learning_rate": 0.00011396046488526871, + "loss": 0.4985, + "step": 172660 + }, + { + "epoch": 8.576040528459323, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011392073110161917, + "loss": 0.4727, + "step": 172670 + }, + { + "epoch": 8.576537200754942, + "grad_norm": 0.181640625, + "learning_rate": 0.00011388099731796962, + "loss": 0.4763, + "step": 172680 + }, + { + "epoch": 8.577033873050562, + "grad_norm": 0.1923828125, + "learning_rate": 0.00011384126353432005, + "loss": 0.4663, + "step": 172690 + }, + { + "epoch": 8.57753054534618, + "grad_norm": 0.2001953125, + "learning_rate": 0.00011380152975067051, + "loss": 0.4856, + "step": 172700 + }, + { + "epoch": 8.5780272176418, + "grad_norm": 0.1572265625, + "learning_rate": 0.00011376179596702097, + "loss": 0.4837, + "step": 172710 + }, + { + "epoch": 8.578523889937419, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011372206218337143, + "loss": 0.49, + "step": 172720 + }, + { + "epoch": 8.579020562233039, + "grad_norm": 0.181640625, + "learning_rate": 0.00011368232839972186, + "loss": 0.4778, + "step": 172730 + }, + { + "epoch": 8.579517234528659, + "grad_norm": 0.1943359375, + "learning_rate": 0.00011364259461607232, + "loss": 0.5232, + "step": 172740 + }, + { + "epoch": 8.580013906824277, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011360286083242277, + "loss": 0.5044, + "step": 172750 + }, + { + "epoch": 8.580510579119897, + "grad_norm": 0.177734375, + "learning_rate": 0.00011356312704877323, + "loss": 0.5027, + "step": 172760 + }, + { + "epoch": 8.581007251415516, + "grad_norm": 0.2080078125, + "learning_rate": 0.00011352339326512366, + "loss": 0.473, + "step": 172770 + }, + { + "epoch": 8.581503923711136, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011348365948147412, + "loss": 0.4845, + "step": 172780 + }, + { + "epoch": 8.582000596006754, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011344392569782458, + "loss": 0.4811, + "step": 172790 + }, + { + "epoch": 8.582497268302374, + "grad_norm": 0.16015625, + "learning_rate": 0.00011340419191417504, + "loss": 0.4984, + "step": 172800 + }, + { + "epoch": 8.582993940597994, + "grad_norm": 0.1826171875, + "learning_rate": 0.00011336445813052548, + "loss": 0.4727, + "step": 172810 + }, + { + "epoch": 8.583490612893613, + "grad_norm": 0.1943359375, + "learning_rate": 0.00011332472434687594, + "loss": 0.5024, + "step": 172820 + }, + { + "epoch": 8.583987285189233, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001132849905632264, + "loss": 0.4802, + "step": 172830 + }, + { + "epoch": 8.584483957484851, + "grad_norm": 0.171875, + "learning_rate": 0.00011324525677957685, + "loss": 0.4615, + "step": 172840 + }, + { + "epoch": 8.584980629780471, + "grad_norm": 0.1904296875, + "learning_rate": 0.00011320552299592728, + "loss": 0.5001, + "step": 172850 + }, + { + "epoch": 8.58547730207609, + "grad_norm": 0.1640625, + "learning_rate": 0.00011316578921227774, + "loss": 0.48, + "step": 172860 + }, + { + "epoch": 8.58597397437171, + "grad_norm": 0.177734375, + "learning_rate": 0.0001131260554286282, + "loss": 0.4833, + "step": 172870 + }, + { + "epoch": 8.58647064666733, + "grad_norm": 0.2177734375, + "learning_rate": 0.00011308632164497866, + "loss": 0.4868, + "step": 172880 + }, + { + "epoch": 8.586967318962948, + "grad_norm": 0.1845703125, + "learning_rate": 0.00011304658786132909, + "loss": 0.4963, + "step": 172890 + }, + { + "epoch": 8.587463991258568, + "grad_norm": 0.2080078125, + "learning_rate": 0.00011300685407767955, + "loss": 0.4841, + "step": 172900 + }, + { + "epoch": 8.587960663554187, + "grad_norm": 0.1884765625, + "learning_rate": 0.00011296712029403, + "loss": 0.5264, + "step": 172910 + }, + { + "epoch": 8.588457335849807, + "grad_norm": 0.16796875, + "learning_rate": 0.00011292738651038046, + "loss": 0.4982, + "step": 172920 + }, + { + "epoch": 8.588954008145425, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011288765272673092, + "loss": 0.4825, + "step": 172930 + }, + { + "epoch": 8.589450680441045, + "grad_norm": 0.193359375, + "learning_rate": 0.00011284791894308135, + "loss": 0.5479, + "step": 172940 + }, + { + "epoch": 8.589947352736665, + "grad_norm": 0.1796875, + "learning_rate": 0.00011280818515943181, + "loss": 0.4673, + "step": 172950 + }, + { + "epoch": 8.590444025032284, + "grad_norm": 0.197265625, + "learning_rate": 0.00011276845137578227, + "loss": 0.4555, + "step": 172960 + }, + { + "epoch": 8.590940697327904, + "grad_norm": 0.1884765625, + "learning_rate": 0.00011272871759213272, + "loss": 0.501, + "step": 172970 + }, + { + "epoch": 8.591437369623522, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011268898380848317, + "loss": 0.4642, + "step": 172980 + }, + { + "epoch": 8.591934041919142, + "grad_norm": 0.17578125, + "learning_rate": 0.00011264925002483361, + "loss": 0.4943, + "step": 172990 + }, + { + "epoch": 8.59243071421476, + "grad_norm": 0.2060546875, + "learning_rate": 0.00011260951624118407, + "loss": 0.4543, + "step": 173000 + }, + { + "epoch": 8.59292738651038, + "grad_norm": 0.158203125, + "learning_rate": 0.00011256978245753453, + "loss": 0.4762, + "step": 173010 + }, + { + "epoch": 8.593424058806, + "grad_norm": 0.181640625, + "learning_rate": 0.00011253004867388497, + "loss": 0.4986, + "step": 173020 + }, + { + "epoch": 8.593920731101619, + "grad_norm": 0.2392578125, + "learning_rate": 0.00011249031489023543, + "loss": 0.5028, + "step": 173030 + }, + { + "epoch": 8.59441740339724, + "grad_norm": 0.1806640625, + "learning_rate": 0.00011245058110658589, + "loss": 0.5066, + "step": 173040 + }, + { + "epoch": 8.594914075692857, + "grad_norm": 0.1865234375, + "learning_rate": 0.00011241084732293635, + "loss": 0.4852, + "step": 173050 + }, + { + "epoch": 8.595410747988478, + "grad_norm": 0.17578125, + "learning_rate": 0.00011237111353928678, + "loss": 0.4797, + "step": 173060 + }, + { + "epoch": 8.595907420284096, + "grad_norm": 0.20703125, + "learning_rate": 0.00011233137975563723, + "loss": 0.4802, + "step": 173070 + }, + { + "epoch": 8.596404092579716, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011229164597198769, + "loss": 0.5015, + "step": 173080 + }, + { + "epoch": 8.596900764875336, + "grad_norm": 0.234375, + "learning_rate": 0.00011225191218833815, + "loss": 0.4566, + "step": 173090 + }, + { + "epoch": 8.597397437170955, + "grad_norm": 0.251953125, + "learning_rate": 0.00011221217840468858, + "loss": 0.4922, + "step": 173100 + }, + { + "epoch": 8.597894109466575, + "grad_norm": 0.1796875, + "learning_rate": 0.00011217244462103904, + "loss": 0.5129, + "step": 173110 + }, + { + "epoch": 8.598390781762193, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001121327108373895, + "loss": 0.5091, + "step": 173120 + }, + { + "epoch": 8.598887454057813, + "grad_norm": 0.1845703125, + "learning_rate": 0.00011209297705373995, + "loss": 0.5011, + "step": 173130 + }, + { + "epoch": 8.599384126353431, + "grad_norm": 0.1875, + "learning_rate": 0.00011205324327009038, + "loss": 0.4902, + "step": 173140 + }, + { + "epoch": 8.599880798649052, + "grad_norm": 0.166015625, + "learning_rate": 0.00011201350948644084, + "loss": 0.4722, + "step": 173150 + }, + { + "epoch": 8.600377470944672, + "grad_norm": 0.224609375, + "learning_rate": 0.0001119737757027913, + "loss": 0.4924, + "step": 173160 + }, + { + "epoch": 8.60087414324029, + "grad_norm": 0.1796875, + "learning_rate": 0.00011193404191914176, + "loss": 0.5015, + "step": 173170 + }, + { + "epoch": 8.60137081553591, + "grad_norm": 0.19921875, + "learning_rate": 0.0001118943081354922, + "loss": 0.4771, + "step": 173180 + }, + { + "epoch": 8.601867487831528, + "grad_norm": 0.181640625, + "learning_rate": 0.00011185457435184266, + "loss": 0.4567, + "step": 173190 + }, + { + "epoch": 8.602364160127149, + "grad_norm": 0.15625, + "learning_rate": 0.00011181484056819312, + "loss": 0.5131, + "step": 173200 + }, + { + "epoch": 8.602860832422767, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011177510678454358, + "loss": 0.4573, + "step": 173210 + }, + { + "epoch": 8.603357504718387, + "grad_norm": 0.1728515625, + "learning_rate": 0.000111735373000894, + "loss": 0.4729, + "step": 173220 + }, + { + "epoch": 8.603854177014005, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011169563921724446, + "loss": 0.5176, + "step": 173230 + }, + { + "epoch": 8.604350849309625, + "grad_norm": 0.2041015625, + "learning_rate": 0.00011165590543359492, + "loss": 0.4943, + "step": 173240 + }, + { + "epoch": 8.604847521605246, + "grad_norm": 0.1904296875, + "learning_rate": 0.00011161617164994538, + "loss": 0.4649, + "step": 173250 + }, + { + "epoch": 8.605344193900864, + "grad_norm": 0.1982421875, + "learning_rate": 0.00011157643786629581, + "loss": 0.5047, + "step": 173260 + }, + { + "epoch": 8.605840866196484, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011153670408264627, + "loss": 0.4832, + "step": 173270 + }, + { + "epoch": 8.606337538492102, + "grad_norm": 0.189453125, + "learning_rate": 0.00011149697029899673, + "loss": 0.4853, + "step": 173280 + }, + { + "epoch": 8.606834210787722, + "grad_norm": 0.1865234375, + "learning_rate": 0.00011145723651534718, + "loss": 0.4814, + "step": 173290 + }, + { + "epoch": 8.60733088308334, + "grad_norm": 0.15625, + "learning_rate": 0.00011141750273169761, + "loss": 0.4646, + "step": 173300 + }, + { + "epoch": 8.607827555378961, + "grad_norm": 0.189453125, + "learning_rate": 0.00011137776894804807, + "loss": 0.4631, + "step": 173310 + }, + { + "epoch": 8.608324227674581, + "grad_norm": 0.1806640625, + "learning_rate": 0.00011133803516439853, + "loss": 0.4459, + "step": 173320 + }, + { + "epoch": 8.6088208999702, + "grad_norm": 0.181640625, + "learning_rate": 0.00011129830138074899, + "loss": 0.4805, + "step": 173330 + }, + { + "epoch": 8.60931757226582, + "grad_norm": 0.1943359375, + "learning_rate": 0.00011125856759709945, + "loss": 0.4772, + "step": 173340 + }, + { + "epoch": 8.609814244561438, + "grad_norm": 0.1875, + "learning_rate": 0.00011121883381344989, + "loss": 0.4769, + "step": 173350 + }, + { + "epoch": 8.610310916857058, + "grad_norm": 0.162109375, + "learning_rate": 0.00011117910002980035, + "loss": 0.4761, + "step": 173360 + }, + { + "epoch": 8.610807589152676, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001111393662461508, + "loss": 0.4986, + "step": 173370 + }, + { + "epoch": 8.611304261448296, + "grad_norm": 0.234375, + "learning_rate": 0.00011109963246250126, + "loss": 0.4688, + "step": 173380 + }, + { + "epoch": 8.611800933743917, + "grad_norm": 0.197265625, + "learning_rate": 0.0001110598986788517, + "loss": 0.4831, + "step": 173390 + }, + { + "epoch": 8.612297606039535, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011102016489520215, + "loss": 0.5004, + "step": 173400 + }, + { + "epoch": 8.612794278335155, + "grad_norm": 0.189453125, + "learning_rate": 0.00011098043111155261, + "loss": 0.4872, + "step": 173410 + }, + { + "epoch": 8.613290950630773, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011094069732790307, + "loss": 0.4881, + "step": 173420 + }, + { + "epoch": 8.613787622926393, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001109009635442535, + "loss": 0.4759, + "step": 173430 + }, + { + "epoch": 8.614284295222012, + "grad_norm": 0.18359375, + "learning_rate": 0.00011086122976060396, + "loss": 0.489, + "step": 173440 + }, + { + "epoch": 8.614780967517632, + "grad_norm": 0.20703125, + "learning_rate": 0.00011082149597695441, + "loss": 0.5264, + "step": 173450 + }, + { + "epoch": 8.615277639813252, + "grad_norm": 0.16796875, + "learning_rate": 0.00011078176219330487, + "loss": 0.4752, + "step": 173460 + }, + { + "epoch": 8.61577431210887, + "grad_norm": 0.1796875, + "learning_rate": 0.0001107420284096553, + "loss": 0.508, + "step": 173470 + }, + { + "epoch": 8.61627098440449, + "grad_norm": 0.1806640625, + "learning_rate": 0.00011070229462600576, + "loss": 0.4833, + "step": 173480 + }, + { + "epoch": 8.616767656700109, + "grad_norm": 0.2236328125, + "learning_rate": 0.00011066256084235622, + "loss": 0.4774, + "step": 173490 + }, + { + "epoch": 8.617264328995729, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011062282705870668, + "loss": 0.4662, + "step": 173500 + }, + { + "epoch": 8.617761001291347, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011058309327505712, + "loss": 0.5317, + "step": 173510 + }, + { + "epoch": 8.618257673586967, + "grad_norm": 0.1962890625, + "learning_rate": 0.00011054335949140758, + "loss": 0.4799, + "step": 173520 + }, + { + "epoch": 8.618754345882587, + "grad_norm": 0.1826171875, + "learning_rate": 0.00011050362570775804, + "loss": 0.4866, + "step": 173530 + }, + { + "epoch": 8.619251018178206, + "grad_norm": 0.1796875, + "learning_rate": 0.0001104638919241085, + "loss": 0.4851, + "step": 173540 + }, + { + "epoch": 8.619747690473826, + "grad_norm": 0.171875, + "learning_rate": 0.00011042415814045892, + "loss": 0.4892, + "step": 173550 + }, + { + "epoch": 8.620244362769444, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011038442435680938, + "loss": 0.5096, + "step": 173560 + }, + { + "epoch": 8.620741035065064, + "grad_norm": 0.2265625, + "learning_rate": 0.00011034469057315984, + "loss": 0.457, + "step": 173570 + }, + { + "epoch": 8.621237707360683, + "grad_norm": 0.1640625, + "learning_rate": 0.0001103049567895103, + "loss": 0.5146, + "step": 173580 + }, + { + "epoch": 8.621734379656303, + "grad_norm": 0.1796875, + "learning_rate": 0.00011026522300586073, + "loss": 0.4439, + "step": 173590 + }, + { + "epoch": 8.622231051951921, + "grad_norm": 0.166015625, + "learning_rate": 0.00011022548922221119, + "loss": 0.5048, + "step": 173600 + }, + { + "epoch": 8.622727724247541, + "grad_norm": 0.177734375, + "learning_rate": 0.00011018575543856164, + "loss": 0.4622, + "step": 173610 + }, + { + "epoch": 8.623224396543161, + "grad_norm": 0.177734375, + "learning_rate": 0.0001101460216549121, + "loss": 0.503, + "step": 173620 + }, + { + "epoch": 8.62372106883878, + "grad_norm": 0.1796875, + "learning_rate": 0.00011010628787126253, + "loss": 0.5009, + "step": 173630 + }, + { + "epoch": 8.6242177411344, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011006655408761299, + "loss": 0.4883, + "step": 173640 + }, + { + "epoch": 8.624714413430018, + "grad_norm": 0.16796875, + "learning_rate": 0.00011002682030396345, + "loss": 0.4946, + "step": 173650 + }, + { + "epoch": 8.625211085725638, + "grad_norm": 0.197265625, + "learning_rate": 0.0001099870865203139, + "loss": 0.5035, + "step": 173660 + }, + { + "epoch": 8.625707758021257, + "grad_norm": 0.17578125, + "learning_rate": 0.00010994735273666435, + "loss": 0.4953, + "step": 173670 + }, + { + "epoch": 8.626204430316877, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010990761895301481, + "loss": 0.5011, + "step": 173680 + }, + { + "epoch": 8.626701102612497, + "grad_norm": 0.162109375, + "learning_rate": 0.00010986788516936527, + "loss": 0.4745, + "step": 173690 + }, + { + "epoch": 8.627197774908115, + "grad_norm": 0.1640625, + "learning_rate": 0.00010982815138571571, + "loss": 0.4817, + "step": 173700 + }, + { + "epoch": 8.627694447203735, + "grad_norm": 0.1865234375, + "learning_rate": 0.00010978841760206615, + "loss": 0.5043, + "step": 173710 + }, + { + "epoch": 8.628191119499354, + "grad_norm": 0.171875, + "learning_rate": 0.00010974868381841661, + "loss": 0.4751, + "step": 173720 + }, + { + "epoch": 8.628687791794974, + "grad_norm": 0.1640625, + "learning_rate": 0.00010970895003476707, + "loss": 0.478, + "step": 173730 + }, + { + "epoch": 8.629184464090592, + "grad_norm": 0.1943359375, + "learning_rate": 0.00010966921625111753, + "loss": 0.4573, + "step": 173740 + }, + { + "epoch": 8.629681136386212, + "grad_norm": 0.1640625, + "learning_rate": 0.00010962948246746796, + "loss": 0.504, + "step": 173750 + }, + { + "epoch": 8.630177808681832, + "grad_norm": 0.1533203125, + "learning_rate": 0.00010958974868381842, + "loss": 0.4997, + "step": 173760 + }, + { + "epoch": 8.63067448097745, + "grad_norm": 0.1875, + "learning_rate": 0.00010955001490016887, + "loss": 0.4746, + "step": 173770 + }, + { + "epoch": 8.63117115327307, + "grad_norm": 0.21484375, + "learning_rate": 0.00010951028111651933, + "loss": 0.4846, + "step": 173780 + }, + { + "epoch": 8.63166782556869, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010947054733286979, + "loss": 0.4894, + "step": 173790 + }, + { + "epoch": 8.63216449786431, + "grad_norm": 0.201171875, + "learning_rate": 0.00010943081354922022, + "loss": 0.485, + "step": 173800 + }, + { + "epoch": 8.632661170159928, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010939107976557068, + "loss": 0.4527, + "step": 173810 + }, + { + "epoch": 8.633157842455548, + "grad_norm": 0.18359375, + "learning_rate": 0.00010935134598192114, + "loss": 0.51, + "step": 173820 + }, + { + "epoch": 8.633654514751168, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001093116121982716, + "loss": 0.4759, + "step": 173830 + }, + { + "epoch": 8.634151187046786, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010927187841462202, + "loss": 0.4881, + "step": 173840 + }, + { + "epoch": 8.634647859342406, + "grad_norm": 0.23046875, + "learning_rate": 0.00010923214463097248, + "loss": 0.5048, + "step": 173850 + }, + { + "epoch": 8.635144531638025, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010919241084732294, + "loss": 0.522, + "step": 173860 + }, + { + "epoch": 8.635641203933645, + "grad_norm": 0.197265625, + "learning_rate": 0.0001091526770636734, + "loss": 0.4871, + "step": 173870 + }, + { + "epoch": 8.636137876229263, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010911294328002384, + "loss": 0.4856, + "step": 173880 + }, + { + "epoch": 8.636634548524883, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001090732094963743, + "loss": 0.5151, + "step": 173890 + }, + { + "epoch": 8.637131220820503, + "grad_norm": 0.1982421875, + "learning_rate": 0.00010903347571272476, + "loss": 0.4928, + "step": 173900 + }, + { + "epoch": 8.637627893116122, + "grad_norm": 0.16015625, + "learning_rate": 0.00010899374192907522, + "loss": 0.4525, + "step": 173910 + }, + { + "epoch": 8.638124565411742, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010895400814542565, + "loss": 0.4856, + "step": 173920 + }, + { + "epoch": 8.63862123770736, + "grad_norm": 0.1640625, + "learning_rate": 0.0001089142743617761, + "loss": 0.4888, + "step": 173930 + }, + { + "epoch": 8.63911791000298, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010887454057812656, + "loss": 0.4846, + "step": 173940 + }, + { + "epoch": 8.639614582298599, + "grad_norm": 0.1962890625, + "learning_rate": 0.00010883480679447702, + "loss": 0.4762, + "step": 173950 + }, + { + "epoch": 8.640111254594219, + "grad_norm": 0.173828125, + "learning_rate": 0.00010879507301082745, + "loss": 0.4901, + "step": 173960 + }, + { + "epoch": 8.640607926889839, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010875533922717791, + "loss": 0.4973, + "step": 173970 + }, + { + "epoch": 8.641104599185457, + "grad_norm": 0.201171875, + "learning_rate": 0.00010871560544352837, + "loss": 0.4961, + "step": 173980 + }, + { + "epoch": 8.641601271481077, + "grad_norm": 0.1640625, + "learning_rate": 0.00010867587165987882, + "loss": 0.484, + "step": 173990 + }, + { + "epoch": 8.642097943776696, + "grad_norm": 0.169921875, + "learning_rate": 0.00010863613787622925, + "loss": 0.4745, + "step": 174000 + }, + { + "epoch": 8.642594616072316, + "grad_norm": 0.162109375, + "learning_rate": 0.00010859640409257971, + "loss": 0.4634, + "step": 174010 + }, + { + "epoch": 8.643091288367934, + "grad_norm": 0.16796875, + "learning_rate": 0.00010855667030893017, + "loss": 0.4951, + "step": 174020 + }, + { + "epoch": 8.643587960663554, + "grad_norm": 0.173828125, + "learning_rate": 0.00010851693652528063, + "loss": 0.4963, + "step": 174030 + }, + { + "epoch": 8.644084632959174, + "grad_norm": 0.19140625, + "learning_rate": 0.00010847720274163107, + "loss": 0.4981, + "step": 174040 + }, + { + "epoch": 8.644581305254793, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010843746895798153, + "loss": 0.5023, + "step": 174050 + }, + { + "epoch": 8.645077977550413, + "grad_norm": 0.1650390625, + "learning_rate": 0.00010839773517433199, + "loss": 0.509, + "step": 174060 + }, + { + "epoch": 8.645574649846031, + "grad_norm": 0.2001953125, + "learning_rate": 0.00010835800139068245, + "loss": 0.465, + "step": 174070 + }, + { + "epoch": 8.646071322141651, + "grad_norm": 0.177734375, + "learning_rate": 0.00010831826760703288, + "loss": 0.496, + "step": 174080 + }, + { + "epoch": 8.64656799443727, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010827853382338333, + "loss": 0.4552, + "step": 174090 + }, + { + "epoch": 8.64706466673289, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010823880003973379, + "loss": 0.4964, + "step": 174100 + }, + { + "epoch": 8.64756133902851, + "grad_norm": 0.193359375, + "learning_rate": 0.00010819906625608425, + "loss": 0.4778, + "step": 174110 + }, + { + "epoch": 8.648058011324128, + "grad_norm": 0.20703125, + "learning_rate": 0.00010815933247243468, + "loss": 0.4903, + "step": 174120 + }, + { + "epoch": 8.648554683619748, + "grad_norm": 0.19140625, + "learning_rate": 0.00010811959868878514, + "loss": 0.4899, + "step": 174130 + }, + { + "epoch": 8.649051355915367, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001080798649051356, + "loss": 0.5264, + "step": 174140 + }, + { + "epoch": 8.649548028210987, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010804013112148605, + "loss": 0.4519, + "step": 174150 + }, + { + "epoch": 8.650044700506605, + "grad_norm": 0.1806640625, + "learning_rate": 0.00010800039733783648, + "loss": 0.4579, + "step": 174160 + }, + { + "epoch": 8.650541372802225, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010796066355418694, + "loss": 0.4924, + "step": 174170 + }, + { + "epoch": 8.651038045097845, + "grad_norm": 0.171875, + "learning_rate": 0.0001079209297705374, + "loss": 0.4829, + "step": 174180 + }, + { + "epoch": 8.651534717393464, + "grad_norm": 0.1923828125, + "learning_rate": 0.00010788119598688786, + "loss": 0.495, + "step": 174190 + }, + { + "epoch": 8.652031389689084, + "grad_norm": 0.19921875, + "learning_rate": 0.00010784146220323832, + "loss": 0.496, + "step": 174200 + }, + { + "epoch": 8.652528061984702, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010780172841958876, + "loss": 0.4862, + "step": 174210 + }, + { + "epoch": 8.653024734280322, + "grad_norm": 0.171875, + "learning_rate": 0.00010776199463593922, + "loss": 0.4759, + "step": 174220 + }, + { + "epoch": 8.65352140657594, + "grad_norm": 0.169921875, + "learning_rate": 0.00010772226085228968, + "loss": 0.4728, + "step": 174230 + }, + { + "epoch": 8.65401807887156, + "grad_norm": 0.1611328125, + "learning_rate": 0.00010768252706864013, + "loss": 0.4325, + "step": 174240 + }, + { + "epoch": 8.65451475116718, + "grad_norm": 0.18359375, + "learning_rate": 0.00010764279328499056, + "loss": 0.5091, + "step": 174250 + }, + { + "epoch": 8.655011423462799, + "grad_norm": 0.169921875, + "learning_rate": 0.00010760305950134102, + "loss": 0.5027, + "step": 174260 + }, + { + "epoch": 8.65550809575842, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010756332571769148, + "loss": 0.4376, + "step": 174270 + }, + { + "epoch": 8.656004768054038, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010752359193404194, + "loss": 0.4668, + "step": 174280 + }, + { + "epoch": 8.656501440349658, + "grad_norm": 0.171875, + "learning_rate": 0.00010748385815039237, + "loss": 0.4832, + "step": 174290 + }, + { + "epoch": 8.656998112645276, + "grad_norm": 0.2060546875, + "learning_rate": 0.00010744412436674283, + "loss": 0.4939, + "step": 174300 + }, + { + "epoch": 8.657494784940896, + "grad_norm": 0.1904296875, + "learning_rate": 0.00010740439058309328, + "loss": 0.4924, + "step": 174310 + }, + { + "epoch": 8.657991457236516, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010736465679944374, + "loss": 0.4954, + "step": 174320 + }, + { + "epoch": 8.658488129532135, + "grad_norm": 0.185546875, + "learning_rate": 0.00010732492301579417, + "loss": 0.4583, + "step": 174330 + }, + { + "epoch": 8.658984801827755, + "grad_norm": 0.16796875, + "learning_rate": 0.00010728518923214463, + "loss": 0.4932, + "step": 174340 + }, + { + "epoch": 8.659481474123373, + "grad_norm": 0.18359375, + "learning_rate": 0.00010724545544849509, + "loss": 0.4873, + "step": 174350 + }, + { + "epoch": 8.659978146418993, + "grad_norm": 0.177734375, + "learning_rate": 0.00010720572166484555, + "loss": 0.4775, + "step": 174360 + }, + { + "epoch": 8.660474818714611, + "grad_norm": 0.1796875, + "learning_rate": 0.00010716598788119599, + "loss": 0.4945, + "step": 174370 + }, + { + "epoch": 8.660971491010232, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010712625409754645, + "loss": 0.4903, + "step": 174380 + }, + { + "epoch": 8.661468163305852, + "grad_norm": 0.1640625, + "learning_rate": 0.0001070865203138969, + "loss": 0.4777, + "step": 174390 + }, + { + "epoch": 8.66196483560147, + "grad_norm": 0.2060546875, + "learning_rate": 0.00010704678653024735, + "loss": 0.4987, + "step": 174400 + }, + { + "epoch": 8.66246150789709, + "grad_norm": 0.173828125, + "learning_rate": 0.0001070070527465978, + "loss": 0.4975, + "step": 174410 + }, + { + "epoch": 8.662958180192708, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010696731896294825, + "loss": 0.4891, + "step": 174420 + }, + { + "epoch": 8.663454852488329, + "grad_norm": 0.2041015625, + "learning_rate": 0.00010692758517929871, + "loss": 0.463, + "step": 174430 + }, + { + "epoch": 8.663951524783947, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010688785139564917, + "loss": 0.4858, + "step": 174440 + }, + { + "epoch": 8.664448197079567, + "grad_norm": 0.16796875, + "learning_rate": 0.0001068481176119996, + "loss": 0.45, + "step": 174450 + }, + { + "epoch": 8.664944869375187, + "grad_norm": 0.216796875, + "learning_rate": 0.00010680838382835006, + "loss": 0.4699, + "step": 174460 + }, + { + "epoch": 8.665441541670805, + "grad_norm": 0.169921875, + "learning_rate": 0.00010676865004470051, + "loss": 0.4936, + "step": 174470 + }, + { + "epoch": 8.665938213966426, + "grad_norm": 0.1611328125, + "learning_rate": 0.00010672891626105097, + "loss": 0.4886, + "step": 174480 + }, + { + "epoch": 8.666434886262044, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001066891824774014, + "loss": 0.4716, + "step": 174490 + }, + { + "epoch": 8.666931558557664, + "grad_norm": 0.1923828125, + "learning_rate": 0.00010664944869375186, + "loss": 0.4806, + "step": 174500 + }, + { + "epoch": 8.667428230853282, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010660971491010232, + "loss": 0.4962, + "step": 174510 + }, + { + "epoch": 8.667924903148903, + "grad_norm": 0.189453125, + "learning_rate": 0.00010656998112645278, + "loss": 0.4882, + "step": 174520 + }, + { + "epoch": 8.668421575444523, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010653024734280322, + "loss": 0.5009, + "step": 174530 + }, + { + "epoch": 8.668918247740141, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010649051355915368, + "loss": 0.5219, + "step": 174540 + }, + { + "epoch": 8.669414920035761, + "grad_norm": 0.1904296875, + "learning_rate": 0.00010645077977550412, + "loss": 0.5139, + "step": 174550 + }, + { + "epoch": 8.66991159233138, + "grad_norm": 0.197265625, + "learning_rate": 0.00010641104599185458, + "loss": 0.4798, + "step": 174560 + }, + { + "epoch": 8.670408264627, + "grad_norm": 0.189453125, + "learning_rate": 0.00010637131220820502, + "loss": 0.481, + "step": 174570 + }, + { + "epoch": 8.670904936922618, + "grad_norm": 0.177734375, + "learning_rate": 0.00010633157842455548, + "loss": 0.4926, + "step": 174580 + }, + { + "epoch": 8.671401609218238, + "grad_norm": 0.173828125, + "learning_rate": 0.00010629184464090594, + "loss": 0.4913, + "step": 174590 + }, + { + "epoch": 8.671898281513856, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001062521108572564, + "loss": 0.4824, + "step": 174600 + }, + { + "epoch": 8.672394953809476, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010621237707360685, + "loss": 0.4928, + "step": 174610 + }, + { + "epoch": 8.672891626105097, + "grad_norm": 0.169921875, + "learning_rate": 0.00010617264328995729, + "loss": 0.5017, + "step": 174620 + }, + { + "epoch": 8.673388298400715, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010613290950630774, + "loss": 0.5175, + "step": 174630 + }, + { + "epoch": 8.673884970696335, + "grad_norm": 0.169921875, + "learning_rate": 0.0001060931757226582, + "loss": 0.5026, + "step": 174640 + }, + { + "epoch": 8.674381642991953, + "grad_norm": 0.17578125, + "learning_rate": 0.00010605344193900866, + "loss": 0.4927, + "step": 174650 + }, + { + "epoch": 8.674878315287573, + "grad_norm": 0.171875, + "learning_rate": 0.00010601370815535909, + "loss": 0.4765, + "step": 174660 + }, + { + "epoch": 8.675374987583192, + "grad_norm": 0.181640625, + "learning_rate": 0.00010597397437170955, + "loss": 0.4778, + "step": 174670 + }, + { + "epoch": 8.675871659878812, + "grad_norm": 0.1953125, + "learning_rate": 0.00010593424058806, + "loss": 0.4864, + "step": 174680 + }, + { + "epoch": 8.676368332174432, + "grad_norm": 0.16796875, + "learning_rate": 0.00010589450680441046, + "loss": 0.5022, + "step": 174690 + }, + { + "epoch": 8.67686500447005, + "grad_norm": 0.1875, + "learning_rate": 0.0001058547730207609, + "loss": 0.4873, + "step": 174700 + }, + { + "epoch": 8.67736167676567, + "grad_norm": 0.1865234375, + "learning_rate": 0.00010581503923711135, + "loss": 0.4798, + "step": 174710 + }, + { + "epoch": 8.677858349061289, + "grad_norm": 0.181640625, + "learning_rate": 0.00010577530545346181, + "loss": 0.4645, + "step": 174720 + }, + { + "epoch": 8.678355021356909, + "grad_norm": 0.205078125, + "learning_rate": 0.00010573557166981227, + "loss": 0.498, + "step": 174730 + }, + { + "epoch": 8.678851693652527, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010569583788616271, + "loss": 0.474, + "step": 174740 + }, + { + "epoch": 8.679348365948147, + "grad_norm": 0.1806640625, + "learning_rate": 0.00010565610410251317, + "loss": 0.4782, + "step": 174750 + }, + { + "epoch": 8.679845038243768, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010561637031886363, + "loss": 0.4497, + "step": 174760 + }, + { + "epoch": 8.680341710539386, + "grad_norm": 0.2001953125, + "learning_rate": 0.00010557663653521408, + "loss": 0.4833, + "step": 174770 + }, + { + "epoch": 8.680838382835006, + "grad_norm": 0.185546875, + "learning_rate": 0.00010553690275156452, + "loss": 0.4923, + "step": 174780 + }, + { + "epoch": 8.681335055130624, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010549716896791497, + "loss": 0.4748, + "step": 174790 + }, + { + "epoch": 8.681831727426244, + "grad_norm": 0.201171875, + "learning_rate": 0.00010545743518426543, + "loss": 0.4766, + "step": 174800 + }, + { + "epoch": 8.682328399721863, + "grad_norm": 0.17578125, + "learning_rate": 0.00010541770140061589, + "loss": 0.5165, + "step": 174810 + }, + { + "epoch": 8.682825072017483, + "grad_norm": 0.16796875, + "learning_rate": 0.00010537796761696632, + "loss": 0.4782, + "step": 174820 + }, + { + "epoch": 8.683321744313103, + "grad_norm": 0.2158203125, + "learning_rate": 0.00010533823383331678, + "loss": 0.482, + "step": 174830 + }, + { + "epoch": 8.683818416608721, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010529850004966724, + "loss": 0.508, + "step": 174840 + }, + { + "epoch": 8.684315088904341, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010525876626601769, + "loss": 0.4965, + "step": 174850 + }, + { + "epoch": 8.68481176119996, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010521903248236812, + "loss": 0.4568, + "step": 174860 + }, + { + "epoch": 8.68530843349558, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010517929869871858, + "loss": 0.4496, + "step": 174870 + }, + { + "epoch": 8.685805105791198, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010513956491506904, + "loss": 0.4925, + "step": 174880 + }, + { + "epoch": 8.686301778086818, + "grad_norm": 0.171875, + "learning_rate": 0.0001050998311314195, + "loss": 0.4353, + "step": 174890 + }, + { + "epoch": 8.686798450382438, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010506009734776994, + "loss": 0.4739, + "step": 174900 + }, + { + "epoch": 8.687295122678057, + "grad_norm": 0.185546875, + "learning_rate": 0.0001050203635641204, + "loss": 0.4828, + "step": 174910 + }, + { + "epoch": 8.687791794973677, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010498062978047086, + "loss": 0.4693, + "step": 174920 + }, + { + "epoch": 8.688288467269295, + "grad_norm": 0.18359375, + "learning_rate": 0.00010494089599682131, + "loss": 0.5162, + "step": 174930 + }, + { + "epoch": 8.688785139564915, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010490116221317175, + "loss": 0.5078, + "step": 174940 + }, + { + "epoch": 8.689281811860534, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001048614284295222, + "loss": 0.5057, + "step": 174950 + }, + { + "epoch": 8.689778484156154, + "grad_norm": 0.17578125, + "learning_rate": 0.00010482169464587266, + "loss": 0.4784, + "step": 174960 + }, + { + "epoch": 8.690275156451774, + "grad_norm": 0.197265625, + "learning_rate": 0.00010478196086222312, + "loss": 0.5044, + "step": 174970 + }, + { + "epoch": 8.690771828747392, + "grad_norm": 0.197265625, + "learning_rate": 0.00010474222707857355, + "loss": 0.4781, + "step": 174980 + }, + { + "epoch": 8.691268501043012, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010470249329492401, + "loss": 0.473, + "step": 174990 + }, + { + "epoch": 8.69176517333863, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010466275951127447, + "loss": 0.4808, + "step": 175000 + }, + { + "epoch": 8.69226184563425, + "grad_norm": 0.171875, + "learning_rate": 0.00010462302572762492, + "loss": 0.4748, + "step": 175010 + }, + { + "epoch": 8.69275851792987, + "grad_norm": 0.171875, + "learning_rate": 0.00010458329194397538, + "loss": 0.4727, + "step": 175020 + }, + { + "epoch": 8.69325519022549, + "grad_norm": 0.1865234375, + "learning_rate": 0.00010454355816032581, + "loss": 0.4716, + "step": 175030 + }, + { + "epoch": 8.693751862521108, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010450382437667627, + "loss": 0.4809, + "step": 175040 + }, + { + "epoch": 8.694248534816728, + "grad_norm": 0.16796875, + "learning_rate": 0.00010446409059302673, + "loss": 0.492, + "step": 175050 + }, + { + "epoch": 8.694745207112348, + "grad_norm": 0.1650390625, + "learning_rate": 0.00010442435680937719, + "loss": 0.4875, + "step": 175060 + }, + { + "epoch": 8.695241879407966, + "grad_norm": 0.208984375, + "learning_rate": 0.00010438462302572763, + "loss": 0.4855, + "step": 175070 + }, + { + "epoch": 8.695738551703586, + "grad_norm": 0.19921875, + "learning_rate": 0.00010434488924207809, + "loss": 0.4954, + "step": 175080 + }, + { + "epoch": 8.696235223999205, + "grad_norm": 0.16015625, + "learning_rate": 0.00010430515545842854, + "loss": 0.4874, + "step": 175090 + }, + { + "epoch": 8.696731896294825, + "grad_norm": 0.177734375, + "learning_rate": 0.00010426542167477899, + "loss": 0.4979, + "step": 175100 + }, + { + "epoch": 8.697228568590443, + "grad_norm": 0.1982421875, + "learning_rate": 0.00010422568789112943, + "loss": 0.5075, + "step": 175110 + }, + { + "epoch": 8.697725240886063, + "grad_norm": 0.2060546875, + "learning_rate": 0.00010418595410747989, + "loss": 0.4886, + "step": 175120 + }, + { + "epoch": 8.698221913181683, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010414622032383035, + "loss": 0.4603, + "step": 175130 + }, + { + "epoch": 8.698718585477302, + "grad_norm": 0.193359375, + "learning_rate": 0.00010410648654018081, + "loss": 0.4983, + "step": 175140 + }, + { + "epoch": 8.699215257772922, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010406675275653124, + "loss": 0.4894, + "step": 175150 + }, + { + "epoch": 8.69971193006854, + "grad_norm": 0.19921875, + "learning_rate": 0.0001040270189728817, + "loss": 0.4874, + "step": 175160 + }, + { + "epoch": 8.70020860236416, + "grad_norm": 0.1875, + "learning_rate": 0.00010398728518923215, + "loss": 0.4918, + "step": 175170 + }, + { + "epoch": 8.700705274659779, + "grad_norm": 0.1796875, + "learning_rate": 0.00010394755140558261, + "loss": 0.5007, + "step": 175180 + }, + { + "epoch": 8.701201946955399, + "grad_norm": 0.203125, + "learning_rate": 0.00010390781762193304, + "loss": 0.478, + "step": 175190 + }, + { + "epoch": 8.701698619251019, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001038680838382835, + "loss": 0.4908, + "step": 175200 + }, + { + "epoch": 8.702195291546637, + "grad_norm": 0.1650390625, + "learning_rate": 0.00010382835005463396, + "loss": 0.4689, + "step": 175210 + }, + { + "epoch": 8.702691963842257, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010378861627098442, + "loss": 0.4518, + "step": 175220 + }, + { + "epoch": 8.703188636137876, + "grad_norm": 0.169921875, + "learning_rate": 0.00010374888248733486, + "loss": 0.4886, + "step": 175230 + }, + { + "epoch": 8.703685308433496, + "grad_norm": 0.1630859375, + "learning_rate": 0.00010370914870368532, + "loss": 0.5028, + "step": 175240 + }, + { + "epoch": 8.704181980729114, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010366941492003576, + "loss": 0.4998, + "step": 175250 + }, + { + "epoch": 8.704678653024734, + "grad_norm": 0.1943359375, + "learning_rate": 0.00010362968113638622, + "loss": 0.4962, + "step": 175260 + }, + { + "epoch": 8.705175325320354, + "grad_norm": 0.177734375, + "learning_rate": 0.00010358994735273666, + "loss": 0.4612, + "step": 175270 + }, + { + "epoch": 8.705671997615973, + "grad_norm": 0.2001953125, + "learning_rate": 0.00010355021356908712, + "loss": 0.4968, + "step": 175280 + }, + { + "epoch": 8.706168669911593, + "grad_norm": 0.166015625, + "learning_rate": 0.00010351047978543758, + "loss": 0.4972, + "step": 175290 + }, + { + "epoch": 8.706665342207211, + "grad_norm": 0.18359375, + "learning_rate": 0.00010347074600178804, + "loss": 0.4681, + "step": 175300 + }, + { + "epoch": 8.707162014502831, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010343101221813847, + "loss": 0.459, + "step": 175310 + }, + { + "epoch": 8.70765868679845, + "grad_norm": 0.1865234375, + "learning_rate": 0.00010339127843448893, + "loss": 0.4841, + "step": 175320 + }, + { + "epoch": 8.70815535909407, + "grad_norm": 0.185546875, + "learning_rate": 0.00010335154465083938, + "loss": 0.4949, + "step": 175330 + }, + { + "epoch": 8.70865203138969, + "grad_norm": 0.1611328125, + "learning_rate": 0.00010331181086718984, + "loss": 0.4655, + "step": 175340 + }, + { + "epoch": 8.709148703685308, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010327207708354027, + "loss": 0.4858, + "step": 175350 + }, + { + "epoch": 8.709645375980928, + "grad_norm": 0.1943359375, + "learning_rate": 0.00010323234329989073, + "loss": 0.4825, + "step": 175360 + }, + { + "epoch": 8.710142048276547, + "grad_norm": 0.1591796875, + "learning_rate": 0.00010319260951624119, + "loss": 0.4749, + "step": 175370 + }, + { + "epoch": 8.710638720572167, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010315287573259164, + "loss": 0.5096, + "step": 175380 + }, + { + "epoch": 8.711135392867785, + "grad_norm": 0.197265625, + "learning_rate": 0.00010311314194894208, + "loss": 0.5066, + "step": 175390 + }, + { + "epoch": 8.711632065163405, + "grad_norm": 0.19921875, + "learning_rate": 0.00010307340816529253, + "loss": 0.4953, + "step": 175400 + }, + { + "epoch": 8.712128737459025, + "grad_norm": 0.2041015625, + "learning_rate": 0.00010303367438164299, + "loss": 0.4979, + "step": 175410 + }, + { + "epoch": 8.712625409754644, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010299394059799345, + "loss": 0.5207, + "step": 175420 + }, + { + "epoch": 8.713122082050264, + "grad_norm": 0.18359375, + "learning_rate": 0.0001029542068143439, + "loss": 0.4455, + "step": 175430 + }, + { + "epoch": 8.713618754345882, + "grad_norm": 0.1904296875, + "learning_rate": 0.00010291447303069435, + "loss": 0.4792, + "step": 175440 + }, + { + "epoch": 8.714115426641502, + "grad_norm": 0.1796875, + "learning_rate": 0.00010287473924704481, + "loss": 0.4731, + "step": 175450 + }, + { + "epoch": 8.71461209893712, + "grad_norm": 0.2158203125, + "learning_rate": 0.00010283500546339527, + "loss": 0.4873, + "step": 175460 + }, + { + "epoch": 8.71510877123274, + "grad_norm": 0.193359375, + "learning_rate": 0.00010279527167974572, + "loss": 0.4752, + "step": 175470 + }, + { + "epoch": 8.71560544352836, + "grad_norm": 0.21484375, + "learning_rate": 0.00010275553789609616, + "loss": 0.5031, + "step": 175480 + }, + { + "epoch": 8.716102115823979, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010271580411244661, + "loss": 0.458, + "step": 175490 + }, + { + "epoch": 8.7165987881196, + "grad_norm": 0.2109375, + "learning_rate": 0.00010267607032879707, + "loss": 0.4615, + "step": 175500 + }, + { + "epoch": 8.717095460415218, + "grad_norm": 0.18359375, + "learning_rate": 0.00010263633654514753, + "loss": 0.4954, + "step": 175510 + }, + { + "epoch": 8.717592132710838, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010259660276149796, + "loss": 0.4803, + "step": 175520 + }, + { + "epoch": 8.718088805006456, + "grad_norm": 0.2021484375, + "learning_rate": 0.00010255686897784842, + "loss": 0.4735, + "step": 175530 + }, + { + "epoch": 8.718585477302076, + "grad_norm": 0.203125, + "learning_rate": 0.00010251713519419887, + "loss": 0.4805, + "step": 175540 + }, + { + "epoch": 8.719082149597696, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010247740141054933, + "loss": 0.47, + "step": 175550 + }, + { + "epoch": 8.719578821893315, + "grad_norm": 0.171875, + "learning_rate": 0.00010243766762689976, + "loss": 0.4884, + "step": 175560 + }, + { + "epoch": 8.720075494188935, + "grad_norm": 0.1875, + "learning_rate": 0.00010239793384325022, + "loss": 0.4689, + "step": 175570 + }, + { + "epoch": 8.720572166484553, + "grad_norm": 0.205078125, + "learning_rate": 0.00010235820005960068, + "loss": 0.5375, + "step": 175580 + }, + { + "epoch": 8.721068838780173, + "grad_norm": 0.1875, + "learning_rate": 0.00010231846627595114, + "loss": 0.4968, + "step": 175590 + }, + { + "epoch": 8.721565511075791, + "grad_norm": 0.1865234375, + "learning_rate": 0.00010227873249230158, + "loss": 0.47, + "step": 175600 + }, + { + "epoch": 8.722062183371412, + "grad_norm": 0.2099609375, + "learning_rate": 0.00010223899870865204, + "loss": 0.5143, + "step": 175610 + }, + { + "epoch": 8.722558855667032, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001021992649250025, + "loss": 0.4678, + "step": 175620 + }, + { + "epoch": 8.72305552796265, + "grad_norm": 0.212890625, + "learning_rate": 0.00010215953114135295, + "loss": 0.5128, + "step": 175630 + }, + { + "epoch": 8.72355220025827, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010211979735770339, + "loss": 0.5179, + "step": 175640 + }, + { + "epoch": 8.724048872553889, + "grad_norm": 0.1884765625, + "learning_rate": 0.00010208006357405384, + "loss": 0.4985, + "step": 175650 + }, + { + "epoch": 8.724545544849509, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001020403297904043, + "loss": 0.4765, + "step": 175660 + }, + { + "epoch": 8.725042217145127, + "grad_norm": 0.171875, + "learning_rate": 0.00010200059600675476, + "loss": 0.4867, + "step": 175670 + }, + { + "epoch": 8.725538889440747, + "grad_norm": 0.181640625, + "learning_rate": 0.00010196086222310519, + "loss": 0.4989, + "step": 175680 + }, + { + "epoch": 8.726035561736367, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010192112843945565, + "loss": 0.4549, + "step": 175690 + }, + { + "epoch": 8.726532234031986, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001018813946558061, + "loss": 0.4934, + "step": 175700 + }, + { + "epoch": 8.727028906327606, + "grad_norm": 0.1806640625, + "learning_rate": 0.00010184166087215656, + "loss": 0.4957, + "step": 175710 + }, + { + "epoch": 8.727525578623224, + "grad_norm": 0.181640625, + "learning_rate": 0.000101801927088507, + "loss": 0.4774, + "step": 175720 + }, + { + "epoch": 8.728022250918844, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010176219330485745, + "loss": 0.4926, + "step": 175730 + }, + { + "epoch": 8.728518923214462, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010172245952120791, + "loss": 0.4866, + "step": 175740 + }, + { + "epoch": 8.729015595510083, + "grad_norm": 0.1796875, + "learning_rate": 0.00010168272573755837, + "loss": 0.4714, + "step": 175750 + }, + { + "epoch": 8.729512267805703, + "grad_norm": 0.173828125, + "learning_rate": 0.00010164299195390881, + "loss": 0.4632, + "step": 175760 + }, + { + "epoch": 8.730008940101321, + "grad_norm": 0.197265625, + "learning_rate": 0.00010160325817025927, + "loss": 0.5331, + "step": 175770 + }, + { + "epoch": 8.730505612396941, + "grad_norm": 0.1943359375, + "learning_rate": 0.00010156352438660973, + "loss": 0.5019, + "step": 175780 + }, + { + "epoch": 8.73100228469256, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010152379060296018, + "loss": 0.4965, + "step": 175790 + }, + { + "epoch": 8.73149895698818, + "grad_norm": 0.17578125, + "learning_rate": 0.00010148405681931062, + "loss": 0.4733, + "step": 175800 + }, + { + "epoch": 8.731995629283798, + "grad_norm": 0.2001953125, + "learning_rate": 0.00010144432303566107, + "loss": 0.4864, + "step": 175810 + }, + { + "epoch": 8.732492301579418, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010140458925201153, + "loss": 0.5009, + "step": 175820 + }, + { + "epoch": 8.732988973875038, + "grad_norm": 0.1982421875, + "learning_rate": 0.00010136485546836199, + "loss": 0.5191, + "step": 175830 + }, + { + "epoch": 8.733485646170656, + "grad_norm": 0.169921875, + "learning_rate": 0.00010132512168471242, + "loss": 0.4973, + "step": 175840 + }, + { + "epoch": 8.733982318466277, + "grad_norm": 0.17578125, + "learning_rate": 0.00010128538790106288, + "loss": 0.4668, + "step": 175850 + }, + { + "epoch": 8.734478990761895, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010124565411741333, + "loss": 0.4487, + "step": 175860 + }, + { + "epoch": 8.734975663057515, + "grad_norm": 0.203125, + "learning_rate": 0.00010120592033376379, + "loss": 0.5054, + "step": 175870 + }, + { + "epoch": 8.735472335353133, + "grad_norm": 0.1953125, + "learning_rate": 0.00010116618655011425, + "loss": 0.4952, + "step": 175880 + }, + { + "epoch": 8.735969007648754, + "grad_norm": 0.162109375, + "learning_rate": 0.00010112645276646468, + "loss": 0.4726, + "step": 175890 + }, + { + "epoch": 8.736465679944374, + "grad_norm": 0.208984375, + "learning_rate": 0.00010108671898281514, + "loss": 0.4965, + "step": 175900 + }, + { + "epoch": 8.736962352239992, + "grad_norm": 0.16796875, + "learning_rate": 0.0001010469851991656, + "loss": 0.4819, + "step": 175910 + }, + { + "epoch": 8.737459024535612, + "grad_norm": 0.17578125, + "learning_rate": 0.00010100725141551605, + "loss": 0.5017, + "step": 175920 + }, + { + "epoch": 8.73795569683123, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001009675176318665, + "loss": 0.4949, + "step": 175930 + }, + { + "epoch": 8.73845236912685, + "grad_norm": 0.201171875, + "learning_rate": 0.00010092778384821696, + "loss": 0.4852, + "step": 175940 + }, + { + "epoch": 8.738949041422469, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001008880500645674, + "loss": 0.4902, + "step": 175950 + }, + { + "epoch": 8.739445713718089, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010084831628091786, + "loss": 0.4961, + "step": 175960 + }, + { + "epoch": 8.739942386013709, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001008085824972683, + "loss": 0.5117, + "step": 175970 + }, + { + "epoch": 8.740439058309327, + "grad_norm": 0.2216796875, + "learning_rate": 0.00010076884871361876, + "loss": 0.4918, + "step": 175980 + }, + { + "epoch": 8.740935730604948, + "grad_norm": 0.1796875, + "learning_rate": 0.00010072911492996922, + "loss": 0.4626, + "step": 175990 + }, + { + "epoch": 8.741432402900566, + "grad_norm": 0.173828125, + "learning_rate": 0.00010068938114631968, + "loss": 0.4798, + "step": 176000 + }, + { + "epoch": 8.741929075196186, + "grad_norm": 0.1806640625, + "learning_rate": 0.00010064964736267011, + "loss": 0.4802, + "step": 176010 + }, + { + "epoch": 8.742425747491804, + "grad_norm": 0.1865234375, + "learning_rate": 0.00010060991357902056, + "loss": 0.4873, + "step": 176020 + }, + { + "epoch": 8.742922419787424, + "grad_norm": 0.18359375, + "learning_rate": 0.00010057017979537102, + "loss": 0.4846, + "step": 176030 + }, + { + "epoch": 8.743419092083043, + "grad_norm": 0.169921875, + "learning_rate": 0.00010053044601172148, + "loss": 0.4633, + "step": 176040 + }, + { + "epoch": 8.743915764378663, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010049071222807191, + "loss": 0.4753, + "step": 176050 + }, + { + "epoch": 8.744412436674283, + "grad_norm": 0.181640625, + "learning_rate": 0.00010045097844442237, + "loss": 0.49, + "step": 176060 + }, + { + "epoch": 8.744909108969901, + "grad_norm": 0.1953125, + "learning_rate": 0.00010041124466077283, + "loss": 0.5021, + "step": 176070 + }, + { + "epoch": 8.745405781265521, + "grad_norm": 0.19921875, + "learning_rate": 0.00010037151087712328, + "loss": 0.4969, + "step": 176080 + }, + { + "epoch": 8.74590245356114, + "grad_norm": 0.169921875, + "learning_rate": 0.00010033177709347373, + "loss": 0.4906, + "step": 176090 + }, + { + "epoch": 8.74639912585676, + "grad_norm": 0.171875, + "learning_rate": 0.00010029204330982417, + "loss": 0.4859, + "step": 176100 + }, + { + "epoch": 8.746895798152378, + "grad_norm": 0.1611328125, + "learning_rate": 0.00010025230952617463, + "loss": 0.4801, + "step": 176110 + }, + { + "epoch": 8.747392470447998, + "grad_norm": 0.17578125, + "learning_rate": 0.00010021257574252509, + "loss": 0.4762, + "step": 176120 + }, + { + "epoch": 8.747889142743619, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010017284195887553, + "loss": 0.4778, + "step": 176130 + }, + { + "epoch": 8.748385815039237, + "grad_norm": 0.212890625, + "learning_rate": 0.00010013310817522599, + "loss": 0.5307, + "step": 176140 + }, + { + "epoch": 8.748882487334857, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010009337439157645, + "loss": 0.4938, + "step": 176150 + }, + { + "epoch": 8.749379159630475, + "grad_norm": 0.1953125, + "learning_rate": 0.0001000536406079269, + "loss": 0.4661, + "step": 176160 + }, + { + "epoch": 8.749875831926095, + "grad_norm": 0.185546875, + "learning_rate": 0.00010001390682427734, + "loss": 0.483, + "step": 176170 + }, + { + "epoch": 8.750372504221714, + "grad_norm": 0.1748046875, + "learning_rate": 9.99741730406278e-05, + "loss": 0.4883, + "step": 176180 + }, + { + "epoch": 8.750869176517334, + "grad_norm": 0.1865234375, + "learning_rate": 9.993443925697825e-05, + "loss": 0.4923, + "step": 176190 + }, + { + "epoch": 8.751365848812954, + "grad_norm": 0.193359375, + "learning_rate": 9.98947054733287e-05, + "loss": 0.4881, + "step": 176200 + }, + { + "epoch": 8.751862521108572, + "grad_norm": 0.1845703125, + "learning_rate": 9.985497168967915e-05, + "loss": 0.4714, + "step": 176210 + }, + { + "epoch": 8.752359193404192, + "grad_norm": 0.197265625, + "learning_rate": 9.98152379060296e-05, + "loss": 0.4527, + "step": 176220 + }, + { + "epoch": 8.75285586569981, + "grad_norm": 0.18359375, + "learning_rate": 9.977550412238006e-05, + "loss": 0.4658, + "step": 176230 + }, + { + "epoch": 8.753352537995431, + "grad_norm": 0.1904296875, + "learning_rate": 9.97357703387305e-05, + "loss": 0.4686, + "step": 176240 + }, + { + "epoch": 8.75384921029105, + "grad_norm": 0.201171875, + "learning_rate": 9.969603655508096e-05, + "loss": 0.4648, + "step": 176250 + }, + { + "epoch": 8.75434588258667, + "grad_norm": 0.189453125, + "learning_rate": 9.96563027714314e-05, + "loss": 0.4743, + "step": 176260 + }, + { + "epoch": 8.75484255488229, + "grad_norm": 0.166015625, + "learning_rate": 9.961656898778186e-05, + "loss": 0.4739, + "step": 176270 + }, + { + "epoch": 8.755339227177908, + "grad_norm": 0.1826171875, + "learning_rate": 9.957683520413232e-05, + "loss": 0.5123, + "step": 176280 + }, + { + "epoch": 8.755835899473528, + "grad_norm": 0.1865234375, + "learning_rate": 9.953710142048278e-05, + "loss": 0.4968, + "step": 176290 + }, + { + "epoch": 8.756332571769146, + "grad_norm": 0.1923828125, + "learning_rate": 9.949736763683322e-05, + "loss": 0.4837, + "step": 176300 + }, + { + "epoch": 8.756829244064766, + "grad_norm": 0.212890625, + "learning_rate": 9.945763385318368e-05, + "loss": 0.5075, + "step": 176310 + }, + { + "epoch": 8.757325916360385, + "grad_norm": 0.1923828125, + "learning_rate": 9.941790006953414e-05, + "loss": 0.4982, + "step": 176320 + }, + { + "epoch": 8.757822588656005, + "grad_norm": 0.169921875, + "learning_rate": 9.937816628588458e-05, + "loss": 0.4884, + "step": 176330 + }, + { + "epoch": 8.758319260951625, + "grad_norm": 0.1728515625, + "learning_rate": 9.933843250223504e-05, + "loss": 0.495, + "step": 176340 + }, + { + "epoch": 8.758815933247243, + "grad_norm": 0.166015625, + "learning_rate": 9.929869871858548e-05, + "loss": 0.4847, + "step": 176350 + }, + { + "epoch": 8.759312605542863, + "grad_norm": 0.1943359375, + "learning_rate": 9.925896493493594e-05, + "loss": 0.4984, + "step": 176360 + }, + { + "epoch": 8.759809277838482, + "grad_norm": 0.1787109375, + "learning_rate": 9.921923115128638e-05, + "loss": 0.4804, + "step": 176370 + }, + { + "epoch": 8.760305950134102, + "grad_norm": 0.1787109375, + "learning_rate": 9.917949736763684e-05, + "loss": 0.5331, + "step": 176380 + }, + { + "epoch": 8.76080262242972, + "grad_norm": 0.193359375, + "learning_rate": 9.913976358398729e-05, + "loss": 0.5116, + "step": 176390 + }, + { + "epoch": 8.76129929472534, + "grad_norm": 0.1943359375, + "learning_rate": 9.910002980033774e-05, + "loss": 0.4954, + "step": 176400 + }, + { + "epoch": 8.761795967020959, + "grad_norm": 0.1884765625, + "learning_rate": 9.906029601668819e-05, + "loss": 0.5152, + "step": 176410 + }, + { + "epoch": 8.762292639316579, + "grad_norm": 0.1845703125, + "learning_rate": 9.902056223303865e-05, + "loss": 0.4826, + "step": 176420 + }, + { + "epoch": 8.762789311612199, + "grad_norm": 0.181640625, + "learning_rate": 9.898082844938909e-05, + "loss": 0.4719, + "step": 176430 + }, + { + "epoch": 8.763285983907817, + "grad_norm": 0.1787109375, + "learning_rate": 9.894109466573955e-05, + "loss": 0.4763, + "step": 176440 + }, + { + "epoch": 8.763782656203437, + "grad_norm": 0.1826171875, + "learning_rate": 9.890136088208999e-05, + "loss": 0.4884, + "step": 176450 + }, + { + "epoch": 8.764279328499056, + "grad_norm": 0.1787109375, + "learning_rate": 9.886162709844045e-05, + "loss": 0.4742, + "step": 176460 + }, + { + "epoch": 8.764776000794676, + "grad_norm": 0.197265625, + "learning_rate": 9.882189331479091e-05, + "loss": 0.4734, + "step": 176470 + }, + { + "epoch": 8.765272673090294, + "grad_norm": 0.171875, + "learning_rate": 9.878215953114137e-05, + "loss": 0.5066, + "step": 176480 + }, + { + "epoch": 8.765769345385914, + "grad_norm": 0.169921875, + "learning_rate": 9.874242574749181e-05, + "loss": 0.4681, + "step": 176490 + }, + { + "epoch": 8.766266017681534, + "grad_norm": 0.1806640625, + "learning_rate": 9.870269196384227e-05, + "loss": 0.4684, + "step": 176500 + }, + { + "epoch": 8.766762689977153, + "grad_norm": 0.1796875, + "learning_rate": 9.866295818019271e-05, + "loss": 0.4754, + "step": 176510 + }, + { + "epoch": 8.767259362272773, + "grad_norm": 0.1806640625, + "learning_rate": 9.862322439654317e-05, + "loss": 0.4733, + "step": 176520 + }, + { + "epoch": 8.767756034568391, + "grad_norm": 0.177734375, + "learning_rate": 9.858349061289361e-05, + "loss": 0.4888, + "step": 176530 + }, + { + "epoch": 8.768252706864011, + "grad_norm": 0.189453125, + "learning_rate": 9.854375682924407e-05, + "loss": 0.5491, + "step": 176540 + }, + { + "epoch": 8.76874937915963, + "grad_norm": 0.1708984375, + "learning_rate": 9.850402304559452e-05, + "loss": 0.48, + "step": 176550 + }, + { + "epoch": 8.76924605145525, + "grad_norm": 0.1826171875, + "learning_rate": 9.846428926194497e-05, + "loss": 0.4741, + "step": 176560 + }, + { + "epoch": 8.76974272375087, + "grad_norm": 0.1953125, + "learning_rate": 9.842455547829542e-05, + "loss": 0.4764, + "step": 176570 + }, + { + "epoch": 8.770239396046488, + "grad_norm": 0.2431640625, + "learning_rate": 9.838482169464588e-05, + "loss": 0.51, + "step": 176580 + }, + { + "epoch": 8.770736068342108, + "grad_norm": 0.166015625, + "learning_rate": 9.834508791099632e-05, + "loss": 0.466, + "step": 176590 + }, + { + "epoch": 8.771232740637727, + "grad_norm": 0.1943359375, + "learning_rate": 9.830535412734678e-05, + "loss": 0.4746, + "step": 176600 + }, + { + "epoch": 8.771729412933347, + "grad_norm": 0.1650390625, + "learning_rate": 9.826562034369722e-05, + "loss": 0.5056, + "step": 176610 + }, + { + "epoch": 8.772226085228965, + "grad_norm": 0.173828125, + "learning_rate": 9.822588656004768e-05, + "loss": 0.4754, + "step": 176620 + }, + { + "epoch": 8.772722757524585, + "grad_norm": 0.2373046875, + "learning_rate": 9.818615277639814e-05, + "loss": 0.4696, + "step": 176630 + }, + { + "epoch": 8.773219429820205, + "grad_norm": 0.171875, + "learning_rate": 9.81464189927486e-05, + "loss": 0.5043, + "step": 176640 + }, + { + "epoch": 8.773716102115824, + "grad_norm": 0.1611328125, + "learning_rate": 9.810668520909904e-05, + "loss": 0.4881, + "step": 176650 + }, + { + "epoch": 8.774212774411444, + "grad_norm": 0.19921875, + "learning_rate": 9.80669514254495e-05, + "loss": 0.4989, + "step": 176660 + }, + { + "epoch": 8.774709446707062, + "grad_norm": 0.171875, + "learning_rate": 9.802721764179994e-05, + "loss": 0.472, + "step": 176670 + }, + { + "epoch": 8.775206119002682, + "grad_norm": 0.177734375, + "learning_rate": 9.79874838581504e-05, + "loss": 0.4874, + "step": 176680 + }, + { + "epoch": 8.7757027912983, + "grad_norm": 0.1640625, + "learning_rate": 9.794775007450084e-05, + "loss": 0.4736, + "step": 176690 + }, + { + "epoch": 8.77619946359392, + "grad_norm": 0.166015625, + "learning_rate": 9.79080162908513e-05, + "loss": 0.481, + "step": 176700 + }, + { + "epoch": 8.77669613588954, + "grad_norm": 0.16796875, + "learning_rate": 9.786828250720176e-05, + "loss": 0.4719, + "step": 176710 + }, + { + "epoch": 8.77719280818516, + "grad_norm": 0.1982421875, + "learning_rate": 9.78285487235522e-05, + "loss": 0.481, + "step": 176720 + }, + { + "epoch": 8.77768948048078, + "grad_norm": 0.2060546875, + "learning_rate": 9.778881493990266e-05, + "loss": 0.4933, + "step": 176730 + }, + { + "epoch": 8.778186152776398, + "grad_norm": 0.181640625, + "learning_rate": 9.77490811562531e-05, + "loss": 0.4894, + "step": 176740 + }, + { + "epoch": 8.778682825072018, + "grad_norm": 0.2001953125, + "learning_rate": 9.770934737260356e-05, + "loss": 0.4862, + "step": 176750 + }, + { + "epoch": 8.779179497367636, + "grad_norm": 0.177734375, + "learning_rate": 9.766961358895401e-05, + "loss": 0.4877, + "step": 176760 + }, + { + "epoch": 8.779676169663256, + "grad_norm": 0.1865234375, + "learning_rate": 9.762987980530447e-05, + "loss": 0.5012, + "step": 176770 + }, + { + "epoch": 8.780172841958876, + "grad_norm": 0.22265625, + "learning_rate": 9.759014602165491e-05, + "loss": 0.4863, + "step": 176780 + }, + { + "epoch": 8.780669514254495, + "grad_norm": 0.1884765625, + "learning_rate": 9.755041223800537e-05, + "loss": 0.4905, + "step": 176790 + }, + { + "epoch": 8.781166186550115, + "grad_norm": 0.1796875, + "learning_rate": 9.751067845435581e-05, + "loss": 0.5004, + "step": 176800 + }, + { + "epoch": 8.781662858845733, + "grad_norm": 0.171875, + "learning_rate": 9.747094467070627e-05, + "loss": 0.469, + "step": 176810 + }, + { + "epoch": 8.782159531141353, + "grad_norm": 0.197265625, + "learning_rate": 9.743121088705673e-05, + "loss": 0.5133, + "step": 176820 + }, + { + "epoch": 8.782656203436972, + "grad_norm": 0.1884765625, + "learning_rate": 9.739147710340719e-05, + "loss": 0.4958, + "step": 176830 + }, + { + "epoch": 8.783152875732592, + "grad_norm": 0.1884765625, + "learning_rate": 9.735174331975763e-05, + "loss": 0.4747, + "step": 176840 + }, + { + "epoch": 8.783649548028212, + "grad_norm": 0.1865234375, + "learning_rate": 9.731200953610809e-05, + "loss": 0.4967, + "step": 176850 + }, + { + "epoch": 8.78414622032383, + "grad_norm": 0.2236328125, + "learning_rate": 9.727227575245853e-05, + "loss": 0.5081, + "step": 176860 + }, + { + "epoch": 8.78464289261945, + "grad_norm": 0.181640625, + "learning_rate": 9.723254196880899e-05, + "loss": 0.4796, + "step": 176870 + }, + { + "epoch": 8.785139564915069, + "grad_norm": 0.1728515625, + "learning_rate": 9.719280818515943e-05, + "loss": 0.4937, + "step": 176880 + }, + { + "epoch": 8.785636237210689, + "grad_norm": 0.2470703125, + "learning_rate": 9.715307440150989e-05, + "loss": 0.4887, + "step": 176890 + }, + { + "epoch": 8.786132909506307, + "grad_norm": 0.1904296875, + "learning_rate": 9.711334061786034e-05, + "loss": 0.4763, + "step": 176900 + }, + { + "epoch": 8.786629581801927, + "grad_norm": 0.2001953125, + "learning_rate": 9.70736068342108e-05, + "loss": 0.475, + "step": 176910 + }, + { + "epoch": 8.787126254097547, + "grad_norm": 0.1748046875, + "learning_rate": 9.703387305056124e-05, + "loss": 0.4647, + "step": 176920 + }, + { + "epoch": 8.787622926393166, + "grad_norm": 0.1611328125, + "learning_rate": 9.69941392669117e-05, + "loss": 0.4669, + "step": 176930 + }, + { + "epoch": 8.788119598688786, + "grad_norm": 0.17578125, + "learning_rate": 9.695440548326214e-05, + "loss": 0.5126, + "step": 176940 + }, + { + "epoch": 8.788616270984404, + "grad_norm": 0.16796875, + "learning_rate": 9.69146716996126e-05, + "loss": 0.473, + "step": 176950 + }, + { + "epoch": 8.789112943280024, + "grad_norm": 0.1767578125, + "learning_rate": 9.687493791596304e-05, + "loss": 0.4998, + "step": 176960 + }, + { + "epoch": 8.789609615575642, + "grad_norm": 0.1669921875, + "learning_rate": 9.68352041323135e-05, + "loss": 0.4981, + "step": 176970 + }, + { + "epoch": 8.790106287871263, + "grad_norm": 0.203125, + "learning_rate": 9.679547034866396e-05, + "loss": 0.4983, + "step": 176980 + }, + { + "epoch": 8.790602960166883, + "grad_norm": 0.21484375, + "learning_rate": 9.675573656501442e-05, + "loss": 0.4855, + "step": 176990 + }, + { + "epoch": 8.791099632462501, + "grad_norm": 0.1787109375, + "learning_rate": 9.671600278136486e-05, + "loss": 0.5087, + "step": 177000 + }, + { + "epoch": 8.791596304758121, + "grad_norm": 0.1728515625, + "learning_rate": 9.667626899771532e-05, + "loss": 0.4864, + "step": 177010 + }, + { + "epoch": 8.79209297705374, + "grad_norm": 0.1923828125, + "learning_rate": 9.663653521406576e-05, + "loss": 0.5112, + "step": 177020 + }, + { + "epoch": 8.79258964934936, + "grad_norm": 0.18359375, + "learning_rate": 9.659680143041622e-05, + "loss": 0.4765, + "step": 177030 + }, + { + "epoch": 8.793086321644978, + "grad_norm": 0.208984375, + "learning_rate": 9.655706764676666e-05, + "loss": 0.4844, + "step": 177040 + }, + { + "epoch": 8.793582993940598, + "grad_norm": 0.2001953125, + "learning_rate": 9.651733386311712e-05, + "loss": 0.4911, + "step": 177050 + }, + { + "epoch": 8.794079666236218, + "grad_norm": 0.1845703125, + "learning_rate": 9.647760007946757e-05, + "loss": 0.499, + "step": 177060 + }, + { + "epoch": 8.794576338531837, + "grad_norm": 0.177734375, + "learning_rate": 9.643786629581802e-05, + "loss": 0.4567, + "step": 177070 + }, + { + "epoch": 8.795073010827457, + "grad_norm": 0.201171875, + "learning_rate": 9.639813251216847e-05, + "loss": 0.4706, + "step": 177080 + }, + { + "epoch": 8.795569683123075, + "grad_norm": 0.189453125, + "learning_rate": 9.635839872851893e-05, + "loss": 0.5288, + "step": 177090 + }, + { + "epoch": 8.796066355418695, + "grad_norm": 0.181640625, + "learning_rate": 9.631866494486937e-05, + "loss": 0.503, + "step": 177100 + }, + { + "epoch": 8.796563027714313, + "grad_norm": 0.2109375, + "learning_rate": 9.627893116121983e-05, + "loss": 0.514, + "step": 177110 + }, + { + "epoch": 8.797059700009934, + "grad_norm": 0.169921875, + "learning_rate": 9.623919737757027e-05, + "loss": 0.5233, + "step": 177120 + }, + { + "epoch": 8.797556372305554, + "grad_norm": 0.2041015625, + "learning_rate": 9.619946359392073e-05, + "loss": 0.504, + "step": 177130 + }, + { + "epoch": 8.798053044601172, + "grad_norm": 0.1875, + "learning_rate": 9.615972981027119e-05, + "loss": 0.4765, + "step": 177140 + }, + { + "epoch": 8.798549716896792, + "grad_norm": 0.18359375, + "learning_rate": 9.611999602662163e-05, + "loss": 0.4937, + "step": 177150 + }, + { + "epoch": 8.79904638919241, + "grad_norm": 0.19140625, + "learning_rate": 9.608026224297209e-05, + "loss": 0.4746, + "step": 177160 + }, + { + "epoch": 8.79954306148803, + "grad_norm": 0.1728515625, + "learning_rate": 9.604052845932255e-05, + "loss": 0.4774, + "step": 177170 + }, + { + "epoch": 8.800039733783649, + "grad_norm": 0.193359375, + "learning_rate": 9.6000794675673e-05, + "loss": 0.4536, + "step": 177180 + }, + { + "epoch": 8.800536406079269, + "grad_norm": 0.2080078125, + "learning_rate": 9.596106089202345e-05, + "loss": 0.4693, + "step": 177190 + }, + { + "epoch": 8.80103307837489, + "grad_norm": 0.181640625, + "learning_rate": 9.592132710837391e-05, + "loss": 0.4815, + "step": 177200 + }, + { + "epoch": 8.801529750670507, + "grad_norm": 0.21875, + "learning_rate": 9.588159332472435e-05, + "loss": 0.4595, + "step": 177210 + }, + { + "epoch": 8.802026422966128, + "grad_norm": 0.203125, + "learning_rate": 9.584185954107481e-05, + "loss": 0.4572, + "step": 177220 + }, + { + "epoch": 8.802523095261746, + "grad_norm": 0.16015625, + "learning_rate": 9.580212575742525e-05, + "loss": 0.5092, + "step": 177230 + }, + { + "epoch": 8.803019767557366, + "grad_norm": 0.18359375, + "learning_rate": 9.576239197377571e-05, + "loss": 0.4826, + "step": 177240 + }, + { + "epoch": 8.803516439852984, + "grad_norm": 0.1806640625, + "learning_rate": 9.572265819012616e-05, + "loss": 0.4733, + "step": 177250 + }, + { + "epoch": 8.804013112148604, + "grad_norm": 0.205078125, + "learning_rate": 9.568292440647661e-05, + "loss": 0.4817, + "step": 177260 + }, + { + "epoch": 8.804509784444225, + "grad_norm": 0.181640625, + "learning_rate": 9.564319062282706e-05, + "loss": 0.5005, + "step": 177270 + }, + { + "epoch": 8.805006456739843, + "grad_norm": 0.2158203125, + "learning_rate": 9.560345683917752e-05, + "loss": 0.4755, + "step": 177280 + }, + { + "epoch": 8.805503129035463, + "grad_norm": 0.166015625, + "learning_rate": 9.556372305552796e-05, + "loss": 0.486, + "step": 177290 + }, + { + "epoch": 8.805999801331081, + "grad_norm": 0.1953125, + "learning_rate": 9.552398927187842e-05, + "loss": 0.4747, + "step": 177300 + }, + { + "epoch": 8.806496473626702, + "grad_norm": 0.181640625, + "learning_rate": 9.548425548822886e-05, + "loss": 0.5076, + "step": 177310 + }, + { + "epoch": 8.80699314592232, + "grad_norm": 0.1806640625, + "learning_rate": 9.544452170457932e-05, + "loss": 0.5059, + "step": 177320 + }, + { + "epoch": 8.80748981821794, + "grad_norm": 0.16796875, + "learning_rate": 9.540478792092978e-05, + "loss": 0.4922, + "step": 177330 + }, + { + "epoch": 8.80798649051356, + "grad_norm": 0.1923828125, + "learning_rate": 9.536505413728024e-05, + "loss": 0.4997, + "step": 177340 + }, + { + "epoch": 8.808483162809178, + "grad_norm": 0.197265625, + "learning_rate": 9.532532035363068e-05, + "loss": 0.4665, + "step": 177350 + }, + { + "epoch": 8.808979835104799, + "grad_norm": 0.2138671875, + "learning_rate": 9.528558656998114e-05, + "loss": 0.4884, + "step": 177360 + }, + { + "epoch": 8.809476507400417, + "grad_norm": 0.2138671875, + "learning_rate": 9.524585278633158e-05, + "loss": 0.512, + "step": 177370 + }, + { + "epoch": 8.809973179696037, + "grad_norm": 0.1904296875, + "learning_rate": 9.520611900268204e-05, + "loss": 0.5118, + "step": 177380 + }, + { + "epoch": 8.810469851991655, + "grad_norm": 0.1767578125, + "learning_rate": 9.516638521903248e-05, + "loss": 0.4607, + "step": 177390 + }, + { + "epoch": 8.810966524287275, + "grad_norm": 0.220703125, + "learning_rate": 9.512665143538294e-05, + "loss": 0.5227, + "step": 177400 + }, + { + "epoch": 8.811463196582894, + "grad_norm": 0.1904296875, + "learning_rate": 9.508691765173339e-05, + "loss": 0.4998, + "step": 177410 + }, + { + "epoch": 8.811959868878514, + "grad_norm": 0.18359375, + "learning_rate": 9.504718386808384e-05, + "loss": 0.4814, + "step": 177420 + }, + { + "epoch": 8.812456541174134, + "grad_norm": 0.197265625, + "learning_rate": 9.500745008443429e-05, + "loss": 0.4896, + "step": 177430 + }, + { + "epoch": 8.812953213469752, + "grad_norm": 0.1796875, + "learning_rate": 9.496771630078475e-05, + "loss": 0.4792, + "step": 177440 + }, + { + "epoch": 8.813449885765372, + "grad_norm": 0.185546875, + "learning_rate": 9.492798251713519e-05, + "loss": 0.4921, + "step": 177450 + }, + { + "epoch": 8.81394655806099, + "grad_norm": 0.185546875, + "learning_rate": 9.488824873348565e-05, + "loss": 0.5142, + "step": 177460 + }, + { + "epoch": 8.814443230356611, + "grad_norm": 0.1689453125, + "learning_rate": 9.484851494983609e-05, + "loss": 0.4699, + "step": 177470 + }, + { + "epoch": 8.81493990265223, + "grad_norm": 0.171875, + "learning_rate": 9.480878116618655e-05, + "loss": 0.4891, + "step": 177480 + }, + { + "epoch": 8.81543657494785, + "grad_norm": 0.1826171875, + "learning_rate": 9.476904738253701e-05, + "loss": 0.4795, + "step": 177490 + }, + { + "epoch": 8.81593324724347, + "grad_norm": 0.1884765625, + "learning_rate": 9.472931359888745e-05, + "loss": 0.4878, + "step": 177500 + }, + { + "epoch": 8.816429919539088, + "grad_norm": 0.2197265625, + "learning_rate": 9.468957981523791e-05, + "loss": 0.4549, + "step": 177510 + }, + { + "epoch": 8.816926591834708, + "grad_norm": 0.1884765625, + "learning_rate": 9.464984603158837e-05, + "loss": 0.5013, + "step": 177520 + }, + { + "epoch": 8.817423264130326, + "grad_norm": 0.1708984375, + "learning_rate": 9.461011224793881e-05, + "loss": 0.4684, + "step": 177530 + }, + { + "epoch": 8.817919936425946, + "grad_norm": 0.185546875, + "learning_rate": 9.457037846428927e-05, + "loss": 0.4496, + "step": 177540 + }, + { + "epoch": 8.818416608721565, + "grad_norm": 0.177734375, + "learning_rate": 9.453064468063973e-05, + "loss": 0.4796, + "step": 177550 + }, + { + "epoch": 8.818913281017185, + "grad_norm": 0.1875, + "learning_rate": 9.449091089699017e-05, + "loss": 0.4595, + "step": 177560 + }, + { + "epoch": 8.819409953312805, + "grad_norm": 0.173828125, + "learning_rate": 9.445117711334063e-05, + "loss": 0.4822, + "step": 177570 + }, + { + "epoch": 8.819906625608423, + "grad_norm": 0.1748046875, + "learning_rate": 9.441144332969107e-05, + "loss": 0.4814, + "step": 177580 + }, + { + "epoch": 8.820403297904043, + "grad_norm": 0.208984375, + "learning_rate": 9.437170954604153e-05, + "loss": 0.4958, + "step": 177590 + }, + { + "epoch": 8.820899970199662, + "grad_norm": 0.1650390625, + "learning_rate": 9.433197576239198e-05, + "loss": 0.4911, + "step": 177600 + }, + { + "epoch": 8.821396642495282, + "grad_norm": 0.1728515625, + "learning_rate": 9.429224197874243e-05, + "loss": 0.4948, + "step": 177610 + }, + { + "epoch": 8.8218933147909, + "grad_norm": 0.1865234375, + "learning_rate": 9.425250819509288e-05, + "loss": 0.5156, + "step": 177620 + }, + { + "epoch": 8.82238998708652, + "grad_norm": 0.185546875, + "learning_rate": 9.421277441144334e-05, + "loss": 0.4728, + "step": 177630 + }, + { + "epoch": 8.82288665938214, + "grad_norm": 0.193359375, + "learning_rate": 9.417304062779378e-05, + "loss": 0.4963, + "step": 177640 + }, + { + "epoch": 8.823383331677759, + "grad_norm": 0.1669921875, + "learning_rate": 9.413330684414424e-05, + "loss": 0.4614, + "step": 177650 + }, + { + "epoch": 8.823880003973379, + "grad_norm": 0.20703125, + "learning_rate": 9.409357306049468e-05, + "loss": 0.4823, + "step": 177660 + }, + { + "epoch": 8.824376676268997, + "grad_norm": 0.1796875, + "learning_rate": 9.405383927684514e-05, + "loss": 0.4824, + "step": 177670 + }, + { + "epoch": 8.824873348564617, + "grad_norm": 0.171875, + "learning_rate": 9.40141054931956e-05, + "loss": 0.4904, + "step": 177680 + }, + { + "epoch": 8.825370020860236, + "grad_norm": 0.166015625, + "learning_rate": 9.397437170954606e-05, + "loss": 0.5075, + "step": 177690 + }, + { + "epoch": 8.825866693155856, + "grad_norm": 0.220703125, + "learning_rate": 9.39346379258965e-05, + "loss": 0.5011, + "step": 177700 + }, + { + "epoch": 8.826363365451476, + "grad_norm": 0.2412109375, + "learning_rate": 9.389490414224696e-05, + "loss": 0.5239, + "step": 177710 + }, + { + "epoch": 8.826860037747094, + "grad_norm": 0.197265625, + "learning_rate": 9.38551703585974e-05, + "loss": 0.4759, + "step": 177720 + }, + { + "epoch": 8.827356710042714, + "grad_norm": 0.17578125, + "learning_rate": 9.381543657494786e-05, + "loss": 0.5076, + "step": 177730 + }, + { + "epoch": 8.827853382338333, + "grad_norm": 0.17578125, + "learning_rate": 9.37757027912983e-05, + "loss": 0.44, + "step": 177740 + }, + { + "epoch": 8.828350054633953, + "grad_norm": 0.2060546875, + "learning_rate": 9.373596900764876e-05, + "loss": 0.5002, + "step": 177750 + }, + { + "epoch": 8.828846726929571, + "grad_norm": 0.18359375, + "learning_rate": 9.36962352239992e-05, + "loss": 0.4905, + "step": 177760 + }, + { + "epoch": 8.829343399225191, + "grad_norm": 0.1943359375, + "learning_rate": 9.365650144034966e-05, + "loss": 0.4814, + "step": 177770 + }, + { + "epoch": 8.82984007152081, + "grad_norm": 0.181640625, + "learning_rate": 9.361676765670011e-05, + "loss": 0.4869, + "step": 177780 + }, + { + "epoch": 8.83033674381643, + "grad_norm": 0.181640625, + "learning_rate": 9.357703387305057e-05, + "loss": 0.4872, + "step": 177790 + }, + { + "epoch": 8.83083341611205, + "grad_norm": 0.166015625, + "learning_rate": 9.353730008940101e-05, + "loss": 0.4742, + "step": 177800 + }, + { + "epoch": 8.831330088407668, + "grad_norm": 0.1728515625, + "learning_rate": 9.349756630575147e-05, + "loss": 0.4995, + "step": 177810 + }, + { + "epoch": 8.831826760703288, + "grad_norm": 0.1767578125, + "learning_rate": 9.345783252210191e-05, + "loss": 0.501, + "step": 177820 + }, + { + "epoch": 8.832323432998907, + "grad_norm": 0.189453125, + "learning_rate": 9.341809873845237e-05, + "loss": 0.4939, + "step": 177830 + }, + { + "epoch": 8.832820105294527, + "grad_norm": 0.1787109375, + "learning_rate": 9.337836495480283e-05, + "loss": 0.514, + "step": 177840 + }, + { + "epoch": 8.833316777590145, + "grad_norm": 0.1650390625, + "learning_rate": 9.333863117115329e-05, + "loss": 0.4908, + "step": 177850 + }, + { + "epoch": 8.833813449885765, + "grad_norm": 0.2021484375, + "learning_rate": 9.329889738750373e-05, + "loss": 0.4792, + "step": 177860 + }, + { + "epoch": 8.834310122181385, + "grad_norm": 0.1650390625, + "learning_rate": 9.325916360385419e-05, + "loss": 0.4731, + "step": 177870 + }, + { + "epoch": 8.834806794477004, + "grad_norm": 0.173828125, + "learning_rate": 9.321942982020463e-05, + "loss": 0.4529, + "step": 177880 + }, + { + "epoch": 8.835303466772624, + "grad_norm": 0.1962890625, + "learning_rate": 9.317969603655509e-05, + "loss": 0.4739, + "step": 177890 + }, + { + "epoch": 8.835800139068242, + "grad_norm": 0.185546875, + "learning_rate": 9.313996225290553e-05, + "loss": 0.4991, + "step": 177900 + }, + { + "epoch": 8.836296811363862, + "grad_norm": 0.1787109375, + "learning_rate": 9.310022846925599e-05, + "loss": 0.4774, + "step": 177910 + }, + { + "epoch": 8.83679348365948, + "grad_norm": 0.1923828125, + "learning_rate": 9.306049468560644e-05, + "loss": 0.5063, + "step": 177920 + }, + { + "epoch": 8.8372901559551, + "grad_norm": 0.1806640625, + "learning_rate": 9.30207609019569e-05, + "loss": 0.5064, + "step": 177930 + }, + { + "epoch": 8.83778682825072, + "grad_norm": 0.1884765625, + "learning_rate": 9.298102711830734e-05, + "loss": 0.5055, + "step": 177940 + }, + { + "epoch": 8.83828350054634, + "grad_norm": 0.1787109375, + "learning_rate": 9.29412933346578e-05, + "loss": 0.5141, + "step": 177950 + }, + { + "epoch": 8.83878017284196, + "grad_norm": 0.1748046875, + "learning_rate": 9.290155955100824e-05, + "loss": 0.4801, + "step": 177960 + }, + { + "epoch": 8.839276845137578, + "grad_norm": 0.220703125, + "learning_rate": 9.28618257673587e-05, + "loss": 0.4867, + "step": 177970 + }, + { + "epoch": 8.839773517433198, + "grad_norm": 0.19140625, + "learning_rate": 9.282209198370916e-05, + "loss": 0.502, + "step": 177980 + }, + { + "epoch": 8.840270189728816, + "grad_norm": 0.1806640625, + "learning_rate": 9.27823582000596e-05, + "loss": 0.4746, + "step": 177990 + }, + { + "epoch": 8.840766862024436, + "grad_norm": 0.228515625, + "learning_rate": 9.274262441641006e-05, + "loss": 0.4901, + "step": 178000 + }, + { + "epoch": 8.841263534320056, + "grad_norm": 0.212890625, + "learning_rate": 9.27028906327605e-05, + "loss": 0.4745, + "step": 178010 + }, + { + "epoch": 8.841760206615675, + "grad_norm": 0.1669921875, + "learning_rate": 9.266315684911096e-05, + "loss": 0.4777, + "step": 178020 + }, + { + "epoch": 8.842256878911295, + "grad_norm": 0.1806640625, + "learning_rate": 9.262342306546142e-05, + "loss": 0.4853, + "step": 178030 + }, + { + "epoch": 8.842753551206913, + "grad_norm": 0.177734375, + "learning_rate": 9.258368928181188e-05, + "loss": 0.4787, + "step": 178040 + }, + { + "epoch": 8.843250223502533, + "grad_norm": 0.189453125, + "learning_rate": 9.254395549816232e-05, + "loss": 0.4891, + "step": 178050 + }, + { + "epoch": 8.843746895798152, + "grad_norm": 0.2041015625, + "learning_rate": 9.250422171451278e-05, + "loss": 0.508, + "step": 178060 + }, + { + "epoch": 8.844243568093772, + "grad_norm": 0.201171875, + "learning_rate": 9.246448793086322e-05, + "loss": 0.4948, + "step": 178070 + }, + { + "epoch": 8.844740240389392, + "grad_norm": 0.1767578125, + "learning_rate": 9.242475414721368e-05, + "loss": 0.4857, + "step": 178080 + }, + { + "epoch": 8.84523691268501, + "grad_norm": 0.17578125, + "learning_rate": 9.238502036356412e-05, + "loss": 0.4985, + "step": 178090 + }, + { + "epoch": 8.84573358498063, + "grad_norm": 0.17578125, + "learning_rate": 9.234528657991458e-05, + "loss": 0.4708, + "step": 178100 + }, + { + "epoch": 8.846230257276249, + "grad_norm": 0.2275390625, + "learning_rate": 9.230555279626503e-05, + "loss": 0.4864, + "step": 178110 + }, + { + "epoch": 8.846726929571869, + "grad_norm": 0.181640625, + "learning_rate": 9.226581901261548e-05, + "loss": 0.4788, + "step": 178120 + }, + { + "epoch": 8.847223601867487, + "grad_norm": 0.1826171875, + "learning_rate": 9.222608522896593e-05, + "loss": 0.536, + "step": 178130 + }, + { + "epoch": 8.847720274163107, + "grad_norm": 0.1962890625, + "learning_rate": 9.218635144531639e-05, + "loss": 0.487, + "step": 178140 + }, + { + "epoch": 8.848216946458727, + "grad_norm": 0.173828125, + "learning_rate": 9.214661766166683e-05, + "loss": 0.5071, + "step": 178150 + }, + { + "epoch": 8.848713618754346, + "grad_norm": 0.1953125, + "learning_rate": 9.210688387801729e-05, + "loss": 0.4701, + "step": 178160 + }, + { + "epoch": 8.849210291049966, + "grad_norm": 0.1923828125, + "learning_rate": 9.206715009436773e-05, + "loss": 0.4669, + "step": 178170 + }, + { + "epoch": 8.849706963345584, + "grad_norm": 0.17578125, + "learning_rate": 9.202741631071819e-05, + "loss": 0.4869, + "step": 178180 + }, + { + "epoch": 8.850203635641204, + "grad_norm": 0.1787109375, + "learning_rate": 9.198768252706865e-05, + "loss": 0.4847, + "step": 178190 + }, + { + "epoch": 8.850700307936822, + "grad_norm": 0.19921875, + "learning_rate": 9.19479487434191e-05, + "loss": 0.5305, + "step": 178200 + }, + { + "epoch": 8.851196980232443, + "grad_norm": 0.24609375, + "learning_rate": 9.190821495976955e-05, + "loss": 0.4845, + "step": 178210 + }, + { + "epoch": 8.851693652528063, + "grad_norm": 0.1943359375, + "learning_rate": 9.186848117612001e-05, + "loss": 0.4765, + "step": 178220 + }, + { + "epoch": 8.852190324823681, + "grad_norm": 0.2109375, + "learning_rate": 9.182874739247045e-05, + "loss": 0.4727, + "step": 178230 + }, + { + "epoch": 8.852686997119301, + "grad_norm": 0.1826171875, + "learning_rate": 9.178901360882091e-05, + "loss": 0.4844, + "step": 178240 + }, + { + "epoch": 8.85318366941492, + "grad_norm": 0.1845703125, + "learning_rate": 9.174927982517135e-05, + "loss": 0.4649, + "step": 178250 + }, + { + "epoch": 8.85368034171054, + "grad_norm": 0.1669921875, + "learning_rate": 9.170954604152181e-05, + "loss": 0.5175, + "step": 178260 + }, + { + "epoch": 8.854177014006158, + "grad_norm": 0.2001953125, + "learning_rate": 9.166981225787226e-05, + "loss": 0.5003, + "step": 178270 + }, + { + "epoch": 8.854673686301778, + "grad_norm": 0.220703125, + "learning_rate": 9.163007847422271e-05, + "loss": 0.4849, + "step": 178280 + }, + { + "epoch": 8.855170358597398, + "grad_norm": 0.169921875, + "learning_rate": 9.159034469057316e-05, + "loss": 0.4885, + "step": 178290 + }, + { + "epoch": 8.855667030893017, + "grad_norm": 0.193359375, + "learning_rate": 9.155061090692362e-05, + "loss": 0.4961, + "step": 178300 + }, + { + "epoch": 8.856163703188637, + "grad_norm": 0.1689453125, + "learning_rate": 9.151087712327406e-05, + "loss": 0.4878, + "step": 178310 + }, + { + "epoch": 8.856660375484255, + "grad_norm": 0.1884765625, + "learning_rate": 9.147114333962452e-05, + "loss": 0.523, + "step": 178320 + }, + { + "epoch": 8.857157047779875, + "grad_norm": 0.18359375, + "learning_rate": 9.143140955597496e-05, + "loss": 0.4829, + "step": 178330 + }, + { + "epoch": 8.857653720075493, + "grad_norm": 0.185546875, + "learning_rate": 9.139167577232542e-05, + "loss": 0.4897, + "step": 178340 + }, + { + "epoch": 8.858150392371114, + "grad_norm": 0.19140625, + "learning_rate": 9.135194198867586e-05, + "loss": 0.4924, + "step": 178350 + }, + { + "epoch": 8.858647064666734, + "grad_norm": 0.1923828125, + "learning_rate": 9.131220820502632e-05, + "loss": 0.4894, + "step": 178360 + }, + { + "epoch": 8.859143736962352, + "grad_norm": 0.2080078125, + "learning_rate": 9.127247442137678e-05, + "loss": 0.4551, + "step": 178370 + }, + { + "epoch": 8.859640409257972, + "grad_norm": 0.1904296875, + "learning_rate": 9.123274063772724e-05, + "loss": 0.4811, + "step": 178380 + }, + { + "epoch": 8.86013708155359, + "grad_norm": 0.1953125, + "learning_rate": 9.11930068540777e-05, + "loss": 0.4676, + "step": 178390 + }, + { + "epoch": 8.86063375384921, + "grad_norm": 0.1845703125, + "learning_rate": 9.115327307042814e-05, + "loss": 0.4773, + "step": 178400 + }, + { + "epoch": 8.861130426144829, + "grad_norm": 0.2001953125, + "learning_rate": 9.11135392867786e-05, + "loss": 0.4545, + "step": 178410 + }, + { + "epoch": 8.861627098440449, + "grad_norm": 0.185546875, + "learning_rate": 9.107380550312904e-05, + "loss": 0.4835, + "step": 178420 + }, + { + "epoch": 8.86212377073607, + "grad_norm": 0.1806640625, + "learning_rate": 9.10340717194795e-05, + "loss": 0.466, + "step": 178430 + }, + { + "epoch": 8.862620443031688, + "grad_norm": 0.1728515625, + "learning_rate": 9.099433793582994e-05, + "loss": 0.4657, + "step": 178440 + }, + { + "epoch": 8.863117115327308, + "grad_norm": 0.2001953125, + "learning_rate": 9.09546041521804e-05, + "loss": 0.5077, + "step": 178450 + }, + { + "epoch": 8.863613787622926, + "grad_norm": 0.251953125, + "learning_rate": 9.091487036853085e-05, + "loss": 0.4968, + "step": 178460 + }, + { + "epoch": 8.864110459918546, + "grad_norm": 0.189453125, + "learning_rate": 9.08751365848813e-05, + "loss": 0.508, + "step": 178470 + }, + { + "epoch": 8.864607132214164, + "grad_norm": 0.189453125, + "learning_rate": 9.083540280123175e-05, + "loss": 0.4876, + "step": 178480 + }, + { + "epoch": 8.865103804509785, + "grad_norm": 0.181640625, + "learning_rate": 9.07956690175822e-05, + "loss": 0.4909, + "step": 178490 + }, + { + "epoch": 8.865600476805405, + "grad_norm": 0.2470703125, + "learning_rate": 9.075593523393265e-05, + "loss": 0.4989, + "step": 178500 + }, + { + "epoch": 8.866097149101023, + "grad_norm": 0.19921875, + "learning_rate": 9.071620145028311e-05, + "loss": 0.5079, + "step": 178510 + }, + { + "epoch": 8.866593821396643, + "grad_norm": 0.1806640625, + "learning_rate": 9.067646766663355e-05, + "loss": 0.4881, + "step": 178520 + }, + { + "epoch": 8.867090493692261, + "grad_norm": 0.212890625, + "learning_rate": 9.063673388298401e-05, + "loss": 0.5281, + "step": 178530 + }, + { + "epoch": 8.867587165987882, + "grad_norm": 0.1875, + "learning_rate": 9.059700009933447e-05, + "loss": 0.4902, + "step": 178540 + }, + { + "epoch": 8.8680838382835, + "grad_norm": 0.171875, + "learning_rate": 9.055726631568493e-05, + "loss": 0.5051, + "step": 178550 + }, + { + "epoch": 8.86858051057912, + "grad_norm": 0.1728515625, + "learning_rate": 9.051753253203537e-05, + "loss": 0.5191, + "step": 178560 + }, + { + "epoch": 8.86907718287474, + "grad_norm": 0.1826171875, + "learning_rate": 9.047779874838583e-05, + "loss": 0.5161, + "step": 178570 + }, + { + "epoch": 8.869573855170358, + "grad_norm": 0.1923828125, + "learning_rate": 9.043806496473627e-05, + "loss": 0.4767, + "step": 178580 + }, + { + "epoch": 8.870070527465979, + "grad_norm": 0.1962890625, + "learning_rate": 9.039833118108673e-05, + "loss": 0.5076, + "step": 178590 + }, + { + "epoch": 8.870567199761597, + "grad_norm": 0.1826171875, + "learning_rate": 9.035859739743717e-05, + "loss": 0.4949, + "step": 178600 + }, + { + "epoch": 8.871063872057217, + "grad_norm": 0.197265625, + "learning_rate": 9.031886361378763e-05, + "loss": 0.5077, + "step": 178610 + }, + { + "epoch": 8.871560544352835, + "grad_norm": 0.208984375, + "learning_rate": 9.027912983013808e-05, + "loss": 0.4732, + "step": 178620 + }, + { + "epoch": 8.872057216648455, + "grad_norm": 0.16796875, + "learning_rate": 9.023939604648853e-05, + "loss": 0.4695, + "step": 178630 + }, + { + "epoch": 8.872553888944076, + "grad_norm": 0.19921875, + "learning_rate": 9.019966226283898e-05, + "loss": 0.4374, + "step": 178640 + }, + { + "epoch": 8.873050561239694, + "grad_norm": 0.2333984375, + "learning_rate": 9.015992847918944e-05, + "loss": 0.4909, + "step": 178650 + }, + { + "epoch": 8.873547233535314, + "grad_norm": 0.1826171875, + "learning_rate": 9.012019469553988e-05, + "loss": 0.4758, + "step": 178660 + }, + { + "epoch": 8.874043905830932, + "grad_norm": 0.197265625, + "learning_rate": 9.008046091189034e-05, + "loss": 0.4834, + "step": 178670 + }, + { + "epoch": 8.874540578126553, + "grad_norm": 0.1884765625, + "learning_rate": 9.004072712824078e-05, + "loss": 0.5022, + "step": 178680 + }, + { + "epoch": 8.87503725042217, + "grad_norm": 0.18359375, + "learning_rate": 9.000099334459124e-05, + "loss": 0.4888, + "step": 178690 + }, + { + "epoch": 8.875533922717791, + "grad_norm": 0.2236328125, + "learning_rate": 8.996125956094168e-05, + "loss": 0.4553, + "step": 178700 + }, + { + "epoch": 8.876030595013411, + "grad_norm": 0.1982421875, + "learning_rate": 8.992152577729214e-05, + "loss": 0.4859, + "step": 178710 + }, + { + "epoch": 8.87652726730903, + "grad_norm": 0.216796875, + "learning_rate": 8.98817919936426e-05, + "loss": 0.5082, + "step": 178720 + }, + { + "epoch": 8.87702393960465, + "grad_norm": 0.1845703125, + "learning_rate": 8.984205820999306e-05, + "loss": 0.508, + "step": 178730 + }, + { + "epoch": 8.877520611900268, + "grad_norm": 0.2041015625, + "learning_rate": 8.98023244263435e-05, + "loss": 0.4816, + "step": 178740 + }, + { + "epoch": 8.878017284195888, + "grad_norm": 0.1787109375, + "learning_rate": 8.976259064269396e-05, + "loss": 0.4698, + "step": 178750 + }, + { + "epoch": 8.878513956491506, + "grad_norm": 0.181640625, + "learning_rate": 8.97228568590444e-05, + "loss": 0.4767, + "step": 178760 + }, + { + "epoch": 8.879010628787126, + "grad_norm": 0.205078125, + "learning_rate": 8.968312307539486e-05, + "loss": 0.4936, + "step": 178770 + }, + { + "epoch": 8.879507301082745, + "grad_norm": 0.1689453125, + "learning_rate": 8.96433892917453e-05, + "loss": 0.4932, + "step": 178780 + }, + { + "epoch": 8.880003973378365, + "grad_norm": 0.216796875, + "learning_rate": 8.960365550809576e-05, + "loss": 0.5014, + "step": 178790 + }, + { + "epoch": 8.880500645673985, + "grad_norm": 0.1865234375, + "learning_rate": 8.956392172444621e-05, + "loss": 0.4737, + "step": 178800 + }, + { + "epoch": 8.880997317969603, + "grad_norm": 0.1845703125, + "learning_rate": 8.952418794079667e-05, + "loss": 0.4558, + "step": 178810 + }, + { + "epoch": 8.881493990265223, + "grad_norm": 0.1865234375, + "learning_rate": 8.948445415714712e-05, + "loss": 0.511, + "step": 178820 + }, + { + "epoch": 8.881990662560842, + "grad_norm": 0.2265625, + "learning_rate": 8.944472037349757e-05, + "loss": 0.4751, + "step": 178830 + }, + { + "epoch": 8.882487334856462, + "grad_norm": 0.1640625, + "learning_rate": 8.940498658984803e-05, + "loss": 0.4934, + "step": 178840 + }, + { + "epoch": 8.88298400715208, + "grad_norm": 0.2021484375, + "learning_rate": 8.936525280619847e-05, + "loss": 0.5163, + "step": 178850 + }, + { + "epoch": 8.8834806794477, + "grad_norm": 0.1767578125, + "learning_rate": 8.932551902254893e-05, + "loss": 0.5046, + "step": 178860 + }, + { + "epoch": 8.88397735174332, + "grad_norm": 0.1962890625, + "learning_rate": 8.928578523889937e-05, + "loss": 0.4613, + "step": 178870 + }, + { + "epoch": 8.884474024038939, + "grad_norm": 0.177734375, + "learning_rate": 8.924605145524983e-05, + "loss": 0.5034, + "step": 178880 + }, + { + "epoch": 8.884970696334559, + "grad_norm": 0.19921875, + "learning_rate": 8.920631767160029e-05, + "loss": 0.5281, + "step": 178890 + }, + { + "epoch": 8.885467368630177, + "grad_norm": 0.177734375, + "learning_rate": 8.916658388795075e-05, + "loss": 0.5001, + "step": 178900 + }, + { + "epoch": 8.885964040925797, + "grad_norm": 0.173828125, + "learning_rate": 8.912685010430119e-05, + "loss": 0.4436, + "step": 178910 + }, + { + "epoch": 8.886460713221416, + "grad_norm": 0.2001953125, + "learning_rate": 8.908711632065165e-05, + "loss": 0.4834, + "step": 178920 + }, + { + "epoch": 8.886957385517036, + "grad_norm": 0.1884765625, + "learning_rate": 8.904738253700209e-05, + "loss": 0.4922, + "step": 178930 + }, + { + "epoch": 8.887454057812656, + "grad_norm": 0.1845703125, + "learning_rate": 8.900764875335255e-05, + "loss": 0.4558, + "step": 178940 + }, + { + "epoch": 8.887950730108274, + "grad_norm": 0.19921875, + "learning_rate": 8.8967914969703e-05, + "loss": 0.5002, + "step": 178950 + }, + { + "epoch": 8.888447402403894, + "grad_norm": 0.1748046875, + "learning_rate": 8.892818118605345e-05, + "loss": 0.4759, + "step": 178960 + }, + { + "epoch": 8.888944074699513, + "grad_norm": 0.1689453125, + "learning_rate": 8.88884474024039e-05, + "loss": 0.4968, + "step": 178970 + }, + { + "epoch": 8.889440746995133, + "grad_norm": 0.2041015625, + "learning_rate": 8.884871361875435e-05, + "loss": 0.4812, + "step": 178980 + }, + { + "epoch": 8.889937419290751, + "grad_norm": 0.177734375, + "learning_rate": 8.88089798351048e-05, + "loss": 0.4873, + "step": 178990 + }, + { + "epoch": 8.890434091586371, + "grad_norm": 0.177734375, + "learning_rate": 8.876924605145526e-05, + "loss": 0.4781, + "step": 179000 + }, + { + "epoch": 8.890930763881991, + "grad_norm": 0.1904296875, + "learning_rate": 8.87295122678057e-05, + "loss": 0.4674, + "step": 179010 + }, + { + "epoch": 8.89142743617761, + "grad_norm": 0.1904296875, + "learning_rate": 8.868977848415616e-05, + "loss": 0.4771, + "step": 179020 + }, + { + "epoch": 8.89192410847323, + "grad_norm": 0.1953125, + "learning_rate": 8.86500447005066e-05, + "loss": 0.4913, + "step": 179030 + }, + { + "epoch": 8.892420780768848, + "grad_norm": 0.1962890625, + "learning_rate": 8.861031091685706e-05, + "loss": 0.5114, + "step": 179040 + }, + { + "epoch": 8.892917453064468, + "grad_norm": 0.1943359375, + "learning_rate": 8.857057713320752e-05, + "loss": 0.4954, + "step": 179050 + }, + { + "epoch": 8.893414125360087, + "grad_norm": 0.17578125, + "learning_rate": 8.853084334955796e-05, + "loss": 0.5121, + "step": 179060 + }, + { + "epoch": 8.893910797655707, + "grad_norm": 0.1787109375, + "learning_rate": 8.849110956590842e-05, + "loss": 0.5159, + "step": 179070 + }, + { + "epoch": 8.894407469951327, + "grad_norm": 0.181640625, + "learning_rate": 8.845137578225888e-05, + "loss": 0.4863, + "step": 179080 + }, + { + "epoch": 8.894904142246945, + "grad_norm": 0.1875, + "learning_rate": 8.841164199860932e-05, + "loss": 0.4863, + "step": 179090 + }, + { + "epoch": 8.895400814542565, + "grad_norm": 0.177734375, + "learning_rate": 8.837190821495978e-05, + "loss": 0.516, + "step": 179100 + }, + { + "epoch": 8.895897486838184, + "grad_norm": 0.1767578125, + "learning_rate": 8.833217443131022e-05, + "loss": 0.5159, + "step": 179110 + }, + { + "epoch": 8.896394159133804, + "grad_norm": 0.201171875, + "learning_rate": 8.829244064766068e-05, + "loss": 0.4884, + "step": 179120 + }, + { + "epoch": 8.896890831429422, + "grad_norm": 0.173828125, + "learning_rate": 8.825270686401113e-05, + "loss": 0.5105, + "step": 179130 + }, + { + "epoch": 8.897387503725042, + "grad_norm": 0.1728515625, + "learning_rate": 8.821297308036158e-05, + "loss": 0.5287, + "step": 179140 + }, + { + "epoch": 8.897884176020662, + "grad_norm": 0.1748046875, + "learning_rate": 8.817323929671203e-05, + "loss": 0.4891, + "step": 179150 + }, + { + "epoch": 8.89838084831628, + "grad_norm": 0.1787109375, + "learning_rate": 8.813350551306249e-05, + "loss": 0.488, + "step": 179160 + }, + { + "epoch": 8.8988775206119, + "grad_norm": 0.2099609375, + "learning_rate": 8.809377172941293e-05, + "loss": 0.4921, + "step": 179170 + }, + { + "epoch": 8.89937419290752, + "grad_norm": 0.1962890625, + "learning_rate": 8.805403794576339e-05, + "loss": 0.5335, + "step": 179180 + }, + { + "epoch": 8.89987086520314, + "grad_norm": 0.1796875, + "learning_rate": 8.801430416211383e-05, + "loss": 0.5144, + "step": 179190 + }, + { + "epoch": 8.900367537498758, + "grad_norm": 0.17578125, + "learning_rate": 8.797457037846429e-05, + "loss": 0.5105, + "step": 179200 + }, + { + "epoch": 8.900864209794378, + "grad_norm": 0.189453125, + "learning_rate": 8.793483659481473e-05, + "loss": 0.4774, + "step": 179210 + }, + { + "epoch": 8.901360882089996, + "grad_norm": 0.2021484375, + "learning_rate": 8.789510281116519e-05, + "loss": 0.5074, + "step": 179220 + }, + { + "epoch": 8.901857554385616, + "grad_norm": 0.1669921875, + "learning_rate": 8.785536902751565e-05, + "loss": 0.4918, + "step": 179230 + }, + { + "epoch": 8.902354226681236, + "grad_norm": 0.177734375, + "learning_rate": 8.781563524386611e-05, + "loss": 0.4707, + "step": 179240 + }, + { + "epoch": 8.902850898976855, + "grad_norm": 0.1982421875, + "learning_rate": 8.777590146021657e-05, + "loss": 0.4949, + "step": 179250 + }, + { + "epoch": 8.903347571272475, + "grad_norm": 0.2138671875, + "learning_rate": 8.773616767656701e-05, + "loss": 0.4835, + "step": 179260 + }, + { + "epoch": 8.903844243568093, + "grad_norm": 0.1962890625, + "learning_rate": 8.769643389291747e-05, + "loss": 0.4933, + "step": 179270 + }, + { + "epoch": 8.904340915863713, + "grad_norm": 0.177734375, + "learning_rate": 8.765670010926791e-05, + "loss": 0.4601, + "step": 179280 + }, + { + "epoch": 8.904837588159332, + "grad_norm": 0.17578125, + "learning_rate": 8.761696632561837e-05, + "loss": 0.4871, + "step": 179290 + }, + { + "epoch": 8.905334260454952, + "grad_norm": 0.1826171875, + "learning_rate": 8.757723254196881e-05, + "loss": 0.4939, + "step": 179300 + }, + { + "epoch": 8.905830932750572, + "grad_norm": 0.1904296875, + "learning_rate": 8.753749875831927e-05, + "loss": 0.4796, + "step": 179310 + }, + { + "epoch": 8.90632760504619, + "grad_norm": 0.2001953125, + "learning_rate": 8.749776497466972e-05, + "loss": 0.4798, + "step": 179320 + }, + { + "epoch": 8.90682427734181, + "grad_norm": 0.20703125, + "learning_rate": 8.745803119102017e-05, + "loss": 0.4824, + "step": 179330 + }, + { + "epoch": 8.907320949637429, + "grad_norm": 0.166015625, + "learning_rate": 8.741829740737062e-05, + "loss": 0.5187, + "step": 179340 + }, + { + "epoch": 8.907817621933049, + "grad_norm": 0.1748046875, + "learning_rate": 8.737856362372108e-05, + "loss": 0.5074, + "step": 179350 + }, + { + "epoch": 8.908314294228667, + "grad_norm": 0.18359375, + "learning_rate": 8.733882984007152e-05, + "loss": 0.4614, + "step": 179360 + }, + { + "epoch": 8.908810966524287, + "grad_norm": 0.1796875, + "learning_rate": 8.729909605642198e-05, + "loss": 0.4654, + "step": 179370 + }, + { + "epoch": 8.909307638819907, + "grad_norm": 0.2021484375, + "learning_rate": 8.725936227277242e-05, + "loss": 0.4884, + "step": 179380 + }, + { + "epoch": 8.909804311115526, + "grad_norm": 0.2314453125, + "learning_rate": 8.721962848912288e-05, + "loss": 0.5112, + "step": 179390 + }, + { + "epoch": 8.910300983411146, + "grad_norm": 0.1943359375, + "learning_rate": 8.717989470547334e-05, + "loss": 0.4858, + "step": 179400 + }, + { + "epoch": 8.910797655706764, + "grad_norm": 0.189453125, + "learning_rate": 8.714016092182378e-05, + "loss": 0.5039, + "step": 179410 + }, + { + "epoch": 8.911294328002384, + "grad_norm": 0.1748046875, + "learning_rate": 8.710042713817424e-05, + "loss": 0.4878, + "step": 179420 + }, + { + "epoch": 8.911791000298003, + "grad_norm": 0.2001953125, + "learning_rate": 8.70606933545247e-05, + "loss": 0.4789, + "step": 179430 + }, + { + "epoch": 8.912287672593623, + "grad_norm": 0.18359375, + "learning_rate": 8.702095957087514e-05, + "loss": 0.4978, + "step": 179440 + }, + { + "epoch": 8.912784344889243, + "grad_norm": 0.1806640625, + "learning_rate": 8.69812257872256e-05, + "loss": 0.4867, + "step": 179450 + }, + { + "epoch": 8.913281017184861, + "grad_norm": 0.1884765625, + "learning_rate": 8.694149200357604e-05, + "loss": 0.4864, + "step": 179460 + }, + { + "epoch": 8.913777689480481, + "grad_norm": 0.1826171875, + "learning_rate": 8.69017582199265e-05, + "loss": 0.489, + "step": 179470 + }, + { + "epoch": 8.9142743617761, + "grad_norm": 0.19921875, + "learning_rate": 8.686202443627695e-05, + "loss": 0.487, + "step": 179480 + }, + { + "epoch": 8.91477103407172, + "grad_norm": 0.2001953125, + "learning_rate": 8.68222906526274e-05, + "loss": 0.4775, + "step": 179490 + }, + { + "epoch": 8.915267706367338, + "grad_norm": 0.197265625, + "learning_rate": 8.678255686897785e-05, + "loss": 0.4653, + "step": 179500 + }, + { + "epoch": 8.915764378662958, + "grad_norm": 0.1767578125, + "learning_rate": 8.67428230853283e-05, + "loss": 0.4398, + "step": 179510 + }, + { + "epoch": 8.916261050958578, + "grad_norm": 0.1845703125, + "learning_rate": 8.670308930167875e-05, + "loss": 0.507, + "step": 179520 + }, + { + "epoch": 8.916757723254197, + "grad_norm": 0.189453125, + "learning_rate": 8.666335551802921e-05, + "loss": 0.4761, + "step": 179530 + }, + { + "epoch": 8.917254395549817, + "grad_norm": 0.19921875, + "learning_rate": 8.662362173437965e-05, + "loss": 0.4468, + "step": 179540 + }, + { + "epoch": 8.917751067845435, + "grad_norm": 0.1787109375, + "learning_rate": 8.658388795073011e-05, + "loss": 0.4619, + "step": 179550 + }, + { + "epoch": 8.918247740141055, + "grad_norm": 0.203125, + "learning_rate": 8.654415416708055e-05, + "loss": 0.4887, + "step": 179560 + }, + { + "epoch": 8.918744412436673, + "grad_norm": 0.1904296875, + "learning_rate": 8.650442038343101e-05, + "loss": 0.4776, + "step": 179570 + }, + { + "epoch": 8.919241084732294, + "grad_norm": 0.2021484375, + "learning_rate": 8.646468659978147e-05, + "loss": 0.5162, + "step": 179580 + }, + { + "epoch": 8.919737757027914, + "grad_norm": 0.201171875, + "learning_rate": 8.642495281613193e-05, + "loss": 0.4993, + "step": 179590 + }, + { + "epoch": 8.920234429323532, + "grad_norm": 0.1953125, + "learning_rate": 8.638521903248237e-05, + "loss": 0.4697, + "step": 179600 + }, + { + "epoch": 8.920731101619152, + "grad_norm": 0.2119140625, + "learning_rate": 8.634548524883283e-05, + "loss": 0.4912, + "step": 179610 + }, + { + "epoch": 8.92122777391477, + "grad_norm": 0.1865234375, + "learning_rate": 8.630575146518327e-05, + "loss": 0.4815, + "step": 179620 + }, + { + "epoch": 8.92172444621039, + "grad_norm": 0.193359375, + "learning_rate": 8.626601768153373e-05, + "loss": 0.4812, + "step": 179630 + }, + { + "epoch": 8.922221118506009, + "grad_norm": 0.1708984375, + "learning_rate": 8.622628389788418e-05, + "loss": 0.4815, + "step": 179640 + }, + { + "epoch": 8.922717790801629, + "grad_norm": 0.1875, + "learning_rate": 8.618655011423463e-05, + "loss": 0.5083, + "step": 179650 + }, + { + "epoch": 8.92321446309725, + "grad_norm": 0.20703125, + "learning_rate": 8.614681633058509e-05, + "loss": 0.4818, + "step": 179660 + }, + { + "epoch": 8.923711135392868, + "grad_norm": 0.1767578125, + "learning_rate": 8.610708254693554e-05, + "loss": 0.4998, + "step": 179670 + }, + { + "epoch": 8.924207807688488, + "grad_norm": 0.177734375, + "learning_rate": 8.606734876328599e-05, + "loss": 0.48, + "step": 179680 + }, + { + "epoch": 8.924704479984106, + "grad_norm": 0.1865234375, + "learning_rate": 8.602761497963644e-05, + "loss": 0.503, + "step": 179690 + }, + { + "epoch": 8.925201152279726, + "grad_norm": 0.1787109375, + "learning_rate": 8.59878811959869e-05, + "loss": 0.4443, + "step": 179700 + }, + { + "epoch": 8.925697824575344, + "grad_norm": 0.177734375, + "learning_rate": 8.594814741233734e-05, + "loss": 0.4841, + "step": 179710 + }, + { + "epoch": 8.926194496870965, + "grad_norm": 0.1826171875, + "learning_rate": 8.59084136286878e-05, + "loss": 0.5048, + "step": 179720 + }, + { + "epoch": 8.926691169166585, + "grad_norm": 0.201171875, + "learning_rate": 8.586867984503824e-05, + "loss": 0.5001, + "step": 179730 + }, + { + "epoch": 8.927187841462203, + "grad_norm": 0.1708984375, + "learning_rate": 8.58289460613887e-05, + "loss": 0.4688, + "step": 179740 + }, + { + "epoch": 8.927684513757823, + "grad_norm": 0.1884765625, + "learning_rate": 8.578921227773916e-05, + "loss": 0.4602, + "step": 179750 + }, + { + "epoch": 8.928181186053441, + "grad_norm": 0.236328125, + "learning_rate": 8.57494784940896e-05, + "loss": 0.505, + "step": 179760 + }, + { + "epoch": 8.928677858349062, + "grad_norm": 0.1708984375, + "learning_rate": 8.570974471044006e-05, + "loss": 0.4495, + "step": 179770 + }, + { + "epoch": 8.92917453064468, + "grad_norm": 0.1962890625, + "learning_rate": 8.567001092679052e-05, + "loss": 0.4841, + "step": 179780 + }, + { + "epoch": 8.9296712029403, + "grad_norm": 0.1806640625, + "learning_rate": 8.563027714314096e-05, + "loss": 0.4862, + "step": 179790 + }, + { + "epoch": 8.93016787523592, + "grad_norm": 0.181640625, + "learning_rate": 8.559054335949142e-05, + "loss": 0.5101, + "step": 179800 + }, + { + "epoch": 8.930664547531538, + "grad_norm": 0.2216796875, + "learning_rate": 8.555080957584186e-05, + "loss": 0.4651, + "step": 179810 + }, + { + "epoch": 8.931161219827159, + "grad_norm": 0.20703125, + "learning_rate": 8.551107579219232e-05, + "loss": 0.4696, + "step": 179820 + }, + { + "epoch": 8.931657892122777, + "grad_norm": 0.1982421875, + "learning_rate": 8.547134200854277e-05, + "loss": 0.5091, + "step": 179830 + }, + { + "epoch": 8.932154564418397, + "grad_norm": 0.19140625, + "learning_rate": 8.543160822489322e-05, + "loss": 0.4927, + "step": 179840 + }, + { + "epoch": 8.932651236714015, + "grad_norm": 0.1806640625, + "learning_rate": 8.539187444124367e-05, + "loss": 0.4911, + "step": 179850 + }, + { + "epoch": 8.933147909009636, + "grad_norm": 0.1943359375, + "learning_rate": 8.535214065759413e-05, + "loss": 0.52, + "step": 179860 + }, + { + "epoch": 8.933644581305256, + "grad_norm": 0.2001953125, + "learning_rate": 8.531240687394457e-05, + "loss": 0.4781, + "step": 179870 + }, + { + "epoch": 8.934141253600874, + "grad_norm": 0.1845703125, + "learning_rate": 8.527267309029503e-05, + "loss": 0.4909, + "step": 179880 + }, + { + "epoch": 8.934637925896494, + "grad_norm": 0.205078125, + "learning_rate": 8.523293930664547e-05, + "loss": 0.4733, + "step": 179890 + }, + { + "epoch": 8.935134598192112, + "grad_norm": 0.203125, + "learning_rate": 8.519320552299593e-05, + "loss": 0.499, + "step": 179900 + }, + { + "epoch": 8.935631270487733, + "grad_norm": 0.1708984375, + "learning_rate": 8.515347173934637e-05, + "loss": 0.4652, + "step": 179910 + }, + { + "epoch": 8.93612794278335, + "grad_norm": 0.1875, + "learning_rate": 8.511373795569683e-05, + "loss": 0.4774, + "step": 179920 + }, + { + "epoch": 8.936624615078971, + "grad_norm": 0.2236328125, + "learning_rate": 8.507400417204729e-05, + "loss": 0.5377, + "step": 179930 + }, + { + "epoch": 8.937121287374591, + "grad_norm": 0.1845703125, + "learning_rate": 8.503427038839775e-05, + "loss": 0.4984, + "step": 179940 + }, + { + "epoch": 8.93761795967021, + "grad_norm": 0.193359375, + "learning_rate": 8.499453660474819e-05, + "loss": 0.4954, + "step": 179950 + }, + { + "epoch": 8.93811463196583, + "grad_norm": 0.1767578125, + "learning_rate": 8.495480282109865e-05, + "loss": 0.4871, + "step": 179960 + }, + { + "epoch": 8.938611304261448, + "grad_norm": 0.185546875, + "learning_rate": 8.49150690374491e-05, + "loss": 0.4923, + "step": 179970 + }, + { + "epoch": 8.939107976557068, + "grad_norm": 0.232421875, + "learning_rate": 8.487533525379955e-05, + "loss": 0.4952, + "step": 179980 + }, + { + "epoch": 8.939604648852686, + "grad_norm": 0.193359375, + "learning_rate": 8.483560147015e-05, + "loss": 0.5052, + "step": 179990 + }, + { + "epoch": 8.940101321148306, + "grad_norm": 0.1865234375, + "learning_rate": 8.479586768650045e-05, + "loss": 0.4791, + "step": 180000 + }, + { + "epoch": 8.940597993443927, + "grad_norm": 0.1875, + "learning_rate": 8.47561339028509e-05, + "loss": 0.4606, + "step": 180010 + }, + { + "epoch": 8.941094665739545, + "grad_norm": 0.1865234375, + "learning_rate": 8.471640011920136e-05, + "loss": 0.4786, + "step": 180020 + }, + { + "epoch": 8.941591338035165, + "grad_norm": 0.2060546875, + "learning_rate": 8.46766663355518e-05, + "loss": 0.468, + "step": 180030 + }, + { + "epoch": 8.942088010330783, + "grad_norm": 0.1796875, + "learning_rate": 8.463693255190226e-05, + "loss": 0.4834, + "step": 180040 + }, + { + "epoch": 8.942584682626403, + "grad_norm": 0.2021484375, + "learning_rate": 8.45971987682527e-05, + "loss": 0.4883, + "step": 180050 + }, + { + "epoch": 8.943081354922022, + "grad_norm": 0.19140625, + "learning_rate": 8.455746498460316e-05, + "loss": 0.4961, + "step": 180060 + }, + { + "epoch": 8.943578027217642, + "grad_norm": 0.1640625, + "learning_rate": 8.451773120095362e-05, + "loss": 0.4694, + "step": 180070 + }, + { + "epoch": 8.944074699513262, + "grad_norm": 0.1865234375, + "learning_rate": 8.447799741730406e-05, + "loss": 0.4684, + "step": 180080 + }, + { + "epoch": 8.94457137180888, + "grad_norm": 0.2158203125, + "learning_rate": 8.443826363365452e-05, + "loss": 0.4963, + "step": 180090 + }, + { + "epoch": 8.9450680441045, + "grad_norm": 0.2158203125, + "learning_rate": 8.439852985000498e-05, + "loss": 0.482, + "step": 180100 + }, + { + "epoch": 8.945564716400119, + "grad_norm": 0.20703125, + "learning_rate": 8.435879606635542e-05, + "loss": 0.5205, + "step": 180110 + }, + { + "epoch": 8.946061388695739, + "grad_norm": 0.1787109375, + "learning_rate": 8.431906228270588e-05, + "loss": 0.4627, + "step": 180120 + }, + { + "epoch": 8.946558060991357, + "grad_norm": 0.197265625, + "learning_rate": 8.427932849905634e-05, + "loss": 0.4897, + "step": 180130 + }, + { + "epoch": 8.947054733286977, + "grad_norm": 0.1826171875, + "learning_rate": 8.423959471540678e-05, + "loss": 0.4652, + "step": 180140 + }, + { + "epoch": 8.947551405582598, + "grad_norm": 0.201171875, + "learning_rate": 8.419986093175724e-05, + "loss": 0.5093, + "step": 180150 + }, + { + "epoch": 8.948048077878216, + "grad_norm": 0.18359375, + "learning_rate": 8.416012714810768e-05, + "loss": 0.4991, + "step": 180160 + }, + { + "epoch": 8.948544750173836, + "grad_norm": 0.1669921875, + "learning_rate": 8.412039336445814e-05, + "loss": 0.4651, + "step": 180170 + }, + { + "epoch": 8.949041422469454, + "grad_norm": 0.1767578125, + "learning_rate": 8.408065958080859e-05, + "loss": 0.4559, + "step": 180180 + }, + { + "epoch": 8.949538094765074, + "grad_norm": 0.193359375, + "learning_rate": 8.404092579715904e-05, + "loss": 0.5146, + "step": 180190 + }, + { + "epoch": 8.950034767060693, + "grad_norm": 0.1953125, + "learning_rate": 8.400119201350949e-05, + "loss": 0.495, + "step": 180200 + }, + { + "epoch": 8.950531439356313, + "grad_norm": 0.19921875, + "learning_rate": 8.396145822985994e-05, + "loss": 0.4924, + "step": 180210 + }, + { + "epoch": 8.951028111651931, + "grad_norm": 0.1826171875, + "learning_rate": 8.392172444621039e-05, + "loss": 0.4889, + "step": 180220 + }, + { + "epoch": 8.951524783947551, + "grad_norm": 0.18359375, + "learning_rate": 8.388199066256085e-05, + "loss": 0.4817, + "step": 180230 + }, + { + "epoch": 8.952021456243171, + "grad_norm": 0.1796875, + "learning_rate": 8.384225687891129e-05, + "loss": 0.4716, + "step": 180240 + }, + { + "epoch": 8.95251812853879, + "grad_norm": 0.2138671875, + "learning_rate": 8.380252309526175e-05, + "loss": 0.501, + "step": 180250 + }, + { + "epoch": 8.95301480083441, + "grad_norm": 0.2021484375, + "learning_rate": 8.37627893116122e-05, + "loss": 0.4933, + "step": 180260 + }, + { + "epoch": 8.953511473130028, + "grad_norm": 0.1845703125, + "learning_rate": 8.372305552796265e-05, + "loss": 0.4752, + "step": 180270 + }, + { + "epoch": 8.954008145425648, + "grad_norm": 0.18359375, + "learning_rate": 8.368332174431311e-05, + "loss": 0.4896, + "step": 180280 + }, + { + "epoch": 8.954504817721267, + "grad_norm": 0.220703125, + "learning_rate": 8.364358796066357e-05, + "loss": 0.4925, + "step": 180290 + }, + { + "epoch": 8.955001490016887, + "grad_norm": 0.185546875, + "learning_rate": 8.360385417701401e-05, + "loss": 0.5158, + "step": 180300 + }, + { + "epoch": 8.955498162312507, + "grad_norm": 0.2138671875, + "learning_rate": 8.356412039336447e-05, + "loss": 0.4804, + "step": 180310 + }, + { + "epoch": 8.955994834608125, + "grad_norm": 0.1943359375, + "learning_rate": 8.352438660971491e-05, + "loss": 0.4729, + "step": 180320 + }, + { + "epoch": 8.956491506903745, + "grad_norm": 0.201171875, + "learning_rate": 8.348465282606537e-05, + "loss": 0.5196, + "step": 180330 + }, + { + "epoch": 8.956988179199364, + "grad_norm": 0.1748046875, + "learning_rate": 8.344491904241582e-05, + "loss": 0.4813, + "step": 180340 + }, + { + "epoch": 8.957484851494984, + "grad_norm": 0.2197265625, + "learning_rate": 8.340518525876627e-05, + "loss": 0.4854, + "step": 180350 + }, + { + "epoch": 8.957981523790602, + "grad_norm": 0.1787109375, + "learning_rate": 8.336545147511672e-05, + "loss": 0.4981, + "step": 180360 + }, + { + "epoch": 8.958478196086222, + "grad_norm": 0.1953125, + "learning_rate": 8.332571769146717e-05, + "loss": 0.5089, + "step": 180370 + }, + { + "epoch": 8.958974868381842, + "grad_norm": 0.2080078125, + "learning_rate": 8.328598390781762e-05, + "loss": 0.4921, + "step": 180380 + }, + { + "epoch": 8.95947154067746, + "grad_norm": 0.1767578125, + "learning_rate": 8.324625012416808e-05, + "loss": 0.534, + "step": 180390 + }, + { + "epoch": 8.95996821297308, + "grad_norm": 0.2001953125, + "learning_rate": 8.320651634051852e-05, + "loss": 0.5196, + "step": 180400 + }, + { + "epoch": 8.9604648852687, + "grad_norm": 0.177734375, + "learning_rate": 8.316678255686898e-05, + "loss": 0.4891, + "step": 180410 + }, + { + "epoch": 8.96096155756432, + "grad_norm": 0.2236328125, + "learning_rate": 8.312704877321942e-05, + "loss": 0.5117, + "step": 180420 + }, + { + "epoch": 8.961458229859938, + "grad_norm": 0.1875, + "learning_rate": 8.308731498956988e-05, + "loss": 0.4737, + "step": 180430 + }, + { + "epoch": 8.961954902155558, + "grad_norm": 0.1845703125, + "learning_rate": 8.304758120592034e-05, + "loss": 0.4806, + "step": 180440 + }, + { + "epoch": 8.962451574451178, + "grad_norm": 0.1845703125, + "learning_rate": 8.30078474222708e-05, + "loss": 0.5053, + "step": 180450 + }, + { + "epoch": 8.962948246746796, + "grad_norm": 0.19140625, + "learning_rate": 8.296811363862124e-05, + "loss": 0.5095, + "step": 180460 + }, + { + "epoch": 8.963444919042416, + "grad_norm": 0.1708984375, + "learning_rate": 8.29283798549717e-05, + "loss": 0.4786, + "step": 180470 + }, + { + "epoch": 8.963941591338035, + "grad_norm": 0.1865234375, + "learning_rate": 8.288864607132214e-05, + "loss": 0.5075, + "step": 180480 + }, + { + "epoch": 8.964438263633655, + "grad_norm": 0.193359375, + "learning_rate": 8.28489122876726e-05, + "loss": 0.4938, + "step": 180490 + }, + { + "epoch": 8.964934935929273, + "grad_norm": 0.1943359375, + "learning_rate": 8.280917850402306e-05, + "loss": 0.5019, + "step": 180500 + }, + { + "epoch": 8.965431608224893, + "grad_norm": 0.1845703125, + "learning_rate": 8.27694447203735e-05, + "loss": 0.475, + "step": 180510 + }, + { + "epoch": 8.965928280520513, + "grad_norm": 0.19921875, + "learning_rate": 8.272971093672396e-05, + "loss": 0.4328, + "step": 180520 + }, + { + "epoch": 8.966424952816132, + "grad_norm": 0.1875, + "learning_rate": 8.26899771530744e-05, + "loss": 0.5268, + "step": 180530 + }, + { + "epoch": 8.966921625111752, + "grad_norm": 0.185546875, + "learning_rate": 8.265024336942486e-05, + "loss": 0.4776, + "step": 180540 + }, + { + "epoch": 8.96741829740737, + "grad_norm": 0.1826171875, + "learning_rate": 8.261050958577531e-05, + "loss": 0.4855, + "step": 180550 + }, + { + "epoch": 8.96791496970299, + "grad_norm": 0.1826171875, + "learning_rate": 8.257077580212576e-05, + "loss": 0.5095, + "step": 180560 + }, + { + "epoch": 8.968411641998609, + "grad_norm": 0.1943359375, + "learning_rate": 8.253104201847621e-05, + "loss": 0.4727, + "step": 180570 + }, + { + "epoch": 8.968908314294229, + "grad_norm": 0.1865234375, + "learning_rate": 8.249130823482667e-05, + "loss": 0.5032, + "step": 180580 + }, + { + "epoch": 8.969404986589847, + "grad_norm": 0.201171875, + "learning_rate": 8.245157445117711e-05, + "loss": 0.4954, + "step": 180590 + }, + { + "epoch": 8.969901658885467, + "grad_norm": 0.1904296875, + "learning_rate": 8.241184066752757e-05, + "loss": 0.4854, + "step": 180600 + }, + { + "epoch": 8.970398331181087, + "grad_norm": 0.1953125, + "learning_rate": 8.237210688387801e-05, + "loss": 0.495, + "step": 180610 + }, + { + "epoch": 8.970895003476706, + "grad_norm": 0.203125, + "learning_rate": 8.233237310022847e-05, + "loss": 0.4903, + "step": 180620 + }, + { + "epoch": 8.971391675772326, + "grad_norm": 0.1923828125, + "learning_rate": 8.229263931657893e-05, + "loss": 0.4724, + "step": 180630 + }, + { + "epoch": 8.971888348067944, + "grad_norm": 0.19140625, + "learning_rate": 8.225290553292939e-05, + "loss": 0.5086, + "step": 180640 + }, + { + "epoch": 8.972385020363564, + "grad_norm": 0.2333984375, + "learning_rate": 8.221317174927983e-05, + "loss": 0.496, + "step": 180650 + }, + { + "epoch": 8.972881692659183, + "grad_norm": 0.1904296875, + "learning_rate": 8.217343796563029e-05, + "loss": 0.4937, + "step": 180660 + }, + { + "epoch": 8.973378364954803, + "grad_norm": 0.232421875, + "learning_rate": 8.213370418198073e-05, + "loss": 0.4648, + "step": 180670 + }, + { + "epoch": 8.973875037250423, + "grad_norm": 0.2001953125, + "learning_rate": 8.209397039833119e-05, + "loss": 0.4768, + "step": 180680 + }, + { + "epoch": 8.974371709546041, + "grad_norm": 0.1884765625, + "learning_rate": 8.205423661468163e-05, + "loss": 0.4783, + "step": 180690 + }, + { + "epoch": 8.974868381841661, + "grad_norm": 0.2236328125, + "learning_rate": 8.201450283103209e-05, + "loss": 0.4717, + "step": 180700 + }, + { + "epoch": 8.97536505413728, + "grad_norm": 0.244140625, + "learning_rate": 8.197476904738254e-05, + "loss": 0.4951, + "step": 180710 + }, + { + "epoch": 8.9758617264329, + "grad_norm": 0.1669921875, + "learning_rate": 8.1935035263733e-05, + "loss": 0.4896, + "step": 180720 + }, + { + "epoch": 8.976358398728518, + "grad_norm": 0.1904296875, + "learning_rate": 8.189530148008344e-05, + "loss": 0.4844, + "step": 180730 + }, + { + "epoch": 8.976855071024138, + "grad_norm": 0.2138671875, + "learning_rate": 8.18555676964339e-05, + "loss": 0.4644, + "step": 180740 + }, + { + "epoch": 8.977351743319758, + "grad_norm": 0.1943359375, + "learning_rate": 8.181583391278434e-05, + "loss": 0.4718, + "step": 180750 + }, + { + "epoch": 8.977848415615377, + "grad_norm": 0.1826171875, + "learning_rate": 8.17761001291348e-05, + "loss": 0.4921, + "step": 180760 + }, + { + "epoch": 8.978345087910997, + "grad_norm": 0.1669921875, + "learning_rate": 8.173636634548524e-05, + "loss": 0.467, + "step": 180770 + }, + { + "epoch": 8.978841760206615, + "grad_norm": 0.181640625, + "learning_rate": 8.16966325618357e-05, + "loss": 0.5066, + "step": 180780 + }, + { + "epoch": 8.979338432502235, + "grad_norm": 0.18359375, + "learning_rate": 8.165689877818616e-05, + "loss": 0.5071, + "step": 180790 + }, + { + "epoch": 8.979835104797854, + "grad_norm": 0.2041015625, + "learning_rate": 8.161716499453662e-05, + "loss": 0.4895, + "step": 180800 + }, + { + "epoch": 8.980331777093474, + "grad_norm": 0.1826171875, + "learning_rate": 8.157743121088706e-05, + "loss": 0.5055, + "step": 180810 + }, + { + "epoch": 8.980828449389094, + "grad_norm": 0.1865234375, + "learning_rate": 8.153769742723752e-05, + "loss": 0.4669, + "step": 180820 + }, + { + "epoch": 8.981325121684712, + "grad_norm": 0.2236328125, + "learning_rate": 8.149796364358796e-05, + "loss": 0.5243, + "step": 180830 + }, + { + "epoch": 8.981821793980332, + "grad_norm": 0.189453125, + "learning_rate": 8.145822985993842e-05, + "loss": 0.4916, + "step": 180840 + }, + { + "epoch": 8.98231846627595, + "grad_norm": 0.1669921875, + "learning_rate": 8.141849607628886e-05, + "loss": 0.4769, + "step": 180850 + }, + { + "epoch": 8.98281513857157, + "grad_norm": 0.1845703125, + "learning_rate": 8.137876229263932e-05, + "loss": 0.466, + "step": 180860 + }, + { + "epoch": 8.983311810867189, + "grad_norm": 0.22265625, + "learning_rate": 8.133902850898977e-05, + "loss": 0.524, + "step": 180870 + }, + { + "epoch": 8.983808483162809, + "grad_norm": 0.193359375, + "learning_rate": 8.129929472534022e-05, + "loss": 0.5201, + "step": 180880 + }, + { + "epoch": 8.98430515545843, + "grad_norm": 0.18359375, + "learning_rate": 8.125956094169067e-05, + "loss": 0.5019, + "step": 180890 + }, + { + "epoch": 8.984801827754048, + "grad_norm": 0.173828125, + "learning_rate": 8.121982715804113e-05, + "loss": 0.4948, + "step": 180900 + }, + { + "epoch": 8.985298500049668, + "grad_norm": 0.19921875, + "learning_rate": 8.118009337439157e-05, + "loss": 0.5, + "step": 180910 + }, + { + "epoch": 8.985795172345286, + "grad_norm": 0.1865234375, + "learning_rate": 8.114035959074203e-05, + "loss": 0.4749, + "step": 180920 + }, + { + "epoch": 8.986291844640906, + "grad_norm": 0.201171875, + "learning_rate": 8.110062580709249e-05, + "loss": 0.47, + "step": 180930 + }, + { + "epoch": 8.986788516936524, + "grad_norm": 0.197265625, + "learning_rate": 8.106089202344293e-05, + "loss": 0.4902, + "step": 180940 + }, + { + "epoch": 8.987285189232145, + "grad_norm": 0.19140625, + "learning_rate": 8.102115823979339e-05, + "loss": 0.5165, + "step": 180950 + }, + { + "epoch": 8.987781861527765, + "grad_norm": 0.1865234375, + "learning_rate": 8.098142445614383e-05, + "loss": 0.4855, + "step": 180960 + }, + { + "epoch": 8.988278533823383, + "grad_norm": 0.2109375, + "learning_rate": 8.094169067249429e-05, + "loss": 0.5052, + "step": 180970 + }, + { + "epoch": 8.988775206119003, + "grad_norm": 0.2021484375, + "learning_rate": 8.090195688884475e-05, + "loss": 0.4728, + "step": 180980 + }, + { + "epoch": 8.989271878414621, + "grad_norm": 0.1904296875, + "learning_rate": 8.08622231051952e-05, + "loss": 0.483, + "step": 180990 + }, + { + "epoch": 8.989768550710242, + "grad_norm": 0.171875, + "learning_rate": 8.082248932154565e-05, + "loss": 0.4724, + "step": 181000 + }, + { + "epoch": 8.99026522300586, + "grad_norm": 0.1806640625, + "learning_rate": 8.078275553789611e-05, + "loss": 0.4849, + "step": 181010 + }, + { + "epoch": 8.99076189530148, + "grad_norm": 0.228515625, + "learning_rate": 8.074302175424655e-05, + "loss": 0.486, + "step": 181020 + }, + { + "epoch": 8.9912585675971, + "grad_norm": 0.171875, + "learning_rate": 8.070328797059701e-05, + "loss": 0.5072, + "step": 181030 + }, + { + "epoch": 8.991755239892719, + "grad_norm": 0.205078125, + "learning_rate": 8.066355418694745e-05, + "loss": 0.525, + "step": 181040 + }, + { + "epoch": 8.992251912188339, + "grad_norm": 0.1728515625, + "learning_rate": 8.062382040329791e-05, + "loss": 0.4892, + "step": 181050 + }, + { + "epoch": 8.992748584483957, + "grad_norm": 0.177734375, + "learning_rate": 8.058408661964836e-05, + "loss": 0.4923, + "step": 181060 + }, + { + "epoch": 8.993245256779577, + "grad_norm": 0.2060546875, + "learning_rate": 8.054435283599881e-05, + "loss": 0.4851, + "step": 181070 + }, + { + "epoch": 8.993741929075195, + "grad_norm": 0.1708984375, + "learning_rate": 8.050461905234926e-05, + "loss": 0.4845, + "step": 181080 + }, + { + "epoch": 8.994238601370816, + "grad_norm": 0.2021484375, + "learning_rate": 8.046488526869972e-05, + "loss": 0.4927, + "step": 181090 + }, + { + "epoch": 8.994735273666436, + "grad_norm": 0.21484375, + "learning_rate": 8.042515148505016e-05, + "loss": 0.5307, + "step": 181100 + }, + { + "epoch": 8.995231945962054, + "grad_norm": 0.18359375, + "learning_rate": 8.038541770140062e-05, + "loss": 0.4963, + "step": 181110 + }, + { + "epoch": 8.995728618257674, + "grad_norm": 0.1845703125, + "learning_rate": 8.034568391775106e-05, + "loss": 0.5034, + "step": 181120 + }, + { + "epoch": 8.996225290553292, + "grad_norm": 0.185546875, + "learning_rate": 8.030595013410152e-05, + "loss": 0.5045, + "step": 181130 + }, + { + "epoch": 8.996721962848913, + "grad_norm": 0.1640625, + "learning_rate": 8.026621635045198e-05, + "loss": 0.4621, + "step": 181140 + }, + { + "epoch": 8.997218635144531, + "grad_norm": 0.185546875, + "learning_rate": 8.022648256680244e-05, + "loss": 0.5041, + "step": 181150 + }, + { + "epoch": 8.997715307440151, + "grad_norm": 0.193359375, + "learning_rate": 8.018674878315288e-05, + "loss": 0.477, + "step": 181160 + }, + { + "epoch": 8.998211979735771, + "grad_norm": 0.2158203125, + "learning_rate": 8.014701499950334e-05, + "loss": 0.5047, + "step": 181170 + }, + { + "epoch": 8.99870865203139, + "grad_norm": 0.181640625, + "learning_rate": 8.010728121585378e-05, + "loss": 0.5087, + "step": 181180 + }, + { + "epoch": 8.99920532432701, + "grad_norm": 0.1875, + "learning_rate": 8.006754743220424e-05, + "loss": 0.495, + "step": 181190 + }, + { + "epoch": 8.999701996622628, + "grad_norm": 0.197265625, + "learning_rate": 8.002781364855468e-05, + "loss": 0.4698, + "step": 181200 + }, + { + "epoch": 9.000198668918248, + "grad_norm": 0.193359375, + "learning_rate": 7.998807986490514e-05, + "loss": 0.4873, + "step": 181210 + }, + { + "epoch": 9.000695341213866, + "grad_norm": 0.19921875, + "learning_rate": 7.994834608125559e-05, + "loss": 0.4846, + "step": 181220 + }, + { + "epoch": 9.001192013509487, + "grad_norm": 0.189453125, + "learning_rate": 7.990861229760604e-05, + "loss": 0.4822, + "step": 181230 + }, + { + "epoch": 9.001688685805107, + "grad_norm": 0.2177734375, + "learning_rate": 7.986887851395649e-05, + "loss": 0.4589, + "step": 181240 + }, + { + "epoch": 9.002185358100725, + "grad_norm": 0.228515625, + "learning_rate": 7.982914473030695e-05, + "loss": 0.5168, + "step": 181250 + }, + { + "epoch": 9.002682030396345, + "grad_norm": 0.1767578125, + "learning_rate": 7.978941094665739e-05, + "loss": 0.5155, + "step": 181260 + }, + { + "epoch": 9.003178702691963, + "grad_norm": 0.1884765625, + "learning_rate": 7.974967716300785e-05, + "loss": 0.4748, + "step": 181270 + }, + { + "epoch": 9.003675374987584, + "grad_norm": 0.189453125, + "learning_rate": 7.970994337935829e-05, + "loss": 0.4775, + "step": 181280 + }, + { + "epoch": 9.004172047283202, + "grad_norm": 0.236328125, + "learning_rate": 7.967020959570875e-05, + "loss": 0.4732, + "step": 181290 + }, + { + "epoch": 9.004668719578822, + "grad_norm": 0.1923828125, + "learning_rate": 7.963047581205921e-05, + "loss": 0.4981, + "step": 181300 + }, + { + "epoch": 9.005165391874442, + "grad_norm": 0.1865234375, + "learning_rate": 7.959074202840965e-05, + "loss": 0.4724, + "step": 181310 + }, + { + "epoch": 9.00566206417006, + "grad_norm": 0.181640625, + "learning_rate": 7.955100824476011e-05, + "loss": 0.5017, + "step": 181320 + }, + { + "epoch": 9.00615873646568, + "grad_norm": 0.193359375, + "learning_rate": 7.951127446111057e-05, + "loss": 0.4853, + "step": 181330 + }, + { + "epoch": 9.006655408761299, + "grad_norm": 0.2109375, + "learning_rate": 7.947154067746103e-05, + "loss": 0.4691, + "step": 181340 + }, + { + "epoch": 9.007152081056919, + "grad_norm": 0.181640625, + "learning_rate": 7.943180689381147e-05, + "loss": 0.5132, + "step": 181350 + }, + { + "epoch": 9.007648753352537, + "grad_norm": 0.1884765625, + "learning_rate": 7.939207311016193e-05, + "loss": 0.4841, + "step": 181360 + }, + { + "epoch": 9.008145425648157, + "grad_norm": 0.1787109375, + "learning_rate": 7.935233932651237e-05, + "loss": 0.4567, + "step": 181370 + }, + { + "epoch": 9.008642097943778, + "grad_norm": 0.1845703125, + "learning_rate": 7.931260554286283e-05, + "loss": 0.4849, + "step": 181380 + }, + { + "epoch": 9.009138770239396, + "grad_norm": 0.2255859375, + "learning_rate": 7.927287175921327e-05, + "loss": 0.4844, + "step": 181390 + }, + { + "epoch": 9.009635442535016, + "grad_norm": 0.1845703125, + "learning_rate": 7.923313797556373e-05, + "loss": 0.4968, + "step": 181400 + }, + { + "epoch": 9.010132114830634, + "grad_norm": 0.1806640625, + "learning_rate": 7.919340419191418e-05, + "loss": 0.4743, + "step": 181410 + }, + { + "epoch": 9.010628787126254, + "grad_norm": 0.1923828125, + "learning_rate": 7.915367040826463e-05, + "loss": 0.512, + "step": 181420 + }, + { + "epoch": 9.011125459421873, + "grad_norm": 0.1943359375, + "learning_rate": 7.911393662461508e-05, + "loss": 0.4897, + "step": 181430 + }, + { + "epoch": 9.011622131717493, + "grad_norm": 0.18359375, + "learning_rate": 7.907420284096554e-05, + "loss": 0.4648, + "step": 181440 + }, + { + "epoch": 9.012118804013111, + "grad_norm": 0.1767578125, + "learning_rate": 7.903446905731598e-05, + "loss": 0.5017, + "step": 181450 + }, + { + "epoch": 9.012615476308731, + "grad_norm": 0.18359375, + "learning_rate": 7.899473527366644e-05, + "loss": 0.4461, + "step": 181460 + }, + { + "epoch": 9.013112148604352, + "grad_norm": 0.2109375, + "learning_rate": 7.895500149001688e-05, + "loss": 0.5197, + "step": 181470 + }, + { + "epoch": 9.01360882089997, + "grad_norm": 0.19921875, + "learning_rate": 7.891526770636734e-05, + "loss": 0.5218, + "step": 181480 + }, + { + "epoch": 9.01410549319559, + "grad_norm": 0.1767578125, + "learning_rate": 7.88755339227178e-05, + "loss": 0.4734, + "step": 181490 + }, + { + "epoch": 9.014602165491208, + "grad_norm": 0.1943359375, + "learning_rate": 7.883580013906826e-05, + "loss": 0.4906, + "step": 181500 + }, + { + "epoch": 9.015098837786828, + "grad_norm": 0.1923828125, + "learning_rate": 7.87960663554187e-05, + "loss": 0.4702, + "step": 181510 + }, + { + "epoch": 9.015595510082447, + "grad_norm": 0.1796875, + "learning_rate": 7.875633257176916e-05, + "loss": 0.4654, + "step": 181520 + }, + { + "epoch": 9.016092182378067, + "grad_norm": 0.1953125, + "learning_rate": 7.87165987881196e-05, + "loss": 0.4803, + "step": 181530 + }, + { + "epoch": 9.016588854673687, + "grad_norm": 0.1923828125, + "learning_rate": 7.867686500447006e-05, + "loss": 0.4961, + "step": 181540 + }, + { + "epoch": 9.017085526969305, + "grad_norm": 0.1904296875, + "learning_rate": 7.86371312208205e-05, + "loss": 0.4313, + "step": 181550 + }, + { + "epoch": 9.017582199264925, + "grad_norm": 0.1845703125, + "learning_rate": 7.859739743717096e-05, + "loss": 0.4801, + "step": 181560 + }, + { + "epoch": 9.018078871560544, + "grad_norm": 0.189453125, + "learning_rate": 7.85576636535214e-05, + "loss": 0.4918, + "step": 181570 + }, + { + "epoch": 9.018575543856164, + "grad_norm": 0.1884765625, + "learning_rate": 7.851792986987186e-05, + "loss": 0.4472, + "step": 181580 + }, + { + "epoch": 9.019072216151782, + "grad_norm": 0.189453125, + "learning_rate": 7.847819608622231e-05, + "loss": 0.455, + "step": 181590 + }, + { + "epoch": 9.019568888447402, + "grad_norm": 0.173828125, + "learning_rate": 7.843846230257277e-05, + "loss": 0.5086, + "step": 181600 + }, + { + "epoch": 9.020065560743022, + "grad_norm": 0.1806640625, + "learning_rate": 7.839872851892321e-05, + "loss": 0.4726, + "step": 181610 + }, + { + "epoch": 9.02056223303864, + "grad_norm": 0.1845703125, + "learning_rate": 7.835899473527367e-05, + "loss": 0.4827, + "step": 181620 + }, + { + "epoch": 9.021058905334261, + "grad_norm": 0.1875, + "learning_rate": 7.831926095162411e-05, + "loss": 0.5131, + "step": 181630 + }, + { + "epoch": 9.02155557762988, + "grad_norm": 0.2216796875, + "learning_rate": 7.827952716797457e-05, + "loss": 0.4827, + "step": 181640 + }, + { + "epoch": 9.0220522499255, + "grad_norm": 0.1767578125, + "learning_rate": 7.823979338432503e-05, + "loss": 0.4884, + "step": 181650 + }, + { + "epoch": 9.022548922221118, + "grad_norm": 0.1728515625, + "learning_rate": 7.820005960067547e-05, + "loss": 0.482, + "step": 181660 + }, + { + "epoch": 9.023045594516738, + "grad_norm": 0.220703125, + "learning_rate": 7.816032581702593e-05, + "loss": 0.5079, + "step": 181670 + }, + { + "epoch": 9.023542266812358, + "grad_norm": 0.19140625, + "learning_rate": 7.812059203337639e-05, + "loss": 0.4843, + "step": 181680 + }, + { + "epoch": 9.024038939107976, + "grad_norm": 0.181640625, + "learning_rate": 7.808085824972683e-05, + "loss": 0.4598, + "step": 181690 + }, + { + "epoch": 9.024535611403596, + "grad_norm": 0.18359375, + "learning_rate": 7.804112446607729e-05, + "loss": 0.4772, + "step": 181700 + }, + { + "epoch": 9.025032283699215, + "grad_norm": 0.201171875, + "learning_rate": 7.800139068242773e-05, + "loss": 0.5073, + "step": 181710 + }, + { + "epoch": 9.025528955994835, + "grad_norm": 0.2001953125, + "learning_rate": 7.796165689877819e-05, + "loss": 0.4913, + "step": 181720 + }, + { + "epoch": 9.026025628290453, + "grad_norm": 0.1962890625, + "learning_rate": 7.792192311512864e-05, + "loss": 0.4884, + "step": 181730 + }, + { + "epoch": 9.026522300586073, + "grad_norm": 0.1953125, + "learning_rate": 7.78821893314791e-05, + "loss": 0.4637, + "step": 181740 + }, + { + "epoch": 9.027018972881693, + "grad_norm": 0.2109375, + "learning_rate": 7.784245554782954e-05, + "loss": 0.4977, + "step": 181750 + }, + { + "epoch": 9.027515645177312, + "grad_norm": 0.1787109375, + "learning_rate": 7.780272176418e-05, + "loss": 0.479, + "step": 181760 + }, + { + "epoch": 9.028012317472932, + "grad_norm": 0.1982421875, + "learning_rate": 7.776298798053045e-05, + "loss": 0.5042, + "step": 181770 + }, + { + "epoch": 9.02850898976855, + "grad_norm": 0.2021484375, + "learning_rate": 7.77232541968809e-05, + "loss": 0.5161, + "step": 181780 + }, + { + "epoch": 9.02900566206417, + "grad_norm": 0.1748046875, + "learning_rate": 7.768352041323136e-05, + "loss": 0.4433, + "step": 181790 + }, + { + "epoch": 9.029502334359789, + "grad_norm": 0.177734375, + "learning_rate": 7.76437866295818e-05, + "loss": 0.4592, + "step": 181800 + }, + { + "epoch": 9.029999006655409, + "grad_norm": 0.224609375, + "learning_rate": 7.760405284593226e-05, + "loss": 0.4839, + "step": 181810 + }, + { + "epoch": 9.030495678951029, + "grad_norm": 0.2099609375, + "learning_rate": 7.75643190622827e-05, + "loss": 0.5169, + "step": 181820 + }, + { + "epoch": 9.030992351246647, + "grad_norm": 0.1982421875, + "learning_rate": 7.752458527863316e-05, + "loss": 0.5034, + "step": 181830 + }, + { + "epoch": 9.031489023542267, + "grad_norm": 0.255859375, + "learning_rate": 7.748485149498362e-05, + "loss": 0.4805, + "step": 181840 + }, + { + "epoch": 9.031985695837886, + "grad_norm": 0.1943359375, + "learning_rate": 7.744511771133408e-05, + "loss": 0.474, + "step": 181850 + }, + { + "epoch": 9.032482368133506, + "grad_norm": 0.1806640625, + "learning_rate": 7.740538392768452e-05, + "loss": 0.4976, + "step": 181860 + }, + { + "epoch": 9.032979040429124, + "grad_norm": 0.1728515625, + "learning_rate": 7.736565014403498e-05, + "loss": 0.4408, + "step": 181870 + }, + { + "epoch": 9.033475712724744, + "grad_norm": 0.1796875, + "learning_rate": 7.732591636038542e-05, + "loss": 0.5109, + "step": 181880 + }, + { + "epoch": 9.033972385020364, + "grad_norm": 0.2275390625, + "learning_rate": 7.728618257673588e-05, + "loss": 0.4535, + "step": 181890 + }, + { + "epoch": 9.034469057315983, + "grad_norm": 0.185546875, + "learning_rate": 7.724644879308632e-05, + "loss": 0.4903, + "step": 181900 + }, + { + "epoch": 9.034965729611603, + "grad_norm": 0.1826171875, + "learning_rate": 7.720671500943678e-05, + "loss": 0.5, + "step": 181910 + }, + { + "epoch": 9.035462401907221, + "grad_norm": 0.2236328125, + "learning_rate": 7.716698122578723e-05, + "loss": 0.5069, + "step": 181920 + }, + { + "epoch": 9.035959074202841, + "grad_norm": 0.1796875, + "learning_rate": 7.712724744213768e-05, + "loss": 0.4912, + "step": 181930 + }, + { + "epoch": 9.03645574649846, + "grad_norm": 0.228515625, + "learning_rate": 7.708751365848813e-05, + "loss": 0.4778, + "step": 181940 + }, + { + "epoch": 9.03695241879408, + "grad_norm": 0.1962890625, + "learning_rate": 7.704777987483859e-05, + "loss": 0.5301, + "step": 181950 + }, + { + "epoch": 9.0374490910897, + "grad_norm": 0.1796875, + "learning_rate": 7.700804609118903e-05, + "loss": 0.4735, + "step": 181960 + }, + { + "epoch": 9.037945763385318, + "grad_norm": 0.2021484375, + "learning_rate": 7.696831230753949e-05, + "loss": 0.4786, + "step": 181970 + }, + { + "epoch": 9.038442435680938, + "grad_norm": 0.201171875, + "learning_rate": 7.692857852388993e-05, + "loss": 0.4826, + "step": 181980 + }, + { + "epoch": 9.038939107976557, + "grad_norm": 0.1953125, + "learning_rate": 7.688884474024039e-05, + "loss": 0.4855, + "step": 181990 + }, + { + "epoch": 9.039435780272177, + "grad_norm": 0.2236328125, + "learning_rate": 7.684911095659085e-05, + "loss": 0.4959, + "step": 182000 + }, + { + "epoch": 9.039932452567795, + "grad_norm": 0.20703125, + "learning_rate": 7.680937717294129e-05, + "loss": 0.4911, + "step": 182010 + }, + { + "epoch": 9.040429124863415, + "grad_norm": 0.2431640625, + "learning_rate": 7.676964338929175e-05, + "loss": 0.4749, + "step": 182020 + }, + { + "epoch": 9.040925797159035, + "grad_norm": 0.177734375, + "learning_rate": 7.672990960564221e-05, + "loss": 0.5086, + "step": 182030 + }, + { + "epoch": 9.041422469454654, + "grad_norm": 0.1904296875, + "learning_rate": 7.669017582199265e-05, + "loss": 0.4991, + "step": 182040 + }, + { + "epoch": 9.041919141750274, + "grad_norm": 0.19140625, + "learning_rate": 7.665044203834311e-05, + "loss": 0.4976, + "step": 182050 + }, + { + "epoch": 9.042415814045892, + "grad_norm": 0.171875, + "learning_rate": 7.661070825469355e-05, + "loss": 0.4819, + "step": 182060 + }, + { + "epoch": 9.042912486341512, + "grad_norm": 0.2021484375, + "learning_rate": 7.657097447104401e-05, + "loss": 0.4784, + "step": 182070 + }, + { + "epoch": 9.04340915863713, + "grad_norm": 0.1748046875, + "learning_rate": 7.653124068739446e-05, + "loss": 0.5272, + "step": 182080 + }, + { + "epoch": 9.04390583093275, + "grad_norm": 0.20703125, + "learning_rate": 7.649150690374491e-05, + "loss": 0.4936, + "step": 182090 + }, + { + "epoch": 9.04440250322837, + "grad_norm": 0.19140625, + "learning_rate": 7.645177312009536e-05, + "loss": 0.479, + "step": 182100 + }, + { + "epoch": 9.04489917552399, + "grad_norm": 0.2138671875, + "learning_rate": 7.641203933644582e-05, + "loss": 0.4642, + "step": 182110 + }, + { + "epoch": 9.04539584781961, + "grad_norm": 0.255859375, + "learning_rate": 7.637230555279626e-05, + "loss": 0.4864, + "step": 182120 + }, + { + "epoch": 9.045892520115228, + "grad_norm": 0.1904296875, + "learning_rate": 7.633257176914672e-05, + "loss": 0.5005, + "step": 182130 + }, + { + "epoch": 9.046389192410848, + "grad_norm": 0.1806640625, + "learning_rate": 7.629283798549716e-05, + "loss": 0.4765, + "step": 182140 + }, + { + "epoch": 9.046885864706466, + "grad_norm": 0.193359375, + "learning_rate": 7.625310420184762e-05, + "loss": 0.4727, + "step": 182150 + }, + { + "epoch": 9.047382537002086, + "grad_norm": 0.1875, + "learning_rate": 7.621337041819806e-05, + "loss": 0.4515, + "step": 182160 + }, + { + "epoch": 9.047879209297705, + "grad_norm": 0.189453125, + "learning_rate": 7.617363663454852e-05, + "loss": 0.505, + "step": 182170 + }, + { + "epoch": 9.048375881593325, + "grad_norm": 0.1943359375, + "learning_rate": 7.613390285089898e-05, + "loss": 0.4773, + "step": 182180 + }, + { + "epoch": 9.048872553888945, + "grad_norm": 0.1904296875, + "learning_rate": 7.609416906724944e-05, + "loss": 0.4876, + "step": 182190 + }, + { + "epoch": 9.049369226184563, + "grad_norm": 0.2109375, + "learning_rate": 7.60544352835999e-05, + "loss": 0.4807, + "step": 182200 + }, + { + "epoch": 9.049865898480183, + "grad_norm": 0.193359375, + "learning_rate": 7.601470149995034e-05, + "loss": 0.5126, + "step": 182210 + }, + { + "epoch": 9.050362570775802, + "grad_norm": 0.1923828125, + "learning_rate": 7.59749677163008e-05, + "loss": 0.5207, + "step": 182220 + }, + { + "epoch": 9.050859243071422, + "grad_norm": 0.1943359375, + "learning_rate": 7.593523393265124e-05, + "loss": 0.4763, + "step": 182230 + }, + { + "epoch": 9.05135591536704, + "grad_norm": 0.2216796875, + "learning_rate": 7.58955001490017e-05, + "loss": 0.483, + "step": 182240 + }, + { + "epoch": 9.05185258766266, + "grad_norm": 0.1865234375, + "learning_rate": 7.585576636535214e-05, + "loss": 0.4933, + "step": 182250 + }, + { + "epoch": 9.05234925995828, + "grad_norm": 0.19140625, + "learning_rate": 7.58160325817026e-05, + "loss": 0.4884, + "step": 182260 + }, + { + "epoch": 9.052845932253899, + "grad_norm": 0.197265625, + "learning_rate": 7.577629879805305e-05, + "loss": 0.4849, + "step": 182270 + }, + { + "epoch": 9.053342604549519, + "grad_norm": 0.1826171875, + "learning_rate": 7.57365650144035e-05, + "loss": 0.5134, + "step": 182280 + }, + { + "epoch": 9.053839276845137, + "grad_norm": 0.2060546875, + "learning_rate": 7.569683123075395e-05, + "loss": 0.4907, + "step": 182290 + }, + { + "epoch": 9.054335949140757, + "grad_norm": 0.19140625, + "learning_rate": 7.56570974471044e-05, + "loss": 0.4962, + "step": 182300 + }, + { + "epoch": 9.054832621436375, + "grad_norm": 0.181640625, + "learning_rate": 7.561736366345485e-05, + "loss": 0.48, + "step": 182310 + }, + { + "epoch": 9.055329293731996, + "grad_norm": 0.1875, + "learning_rate": 7.557762987980531e-05, + "loss": 0.4899, + "step": 182320 + }, + { + "epoch": 9.055825966027616, + "grad_norm": 0.205078125, + "learning_rate": 7.553789609615575e-05, + "loss": 0.4762, + "step": 182330 + }, + { + "epoch": 9.056322638323234, + "grad_norm": 0.220703125, + "learning_rate": 7.549816231250621e-05, + "loss": 0.4973, + "step": 182340 + }, + { + "epoch": 9.056819310618854, + "grad_norm": 0.1826171875, + "learning_rate": 7.545842852885667e-05, + "loss": 0.4652, + "step": 182350 + }, + { + "epoch": 9.057315982914472, + "grad_norm": 0.1728515625, + "learning_rate": 7.541869474520713e-05, + "loss": 0.4807, + "step": 182360 + }, + { + "epoch": 9.057812655210093, + "grad_norm": 0.1787109375, + "learning_rate": 7.537896096155757e-05, + "loss": 0.4666, + "step": 182370 + }, + { + "epoch": 9.058309327505711, + "grad_norm": 0.2119140625, + "learning_rate": 7.533922717790803e-05, + "loss": 0.4926, + "step": 182380 + }, + { + "epoch": 9.058805999801331, + "grad_norm": 0.1875, + "learning_rate": 7.529949339425847e-05, + "loss": 0.4793, + "step": 182390 + }, + { + "epoch": 9.059302672096951, + "grad_norm": 0.2021484375, + "learning_rate": 7.525975961060893e-05, + "loss": 0.4722, + "step": 182400 + }, + { + "epoch": 9.05979934439257, + "grad_norm": 0.189453125, + "learning_rate": 7.522002582695937e-05, + "loss": 0.4739, + "step": 182410 + }, + { + "epoch": 9.06029601668819, + "grad_norm": 0.177734375, + "learning_rate": 7.518029204330983e-05, + "loss": 0.4611, + "step": 182420 + }, + { + "epoch": 9.060792688983808, + "grad_norm": 0.181640625, + "learning_rate": 7.514055825966028e-05, + "loss": 0.4728, + "step": 182430 + }, + { + "epoch": 9.061289361279428, + "grad_norm": 0.1982421875, + "learning_rate": 7.510082447601073e-05, + "loss": 0.5058, + "step": 182440 + }, + { + "epoch": 9.061786033575046, + "grad_norm": 0.20703125, + "learning_rate": 7.506109069236118e-05, + "loss": 0.5037, + "step": 182450 + }, + { + "epoch": 9.062282705870667, + "grad_norm": 0.18359375, + "learning_rate": 7.502135690871164e-05, + "loss": 0.4864, + "step": 182460 + }, + { + "epoch": 9.062779378166287, + "grad_norm": 0.1875, + "learning_rate": 7.498162312506208e-05, + "loss": 0.5376, + "step": 182470 + }, + { + "epoch": 9.063276050461905, + "grad_norm": 0.1865234375, + "learning_rate": 7.494188934141254e-05, + "loss": 0.4829, + "step": 182480 + }, + { + "epoch": 9.063772722757525, + "grad_norm": 0.1962890625, + "learning_rate": 7.490215555776298e-05, + "loss": 0.4903, + "step": 182490 + }, + { + "epoch": 9.064269395053143, + "grad_norm": 0.16796875, + "learning_rate": 7.486242177411344e-05, + "loss": 0.4897, + "step": 182500 + }, + { + "epoch": 9.064766067348764, + "grad_norm": 0.1884765625, + "learning_rate": 7.482268799046388e-05, + "loss": 0.4932, + "step": 182510 + }, + { + "epoch": 9.065262739644382, + "grad_norm": 0.2138671875, + "learning_rate": 7.478295420681434e-05, + "loss": 0.5258, + "step": 182520 + }, + { + "epoch": 9.065759411940002, + "grad_norm": 0.1875, + "learning_rate": 7.47432204231648e-05, + "loss": 0.5126, + "step": 182530 + }, + { + "epoch": 9.066256084235622, + "grad_norm": 0.20703125, + "learning_rate": 7.470348663951526e-05, + "loss": 0.4997, + "step": 182540 + }, + { + "epoch": 9.06675275653124, + "grad_norm": 0.189453125, + "learning_rate": 7.46637528558657e-05, + "loss": 0.4666, + "step": 182550 + }, + { + "epoch": 9.06724942882686, + "grad_norm": 0.19921875, + "learning_rate": 7.462401907221616e-05, + "loss": 0.4605, + "step": 182560 + }, + { + "epoch": 9.067746101122479, + "grad_norm": 0.23046875, + "learning_rate": 7.45842852885666e-05, + "loss": 0.4722, + "step": 182570 + }, + { + "epoch": 9.068242773418099, + "grad_norm": 0.1845703125, + "learning_rate": 7.454455150491706e-05, + "loss": 0.5031, + "step": 182580 + }, + { + "epoch": 9.068739445713717, + "grad_norm": 0.1865234375, + "learning_rate": 7.45048177212675e-05, + "loss": 0.4937, + "step": 182590 + }, + { + "epoch": 9.069236118009337, + "grad_norm": 0.2080078125, + "learning_rate": 7.446508393761796e-05, + "loss": 0.4644, + "step": 182600 + }, + { + "epoch": 9.069732790304958, + "grad_norm": 0.1962890625, + "learning_rate": 7.442535015396842e-05, + "loss": 0.4606, + "step": 182610 + }, + { + "epoch": 9.070229462600576, + "grad_norm": 0.2158203125, + "learning_rate": 7.438561637031887e-05, + "loss": 0.5453, + "step": 182620 + }, + { + "epoch": 9.070726134896196, + "grad_norm": 0.208984375, + "learning_rate": 7.434588258666932e-05, + "loss": 0.4805, + "step": 182630 + }, + { + "epoch": 9.071222807191814, + "grad_norm": 0.189453125, + "learning_rate": 7.430614880301977e-05, + "loss": 0.4781, + "step": 182640 + }, + { + "epoch": 9.071719479487435, + "grad_norm": 0.1865234375, + "learning_rate": 7.426641501937023e-05, + "loss": 0.4755, + "step": 182650 + }, + { + "epoch": 9.072216151783053, + "grad_norm": 0.1962890625, + "learning_rate": 7.422668123572067e-05, + "loss": 0.487, + "step": 182660 + }, + { + "epoch": 9.072712824078673, + "grad_norm": 0.197265625, + "learning_rate": 7.418694745207113e-05, + "loss": 0.5061, + "step": 182670 + }, + { + "epoch": 9.073209496374293, + "grad_norm": 0.2041015625, + "learning_rate": 7.414721366842157e-05, + "loss": 0.4973, + "step": 182680 + }, + { + "epoch": 9.073706168669911, + "grad_norm": 0.1982421875, + "learning_rate": 7.410747988477203e-05, + "loss": 0.4624, + "step": 182690 + }, + { + "epoch": 9.074202840965532, + "grad_norm": 0.1884765625, + "learning_rate": 7.406774610112249e-05, + "loss": 0.4932, + "step": 182700 + }, + { + "epoch": 9.07469951326115, + "grad_norm": 0.1865234375, + "learning_rate": 7.402801231747295e-05, + "loss": 0.5181, + "step": 182710 + }, + { + "epoch": 9.07519618555677, + "grad_norm": 0.2021484375, + "learning_rate": 7.398827853382339e-05, + "loss": 0.494, + "step": 182720 + }, + { + "epoch": 9.075692857852388, + "grad_norm": 0.185546875, + "learning_rate": 7.394854475017385e-05, + "loss": 0.4742, + "step": 182730 + }, + { + "epoch": 9.076189530148008, + "grad_norm": 0.1826171875, + "learning_rate": 7.390881096652429e-05, + "loss": 0.4949, + "step": 182740 + }, + { + "epoch": 9.076686202443629, + "grad_norm": 0.1826171875, + "learning_rate": 7.386907718287475e-05, + "loss": 0.4609, + "step": 182750 + }, + { + "epoch": 9.077182874739247, + "grad_norm": 0.1806640625, + "learning_rate": 7.38293433992252e-05, + "loss": 0.4786, + "step": 182760 + }, + { + "epoch": 9.077679547034867, + "grad_norm": 0.1953125, + "learning_rate": 7.378960961557565e-05, + "loss": 0.5006, + "step": 182770 + }, + { + "epoch": 9.078176219330485, + "grad_norm": 0.1796875, + "learning_rate": 7.37498758319261e-05, + "loss": 0.478, + "step": 182780 + }, + { + "epoch": 9.078672891626105, + "grad_norm": 0.2138671875, + "learning_rate": 7.371014204827655e-05, + "loss": 0.5064, + "step": 182790 + }, + { + "epoch": 9.079169563921724, + "grad_norm": 0.1904296875, + "learning_rate": 7.3670408264627e-05, + "loss": 0.482, + "step": 182800 + }, + { + "epoch": 9.079666236217344, + "grad_norm": 0.2021484375, + "learning_rate": 7.363067448097746e-05, + "loss": 0.4831, + "step": 182810 + }, + { + "epoch": 9.080162908512964, + "grad_norm": 0.2138671875, + "learning_rate": 7.35909406973279e-05, + "loss": 0.4944, + "step": 182820 + }, + { + "epoch": 9.080659580808582, + "grad_norm": 0.1923828125, + "learning_rate": 7.355120691367836e-05, + "loss": 0.4607, + "step": 182830 + }, + { + "epoch": 9.081156253104202, + "grad_norm": 0.203125, + "learning_rate": 7.35114731300288e-05, + "loss": 0.4474, + "step": 182840 + }, + { + "epoch": 9.08165292539982, + "grad_norm": 0.21484375, + "learning_rate": 7.347173934637926e-05, + "loss": 0.4873, + "step": 182850 + }, + { + "epoch": 9.082149597695441, + "grad_norm": 0.208984375, + "learning_rate": 7.34320055627297e-05, + "loss": 0.4632, + "step": 182860 + }, + { + "epoch": 9.08264626999106, + "grad_norm": 0.18359375, + "learning_rate": 7.339227177908016e-05, + "loss": 0.4593, + "step": 182870 + }, + { + "epoch": 9.08314294228668, + "grad_norm": 0.1865234375, + "learning_rate": 7.335253799543062e-05, + "loss": 0.464, + "step": 182880 + }, + { + "epoch": 9.083639614582298, + "grad_norm": 0.24609375, + "learning_rate": 7.331280421178108e-05, + "loss": 0.4516, + "step": 182890 + }, + { + "epoch": 9.084136286877918, + "grad_norm": 0.1865234375, + "learning_rate": 7.327307042813152e-05, + "loss": 0.4705, + "step": 182900 + }, + { + "epoch": 9.084632959173538, + "grad_norm": 0.2138671875, + "learning_rate": 7.323333664448198e-05, + "loss": 0.5121, + "step": 182910 + }, + { + "epoch": 9.085129631469156, + "grad_norm": 0.2314453125, + "learning_rate": 7.319360286083242e-05, + "loss": 0.4887, + "step": 182920 + }, + { + "epoch": 9.085626303764776, + "grad_norm": 0.2099609375, + "learning_rate": 7.315386907718288e-05, + "loss": 0.4915, + "step": 182930 + }, + { + "epoch": 9.086122976060395, + "grad_norm": 0.1982421875, + "learning_rate": 7.311413529353333e-05, + "loss": 0.4873, + "step": 182940 + }, + { + "epoch": 9.086619648356015, + "grad_norm": 0.185546875, + "learning_rate": 7.307440150988378e-05, + "loss": 0.4949, + "step": 182950 + }, + { + "epoch": 9.087116320651633, + "grad_norm": 0.193359375, + "learning_rate": 7.303466772623423e-05, + "loss": 0.4797, + "step": 182960 + }, + { + "epoch": 9.087612992947253, + "grad_norm": 0.203125, + "learning_rate": 7.299493394258469e-05, + "loss": 0.4649, + "step": 182970 + }, + { + "epoch": 9.088109665242873, + "grad_norm": 0.185546875, + "learning_rate": 7.295520015893513e-05, + "loss": 0.4682, + "step": 182980 + }, + { + "epoch": 9.088606337538492, + "grad_norm": 0.197265625, + "learning_rate": 7.291546637528559e-05, + "loss": 0.5045, + "step": 182990 + }, + { + "epoch": 9.089103009834112, + "grad_norm": 0.181640625, + "learning_rate": 7.287573259163603e-05, + "loss": 0.4728, + "step": 183000 + }, + { + "epoch": 9.08959968212973, + "grad_norm": 0.2255859375, + "learning_rate": 7.283599880798649e-05, + "loss": 0.4804, + "step": 183010 + }, + { + "epoch": 9.09009635442535, + "grad_norm": 0.189453125, + "learning_rate": 7.279626502433695e-05, + "loss": 0.4665, + "step": 183020 + }, + { + "epoch": 9.090593026720969, + "grad_norm": 0.19140625, + "learning_rate": 7.275653124068739e-05, + "loss": 0.4453, + "step": 183030 + }, + { + "epoch": 9.091089699016589, + "grad_norm": 0.18359375, + "learning_rate": 7.271679745703785e-05, + "loss": 0.5243, + "step": 183040 + }, + { + "epoch": 9.091586371312209, + "grad_norm": 0.1943359375, + "learning_rate": 7.267706367338831e-05, + "loss": 0.5243, + "step": 183050 + }, + { + "epoch": 9.092083043607827, + "grad_norm": 0.1875, + "learning_rate": 7.263732988973877e-05, + "loss": 0.4825, + "step": 183060 + }, + { + "epoch": 9.092579715903447, + "grad_norm": 0.203125, + "learning_rate": 7.259759610608921e-05, + "loss": 0.4764, + "step": 183070 + }, + { + "epoch": 9.093076388199066, + "grad_norm": 0.201171875, + "learning_rate": 7.255786232243967e-05, + "loss": 0.4724, + "step": 183080 + }, + { + "epoch": 9.093573060494686, + "grad_norm": 0.1875, + "learning_rate": 7.251812853879011e-05, + "loss": 0.4651, + "step": 183090 + }, + { + "epoch": 9.094069732790304, + "grad_norm": 0.1796875, + "learning_rate": 7.247839475514057e-05, + "loss": 0.5167, + "step": 183100 + }, + { + "epoch": 9.094566405085924, + "grad_norm": 0.1904296875, + "learning_rate": 7.243866097149101e-05, + "loss": 0.4953, + "step": 183110 + }, + { + "epoch": 9.095063077381544, + "grad_norm": 0.185546875, + "learning_rate": 7.239892718784147e-05, + "loss": 0.5026, + "step": 183120 + }, + { + "epoch": 9.095559749677163, + "grad_norm": 0.2138671875, + "learning_rate": 7.235919340419192e-05, + "loss": 0.4616, + "step": 183130 + }, + { + "epoch": 9.096056421972783, + "grad_norm": 0.1845703125, + "learning_rate": 7.231945962054237e-05, + "loss": 0.4756, + "step": 183140 + }, + { + "epoch": 9.096553094268401, + "grad_norm": 0.2177734375, + "learning_rate": 7.227972583689282e-05, + "loss": 0.503, + "step": 183150 + }, + { + "epoch": 9.097049766564021, + "grad_norm": 0.193359375, + "learning_rate": 7.223999205324328e-05, + "loss": 0.5279, + "step": 183160 + }, + { + "epoch": 9.09754643885964, + "grad_norm": 0.177734375, + "learning_rate": 7.220025826959372e-05, + "loss": 0.4922, + "step": 183170 + }, + { + "epoch": 9.09804311115526, + "grad_norm": 0.2158203125, + "learning_rate": 7.216052448594418e-05, + "loss": 0.4952, + "step": 183180 + }, + { + "epoch": 9.09853978345088, + "grad_norm": 0.23828125, + "learning_rate": 7.212079070229462e-05, + "loss": 0.5045, + "step": 183190 + }, + { + "epoch": 9.099036455746498, + "grad_norm": 0.25, + "learning_rate": 7.208105691864508e-05, + "loss": 0.5088, + "step": 183200 + }, + { + "epoch": 9.099533128042118, + "grad_norm": 0.2109375, + "learning_rate": 7.204132313499554e-05, + "loss": 0.4668, + "step": 183210 + }, + { + "epoch": 9.100029800337737, + "grad_norm": 0.1767578125, + "learning_rate": 7.200158935134598e-05, + "loss": 0.4921, + "step": 183220 + }, + { + "epoch": 9.100526472633357, + "grad_norm": 0.19140625, + "learning_rate": 7.196185556769644e-05, + "loss": 0.4922, + "step": 183230 + }, + { + "epoch": 9.101023144928975, + "grad_norm": 0.181640625, + "learning_rate": 7.19221217840469e-05, + "loss": 0.4779, + "step": 183240 + }, + { + "epoch": 9.101519817224595, + "grad_norm": 0.2001953125, + "learning_rate": 7.188238800039734e-05, + "loss": 0.477, + "step": 183250 + }, + { + "epoch": 9.102016489520215, + "grad_norm": 0.185546875, + "learning_rate": 7.18426542167478e-05, + "loss": 0.5114, + "step": 183260 + }, + { + "epoch": 9.102513161815834, + "grad_norm": 0.189453125, + "learning_rate": 7.180292043309824e-05, + "loss": 0.4597, + "step": 183270 + }, + { + "epoch": 9.103009834111454, + "grad_norm": 0.2001953125, + "learning_rate": 7.17631866494487e-05, + "loss": 0.5096, + "step": 183280 + }, + { + "epoch": 9.103506506407072, + "grad_norm": 0.1767578125, + "learning_rate": 7.172345286579915e-05, + "loss": 0.5081, + "step": 183290 + }, + { + "epoch": 9.104003178702692, + "grad_norm": 0.181640625, + "learning_rate": 7.16837190821496e-05, + "loss": 0.474, + "step": 183300 + }, + { + "epoch": 9.10449985099831, + "grad_norm": 0.193359375, + "learning_rate": 7.164398529850005e-05, + "loss": 0.467, + "step": 183310 + }, + { + "epoch": 9.10499652329393, + "grad_norm": 0.1845703125, + "learning_rate": 7.16042515148505e-05, + "loss": 0.4756, + "step": 183320 + }, + { + "epoch": 9.10549319558955, + "grad_norm": 0.1787109375, + "learning_rate": 7.156451773120095e-05, + "loss": 0.4683, + "step": 183330 + }, + { + "epoch": 9.10598986788517, + "grad_norm": 0.205078125, + "learning_rate": 7.152478394755141e-05, + "loss": 0.4684, + "step": 183340 + }, + { + "epoch": 9.10648654018079, + "grad_norm": 0.232421875, + "learning_rate": 7.148505016390185e-05, + "loss": 0.5029, + "step": 183350 + }, + { + "epoch": 9.106983212476408, + "grad_norm": 0.193359375, + "learning_rate": 7.144531638025231e-05, + "loss": 0.5172, + "step": 183360 + }, + { + "epoch": 9.107479884772028, + "grad_norm": 0.1806640625, + "learning_rate": 7.140558259660275e-05, + "loss": 0.496, + "step": 183370 + }, + { + "epoch": 9.107976557067646, + "grad_norm": 0.19140625, + "learning_rate": 7.136584881295321e-05, + "loss": 0.4647, + "step": 183380 + }, + { + "epoch": 9.108473229363266, + "grad_norm": 0.212890625, + "learning_rate": 7.132611502930367e-05, + "loss": 0.4909, + "step": 183390 + }, + { + "epoch": 9.108969901658886, + "grad_norm": 0.212890625, + "learning_rate": 7.128638124565413e-05, + "loss": 0.5088, + "step": 183400 + }, + { + "epoch": 9.109466573954505, + "grad_norm": 0.1943359375, + "learning_rate": 7.124664746200457e-05, + "loss": 0.4997, + "step": 183410 + }, + { + "epoch": 9.109963246250125, + "grad_norm": 0.1923828125, + "learning_rate": 7.120691367835503e-05, + "loss": 0.4552, + "step": 183420 + }, + { + "epoch": 9.110459918545743, + "grad_norm": 0.1787109375, + "learning_rate": 7.116717989470547e-05, + "loss": 0.4588, + "step": 183430 + }, + { + "epoch": 9.110956590841363, + "grad_norm": 0.177734375, + "learning_rate": 7.112744611105593e-05, + "loss": 0.4673, + "step": 183440 + }, + { + "epoch": 9.111453263136982, + "grad_norm": 0.244140625, + "learning_rate": 7.108771232740639e-05, + "loss": 0.498, + "step": 183450 + }, + { + "epoch": 9.111949935432602, + "grad_norm": 0.193359375, + "learning_rate": 7.104797854375683e-05, + "loss": 0.4668, + "step": 183460 + }, + { + "epoch": 9.112446607728222, + "grad_norm": 0.21875, + "learning_rate": 7.100824476010729e-05, + "loss": 0.4676, + "step": 183470 + }, + { + "epoch": 9.11294328002384, + "grad_norm": 0.2080078125, + "learning_rate": 7.096851097645774e-05, + "loss": 0.5052, + "step": 183480 + }, + { + "epoch": 9.11343995231946, + "grad_norm": 0.1923828125, + "learning_rate": 7.09287771928082e-05, + "loss": 0.4709, + "step": 183490 + }, + { + "epoch": 9.113936624615079, + "grad_norm": 0.18359375, + "learning_rate": 7.088904340915864e-05, + "loss": 0.4904, + "step": 183500 + }, + { + "epoch": 9.114433296910699, + "grad_norm": 0.19140625, + "learning_rate": 7.08493096255091e-05, + "loss": 0.4782, + "step": 183510 + }, + { + "epoch": 9.114929969206317, + "grad_norm": 0.18359375, + "learning_rate": 7.080957584185954e-05, + "loss": 0.4843, + "step": 183520 + }, + { + "epoch": 9.115426641501937, + "grad_norm": 0.1826171875, + "learning_rate": 7.076984205821e-05, + "loss": 0.4844, + "step": 183530 + }, + { + "epoch": 9.115923313797555, + "grad_norm": 0.18359375, + "learning_rate": 7.073010827456044e-05, + "loss": 0.4814, + "step": 183540 + }, + { + "epoch": 9.116419986093176, + "grad_norm": 0.2353515625, + "learning_rate": 7.06903744909109e-05, + "loss": 0.5254, + "step": 183550 + }, + { + "epoch": 9.116916658388796, + "grad_norm": 0.1923828125, + "learning_rate": 7.065064070726136e-05, + "loss": 0.495, + "step": 183560 + }, + { + "epoch": 9.117413330684414, + "grad_norm": 0.19140625, + "learning_rate": 7.06109069236118e-05, + "loss": 0.4925, + "step": 183570 + }, + { + "epoch": 9.117910002980034, + "grad_norm": 0.1875, + "learning_rate": 7.057117313996226e-05, + "loss": 0.5212, + "step": 183580 + }, + { + "epoch": 9.118406675275653, + "grad_norm": 0.18359375, + "learning_rate": 7.053143935631272e-05, + "loss": 0.4711, + "step": 183590 + }, + { + "epoch": 9.118903347571273, + "grad_norm": 0.201171875, + "learning_rate": 7.049170557266316e-05, + "loss": 0.4479, + "step": 183600 + }, + { + "epoch": 9.119400019866891, + "grad_norm": 0.19140625, + "learning_rate": 7.045197178901362e-05, + "loss": 0.4634, + "step": 183610 + }, + { + "epoch": 9.119896692162511, + "grad_norm": 0.2080078125, + "learning_rate": 7.041223800536406e-05, + "loss": 0.5072, + "step": 183620 + }, + { + "epoch": 9.120393364458131, + "grad_norm": 0.19140625, + "learning_rate": 7.037250422171452e-05, + "loss": 0.4743, + "step": 183630 + }, + { + "epoch": 9.12089003675375, + "grad_norm": 0.20703125, + "learning_rate": 7.033277043806497e-05, + "loss": 0.4725, + "step": 183640 + }, + { + "epoch": 9.12138670904937, + "grad_norm": 0.2001953125, + "learning_rate": 7.029303665441542e-05, + "loss": 0.483, + "step": 183650 + }, + { + "epoch": 9.121883381344988, + "grad_norm": 0.1640625, + "learning_rate": 7.025330287076587e-05, + "loss": 0.4595, + "step": 183660 + }, + { + "epoch": 9.122380053640608, + "grad_norm": 0.205078125, + "learning_rate": 7.021356908711633e-05, + "loss": 0.4898, + "step": 183670 + }, + { + "epoch": 9.122876725936226, + "grad_norm": 0.171875, + "learning_rate": 7.017383530346677e-05, + "loss": 0.4713, + "step": 183680 + }, + { + "epoch": 9.123373398231847, + "grad_norm": 0.18359375, + "learning_rate": 7.013410151981723e-05, + "loss": 0.4793, + "step": 183690 + }, + { + "epoch": 9.123870070527467, + "grad_norm": 0.205078125, + "learning_rate": 7.009436773616767e-05, + "loss": 0.4977, + "step": 183700 + }, + { + "epoch": 9.124366742823085, + "grad_norm": 0.1865234375, + "learning_rate": 7.005463395251813e-05, + "loss": 0.4537, + "step": 183710 + }, + { + "epoch": 9.124863415118705, + "grad_norm": 0.197265625, + "learning_rate": 7.001490016886857e-05, + "loss": 0.5065, + "step": 183720 + }, + { + "epoch": 9.125360087414323, + "grad_norm": 0.1982421875, + "learning_rate": 6.997516638521903e-05, + "loss": 0.5102, + "step": 183730 + }, + { + "epoch": 9.125856759709944, + "grad_norm": 0.2392578125, + "learning_rate": 6.993543260156949e-05, + "loss": 0.5043, + "step": 183740 + }, + { + "epoch": 9.126353432005562, + "grad_norm": 0.2021484375, + "learning_rate": 6.989569881791995e-05, + "loss": 0.5042, + "step": 183750 + }, + { + "epoch": 9.126850104301182, + "grad_norm": 0.1728515625, + "learning_rate": 6.985596503427039e-05, + "loss": 0.5023, + "step": 183760 + }, + { + "epoch": 9.127346776596802, + "grad_norm": 0.193359375, + "learning_rate": 6.981623125062085e-05, + "loss": 0.4962, + "step": 183770 + }, + { + "epoch": 9.12784344889242, + "grad_norm": 0.1953125, + "learning_rate": 6.97764974669713e-05, + "loss": 0.4831, + "step": 183780 + }, + { + "epoch": 9.12834012118804, + "grad_norm": 0.2236328125, + "learning_rate": 6.973676368332175e-05, + "loss": 0.5234, + "step": 183790 + }, + { + "epoch": 9.128836793483659, + "grad_norm": 0.2021484375, + "learning_rate": 6.96970298996722e-05, + "loss": 0.4678, + "step": 183800 + }, + { + "epoch": 9.129333465779279, + "grad_norm": 0.2060546875, + "learning_rate": 6.965729611602265e-05, + "loss": 0.4922, + "step": 183810 + }, + { + "epoch": 9.129830138074897, + "grad_norm": 0.18359375, + "learning_rate": 6.96175623323731e-05, + "loss": 0.4929, + "step": 183820 + }, + { + "epoch": 9.130326810370518, + "grad_norm": 0.189453125, + "learning_rate": 6.957782854872356e-05, + "loss": 0.4406, + "step": 183830 + }, + { + "epoch": 9.130823482666138, + "grad_norm": 0.201171875, + "learning_rate": 6.9538094765074e-05, + "loss": 0.4779, + "step": 183840 + }, + { + "epoch": 9.131320154961756, + "grad_norm": 0.197265625, + "learning_rate": 6.949836098142446e-05, + "loss": 0.4758, + "step": 183850 + }, + { + "epoch": 9.131816827257376, + "grad_norm": 0.2109375, + "learning_rate": 6.94586271977749e-05, + "loss": 0.4949, + "step": 183860 + }, + { + "epoch": 9.132313499552994, + "grad_norm": 0.2177734375, + "learning_rate": 6.941889341412536e-05, + "loss": 0.4358, + "step": 183870 + }, + { + "epoch": 9.132810171848615, + "grad_norm": 0.1845703125, + "learning_rate": 6.937915963047582e-05, + "loss": 0.4765, + "step": 183880 + }, + { + "epoch": 9.133306844144233, + "grad_norm": 0.2109375, + "learning_rate": 6.933942584682626e-05, + "loss": 0.5001, + "step": 183890 + }, + { + "epoch": 9.133803516439853, + "grad_norm": 0.1845703125, + "learning_rate": 6.929969206317672e-05, + "loss": 0.5032, + "step": 183900 + }, + { + "epoch": 9.134300188735473, + "grad_norm": 0.181640625, + "learning_rate": 6.925995827952718e-05, + "loss": 0.4726, + "step": 183910 + }, + { + "epoch": 9.134796861031091, + "grad_norm": 0.2158203125, + "learning_rate": 6.922022449587762e-05, + "loss": 0.4912, + "step": 183920 + }, + { + "epoch": 9.135293533326712, + "grad_norm": 0.220703125, + "learning_rate": 6.918049071222808e-05, + "loss": 0.4891, + "step": 183930 + }, + { + "epoch": 9.13579020562233, + "grad_norm": 0.224609375, + "learning_rate": 6.914075692857854e-05, + "loss": 0.4766, + "step": 183940 + }, + { + "epoch": 9.13628687791795, + "grad_norm": 0.21875, + "learning_rate": 6.910102314492898e-05, + "loss": 0.4845, + "step": 183950 + }, + { + "epoch": 9.136783550213568, + "grad_norm": 0.2275390625, + "learning_rate": 6.906128936127944e-05, + "loss": 0.4646, + "step": 183960 + }, + { + "epoch": 9.137280222509188, + "grad_norm": 0.208984375, + "learning_rate": 6.902155557762988e-05, + "loss": 0.4531, + "step": 183970 + }, + { + "epoch": 9.137776894804809, + "grad_norm": 0.19921875, + "learning_rate": 6.898182179398034e-05, + "loss": 0.4454, + "step": 183980 + }, + { + "epoch": 9.138273567100427, + "grad_norm": 0.2021484375, + "learning_rate": 6.894208801033079e-05, + "loss": 0.5006, + "step": 183990 + }, + { + "epoch": 9.138770239396047, + "grad_norm": 0.236328125, + "learning_rate": 6.890235422668124e-05, + "loss": 0.4959, + "step": 184000 + }, + { + "epoch": 9.139266911691665, + "grad_norm": 0.220703125, + "learning_rate": 6.886262044303169e-05, + "loss": 0.4727, + "step": 184010 + }, + { + "epoch": 9.139763583987286, + "grad_norm": 0.1865234375, + "learning_rate": 6.882288665938215e-05, + "loss": 0.4789, + "step": 184020 + }, + { + "epoch": 9.140260256282904, + "grad_norm": 0.23046875, + "learning_rate": 6.878315287573259e-05, + "loss": 0.5327, + "step": 184030 + }, + { + "epoch": 9.140756928578524, + "grad_norm": 0.1865234375, + "learning_rate": 6.874341909208305e-05, + "loss": 0.5216, + "step": 184040 + }, + { + "epoch": 9.141253600874144, + "grad_norm": 0.189453125, + "learning_rate": 6.870368530843349e-05, + "loss": 0.4922, + "step": 184050 + }, + { + "epoch": 9.141750273169762, + "grad_norm": 0.1943359375, + "learning_rate": 6.866395152478395e-05, + "loss": 0.4965, + "step": 184060 + }, + { + "epoch": 9.142246945465383, + "grad_norm": 0.2158203125, + "learning_rate": 6.86242177411344e-05, + "loss": 0.495, + "step": 184070 + }, + { + "epoch": 9.142743617761, + "grad_norm": 0.19921875, + "learning_rate": 6.858448395748485e-05, + "loss": 0.4731, + "step": 184080 + }, + { + "epoch": 9.143240290056621, + "grad_norm": 0.1845703125, + "learning_rate": 6.854475017383531e-05, + "loss": 0.4728, + "step": 184090 + }, + { + "epoch": 9.14373696235224, + "grad_norm": 0.1845703125, + "learning_rate": 6.850501639018577e-05, + "loss": 0.4996, + "step": 184100 + }, + { + "epoch": 9.14423363464786, + "grad_norm": 0.193359375, + "learning_rate": 6.846528260653621e-05, + "loss": 0.4944, + "step": 184110 + }, + { + "epoch": 9.14473030694348, + "grad_norm": 0.216796875, + "learning_rate": 6.842554882288667e-05, + "loss": 0.5061, + "step": 184120 + }, + { + "epoch": 9.145226979239098, + "grad_norm": 0.2060546875, + "learning_rate": 6.838581503923711e-05, + "loss": 0.482, + "step": 184130 + }, + { + "epoch": 9.145723651534718, + "grad_norm": 0.203125, + "learning_rate": 6.834608125558757e-05, + "loss": 0.4966, + "step": 184140 + }, + { + "epoch": 9.146220323830336, + "grad_norm": 0.2060546875, + "learning_rate": 6.830634747193802e-05, + "loss": 0.4577, + "step": 184150 + }, + { + "epoch": 9.146716996125956, + "grad_norm": 0.193359375, + "learning_rate": 6.826661368828847e-05, + "loss": 0.468, + "step": 184160 + }, + { + "epoch": 9.147213668421575, + "grad_norm": 0.1865234375, + "learning_rate": 6.822687990463892e-05, + "loss": 0.4917, + "step": 184170 + }, + { + "epoch": 9.147710340717195, + "grad_norm": 0.201171875, + "learning_rate": 6.818714612098938e-05, + "loss": 0.446, + "step": 184180 + }, + { + "epoch": 9.148207013012815, + "grad_norm": 0.2001953125, + "learning_rate": 6.814741233733982e-05, + "loss": 0.4935, + "step": 184190 + }, + { + "epoch": 9.148703685308433, + "grad_norm": 0.2177734375, + "learning_rate": 6.810767855369028e-05, + "loss": 0.4845, + "step": 184200 + }, + { + "epoch": 9.149200357604053, + "grad_norm": 0.234375, + "learning_rate": 6.806794477004072e-05, + "loss": 0.512, + "step": 184210 + }, + { + "epoch": 9.149697029899672, + "grad_norm": 0.1796875, + "learning_rate": 6.802821098639118e-05, + "loss": 0.4937, + "step": 184220 + }, + { + "epoch": 9.150193702195292, + "grad_norm": 0.19140625, + "learning_rate": 6.798847720274162e-05, + "loss": 0.4869, + "step": 184230 + }, + { + "epoch": 9.15069037449091, + "grad_norm": 0.1904296875, + "learning_rate": 6.794874341909208e-05, + "loss": 0.5046, + "step": 184240 + }, + { + "epoch": 9.15118704678653, + "grad_norm": 0.1875, + "learning_rate": 6.790900963544254e-05, + "loss": 0.4638, + "step": 184250 + }, + { + "epoch": 9.151683719082149, + "grad_norm": 0.208984375, + "learning_rate": 6.7869275851793e-05, + "loss": 0.4879, + "step": 184260 + }, + { + "epoch": 9.152180391377769, + "grad_norm": 0.1904296875, + "learning_rate": 6.782954206814344e-05, + "loss": 0.4779, + "step": 184270 + }, + { + "epoch": 9.152677063673389, + "grad_norm": 0.1953125, + "learning_rate": 6.77898082844939e-05, + "loss": 0.4877, + "step": 184280 + }, + { + "epoch": 9.153173735969007, + "grad_norm": 0.23828125, + "learning_rate": 6.775007450084436e-05, + "loss": 0.4687, + "step": 184290 + }, + { + "epoch": 9.153670408264627, + "grad_norm": 0.1904296875, + "learning_rate": 6.77103407171948e-05, + "loss": 0.474, + "step": 184300 + }, + { + "epoch": 9.154167080560246, + "grad_norm": 0.1982421875, + "learning_rate": 6.767060693354526e-05, + "loss": 0.4974, + "step": 184310 + }, + { + "epoch": 9.154663752855866, + "grad_norm": 0.2041015625, + "learning_rate": 6.76308731498957e-05, + "loss": 0.4704, + "step": 184320 + }, + { + "epoch": 9.155160425151484, + "grad_norm": 0.1875, + "learning_rate": 6.759113936624616e-05, + "loss": 0.494, + "step": 184330 + }, + { + "epoch": 9.155657097447104, + "grad_norm": 0.2060546875, + "learning_rate": 6.75514055825966e-05, + "loss": 0.5045, + "step": 184340 + }, + { + "epoch": 9.156153769742724, + "grad_norm": 0.181640625, + "learning_rate": 6.751167179894706e-05, + "loss": 0.462, + "step": 184350 + }, + { + "epoch": 9.156650442038343, + "grad_norm": 0.2177734375, + "learning_rate": 6.747193801529751e-05, + "loss": 0.4745, + "step": 184360 + }, + { + "epoch": 9.157147114333963, + "grad_norm": 0.2080078125, + "learning_rate": 6.743220423164797e-05, + "loss": 0.5212, + "step": 184370 + }, + { + "epoch": 9.157643786629581, + "grad_norm": 0.1806640625, + "learning_rate": 6.739247044799841e-05, + "loss": 0.457, + "step": 184380 + }, + { + "epoch": 9.158140458925201, + "grad_norm": 0.20703125, + "learning_rate": 6.735273666434887e-05, + "loss": 0.4669, + "step": 184390 + }, + { + "epoch": 9.15863713122082, + "grad_norm": 0.1826171875, + "learning_rate": 6.731300288069931e-05, + "loss": 0.4686, + "step": 184400 + }, + { + "epoch": 9.15913380351644, + "grad_norm": 0.21484375, + "learning_rate": 6.727326909704977e-05, + "loss": 0.4725, + "step": 184410 + }, + { + "epoch": 9.15963047581206, + "grad_norm": 0.1982421875, + "learning_rate": 6.723353531340021e-05, + "loss": 0.4822, + "step": 184420 + }, + { + "epoch": 9.160127148107678, + "grad_norm": 0.1767578125, + "learning_rate": 6.719380152975067e-05, + "loss": 0.4776, + "step": 184430 + }, + { + "epoch": 9.160623820403298, + "grad_norm": 0.1884765625, + "learning_rate": 6.715406774610113e-05, + "loss": 0.4736, + "step": 184440 + }, + { + "epoch": 9.161120492698917, + "grad_norm": 0.2255859375, + "learning_rate": 6.711433396245159e-05, + "loss": 0.4684, + "step": 184450 + }, + { + "epoch": 9.161617164994537, + "grad_norm": 0.173828125, + "learning_rate": 6.707460017880203e-05, + "loss": 0.4736, + "step": 184460 + }, + { + "epoch": 9.162113837290155, + "grad_norm": 0.224609375, + "learning_rate": 6.703486639515249e-05, + "loss": 0.4846, + "step": 184470 + }, + { + "epoch": 9.162610509585775, + "grad_norm": 0.189453125, + "learning_rate": 6.699513261150293e-05, + "loss": 0.4632, + "step": 184480 + }, + { + "epoch": 9.163107181881395, + "grad_norm": 0.251953125, + "learning_rate": 6.695539882785339e-05, + "loss": 0.4721, + "step": 184490 + }, + { + "epoch": 9.163603854177014, + "grad_norm": 0.1923828125, + "learning_rate": 6.691566504420384e-05, + "loss": 0.5025, + "step": 184500 + }, + { + "epoch": 9.164100526472634, + "grad_norm": 0.181640625, + "learning_rate": 6.687593126055429e-05, + "loss": 0.4663, + "step": 184510 + }, + { + "epoch": 9.164597198768252, + "grad_norm": 0.26171875, + "learning_rate": 6.683619747690474e-05, + "loss": 0.4928, + "step": 184520 + }, + { + "epoch": 9.165093871063872, + "grad_norm": 0.2060546875, + "learning_rate": 6.67964636932552e-05, + "loss": 0.4811, + "step": 184530 + }, + { + "epoch": 9.16559054335949, + "grad_norm": 0.2080078125, + "learning_rate": 6.675672990960564e-05, + "loss": 0.4769, + "step": 184540 + }, + { + "epoch": 9.16608721565511, + "grad_norm": 0.1875, + "learning_rate": 6.67169961259561e-05, + "loss": 0.4968, + "step": 184550 + }, + { + "epoch": 9.16658388795073, + "grad_norm": 0.1884765625, + "learning_rate": 6.667726234230654e-05, + "loss": 0.4564, + "step": 184560 + }, + { + "epoch": 9.16708056024635, + "grad_norm": 0.1923828125, + "learning_rate": 6.6637528558657e-05, + "loss": 0.4507, + "step": 184570 + }, + { + "epoch": 9.16757723254197, + "grad_norm": 0.1943359375, + "learning_rate": 6.659779477500744e-05, + "loss": 0.5085, + "step": 184580 + }, + { + "epoch": 9.168073904837588, + "grad_norm": 0.20703125, + "learning_rate": 6.65580609913579e-05, + "loss": 0.4818, + "step": 184590 + }, + { + "epoch": 9.168570577133208, + "grad_norm": 0.1796875, + "learning_rate": 6.651832720770836e-05, + "loss": 0.4917, + "step": 184600 + }, + { + "epoch": 9.169067249428826, + "grad_norm": 0.1923828125, + "learning_rate": 6.647859342405882e-05, + "loss": 0.4645, + "step": 184610 + }, + { + "epoch": 9.169563921724446, + "grad_norm": 0.2138671875, + "learning_rate": 6.643885964040926e-05, + "loss": 0.4783, + "step": 184620 + }, + { + "epoch": 9.170060594020066, + "grad_norm": 0.203125, + "learning_rate": 6.639912585675972e-05, + "loss": 0.4996, + "step": 184630 + }, + { + "epoch": 9.170557266315685, + "grad_norm": 0.1923828125, + "learning_rate": 6.635939207311016e-05, + "loss": 0.5024, + "step": 184640 + }, + { + "epoch": 9.171053938611305, + "grad_norm": 0.1826171875, + "learning_rate": 6.631965828946062e-05, + "loss": 0.4781, + "step": 184650 + }, + { + "epoch": 9.171550610906923, + "grad_norm": 0.2021484375, + "learning_rate": 6.627992450581107e-05, + "loss": 0.4547, + "step": 184660 + }, + { + "epoch": 9.172047283202543, + "grad_norm": 0.181640625, + "learning_rate": 6.624019072216152e-05, + "loss": 0.4807, + "step": 184670 + }, + { + "epoch": 9.172543955498162, + "grad_norm": 0.2060546875, + "learning_rate": 6.620045693851197e-05, + "loss": 0.504, + "step": 184680 + }, + { + "epoch": 9.173040627793782, + "grad_norm": 0.2265625, + "learning_rate": 6.616072315486243e-05, + "loss": 0.4856, + "step": 184690 + }, + { + "epoch": 9.173537300089402, + "grad_norm": 0.1865234375, + "learning_rate": 6.612098937121287e-05, + "loss": 0.441, + "step": 184700 + }, + { + "epoch": 9.17403397238502, + "grad_norm": 0.2060546875, + "learning_rate": 6.608125558756333e-05, + "loss": 0.4968, + "step": 184710 + }, + { + "epoch": 9.17453064468064, + "grad_norm": 0.2119140625, + "learning_rate": 6.604152180391378e-05, + "loss": 0.5115, + "step": 184720 + }, + { + "epoch": 9.175027316976259, + "grad_norm": 0.1796875, + "learning_rate": 6.600178802026423e-05, + "loss": 0.4634, + "step": 184730 + }, + { + "epoch": 9.175523989271879, + "grad_norm": 0.1904296875, + "learning_rate": 6.596205423661469e-05, + "loss": 0.4893, + "step": 184740 + }, + { + "epoch": 9.176020661567497, + "grad_norm": 0.1845703125, + "learning_rate": 6.592232045296513e-05, + "loss": 0.4928, + "step": 184750 + }, + { + "epoch": 9.176517333863117, + "grad_norm": 0.185546875, + "learning_rate": 6.588258666931559e-05, + "loss": 0.4845, + "step": 184760 + }, + { + "epoch": 9.177014006158737, + "grad_norm": 0.1953125, + "learning_rate": 6.584285288566603e-05, + "loss": 0.4834, + "step": 184770 + }, + { + "epoch": 9.177510678454356, + "grad_norm": 0.1904296875, + "learning_rate": 6.580311910201649e-05, + "loss": 0.4973, + "step": 184780 + }, + { + "epoch": 9.178007350749976, + "grad_norm": 0.2080078125, + "learning_rate": 6.576338531836695e-05, + "loss": 0.4579, + "step": 184790 + }, + { + "epoch": 9.178504023045594, + "grad_norm": 0.2021484375, + "learning_rate": 6.57236515347174e-05, + "loss": 0.5235, + "step": 184800 + }, + { + "epoch": 9.179000695341214, + "grad_norm": 0.2041015625, + "learning_rate": 6.568391775106785e-05, + "loss": 0.4999, + "step": 184810 + }, + { + "epoch": 9.179497367636833, + "grad_norm": 0.203125, + "learning_rate": 6.564418396741831e-05, + "loss": 0.4872, + "step": 184820 + }, + { + "epoch": 9.179994039932453, + "grad_norm": 0.234375, + "learning_rate": 6.560445018376875e-05, + "loss": 0.4969, + "step": 184830 + }, + { + "epoch": 9.180490712228073, + "grad_norm": 0.2001953125, + "learning_rate": 6.556471640011921e-05, + "loss": 0.4938, + "step": 184840 + }, + { + "epoch": 9.180987384523691, + "grad_norm": 0.177734375, + "learning_rate": 6.552498261646966e-05, + "loss": 0.4687, + "step": 184850 + }, + { + "epoch": 9.181484056819311, + "grad_norm": 0.193359375, + "learning_rate": 6.548524883282011e-05, + "loss": 0.5009, + "step": 184860 + }, + { + "epoch": 9.18198072911493, + "grad_norm": 0.1953125, + "learning_rate": 6.544551504917056e-05, + "loss": 0.491, + "step": 184870 + }, + { + "epoch": 9.18247740141055, + "grad_norm": 0.193359375, + "learning_rate": 6.540578126552101e-05, + "loss": 0.4583, + "step": 184880 + }, + { + "epoch": 9.182974073706168, + "grad_norm": 0.205078125, + "learning_rate": 6.536604748187146e-05, + "loss": 0.4648, + "step": 184890 + }, + { + "epoch": 9.183470746001788, + "grad_norm": 0.240234375, + "learning_rate": 6.532631369822192e-05, + "loss": 0.5045, + "step": 184900 + }, + { + "epoch": 9.183967418297406, + "grad_norm": 0.1826171875, + "learning_rate": 6.528657991457236e-05, + "loss": 0.4865, + "step": 184910 + }, + { + "epoch": 9.184464090593027, + "grad_norm": 0.1845703125, + "learning_rate": 6.524684613092282e-05, + "loss": 0.4882, + "step": 184920 + }, + { + "epoch": 9.184960762888647, + "grad_norm": 0.1865234375, + "learning_rate": 6.520711234727326e-05, + "loss": 0.4672, + "step": 184930 + }, + { + "epoch": 9.185457435184265, + "grad_norm": 0.2314453125, + "learning_rate": 6.516737856362372e-05, + "loss": 0.4775, + "step": 184940 + }, + { + "epoch": 9.185954107479885, + "grad_norm": 0.2177734375, + "learning_rate": 6.512764477997418e-05, + "loss": 0.4545, + "step": 184950 + }, + { + "epoch": 9.186450779775504, + "grad_norm": 0.1962890625, + "learning_rate": 6.508791099632464e-05, + "loss": 0.4564, + "step": 184960 + }, + { + "epoch": 9.186947452071124, + "grad_norm": 0.2265625, + "learning_rate": 6.504817721267508e-05, + "loss": 0.4562, + "step": 184970 + }, + { + "epoch": 9.187444124366742, + "grad_norm": 0.197265625, + "learning_rate": 6.500844342902554e-05, + "loss": 0.4695, + "step": 184980 + }, + { + "epoch": 9.187940796662362, + "grad_norm": 0.205078125, + "learning_rate": 6.496870964537598e-05, + "loss": 0.4632, + "step": 184990 + }, + { + "epoch": 9.188437468957982, + "grad_norm": 0.2109375, + "learning_rate": 6.492897586172644e-05, + "loss": 0.4847, + "step": 185000 + }, + { + "epoch": 9.1889341412536, + "grad_norm": 0.2333984375, + "learning_rate": 6.488924207807689e-05, + "loss": 0.5039, + "step": 185010 + }, + { + "epoch": 9.18943081354922, + "grad_norm": 0.1826171875, + "learning_rate": 6.484950829442734e-05, + "loss": 0.4899, + "step": 185020 + }, + { + "epoch": 9.189927485844839, + "grad_norm": 0.1953125, + "learning_rate": 6.480977451077779e-05, + "loss": 0.4832, + "step": 185030 + }, + { + "epoch": 9.190424158140459, + "grad_norm": 0.201171875, + "learning_rate": 6.477004072712824e-05, + "loss": 0.5118, + "step": 185040 + }, + { + "epoch": 9.190920830436077, + "grad_norm": 0.205078125, + "learning_rate": 6.473030694347869e-05, + "loss": 0.4991, + "step": 185050 + }, + { + "epoch": 9.191417502731698, + "grad_norm": 0.2109375, + "learning_rate": 6.469057315982915e-05, + "loss": 0.4967, + "step": 185060 + }, + { + "epoch": 9.191914175027318, + "grad_norm": 0.205078125, + "learning_rate": 6.465083937617959e-05, + "loss": 0.5004, + "step": 185070 + }, + { + "epoch": 9.192410847322936, + "grad_norm": 0.22265625, + "learning_rate": 6.461110559253005e-05, + "loss": 0.4704, + "step": 185080 + }, + { + "epoch": 9.192907519618556, + "grad_norm": 0.2158203125, + "learning_rate": 6.45713718088805e-05, + "loss": 0.4667, + "step": 185090 + }, + { + "epoch": 9.193404191914174, + "grad_norm": 0.1904296875, + "learning_rate": 6.453163802523095e-05, + "loss": 0.4669, + "step": 185100 + }, + { + "epoch": 9.193900864209795, + "grad_norm": 0.2109375, + "learning_rate": 6.449190424158141e-05, + "loss": 0.4886, + "step": 185110 + }, + { + "epoch": 9.194397536505413, + "grad_norm": 0.2255859375, + "learning_rate": 6.445217045793185e-05, + "loss": 0.4788, + "step": 185120 + }, + { + "epoch": 9.194894208801033, + "grad_norm": 0.2265625, + "learning_rate": 6.441243667428231e-05, + "loss": 0.4927, + "step": 185130 + }, + { + "epoch": 9.195390881096653, + "grad_norm": 0.205078125, + "learning_rate": 6.437270289063277e-05, + "loss": 0.4908, + "step": 185140 + }, + { + "epoch": 9.195887553392271, + "grad_norm": 0.189453125, + "learning_rate": 6.433296910698323e-05, + "loss": 0.4802, + "step": 185150 + }, + { + "epoch": 9.196384225687892, + "grad_norm": 0.1923828125, + "learning_rate": 6.429323532333367e-05, + "loss": 0.4572, + "step": 185160 + }, + { + "epoch": 9.19688089798351, + "grad_norm": 0.181640625, + "learning_rate": 6.425350153968413e-05, + "loss": 0.4916, + "step": 185170 + }, + { + "epoch": 9.19737757027913, + "grad_norm": 0.1845703125, + "learning_rate": 6.421376775603457e-05, + "loss": 0.4472, + "step": 185180 + }, + { + "epoch": 9.197874242574748, + "grad_norm": 0.2001953125, + "learning_rate": 6.417403397238503e-05, + "loss": 0.4867, + "step": 185190 + }, + { + "epoch": 9.198370914870369, + "grad_norm": 0.212890625, + "learning_rate": 6.413430018873547e-05, + "loss": 0.4517, + "step": 185200 + }, + { + "epoch": 9.198867587165989, + "grad_norm": 0.189453125, + "learning_rate": 6.409456640508593e-05, + "loss": 0.4819, + "step": 185210 + }, + { + "epoch": 9.199364259461607, + "grad_norm": 0.1953125, + "learning_rate": 6.405483262143638e-05, + "loss": 0.4861, + "step": 185220 + }, + { + "epoch": 9.199860931757227, + "grad_norm": 0.1953125, + "learning_rate": 6.401509883778683e-05, + "loss": 0.4791, + "step": 185230 + }, + { + "epoch": 9.200357604052845, + "grad_norm": 0.1953125, + "learning_rate": 6.397536505413728e-05, + "loss": 0.5115, + "step": 185240 + }, + { + "epoch": 9.200854276348466, + "grad_norm": 0.2177734375, + "learning_rate": 6.393563127048774e-05, + "loss": 0.5009, + "step": 185250 + }, + { + "epoch": 9.201350948644084, + "grad_norm": 0.1875, + "learning_rate": 6.389589748683818e-05, + "loss": 0.4814, + "step": 185260 + }, + { + "epoch": 9.201847620939704, + "grad_norm": 0.1826171875, + "learning_rate": 6.385616370318864e-05, + "loss": 0.4707, + "step": 185270 + }, + { + "epoch": 9.202344293235324, + "grad_norm": 0.19140625, + "learning_rate": 6.381642991953908e-05, + "loss": 0.4635, + "step": 185280 + }, + { + "epoch": 9.202840965530942, + "grad_norm": 0.2060546875, + "learning_rate": 6.377669613588954e-05, + "loss": 0.446, + "step": 185290 + }, + { + "epoch": 9.203337637826563, + "grad_norm": 0.1923828125, + "learning_rate": 6.373696235224e-05, + "loss": 0.5201, + "step": 185300 + }, + { + "epoch": 9.203834310122181, + "grad_norm": 0.2041015625, + "learning_rate": 6.369722856859046e-05, + "loss": 0.5005, + "step": 185310 + }, + { + "epoch": 9.204330982417801, + "grad_norm": 0.2392578125, + "learning_rate": 6.36574947849409e-05, + "loss": 0.475, + "step": 185320 + }, + { + "epoch": 9.20482765471342, + "grad_norm": 0.193359375, + "learning_rate": 6.361776100129136e-05, + "loss": 0.4586, + "step": 185330 + }, + { + "epoch": 9.20532432700904, + "grad_norm": 0.201171875, + "learning_rate": 6.35780272176418e-05, + "loss": 0.4828, + "step": 185340 + }, + { + "epoch": 9.20582099930466, + "grad_norm": 0.1943359375, + "learning_rate": 6.353829343399226e-05, + "loss": 0.4793, + "step": 185350 + }, + { + "epoch": 9.206317671600278, + "grad_norm": 0.173828125, + "learning_rate": 6.34985596503427e-05, + "loss": 0.477, + "step": 185360 + }, + { + "epoch": 9.206814343895898, + "grad_norm": 0.2109375, + "learning_rate": 6.345882586669316e-05, + "loss": 0.4984, + "step": 185370 + }, + { + "epoch": 9.207311016191516, + "grad_norm": 0.2099609375, + "learning_rate": 6.341909208304361e-05, + "loss": 0.4755, + "step": 185380 + }, + { + "epoch": 9.207807688487136, + "grad_norm": 0.208984375, + "learning_rate": 6.337935829939406e-05, + "loss": 0.5122, + "step": 185390 + }, + { + "epoch": 9.208304360782755, + "grad_norm": 0.1875, + "learning_rate": 6.333962451574451e-05, + "loss": 0.4924, + "step": 185400 + }, + { + "epoch": 9.208801033078375, + "grad_norm": 0.205078125, + "learning_rate": 6.329989073209497e-05, + "loss": 0.4954, + "step": 185410 + }, + { + "epoch": 9.209297705373995, + "grad_norm": 0.1953125, + "learning_rate": 6.326015694844541e-05, + "loss": 0.469, + "step": 185420 + }, + { + "epoch": 9.209794377669613, + "grad_norm": 0.29296875, + "learning_rate": 6.322042316479587e-05, + "loss": 0.4755, + "step": 185430 + }, + { + "epoch": 9.210291049965234, + "grad_norm": 0.19921875, + "learning_rate": 6.318068938114631e-05, + "loss": 0.4923, + "step": 185440 + }, + { + "epoch": 9.210787722260852, + "grad_norm": 0.185546875, + "learning_rate": 6.314095559749677e-05, + "loss": 0.5002, + "step": 185450 + }, + { + "epoch": 9.211284394556472, + "grad_norm": 0.189453125, + "learning_rate": 6.310122181384723e-05, + "loss": 0.5146, + "step": 185460 + }, + { + "epoch": 9.21178106685209, + "grad_norm": 0.1865234375, + "learning_rate": 6.306148803019767e-05, + "loss": 0.5041, + "step": 185470 + }, + { + "epoch": 9.21227773914771, + "grad_norm": 0.2138671875, + "learning_rate": 6.302175424654813e-05, + "loss": 0.4766, + "step": 185480 + }, + { + "epoch": 9.21277441144333, + "grad_norm": 0.212890625, + "learning_rate": 6.298202046289859e-05, + "loss": 0.4516, + "step": 185490 + }, + { + "epoch": 9.213271083738949, + "grad_norm": 0.1884765625, + "learning_rate": 6.294228667924903e-05, + "loss": 0.4993, + "step": 185500 + }, + { + "epoch": 9.213767756034569, + "grad_norm": 0.2177734375, + "learning_rate": 6.290255289559949e-05, + "loss": 0.496, + "step": 185510 + }, + { + "epoch": 9.214264428330187, + "grad_norm": 0.203125, + "learning_rate": 6.286281911194993e-05, + "loss": 0.4954, + "step": 185520 + }, + { + "epoch": 9.214761100625807, + "grad_norm": 0.19140625, + "learning_rate": 6.282308532830039e-05, + "loss": 0.4816, + "step": 185530 + }, + { + "epoch": 9.215257772921426, + "grad_norm": 0.21484375, + "learning_rate": 6.278335154465084e-05, + "loss": 0.4819, + "step": 185540 + }, + { + "epoch": 9.215754445217046, + "grad_norm": 0.1806640625, + "learning_rate": 6.27436177610013e-05, + "loss": 0.4628, + "step": 185550 + }, + { + "epoch": 9.216251117512666, + "grad_norm": 0.2080078125, + "learning_rate": 6.270388397735175e-05, + "loss": 0.4623, + "step": 185560 + }, + { + "epoch": 9.216747789808284, + "grad_norm": 0.1796875, + "learning_rate": 6.26641501937022e-05, + "loss": 0.4967, + "step": 185570 + }, + { + "epoch": 9.217244462103904, + "grad_norm": 0.185546875, + "learning_rate": 6.262441641005265e-05, + "loss": 0.4698, + "step": 185580 + }, + { + "epoch": 9.217741134399523, + "grad_norm": 0.2099609375, + "learning_rate": 6.25846826264031e-05, + "loss": 0.4738, + "step": 185590 + }, + { + "epoch": 9.218237806695143, + "grad_norm": 0.2060546875, + "learning_rate": 6.254494884275356e-05, + "loss": 0.4926, + "step": 185600 + }, + { + "epoch": 9.218734478990761, + "grad_norm": 0.1982421875, + "learning_rate": 6.2505215059104e-05, + "loss": 0.496, + "step": 185610 + }, + { + "epoch": 9.219231151286381, + "grad_norm": 0.193359375, + "learning_rate": 6.246548127545446e-05, + "loss": 0.4931, + "step": 185620 + }, + { + "epoch": 9.219727823582001, + "grad_norm": 0.224609375, + "learning_rate": 6.24257474918049e-05, + "loss": 0.5287, + "step": 185630 + }, + { + "epoch": 9.22022449587762, + "grad_norm": 0.1923828125, + "learning_rate": 6.238601370815536e-05, + "loss": 0.5118, + "step": 185640 + }, + { + "epoch": 9.22072116817324, + "grad_norm": 0.1953125, + "learning_rate": 6.234627992450582e-05, + "loss": 0.4663, + "step": 185650 + }, + { + "epoch": 9.221217840468858, + "grad_norm": 0.189453125, + "learning_rate": 6.230654614085628e-05, + "loss": 0.5234, + "step": 185660 + }, + { + "epoch": 9.221714512764478, + "grad_norm": 0.2265625, + "learning_rate": 6.226681235720672e-05, + "loss": 0.4939, + "step": 185670 + }, + { + "epoch": 9.222211185060097, + "grad_norm": 0.2001953125, + "learning_rate": 6.222707857355718e-05, + "loss": 0.499, + "step": 185680 + }, + { + "epoch": 9.222707857355717, + "grad_norm": 0.19921875, + "learning_rate": 6.218734478990762e-05, + "loss": 0.4977, + "step": 185690 + }, + { + "epoch": 9.223204529651335, + "grad_norm": 0.2041015625, + "learning_rate": 6.214761100625808e-05, + "loss": 0.4894, + "step": 185700 + }, + { + "epoch": 9.223701201946955, + "grad_norm": 0.2041015625, + "learning_rate": 6.210787722260852e-05, + "loss": 0.4917, + "step": 185710 + }, + { + "epoch": 9.224197874242575, + "grad_norm": 0.193359375, + "learning_rate": 6.206814343895898e-05, + "loss": 0.5219, + "step": 185720 + }, + { + "epoch": 9.224694546538194, + "grad_norm": 0.189453125, + "learning_rate": 6.202840965530943e-05, + "loss": 0.4857, + "step": 185730 + }, + { + "epoch": 9.225191218833814, + "grad_norm": 0.212890625, + "learning_rate": 6.198867587165988e-05, + "loss": 0.4869, + "step": 185740 + }, + { + "epoch": 9.225687891129432, + "grad_norm": 0.19140625, + "learning_rate": 6.194894208801033e-05, + "loss": 0.4897, + "step": 185750 + }, + { + "epoch": 9.226184563425052, + "grad_norm": 0.205078125, + "learning_rate": 6.190920830436079e-05, + "loss": 0.4721, + "step": 185760 + }, + { + "epoch": 9.22668123572067, + "grad_norm": 0.1923828125, + "learning_rate": 6.186947452071123e-05, + "loss": 0.4598, + "step": 185770 + }, + { + "epoch": 9.22717790801629, + "grad_norm": 0.2080078125, + "learning_rate": 6.182974073706169e-05, + "loss": 0.5066, + "step": 185780 + }, + { + "epoch": 9.227674580311911, + "grad_norm": 0.201171875, + "learning_rate": 6.179000695341213e-05, + "loss": 0.4993, + "step": 185790 + }, + { + "epoch": 9.22817125260753, + "grad_norm": 0.17578125, + "learning_rate": 6.175027316976259e-05, + "loss": 0.4807, + "step": 185800 + }, + { + "epoch": 9.22866792490315, + "grad_norm": 0.201171875, + "learning_rate": 6.171053938611305e-05, + "loss": 0.4626, + "step": 185810 + }, + { + "epoch": 9.229164597198768, + "grad_norm": 0.185546875, + "learning_rate": 6.167080560246349e-05, + "loss": 0.4757, + "step": 185820 + }, + { + "epoch": 9.229661269494388, + "grad_norm": 0.189453125, + "learning_rate": 6.163107181881395e-05, + "loss": 0.4612, + "step": 185830 + }, + { + "epoch": 9.230157941790006, + "grad_norm": 0.2138671875, + "learning_rate": 6.159133803516441e-05, + "loss": 0.4929, + "step": 185840 + }, + { + "epoch": 9.230654614085626, + "grad_norm": 0.1875, + "learning_rate": 6.155160425151485e-05, + "loss": 0.483, + "step": 185850 + }, + { + "epoch": 9.231151286381246, + "grad_norm": 0.19140625, + "learning_rate": 6.151187046786531e-05, + "loss": 0.4785, + "step": 185860 + }, + { + "epoch": 9.231647958676865, + "grad_norm": 0.1953125, + "learning_rate": 6.147213668421575e-05, + "loss": 0.4799, + "step": 185870 + }, + { + "epoch": 9.232144630972485, + "grad_norm": 0.1982421875, + "learning_rate": 6.143240290056621e-05, + "loss": 0.49, + "step": 185880 + }, + { + "epoch": 9.232641303268103, + "grad_norm": 0.197265625, + "learning_rate": 6.139266911691666e-05, + "loss": 0.4554, + "step": 185890 + }, + { + "epoch": 9.233137975563723, + "grad_norm": 0.189453125, + "learning_rate": 6.135293533326711e-05, + "loss": 0.4884, + "step": 185900 + }, + { + "epoch": 9.233634647859342, + "grad_norm": 0.19921875, + "learning_rate": 6.131320154961756e-05, + "loss": 0.4668, + "step": 185910 + }, + { + "epoch": 9.234131320154962, + "grad_norm": 0.1865234375, + "learning_rate": 6.127346776596802e-05, + "loss": 0.4803, + "step": 185920 + }, + { + "epoch": 9.234627992450582, + "grad_norm": 0.1953125, + "learning_rate": 6.123373398231846e-05, + "loss": 0.5186, + "step": 185930 + }, + { + "epoch": 9.2351246647462, + "grad_norm": 0.244140625, + "learning_rate": 6.119400019866892e-05, + "loss": 0.4741, + "step": 185940 + }, + { + "epoch": 9.23562133704182, + "grad_norm": 0.1923828125, + "learning_rate": 6.115426641501936e-05, + "loss": 0.4883, + "step": 185950 + }, + { + "epoch": 9.236118009337439, + "grad_norm": 0.185546875, + "learning_rate": 6.111453263136982e-05, + "loss": 0.4836, + "step": 185960 + }, + { + "epoch": 9.236614681633059, + "grad_norm": 0.1904296875, + "learning_rate": 6.107479884772028e-05, + "loss": 0.4724, + "step": 185970 + }, + { + "epoch": 9.237111353928677, + "grad_norm": 0.216796875, + "learning_rate": 6.103506506407073e-05, + "loss": 0.5036, + "step": 185980 + }, + { + "epoch": 9.237608026224297, + "grad_norm": 0.2099609375, + "learning_rate": 6.099533128042119e-05, + "loss": 0.4576, + "step": 185990 + }, + { + "epoch": 9.238104698519917, + "grad_norm": 0.232421875, + "learning_rate": 6.095559749677163e-05, + "loss": 0.4848, + "step": 186000 + }, + { + "epoch": 9.238601370815536, + "grad_norm": 0.20703125, + "learning_rate": 6.091586371312209e-05, + "loss": 0.4708, + "step": 186010 + }, + { + "epoch": 9.239098043111156, + "grad_norm": 0.193359375, + "learning_rate": 6.0876129929472534e-05, + "loss": 0.4908, + "step": 186020 + }, + { + "epoch": 9.239594715406774, + "grad_norm": 0.203125, + "learning_rate": 6.083639614582299e-05, + "loss": 0.5043, + "step": 186030 + }, + { + "epoch": 9.240091387702394, + "grad_norm": 0.216796875, + "learning_rate": 6.079666236217344e-05, + "loss": 0.4702, + "step": 186040 + }, + { + "epoch": 9.240588059998013, + "grad_norm": 0.1845703125, + "learning_rate": 6.07569285785239e-05, + "loss": 0.4784, + "step": 186050 + }, + { + "epoch": 9.241084732293633, + "grad_norm": 0.1962890625, + "learning_rate": 6.0717194794874345e-05, + "loss": 0.485, + "step": 186060 + }, + { + "epoch": 9.241581404589253, + "grad_norm": 0.25390625, + "learning_rate": 6.06774610112248e-05, + "loss": 0.4819, + "step": 186070 + }, + { + "epoch": 9.242078076884871, + "grad_norm": 0.19140625, + "learning_rate": 6.063772722757525e-05, + "loss": 0.4849, + "step": 186080 + }, + { + "epoch": 9.242574749180491, + "grad_norm": 0.1904296875, + "learning_rate": 6.0597993443925704e-05, + "loss": 0.5043, + "step": 186090 + }, + { + "epoch": 9.24307142147611, + "grad_norm": 0.1904296875, + "learning_rate": 6.055825966027615e-05, + "loss": 0.4819, + "step": 186100 + }, + { + "epoch": 9.24356809377173, + "grad_norm": 0.1943359375, + "learning_rate": 6.0518525876626606e-05, + "loss": 0.4952, + "step": 186110 + }, + { + "epoch": 9.244064766067348, + "grad_norm": 0.1884765625, + "learning_rate": 6.047879209297706e-05, + "loss": 0.4589, + "step": 186120 + }, + { + "epoch": 9.244561438362968, + "grad_norm": 0.23046875, + "learning_rate": 6.0439058309327515e-05, + "loss": 0.5142, + "step": 186130 + }, + { + "epoch": 9.245058110658588, + "grad_norm": 0.2099609375, + "learning_rate": 6.039932452567796e-05, + "loss": 0.4881, + "step": 186140 + }, + { + "epoch": 9.245554782954207, + "grad_norm": 0.1923828125, + "learning_rate": 6.035959074202842e-05, + "loss": 0.4849, + "step": 186150 + }, + { + "epoch": 9.246051455249827, + "grad_norm": 0.2197265625, + "learning_rate": 6.031985695837886e-05, + "loss": 0.4902, + "step": 186160 + }, + { + "epoch": 9.246548127545445, + "grad_norm": 0.177734375, + "learning_rate": 6.028012317472932e-05, + "loss": 0.5227, + "step": 186170 + }, + { + "epoch": 9.247044799841065, + "grad_norm": 0.203125, + "learning_rate": 6.0240389391079764e-05, + "loss": 0.5197, + "step": 186180 + }, + { + "epoch": 9.247541472136684, + "grad_norm": 0.21875, + "learning_rate": 6.020065560743022e-05, + "loss": 0.5058, + "step": 186190 + }, + { + "epoch": 9.248038144432304, + "grad_norm": 0.1845703125, + "learning_rate": 6.016092182378067e-05, + "loss": 0.4697, + "step": 186200 + }, + { + "epoch": 9.248534816727924, + "grad_norm": 0.21484375, + "learning_rate": 6.0121188040131124e-05, + "loss": 0.5106, + "step": 186210 + }, + { + "epoch": 9.249031489023542, + "grad_norm": 0.2197265625, + "learning_rate": 6.0081454256481575e-05, + "loss": 0.4796, + "step": 186220 + }, + { + "epoch": 9.249528161319162, + "grad_norm": 0.2109375, + "learning_rate": 6.004172047283203e-05, + "loss": 0.4907, + "step": 186230 + }, + { + "epoch": 9.25002483361478, + "grad_norm": 0.189453125, + "learning_rate": 6.0001986689182477e-05, + "loss": 0.4646, + "step": 186240 + }, + { + "epoch": 9.2505215059104, + "grad_norm": 0.177734375, + "learning_rate": 5.9962252905532934e-05, + "loss": 0.4674, + "step": 186250 + }, + { + "epoch": 9.251018178206019, + "grad_norm": 0.1865234375, + "learning_rate": 5.992251912188338e-05, + "loss": 0.4699, + "step": 186260 + }, + { + "epoch": 9.25151485050164, + "grad_norm": 0.2021484375, + "learning_rate": 5.9882785338233836e-05, + "loss": 0.4921, + "step": 186270 + }, + { + "epoch": 9.252011522797257, + "grad_norm": 0.20703125, + "learning_rate": 5.984305155458428e-05, + "loss": 0.5019, + "step": 186280 + }, + { + "epoch": 9.252508195092878, + "grad_norm": 0.1845703125, + "learning_rate": 5.980331777093474e-05, + "loss": 0.4835, + "step": 186290 + }, + { + "epoch": 9.253004867388498, + "grad_norm": 0.19921875, + "learning_rate": 5.976358398728519e-05, + "loss": 0.5084, + "step": 186300 + }, + { + "epoch": 9.253501539684116, + "grad_norm": 0.232421875, + "learning_rate": 5.972385020363565e-05, + "loss": 0.5069, + "step": 186310 + }, + { + "epoch": 9.253998211979736, + "grad_norm": 0.2119140625, + "learning_rate": 5.968411641998609e-05, + "loss": 0.4888, + "step": 186320 + }, + { + "epoch": 9.254494884275354, + "grad_norm": 0.1904296875, + "learning_rate": 5.964438263633655e-05, + "loss": 0.4543, + "step": 186330 + }, + { + "epoch": 9.254991556570975, + "grad_norm": 0.1884765625, + "learning_rate": 5.9604648852686994e-05, + "loss": 0.4822, + "step": 186340 + }, + { + "epoch": 9.255488228866593, + "grad_norm": 0.2021484375, + "learning_rate": 5.956491506903745e-05, + "loss": 0.4528, + "step": 186350 + }, + { + "epoch": 9.255984901162213, + "grad_norm": 0.2353515625, + "learning_rate": 5.9525181285387896e-05, + "loss": 0.5025, + "step": 186360 + }, + { + "epoch": 9.256481573457833, + "grad_norm": 0.18359375, + "learning_rate": 5.9485447501738353e-05, + "loss": 0.4753, + "step": 186370 + }, + { + "epoch": 9.256978245753452, + "grad_norm": 0.2099609375, + "learning_rate": 5.9445713718088805e-05, + "loss": 0.4525, + "step": 186380 + }, + { + "epoch": 9.257474918049072, + "grad_norm": 0.212890625, + "learning_rate": 5.940597993443926e-05, + "loss": 0.4734, + "step": 186390 + }, + { + "epoch": 9.25797159034469, + "grad_norm": 0.18359375, + "learning_rate": 5.936624615078972e-05, + "loss": 0.467, + "step": 186400 + }, + { + "epoch": 9.25846826264031, + "grad_norm": 0.23046875, + "learning_rate": 5.9326512367140164e-05, + "loss": 0.5307, + "step": 186410 + }, + { + "epoch": 9.258964934935928, + "grad_norm": 0.2001953125, + "learning_rate": 5.928677858349062e-05, + "loss": 0.492, + "step": 186420 + }, + { + "epoch": 9.259461607231549, + "grad_norm": 0.20703125, + "learning_rate": 5.9247044799841066e-05, + "loss": 0.4876, + "step": 186430 + }, + { + "epoch": 9.259958279527169, + "grad_norm": 0.181640625, + "learning_rate": 5.9207311016191524e-05, + "loss": 0.4827, + "step": 186440 + }, + { + "epoch": 9.260454951822787, + "grad_norm": 0.2001953125, + "learning_rate": 5.916757723254197e-05, + "loss": 0.5002, + "step": 186450 + }, + { + "epoch": 9.260951624118407, + "grad_norm": 0.1953125, + "learning_rate": 5.9127843448892426e-05, + "loss": 0.4513, + "step": 186460 + }, + { + "epoch": 9.261448296414025, + "grad_norm": 0.23046875, + "learning_rate": 5.908810966524288e-05, + "loss": 0.5047, + "step": 186470 + }, + { + "epoch": 9.261944968709646, + "grad_norm": 0.1953125, + "learning_rate": 5.9048375881593335e-05, + "loss": 0.4695, + "step": 186480 + }, + { + "epoch": 9.262441641005264, + "grad_norm": 0.205078125, + "learning_rate": 5.900864209794378e-05, + "loss": 0.4897, + "step": 186490 + }, + { + "epoch": 9.262938313300884, + "grad_norm": 0.1962890625, + "learning_rate": 5.896890831429424e-05, + "loss": 0.4734, + "step": 186500 + }, + { + "epoch": 9.263434985596504, + "grad_norm": 0.1826171875, + "learning_rate": 5.892917453064468e-05, + "loss": 0.4814, + "step": 186510 + }, + { + "epoch": 9.263931657892122, + "grad_norm": 0.2109375, + "learning_rate": 5.888944074699514e-05, + "loss": 0.5046, + "step": 186520 + }, + { + "epoch": 9.264428330187743, + "grad_norm": 0.1865234375, + "learning_rate": 5.8849706963345583e-05, + "loss": 0.4948, + "step": 186530 + }, + { + "epoch": 9.264925002483361, + "grad_norm": 0.2060546875, + "learning_rate": 5.880997317969604e-05, + "loss": 0.5241, + "step": 186540 + }, + { + "epoch": 9.265421674778981, + "grad_norm": 0.2041015625, + "learning_rate": 5.877023939604649e-05, + "loss": 0.5078, + "step": 186550 + }, + { + "epoch": 9.2659183470746, + "grad_norm": 0.212890625, + "learning_rate": 5.873050561239694e-05, + "loss": 0.4903, + "step": 186560 + }, + { + "epoch": 9.26641501937022, + "grad_norm": 0.197265625, + "learning_rate": 5.8690771828747394e-05, + "loss": 0.4595, + "step": 186570 + }, + { + "epoch": 9.26691169166584, + "grad_norm": 0.1884765625, + "learning_rate": 5.865103804509785e-05, + "loss": 0.4921, + "step": 186580 + }, + { + "epoch": 9.267408363961458, + "grad_norm": 0.1982421875, + "learning_rate": 5.8611304261448296e-05, + "loss": 0.4667, + "step": 186590 + }, + { + "epoch": 9.267905036257078, + "grad_norm": 0.21484375, + "learning_rate": 5.8571570477798754e-05, + "loss": 0.4944, + "step": 186600 + }, + { + "epoch": 9.268401708552696, + "grad_norm": 0.2158203125, + "learning_rate": 5.85318366941492e-05, + "loss": 0.4866, + "step": 186610 + }, + { + "epoch": 9.268898380848317, + "grad_norm": 0.23046875, + "learning_rate": 5.8492102910499656e-05, + "loss": 0.454, + "step": 186620 + }, + { + "epoch": 9.269395053143935, + "grad_norm": 0.220703125, + "learning_rate": 5.84523691268501e-05, + "loss": 0.4836, + "step": 186630 + }, + { + "epoch": 9.269891725439555, + "grad_norm": 0.1865234375, + "learning_rate": 5.841263534320056e-05, + "loss": 0.4713, + "step": 186640 + }, + { + "epoch": 9.270388397735175, + "grad_norm": 0.224609375, + "learning_rate": 5.837290155955101e-05, + "loss": 0.5056, + "step": 186650 + }, + { + "epoch": 9.270885070030793, + "grad_norm": 0.193359375, + "learning_rate": 5.833316777590147e-05, + "loss": 0.4788, + "step": 186660 + }, + { + "epoch": 9.271381742326414, + "grad_norm": 0.216796875, + "learning_rate": 5.829343399225191e-05, + "loss": 0.4942, + "step": 186670 + }, + { + "epoch": 9.271878414622032, + "grad_norm": 0.2060546875, + "learning_rate": 5.825370020860237e-05, + "loss": 0.4962, + "step": 186680 + }, + { + "epoch": 9.272375086917652, + "grad_norm": 0.259765625, + "learning_rate": 5.8213966424952813e-05, + "loss": 0.5012, + "step": 186690 + }, + { + "epoch": 9.27287175921327, + "grad_norm": 0.1884765625, + "learning_rate": 5.817423264130327e-05, + "loss": 0.4848, + "step": 186700 + }, + { + "epoch": 9.27336843150889, + "grad_norm": 0.1796875, + "learning_rate": 5.8134498857653716e-05, + "loss": 0.5006, + "step": 186710 + }, + { + "epoch": 9.27386510380451, + "grad_norm": 0.1884765625, + "learning_rate": 5.809476507400417e-05, + "loss": 0.4955, + "step": 186720 + }, + { + "epoch": 9.274361776100129, + "grad_norm": 0.205078125, + "learning_rate": 5.8055031290354624e-05, + "loss": 0.4762, + "step": 186730 + }, + { + "epoch": 9.274858448395749, + "grad_norm": 0.1845703125, + "learning_rate": 5.801529750670508e-05, + "loss": 0.5008, + "step": 186740 + }, + { + "epoch": 9.275355120691367, + "grad_norm": 0.224609375, + "learning_rate": 5.7975563723055526e-05, + "loss": 0.501, + "step": 186750 + }, + { + "epoch": 9.275851792986987, + "grad_norm": 0.1796875, + "learning_rate": 5.7935829939405984e-05, + "loss": 0.4476, + "step": 186760 + }, + { + "epoch": 9.276348465282606, + "grad_norm": 0.2021484375, + "learning_rate": 5.789609615575643e-05, + "loss": 0.4489, + "step": 186770 + }, + { + "epoch": 9.276845137578226, + "grad_norm": 0.2197265625, + "learning_rate": 5.7856362372106886e-05, + "loss": 0.4733, + "step": 186780 + }, + { + "epoch": 9.277341809873846, + "grad_norm": 0.23046875, + "learning_rate": 5.781662858845733e-05, + "loss": 0.5189, + "step": 186790 + }, + { + "epoch": 9.277838482169464, + "grad_norm": 0.2333984375, + "learning_rate": 5.777689480480779e-05, + "loss": 0.5196, + "step": 186800 + }, + { + "epoch": 9.278335154465084, + "grad_norm": 0.2109375, + "learning_rate": 5.7737161021158246e-05, + "loss": 0.4705, + "step": 186810 + }, + { + "epoch": 9.278831826760703, + "grad_norm": 0.2021484375, + "learning_rate": 5.76974272375087e-05, + "loss": 0.4956, + "step": 186820 + }, + { + "epoch": 9.279328499056323, + "grad_norm": 0.1923828125, + "learning_rate": 5.7657693453859155e-05, + "loss": 0.5072, + "step": 186830 + }, + { + "epoch": 9.279825171351941, + "grad_norm": 0.1943359375, + "learning_rate": 5.76179596702096e-05, + "loss": 0.4981, + "step": 186840 + }, + { + "epoch": 9.280321843647561, + "grad_norm": 0.1875, + "learning_rate": 5.757822588656006e-05, + "loss": 0.47, + "step": 186850 + }, + { + "epoch": 9.280818515943182, + "grad_norm": 0.2138671875, + "learning_rate": 5.75384921029105e-05, + "loss": 0.5055, + "step": 186860 + }, + { + "epoch": 9.2813151882388, + "grad_norm": 0.19921875, + "learning_rate": 5.749875831926096e-05, + "loss": 0.4947, + "step": 186870 + }, + { + "epoch": 9.28181186053442, + "grad_norm": 0.2021484375, + "learning_rate": 5.74590245356114e-05, + "loss": 0.5082, + "step": 186880 + }, + { + "epoch": 9.282308532830038, + "grad_norm": 0.2001953125, + "learning_rate": 5.741929075196186e-05, + "loss": 0.4985, + "step": 186890 + }, + { + "epoch": 9.282805205125658, + "grad_norm": 0.197265625, + "learning_rate": 5.737955696831231e-05, + "loss": 0.4571, + "step": 186900 + }, + { + "epoch": 9.283301877421277, + "grad_norm": 0.1796875, + "learning_rate": 5.733982318466276e-05, + "loss": 0.4831, + "step": 186910 + }, + { + "epoch": 9.283798549716897, + "grad_norm": 0.1923828125, + "learning_rate": 5.7300089401013214e-05, + "loss": 0.4659, + "step": 186920 + }, + { + "epoch": 9.284295222012517, + "grad_norm": 0.1875, + "learning_rate": 5.726035561736367e-05, + "loss": 0.4556, + "step": 186930 + }, + { + "epoch": 9.284791894308135, + "grad_norm": 0.236328125, + "learning_rate": 5.7220621833714116e-05, + "loss": 0.4494, + "step": 186940 + }, + { + "epoch": 9.285288566603755, + "grad_norm": 0.1953125, + "learning_rate": 5.7180888050064574e-05, + "loss": 0.4987, + "step": 186950 + }, + { + "epoch": 9.285785238899374, + "grad_norm": 0.19921875, + "learning_rate": 5.714115426641502e-05, + "loss": 0.4772, + "step": 186960 + }, + { + "epoch": 9.286281911194994, + "grad_norm": 0.1962890625, + "learning_rate": 5.7101420482765476e-05, + "loss": 0.4859, + "step": 186970 + }, + { + "epoch": 9.286778583490612, + "grad_norm": 0.20703125, + "learning_rate": 5.706168669911593e-05, + "loss": 0.5106, + "step": 186980 + }, + { + "epoch": 9.287275255786232, + "grad_norm": 0.197265625, + "learning_rate": 5.702195291546638e-05, + "loss": 0.4717, + "step": 186990 + }, + { + "epoch": 9.287771928081852, + "grad_norm": 0.2236328125, + "learning_rate": 5.698221913181683e-05, + "loss": 0.4641, + "step": 187000 + }, + { + "epoch": 9.28826860037747, + "grad_norm": 0.193359375, + "learning_rate": 5.694248534816729e-05, + "loss": 0.5042, + "step": 187010 + }, + { + "epoch": 9.288765272673091, + "grad_norm": 0.1982421875, + "learning_rate": 5.690275156451773e-05, + "loss": 0.4724, + "step": 187020 + }, + { + "epoch": 9.28926194496871, + "grad_norm": 0.2001953125, + "learning_rate": 5.686301778086819e-05, + "loss": 0.4697, + "step": 187030 + }, + { + "epoch": 9.28975861726433, + "grad_norm": 0.2119140625, + "learning_rate": 5.682328399721863e-05, + "loss": 0.4926, + "step": 187040 + }, + { + "epoch": 9.290255289559948, + "grad_norm": 0.2060546875, + "learning_rate": 5.678355021356909e-05, + "loss": 0.4981, + "step": 187050 + }, + { + "epoch": 9.290751961855568, + "grad_norm": 0.216796875, + "learning_rate": 5.6743816429919535e-05, + "loss": 0.4962, + "step": 187060 + }, + { + "epoch": 9.291248634151188, + "grad_norm": 0.2021484375, + "learning_rate": 5.670408264626999e-05, + "loss": 0.4751, + "step": 187070 + }, + { + "epoch": 9.291745306446806, + "grad_norm": 0.2099609375, + "learning_rate": 5.6664348862620444e-05, + "loss": 0.489, + "step": 187080 + }, + { + "epoch": 9.292241978742426, + "grad_norm": 0.1923828125, + "learning_rate": 5.66246150789709e-05, + "loss": 0.4979, + "step": 187090 + }, + { + "epoch": 9.292738651038045, + "grad_norm": 0.19140625, + "learning_rate": 5.6584881295321346e-05, + "loss": 0.4927, + "step": 187100 + }, + { + "epoch": 9.293235323333665, + "grad_norm": 0.1943359375, + "learning_rate": 5.6545147511671804e-05, + "loss": 0.489, + "step": 187110 + }, + { + "epoch": 9.293731995629283, + "grad_norm": 0.2099609375, + "learning_rate": 5.650541372802225e-05, + "loss": 0.4744, + "step": 187120 + }, + { + "epoch": 9.294228667924903, + "grad_norm": 0.2021484375, + "learning_rate": 5.6465679944372706e-05, + "loss": 0.5103, + "step": 187130 + }, + { + "epoch": 9.294725340220522, + "grad_norm": 0.2060546875, + "learning_rate": 5.642594616072315e-05, + "loss": 0.493, + "step": 187140 + }, + { + "epoch": 9.295222012516142, + "grad_norm": 0.1728515625, + "learning_rate": 5.638621237707361e-05, + "loss": 0.4323, + "step": 187150 + }, + { + "epoch": 9.295718684811762, + "grad_norm": 0.2138671875, + "learning_rate": 5.634647859342406e-05, + "loss": 0.4903, + "step": 187160 + }, + { + "epoch": 9.29621535710738, + "grad_norm": 0.2275390625, + "learning_rate": 5.630674480977452e-05, + "loss": 0.4894, + "step": 187170 + }, + { + "epoch": 9.296712029403, + "grad_norm": 0.208984375, + "learning_rate": 5.626701102612496e-05, + "loss": 0.5074, + "step": 187180 + }, + { + "epoch": 9.297208701698619, + "grad_norm": 0.20703125, + "learning_rate": 5.622727724247542e-05, + "loss": 0.5152, + "step": 187190 + }, + { + "epoch": 9.297705373994239, + "grad_norm": 0.205078125, + "learning_rate": 5.618754345882586e-05, + "loss": 0.4853, + "step": 187200 + }, + { + "epoch": 9.298202046289857, + "grad_norm": 0.2060546875, + "learning_rate": 5.614780967517632e-05, + "loss": 0.5112, + "step": 187210 + }, + { + "epoch": 9.298698718585477, + "grad_norm": 0.2060546875, + "learning_rate": 5.6108075891526765e-05, + "loss": 0.5068, + "step": 187220 + }, + { + "epoch": 9.299195390881097, + "grad_norm": 0.1982421875, + "learning_rate": 5.606834210787722e-05, + "loss": 0.4822, + "step": 187230 + }, + { + "epoch": 9.299692063176716, + "grad_norm": 0.1787109375, + "learning_rate": 5.602860832422768e-05, + "loss": 0.4783, + "step": 187240 + }, + { + "epoch": 9.300188735472336, + "grad_norm": 0.18359375, + "learning_rate": 5.598887454057813e-05, + "loss": 0.4732, + "step": 187250 + }, + { + "epoch": 9.300685407767954, + "grad_norm": 0.20703125, + "learning_rate": 5.594914075692858e-05, + "loss": 0.4715, + "step": 187260 + }, + { + "epoch": 9.301182080063574, + "grad_norm": 0.197265625, + "learning_rate": 5.5909406973279034e-05, + "loss": 0.4939, + "step": 187270 + }, + { + "epoch": 9.301678752359193, + "grad_norm": 0.2001953125, + "learning_rate": 5.586967318962949e-05, + "loss": 0.4863, + "step": 187280 + }, + { + "epoch": 9.302175424654813, + "grad_norm": 0.1748046875, + "learning_rate": 5.5829939405979936e-05, + "loss": 0.4837, + "step": 187290 + }, + { + "epoch": 9.302672096950433, + "grad_norm": 0.2001953125, + "learning_rate": 5.5790205622330394e-05, + "loss": 0.4811, + "step": 187300 + }, + { + "epoch": 9.303168769246051, + "grad_norm": 0.185546875, + "learning_rate": 5.575047183868084e-05, + "loss": 0.4762, + "step": 187310 + }, + { + "epoch": 9.303665441541671, + "grad_norm": 0.2314453125, + "learning_rate": 5.5710738055031296e-05, + "loss": 0.511, + "step": 187320 + }, + { + "epoch": 9.30416211383729, + "grad_norm": 0.1923828125, + "learning_rate": 5.567100427138175e-05, + "loss": 0.4655, + "step": 187330 + }, + { + "epoch": 9.30465878613291, + "grad_norm": 0.19921875, + "learning_rate": 5.56312704877322e-05, + "loss": 0.4847, + "step": 187340 + }, + { + "epoch": 9.305155458428528, + "grad_norm": 0.2275390625, + "learning_rate": 5.559153670408265e-05, + "loss": 0.486, + "step": 187350 + }, + { + "epoch": 9.305652130724148, + "grad_norm": 0.2080078125, + "learning_rate": 5.555180292043311e-05, + "loss": 0.4938, + "step": 187360 + }, + { + "epoch": 9.306148803019768, + "grad_norm": 0.1982421875, + "learning_rate": 5.551206913678355e-05, + "loss": 0.4912, + "step": 187370 + }, + { + "epoch": 9.306645475315387, + "grad_norm": 0.2119140625, + "learning_rate": 5.547233535313401e-05, + "loss": 0.5352, + "step": 187380 + }, + { + "epoch": 9.307142147611007, + "grad_norm": 0.203125, + "learning_rate": 5.543260156948445e-05, + "loss": 0.4706, + "step": 187390 + }, + { + "epoch": 9.307638819906625, + "grad_norm": 0.2080078125, + "learning_rate": 5.539286778583491e-05, + "loss": 0.4798, + "step": 187400 + }, + { + "epoch": 9.308135492202245, + "grad_norm": 0.2080078125, + "learning_rate": 5.5353134002185355e-05, + "loss": 0.4745, + "step": 187410 + }, + { + "epoch": 9.308632164497864, + "grad_norm": 0.1982421875, + "learning_rate": 5.531340021853581e-05, + "loss": 0.4589, + "step": 187420 + }, + { + "epoch": 9.309128836793484, + "grad_norm": 0.1962890625, + "learning_rate": 5.5273666434886264e-05, + "loss": 0.4995, + "step": 187430 + }, + { + "epoch": 9.309625509089104, + "grad_norm": 0.2177734375, + "learning_rate": 5.523393265123672e-05, + "loss": 0.5002, + "step": 187440 + }, + { + "epoch": 9.310122181384722, + "grad_norm": 0.2236328125, + "learning_rate": 5.5194198867587166e-05, + "loss": 0.4806, + "step": 187450 + }, + { + "epoch": 9.310618853680342, + "grad_norm": 0.22265625, + "learning_rate": 5.5154465083937624e-05, + "loss": 0.4971, + "step": 187460 + }, + { + "epoch": 9.31111552597596, + "grad_norm": 0.2099609375, + "learning_rate": 5.511473130028807e-05, + "loss": 0.4762, + "step": 187470 + }, + { + "epoch": 9.31161219827158, + "grad_norm": 0.2041015625, + "learning_rate": 5.5074997516638526e-05, + "loss": 0.5103, + "step": 187480 + }, + { + "epoch": 9.312108870567199, + "grad_norm": 0.1884765625, + "learning_rate": 5.503526373298897e-05, + "loss": 0.498, + "step": 187490 + }, + { + "epoch": 9.31260554286282, + "grad_norm": 0.22265625, + "learning_rate": 5.499552994933943e-05, + "loss": 0.4498, + "step": 187500 + }, + { + "epoch": 9.31310221515844, + "grad_norm": 0.2255859375, + "learning_rate": 5.495579616568988e-05, + "loss": 0.5271, + "step": 187510 + }, + { + "epoch": 9.313598887454058, + "grad_norm": 0.22265625, + "learning_rate": 5.491606238204034e-05, + "loss": 0.523, + "step": 187520 + }, + { + "epoch": 9.314095559749678, + "grad_norm": 0.1923828125, + "learning_rate": 5.487632859839078e-05, + "loss": 0.4399, + "step": 187530 + }, + { + "epoch": 9.314592232045296, + "grad_norm": 0.19140625, + "learning_rate": 5.483659481474124e-05, + "loss": 0.4728, + "step": 187540 + }, + { + "epoch": 9.315088904340916, + "grad_norm": 0.232421875, + "learning_rate": 5.479686103109168e-05, + "loss": 0.4888, + "step": 187550 + }, + { + "epoch": 9.315585576636535, + "grad_norm": 0.2412109375, + "learning_rate": 5.475712724744214e-05, + "loss": 0.4777, + "step": 187560 + }, + { + "epoch": 9.316082248932155, + "grad_norm": 0.234375, + "learning_rate": 5.4717393463792585e-05, + "loss": 0.5027, + "step": 187570 + }, + { + "epoch": 9.316578921227775, + "grad_norm": 0.1865234375, + "learning_rate": 5.467765968014304e-05, + "loss": 0.4625, + "step": 187580 + }, + { + "epoch": 9.317075593523393, + "grad_norm": 0.197265625, + "learning_rate": 5.4637925896493494e-05, + "loss": 0.4635, + "step": 187590 + }, + { + "epoch": 9.317572265819013, + "grad_norm": 0.1875, + "learning_rate": 5.459819211284395e-05, + "loss": 0.4702, + "step": 187600 + }, + { + "epoch": 9.318068938114632, + "grad_norm": 0.2177734375, + "learning_rate": 5.4558458329194396e-05, + "loss": 0.4579, + "step": 187610 + }, + { + "epoch": 9.318565610410252, + "grad_norm": 0.2431640625, + "learning_rate": 5.4518724545544854e-05, + "loss": 0.504, + "step": 187620 + }, + { + "epoch": 9.31906228270587, + "grad_norm": 0.240234375, + "learning_rate": 5.44789907618953e-05, + "loss": 0.4948, + "step": 187630 + }, + { + "epoch": 9.31955895500149, + "grad_norm": 0.2099609375, + "learning_rate": 5.4439256978245756e-05, + "loss": 0.4738, + "step": 187640 + }, + { + "epoch": 9.320055627297108, + "grad_norm": 0.220703125, + "learning_rate": 5.43995231945962e-05, + "loss": 0.4768, + "step": 187650 + }, + { + "epoch": 9.320552299592729, + "grad_norm": 0.2080078125, + "learning_rate": 5.435978941094666e-05, + "loss": 0.5013, + "step": 187660 + }, + { + "epoch": 9.321048971888349, + "grad_norm": 0.2138671875, + "learning_rate": 5.4320055627297116e-05, + "loss": 0.4793, + "step": 187670 + }, + { + "epoch": 9.321545644183967, + "grad_norm": 0.185546875, + "learning_rate": 5.428032184364757e-05, + "loss": 0.4856, + "step": 187680 + }, + { + "epoch": 9.322042316479587, + "grad_norm": 0.2041015625, + "learning_rate": 5.424058805999802e-05, + "loss": 0.4884, + "step": 187690 + }, + { + "epoch": 9.322538988775205, + "grad_norm": 0.201171875, + "learning_rate": 5.420085427634847e-05, + "loss": 0.5033, + "step": 187700 + }, + { + "epoch": 9.323035661070826, + "grad_norm": 0.20703125, + "learning_rate": 5.416112049269893e-05, + "loss": 0.4459, + "step": 187710 + }, + { + "epoch": 9.323532333366444, + "grad_norm": 0.2080078125, + "learning_rate": 5.412138670904937e-05, + "loss": 0.4816, + "step": 187720 + }, + { + "epoch": 9.324029005662064, + "grad_norm": 0.2001953125, + "learning_rate": 5.408165292539983e-05, + "loss": 0.4857, + "step": 187730 + }, + { + "epoch": 9.324525677957684, + "grad_norm": 0.1904296875, + "learning_rate": 5.404191914175027e-05, + "loss": 0.4951, + "step": 187740 + }, + { + "epoch": 9.325022350253303, + "grad_norm": 0.189453125, + "learning_rate": 5.400218535810073e-05, + "loss": 0.4738, + "step": 187750 + }, + { + "epoch": 9.325519022548923, + "grad_norm": 0.185546875, + "learning_rate": 5.3962451574451175e-05, + "loss": 0.4828, + "step": 187760 + }, + { + "epoch": 9.326015694844541, + "grad_norm": 0.2412109375, + "learning_rate": 5.392271779080163e-05, + "loss": 0.4806, + "step": 187770 + }, + { + "epoch": 9.326512367140161, + "grad_norm": 0.19140625, + "learning_rate": 5.3882984007152084e-05, + "loss": 0.5165, + "step": 187780 + }, + { + "epoch": 9.32700903943578, + "grad_norm": 0.2373046875, + "learning_rate": 5.384325022350254e-05, + "loss": 0.5057, + "step": 187790 + }, + { + "epoch": 9.3275057117314, + "grad_norm": 0.2021484375, + "learning_rate": 5.3803516439852986e-05, + "loss": 0.4697, + "step": 187800 + }, + { + "epoch": 9.32800238402702, + "grad_norm": 0.201171875, + "learning_rate": 5.3763782656203444e-05, + "loss": 0.4986, + "step": 187810 + }, + { + "epoch": 9.328499056322638, + "grad_norm": 0.1923828125, + "learning_rate": 5.372404887255389e-05, + "loss": 0.485, + "step": 187820 + }, + { + "epoch": 9.328995728618258, + "grad_norm": 0.18359375, + "learning_rate": 5.3684315088904346e-05, + "loss": 0.4799, + "step": 187830 + }, + { + "epoch": 9.329492400913876, + "grad_norm": 0.2294921875, + "learning_rate": 5.364458130525479e-05, + "loss": 0.5037, + "step": 187840 + }, + { + "epoch": 9.329989073209497, + "grad_norm": 0.1943359375, + "learning_rate": 5.360484752160525e-05, + "loss": 0.4841, + "step": 187850 + }, + { + "epoch": 9.330485745505115, + "grad_norm": 0.2099609375, + "learning_rate": 5.35651137379557e-05, + "loss": 0.4872, + "step": 187860 + }, + { + "epoch": 9.330982417800735, + "grad_norm": 0.2021484375, + "learning_rate": 5.3525379954306157e-05, + "loss": 0.4738, + "step": 187870 + }, + { + "epoch": 9.331479090096355, + "grad_norm": 0.2119140625, + "learning_rate": 5.34856461706566e-05, + "loss": 0.5264, + "step": 187880 + }, + { + "epoch": 9.331975762391973, + "grad_norm": 0.1982421875, + "learning_rate": 5.344591238700706e-05, + "loss": 0.4613, + "step": 187890 + }, + { + "epoch": 9.332472434687594, + "grad_norm": 0.1865234375, + "learning_rate": 5.34061786033575e-05, + "loss": 0.4727, + "step": 187900 + }, + { + "epoch": 9.332969106983212, + "grad_norm": 0.2041015625, + "learning_rate": 5.336644481970796e-05, + "loss": 0.5011, + "step": 187910 + }, + { + "epoch": 9.333465779278832, + "grad_norm": 0.1943359375, + "learning_rate": 5.3326711036058405e-05, + "loss": 0.4775, + "step": 187920 + }, + { + "epoch": 9.33396245157445, + "grad_norm": 0.203125, + "learning_rate": 5.328697725240886e-05, + "loss": 0.4799, + "step": 187930 + }, + { + "epoch": 9.33445912387007, + "grad_norm": 0.197265625, + "learning_rate": 5.3247243468759314e-05, + "loss": 0.4657, + "step": 187940 + }, + { + "epoch": 9.33495579616569, + "grad_norm": 0.2109375, + "learning_rate": 5.320750968510977e-05, + "loss": 0.5079, + "step": 187950 + }, + { + "epoch": 9.335452468461309, + "grad_norm": 0.201171875, + "learning_rate": 5.3167775901460216e-05, + "loss": 0.5437, + "step": 187960 + }, + { + "epoch": 9.335949140756929, + "grad_norm": 0.1875, + "learning_rate": 5.3128042117810674e-05, + "loss": 0.4687, + "step": 187970 + }, + { + "epoch": 9.336445813052547, + "grad_norm": 0.212890625, + "learning_rate": 5.308830833416112e-05, + "loss": 0.4709, + "step": 187980 + }, + { + "epoch": 9.336942485348168, + "grad_norm": 0.224609375, + "learning_rate": 5.3048574550511576e-05, + "loss": 0.4948, + "step": 187990 + }, + { + "epoch": 9.337439157643786, + "grad_norm": 0.19921875, + "learning_rate": 5.300884076686202e-05, + "loss": 0.4843, + "step": 188000 + }, + { + "epoch": 9.337935829939406, + "grad_norm": 0.1875, + "learning_rate": 5.296910698321248e-05, + "loss": 0.4385, + "step": 188010 + }, + { + "epoch": 9.338432502235026, + "grad_norm": 0.2158203125, + "learning_rate": 5.292937319956293e-05, + "loss": 0.4862, + "step": 188020 + }, + { + "epoch": 9.338929174530644, + "grad_norm": 0.2109375, + "learning_rate": 5.2889639415913387e-05, + "loss": 0.4811, + "step": 188030 + }, + { + "epoch": 9.339425846826265, + "grad_norm": 0.189453125, + "learning_rate": 5.284990563226383e-05, + "loss": 0.4694, + "step": 188040 + }, + { + "epoch": 9.339922519121883, + "grad_norm": 0.2021484375, + "learning_rate": 5.281017184861429e-05, + "loss": 0.5002, + "step": 188050 + }, + { + "epoch": 9.340419191417503, + "grad_norm": 0.2333984375, + "learning_rate": 5.277043806496473e-05, + "loss": 0.4877, + "step": 188060 + }, + { + "epoch": 9.340915863713121, + "grad_norm": 0.1943359375, + "learning_rate": 5.273070428131519e-05, + "loss": 0.4835, + "step": 188070 + }, + { + "epoch": 9.341412536008741, + "grad_norm": 0.2099609375, + "learning_rate": 5.269097049766565e-05, + "loss": 0.5024, + "step": 188080 + }, + { + "epoch": 9.341909208304362, + "grad_norm": 0.203125, + "learning_rate": 5.265123671401609e-05, + "loss": 0.4623, + "step": 188090 + }, + { + "epoch": 9.34240588059998, + "grad_norm": 0.228515625, + "learning_rate": 5.261150293036655e-05, + "loss": 0.5086, + "step": 188100 + }, + { + "epoch": 9.3429025528956, + "grad_norm": 0.2001953125, + "learning_rate": 5.2571769146716995e-05, + "loss": 0.4834, + "step": 188110 + }, + { + "epoch": 9.343399225191218, + "grad_norm": 0.1923828125, + "learning_rate": 5.253203536306745e-05, + "loss": 0.4931, + "step": 188120 + }, + { + "epoch": 9.343895897486838, + "grad_norm": 0.201171875, + "learning_rate": 5.2492301579417904e-05, + "loss": 0.4896, + "step": 188130 + }, + { + "epoch": 9.344392569782457, + "grad_norm": 0.1953125, + "learning_rate": 5.245256779576836e-05, + "loss": 0.4814, + "step": 188140 + }, + { + "epoch": 9.344889242078077, + "grad_norm": 0.21484375, + "learning_rate": 5.2412834012118806e-05, + "loss": 0.4714, + "step": 188150 + }, + { + "epoch": 9.345385914373697, + "grad_norm": 0.1884765625, + "learning_rate": 5.2373100228469263e-05, + "loss": 0.4631, + "step": 188160 + }, + { + "epoch": 9.345882586669315, + "grad_norm": 0.21484375, + "learning_rate": 5.233336644481971e-05, + "loss": 0.4694, + "step": 188170 + }, + { + "epoch": 9.346379258964935, + "grad_norm": 0.2021484375, + "learning_rate": 5.2293632661170166e-05, + "loss": 0.5235, + "step": 188180 + }, + { + "epoch": 9.346875931260554, + "grad_norm": 0.2216796875, + "learning_rate": 5.225389887752061e-05, + "loss": 0.4933, + "step": 188190 + }, + { + "epoch": 9.347372603556174, + "grad_norm": 0.2109375, + "learning_rate": 5.221416509387107e-05, + "loss": 0.4818, + "step": 188200 + }, + { + "epoch": 9.347869275851792, + "grad_norm": 0.197265625, + "learning_rate": 5.217443131022152e-05, + "loss": 0.4755, + "step": 188210 + }, + { + "epoch": 9.348365948147412, + "grad_norm": 0.1982421875, + "learning_rate": 5.2134697526571976e-05, + "loss": 0.5092, + "step": 188220 + }, + { + "epoch": 9.348862620443033, + "grad_norm": 0.1748046875, + "learning_rate": 5.209496374292242e-05, + "loss": 0.5006, + "step": 188230 + }, + { + "epoch": 9.34935929273865, + "grad_norm": 0.1943359375, + "learning_rate": 5.205522995927288e-05, + "loss": 0.4579, + "step": 188240 + }, + { + "epoch": 9.349855965034271, + "grad_norm": 0.1943359375, + "learning_rate": 5.201549617562332e-05, + "loss": 0.4728, + "step": 188250 + }, + { + "epoch": 9.35035263732989, + "grad_norm": 0.1865234375, + "learning_rate": 5.197576239197378e-05, + "loss": 0.4806, + "step": 188260 + }, + { + "epoch": 9.35084930962551, + "grad_norm": 0.1943359375, + "learning_rate": 5.1936028608324225e-05, + "loss": 0.4834, + "step": 188270 + }, + { + "epoch": 9.351345981921128, + "grad_norm": 0.1904296875, + "learning_rate": 5.189629482467468e-05, + "loss": 0.4668, + "step": 188280 + }, + { + "epoch": 9.351842654216748, + "grad_norm": 0.1953125, + "learning_rate": 5.1856561041025134e-05, + "loss": 0.4714, + "step": 188290 + }, + { + "epoch": 9.352339326512368, + "grad_norm": 0.232421875, + "learning_rate": 5.181682725737559e-05, + "loss": 0.4732, + "step": 188300 + }, + { + "epoch": 9.352835998807986, + "grad_norm": 0.216796875, + "learning_rate": 5.1777093473726036e-05, + "loss": 0.5134, + "step": 188310 + }, + { + "epoch": 9.353332671103606, + "grad_norm": 0.22265625, + "learning_rate": 5.1737359690076493e-05, + "loss": 0.4875, + "step": 188320 + }, + { + "epoch": 9.353829343399225, + "grad_norm": 0.22265625, + "learning_rate": 5.169762590642694e-05, + "loss": 0.513, + "step": 188330 + }, + { + "epoch": 9.354326015694845, + "grad_norm": 0.2314453125, + "learning_rate": 5.1657892122777396e-05, + "loss": 0.4639, + "step": 188340 + }, + { + "epoch": 9.354822687990463, + "grad_norm": 0.193359375, + "learning_rate": 5.161815833912784e-05, + "loss": 0.5193, + "step": 188350 + }, + { + "epoch": 9.355319360286083, + "grad_norm": 0.20703125, + "learning_rate": 5.15784245554783e-05, + "loss": 0.4933, + "step": 188360 + }, + { + "epoch": 9.355816032581703, + "grad_norm": 0.197265625, + "learning_rate": 5.153869077182875e-05, + "loss": 0.4735, + "step": 188370 + }, + { + "epoch": 9.356312704877322, + "grad_norm": 0.212890625, + "learning_rate": 5.1498956988179206e-05, + "loss": 0.4596, + "step": 188380 + }, + { + "epoch": 9.356809377172942, + "grad_norm": 0.1904296875, + "learning_rate": 5.145922320452965e-05, + "loss": 0.4963, + "step": 188390 + }, + { + "epoch": 9.35730604946856, + "grad_norm": 0.189453125, + "learning_rate": 5.141948942088011e-05, + "loss": 0.5084, + "step": 188400 + }, + { + "epoch": 9.35780272176418, + "grad_norm": 0.19921875, + "learning_rate": 5.137975563723055e-05, + "loss": 0.4659, + "step": 188410 + }, + { + "epoch": 9.358299394059799, + "grad_norm": 0.1923828125, + "learning_rate": 5.134002185358101e-05, + "loss": 0.4687, + "step": 188420 + }, + { + "epoch": 9.358796066355419, + "grad_norm": 0.205078125, + "learning_rate": 5.1300288069931455e-05, + "loss": 0.4858, + "step": 188430 + }, + { + "epoch": 9.359292738651039, + "grad_norm": 0.2080078125, + "learning_rate": 5.126055428628191e-05, + "loss": 0.5013, + "step": 188440 + }, + { + "epoch": 9.359789410946657, + "grad_norm": 0.2373046875, + "learning_rate": 5.1220820502632364e-05, + "loss": 0.5071, + "step": 188450 + }, + { + "epoch": 9.360286083242277, + "grad_norm": 0.2109375, + "learning_rate": 5.118108671898282e-05, + "loss": 0.5256, + "step": 188460 + }, + { + "epoch": 9.360782755537896, + "grad_norm": 0.1962890625, + "learning_rate": 5.1141352935333266e-05, + "loss": 0.49, + "step": 188470 + }, + { + "epoch": 9.361279427833516, + "grad_norm": 0.1923828125, + "learning_rate": 5.1101619151683723e-05, + "loss": 0.4787, + "step": 188480 + }, + { + "epoch": 9.361776100129134, + "grad_norm": 0.2314453125, + "learning_rate": 5.106188536803417e-05, + "loss": 0.4634, + "step": 188490 + }, + { + "epoch": 9.362272772424754, + "grad_norm": 0.21484375, + "learning_rate": 5.1022151584384626e-05, + "loss": 0.4745, + "step": 188500 + }, + { + "epoch": 9.362769444720373, + "grad_norm": 0.2041015625, + "learning_rate": 5.098241780073508e-05, + "loss": 0.4861, + "step": 188510 + }, + { + "epoch": 9.363266117015993, + "grad_norm": 0.2021484375, + "learning_rate": 5.094268401708553e-05, + "loss": 0.5009, + "step": 188520 + }, + { + "epoch": 9.363762789311613, + "grad_norm": 0.2392578125, + "learning_rate": 5.0902950233435985e-05, + "loss": 0.5162, + "step": 188530 + }, + { + "epoch": 9.364259461607231, + "grad_norm": 0.212890625, + "learning_rate": 5.086321644978643e-05, + "loss": 0.4731, + "step": 188540 + }, + { + "epoch": 9.364756133902851, + "grad_norm": 0.216796875, + "learning_rate": 5.082348266613689e-05, + "loss": 0.4853, + "step": 188550 + }, + { + "epoch": 9.36525280619847, + "grad_norm": 0.2080078125, + "learning_rate": 5.078374888248734e-05, + "loss": 0.4853, + "step": 188560 + }, + { + "epoch": 9.36574947849409, + "grad_norm": 0.1982421875, + "learning_rate": 5.0744015098837796e-05, + "loss": 0.519, + "step": 188570 + }, + { + "epoch": 9.366246150789708, + "grad_norm": 0.193359375, + "learning_rate": 5.070428131518824e-05, + "loss": 0.4911, + "step": 188580 + }, + { + "epoch": 9.366742823085328, + "grad_norm": 0.234375, + "learning_rate": 5.06645475315387e-05, + "loss": 0.4996, + "step": 188590 + }, + { + "epoch": 9.367239495380948, + "grad_norm": 0.1923828125, + "learning_rate": 5.062481374788914e-05, + "loss": 0.4864, + "step": 188600 + }, + { + "epoch": 9.367736167676567, + "grad_norm": 0.203125, + "learning_rate": 5.05850799642396e-05, + "loss": 0.4807, + "step": 188610 + }, + { + "epoch": 9.368232839972187, + "grad_norm": 0.1845703125, + "learning_rate": 5.0545346180590045e-05, + "loss": 0.4658, + "step": 188620 + }, + { + "epoch": 9.368729512267805, + "grad_norm": 0.19921875, + "learning_rate": 5.05056123969405e-05, + "loss": 0.4984, + "step": 188630 + }, + { + "epoch": 9.369226184563425, + "grad_norm": 0.212890625, + "learning_rate": 5.0465878613290953e-05, + "loss": 0.4678, + "step": 188640 + }, + { + "epoch": 9.369722856859044, + "grad_norm": 0.1943359375, + "learning_rate": 5.042614482964141e-05, + "loss": 0.4779, + "step": 188650 + }, + { + "epoch": 9.370219529154664, + "grad_norm": 0.185546875, + "learning_rate": 5.0386411045991855e-05, + "loss": 0.4985, + "step": 188660 + }, + { + "epoch": 9.370716201450284, + "grad_norm": 0.2109375, + "learning_rate": 5.034667726234231e-05, + "loss": 0.4886, + "step": 188670 + }, + { + "epoch": 9.371212873745902, + "grad_norm": 0.212890625, + "learning_rate": 5.030694347869276e-05, + "loss": 0.5163, + "step": 188680 + }, + { + "epoch": 9.371709546041522, + "grad_norm": 0.197265625, + "learning_rate": 5.0267209695043215e-05, + "loss": 0.4977, + "step": 188690 + }, + { + "epoch": 9.37220621833714, + "grad_norm": 0.2431640625, + "learning_rate": 5.022747591139366e-05, + "loss": 0.5211, + "step": 188700 + }, + { + "epoch": 9.37270289063276, + "grad_norm": 0.1953125, + "learning_rate": 5.018774212774412e-05, + "loss": 0.4903, + "step": 188710 + }, + { + "epoch": 9.373199562928379, + "grad_norm": 0.1875, + "learning_rate": 5.014800834409457e-05, + "loss": 0.4677, + "step": 188720 + }, + { + "epoch": 9.373696235224, + "grad_norm": 0.1923828125, + "learning_rate": 5.0108274560445026e-05, + "loss": 0.4746, + "step": 188730 + }, + { + "epoch": 9.37419290751962, + "grad_norm": 0.193359375, + "learning_rate": 5.006854077679547e-05, + "loss": 0.4586, + "step": 188740 + }, + { + "epoch": 9.374689579815238, + "grad_norm": 0.185546875, + "learning_rate": 5.002880699314593e-05, + "loss": 0.4692, + "step": 188750 + }, + { + "epoch": 9.375186252110858, + "grad_norm": 0.208984375, + "learning_rate": 4.998907320949638e-05, + "loss": 0.4933, + "step": 188760 + }, + { + "epoch": 9.375682924406476, + "grad_norm": 0.25, + "learning_rate": 4.994933942584683e-05, + "loss": 0.493, + "step": 188770 + }, + { + "epoch": 9.376179596702096, + "grad_norm": 0.19921875, + "learning_rate": 4.990960564219728e-05, + "loss": 0.4864, + "step": 188780 + }, + { + "epoch": 9.376676268997715, + "grad_norm": 0.21484375, + "learning_rate": 4.986987185854773e-05, + "loss": 0.4733, + "step": 188790 + }, + { + "epoch": 9.377172941293335, + "grad_norm": 0.2265625, + "learning_rate": 4.9830138074898183e-05, + "loss": 0.5183, + "step": 188800 + }, + { + "epoch": 9.377669613588955, + "grad_norm": 0.19921875, + "learning_rate": 4.979040429124864e-05, + "loss": 0.4848, + "step": 188810 + }, + { + "epoch": 9.378166285884573, + "grad_norm": 0.1904296875, + "learning_rate": 4.975067050759909e-05, + "loss": 0.5142, + "step": 188820 + }, + { + "epoch": 9.378662958180193, + "grad_norm": 0.21875, + "learning_rate": 4.971093672394954e-05, + "loss": 0.4965, + "step": 188830 + }, + { + "epoch": 9.379159630475812, + "grad_norm": 0.2099609375, + "learning_rate": 4.9671202940299994e-05, + "loss": 0.4981, + "step": 188840 + }, + { + "epoch": 9.379656302771432, + "grad_norm": 0.17578125, + "learning_rate": 4.9631469156650445e-05, + "loss": 0.4926, + "step": 188850 + }, + { + "epoch": 9.38015297506705, + "grad_norm": 0.197265625, + "learning_rate": 4.9591735373000896e-05, + "loss": 0.5015, + "step": 188860 + }, + { + "epoch": 9.38064964736267, + "grad_norm": 0.189453125, + "learning_rate": 4.955200158935135e-05, + "loss": 0.5019, + "step": 188870 + }, + { + "epoch": 9.38114631965829, + "grad_norm": 0.201171875, + "learning_rate": 4.95122678057018e-05, + "loss": 0.5043, + "step": 188880 + }, + { + "epoch": 9.381642991953909, + "grad_norm": 0.1982421875, + "learning_rate": 4.947253402205225e-05, + "loss": 0.4871, + "step": 188890 + }, + { + "epoch": 9.382139664249529, + "grad_norm": 0.2001953125, + "learning_rate": 4.943280023840271e-05, + "loss": 0.499, + "step": 188900 + }, + { + "epoch": 9.382636336545147, + "grad_norm": 0.1796875, + "learning_rate": 4.939306645475316e-05, + "loss": 0.4528, + "step": 188910 + }, + { + "epoch": 9.383133008840767, + "grad_norm": 0.1865234375, + "learning_rate": 4.935333267110361e-05, + "loss": 0.483, + "step": 188920 + }, + { + "epoch": 9.383629681136386, + "grad_norm": 0.216796875, + "learning_rate": 4.931359888745406e-05, + "loss": 0.4871, + "step": 188930 + }, + { + "epoch": 9.384126353432006, + "grad_norm": 0.1689453125, + "learning_rate": 4.927386510380451e-05, + "loss": 0.469, + "step": 188940 + }, + { + "epoch": 9.384623025727626, + "grad_norm": 0.2109375, + "learning_rate": 4.923413132015496e-05, + "loss": 0.4758, + "step": 188950 + }, + { + "epoch": 9.385119698023244, + "grad_norm": 0.248046875, + "learning_rate": 4.9194397536505413e-05, + "loss": 0.4691, + "step": 188960 + }, + { + "epoch": 9.385616370318864, + "grad_norm": 0.2373046875, + "learning_rate": 4.9154663752855864e-05, + "loss": 0.4728, + "step": 188970 + }, + { + "epoch": 9.386113042614483, + "grad_norm": 0.2216796875, + "learning_rate": 4.911492996920632e-05, + "loss": 0.4822, + "step": 188980 + }, + { + "epoch": 9.386609714910103, + "grad_norm": 0.193359375, + "learning_rate": 4.907519618555677e-05, + "loss": 0.4673, + "step": 188990 + }, + { + "epoch": 9.387106387205721, + "grad_norm": 0.1806640625, + "learning_rate": 4.9035462401907224e-05, + "loss": 0.4952, + "step": 189000 + }, + { + "epoch": 9.387603059501341, + "grad_norm": 0.19140625, + "learning_rate": 4.8995728618257675e-05, + "loss": 0.4755, + "step": 189010 + }, + { + "epoch": 9.388099731796961, + "grad_norm": 0.2021484375, + "learning_rate": 4.8955994834608126e-05, + "loss": 0.5159, + "step": 189020 + }, + { + "epoch": 9.38859640409258, + "grad_norm": 0.205078125, + "learning_rate": 4.891626105095858e-05, + "loss": 0.4792, + "step": 189030 + }, + { + "epoch": 9.3890930763882, + "grad_norm": 0.201171875, + "learning_rate": 4.887652726730903e-05, + "loss": 0.4912, + "step": 189040 + }, + { + "epoch": 9.389589748683818, + "grad_norm": 0.2216796875, + "learning_rate": 4.883679348365948e-05, + "loss": 0.4851, + "step": 189050 + }, + { + "epoch": 9.390086420979438, + "grad_norm": 0.265625, + "learning_rate": 4.879705970000994e-05, + "loss": 0.5188, + "step": 189060 + }, + { + "epoch": 9.390583093275056, + "grad_norm": 0.1845703125, + "learning_rate": 4.875732591636039e-05, + "loss": 0.4958, + "step": 189070 + }, + { + "epoch": 9.391079765570677, + "grad_norm": 0.2255859375, + "learning_rate": 4.871759213271084e-05, + "loss": 0.4566, + "step": 189080 + }, + { + "epoch": 9.391576437866295, + "grad_norm": 0.2041015625, + "learning_rate": 4.867785834906129e-05, + "loss": 0.485, + "step": 189090 + }, + { + "epoch": 9.392073110161915, + "grad_norm": 0.208984375, + "learning_rate": 4.863812456541174e-05, + "loss": 0.4859, + "step": 189100 + }, + { + "epoch": 9.392569782457535, + "grad_norm": 0.203125, + "learning_rate": 4.859839078176219e-05, + "loss": 0.4685, + "step": 189110 + }, + { + "epoch": 9.393066454753153, + "grad_norm": 0.203125, + "learning_rate": 4.855865699811264e-05, + "loss": 0.4739, + "step": 189120 + }, + { + "epoch": 9.393563127048774, + "grad_norm": 0.1875, + "learning_rate": 4.85189232144631e-05, + "loss": 0.464, + "step": 189130 + }, + { + "epoch": 9.394059799344392, + "grad_norm": 0.220703125, + "learning_rate": 4.847918943081355e-05, + "loss": 0.5047, + "step": 189140 + }, + { + "epoch": 9.394556471640012, + "grad_norm": 0.2421875, + "learning_rate": 4.8439455647164e-05, + "loss": 0.5102, + "step": 189150 + }, + { + "epoch": 9.39505314393563, + "grad_norm": 0.197265625, + "learning_rate": 4.839972186351446e-05, + "loss": 0.4791, + "step": 189160 + }, + { + "epoch": 9.39554981623125, + "grad_norm": 0.2216796875, + "learning_rate": 4.835998807986491e-05, + "loss": 0.4723, + "step": 189170 + }, + { + "epoch": 9.39604648852687, + "grad_norm": 0.1875, + "learning_rate": 4.832025429621536e-05, + "loss": 0.5001, + "step": 189180 + }, + { + "epoch": 9.396543160822489, + "grad_norm": 0.208984375, + "learning_rate": 4.8280520512565814e-05, + "loss": 0.4919, + "step": 189190 + }, + { + "epoch": 9.397039833118109, + "grad_norm": 0.2421875, + "learning_rate": 4.8240786728916265e-05, + "loss": 0.531, + "step": 189200 + }, + { + "epoch": 9.397536505413727, + "grad_norm": 0.1943359375, + "learning_rate": 4.8201052945266716e-05, + "loss": 0.4866, + "step": 189210 + }, + { + "epoch": 9.398033177709348, + "grad_norm": 0.2236328125, + "learning_rate": 4.816131916161717e-05, + "loss": 0.4984, + "step": 189220 + }, + { + "epoch": 9.398529850004966, + "grad_norm": 0.2099609375, + "learning_rate": 4.812158537796762e-05, + "loss": 0.466, + "step": 189230 + }, + { + "epoch": 9.399026522300586, + "grad_norm": 0.2041015625, + "learning_rate": 4.808185159431807e-05, + "loss": 0.5262, + "step": 189240 + }, + { + "epoch": 9.399523194596206, + "grad_norm": 0.185546875, + "learning_rate": 4.804211781066853e-05, + "loss": 0.4608, + "step": 189250 + }, + { + "epoch": 9.400019866891824, + "grad_norm": 0.2109375, + "learning_rate": 4.800238402701898e-05, + "loss": 0.4591, + "step": 189260 + }, + { + "epoch": 9.400516539187445, + "grad_norm": 0.208984375, + "learning_rate": 4.796265024336943e-05, + "loss": 0.4883, + "step": 189270 + }, + { + "epoch": 9.401013211483063, + "grad_norm": 0.2275390625, + "learning_rate": 4.792291645971988e-05, + "loss": 0.4759, + "step": 189280 + }, + { + "epoch": 9.401509883778683, + "grad_norm": 0.201171875, + "learning_rate": 4.788318267607033e-05, + "loss": 0.4968, + "step": 189290 + }, + { + "epoch": 9.402006556074301, + "grad_norm": 0.21875, + "learning_rate": 4.784344889242078e-05, + "loss": 0.4804, + "step": 189300 + }, + { + "epoch": 9.402503228369921, + "grad_norm": 0.2099609375, + "learning_rate": 4.780371510877123e-05, + "loss": 0.4754, + "step": 189310 + }, + { + "epoch": 9.402999900665542, + "grad_norm": 0.212890625, + "learning_rate": 4.7763981325121684e-05, + "loss": 0.4695, + "step": 189320 + }, + { + "epoch": 9.40349657296116, + "grad_norm": 0.205078125, + "learning_rate": 4.772424754147214e-05, + "loss": 0.4796, + "step": 189330 + }, + { + "epoch": 9.40399324525678, + "grad_norm": 0.208984375, + "learning_rate": 4.768451375782259e-05, + "loss": 0.4904, + "step": 189340 + }, + { + "epoch": 9.404489917552398, + "grad_norm": 0.1962890625, + "learning_rate": 4.7644779974173044e-05, + "loss": 0.4853, + "step": 189350 + }, + { + "epoch": 9.404986589848018, + "grad_norm": 0.26171875, + "learning_rate": 4.7605046190523495e-05, + "loss": 0.513, + "step": 189360 + }, + { + "epoch": 9.405483262143637, + "grad_norm": 0.2138671875, + "learning_rate": 4.7565312406873946e-05, + "loss": 0.4981, + "step": 189370 + }, + { + "epoch": 9.405979934439257, + "grad_norm": 0.19921875, + "learning_rate": 4.75255786232244e-05, + "loss": 0.4926, + "step": 189380 + }, + { + "epoch": 9.406476606734877, + "grad_norm": 0.216796875, + "learning_rate": 4.748584483957485e-05, + "loss": 0.4727, + "step": 189390 + }, + { + "epoch": 9.406973279030495, + "grad_norm": 0.193359375, + "learning_rate": 4.74461110559253e-05, + "loss": 0.4861, + "step": 189400 + }, + { + "epoch": 9.407469951326116, + "grad_norm": 0.216796875, + "learning_rate": 4.740637727227576e-05, + "loss": 0.5027, + "step": 189410 + }, + { + "epoch": 9.407966623621734, + "grad_norm": 0.189453125, + "learning_rate": 4.736664348862621e-05, + "loss": 0.5004, + "step": 189420 + }, + { + "epoch": 9.408463295917354, + "grad_norm": 0.1865234375, + "learning_rate": 4.732690970497666e-05, + "loss": 0.5357, + "step": 189430 + }, + { + "epoch": 9.408959968212972, + "grad_norm": 0.19140625, + "learning_rate": 4.728717592132711e-05, + "loss": 0.4972, + "step": 189440 + }, + { + "epoch": 9.409456640508592, + "grad_norm": 0.2275390625, + "learning_rate": 4.724744213767756e-05, + "loss": 0.5064, + "step": 189450 + }, + { + "epoch": 9.409953312804213, + "grad_norm": 0.19921875, + "learning_rate": 4.720770835402801e-05, + "loss": 0.4709, + "step": 189460 + }, + { + "epoch": 9.41044998509983, + "grad_norm": 0.1767578125, + "learning_rate": 4.716797457037846e-05, + "loss": 0.4683, + "step": 189470 + }, + { + "epoch": 9.410946657395451, + "grad_norm": 0.271484375, + "learning_rate": 4.7128240786728914e-05, + "loss": 0.5011, + "step": 189480 + }, + { + "epoch": 9.41144332969107, + "grad_norm": 0.2109375, + "learning_rate": 4.7088507003079365e-05, + "loss": 0.5058, + "step": 189490 + }, + { + "epoch": 9.41194000198669, + "grad_norm": 0.208984375, + "learning_rate": 4.704877321942982e-05, + "loss": 0.5098, + "step": 189500 + }, + { + "epoch": 9.412436674282308, + "grad_norm": 0.22265625, + "learning_rate": 4.7009039435780274e-05, + "loss": 0.4796, + "step": 189510 + }, + { + "epoch": 9.412933346577928, + "grad_norm": 0.224609375, + "learning_rate": 4.6969305652130725e-05, + "loss": 0.5047, + "step": 189520 + }, + { + "epoch": 9.413430018873548, + "grad_norm": 0.1884765625, + "learning_rate": 4.6929571868481176e-05, + "loss": 0.4659, + "step": 189530 + }, + { + "epoch": 9.413926691169166, + "grad_norm": 0.197265625, + "learning_rate": 4.688983808483163e-05, + "loss": 0.471, + "step": 189540 + }, + { + "epoch": 9.414423363464786, + "grad_norm": 0.1875, + "learning_rate": 4.6850104301182085e-05, + "loss": 0.4838, + "step": 189550 + }, + { + "epoch": 9.414920035760405, + "grad_norm": 0.2177734375, + "learning_rate": 4.6810370517532536e-05, + "loss": 0.4829, + "step": 189560 + }, + { + "epoch": 9.415416708056025, + "grad_norm": 0.2080078125, + "learning_rate": 4.677063673388299e-05, + "loss": 0.4788, + "step": 189570 + }, + { + "epoch": 9.415913380351643, + "grad_norm": 0.2080078125, + "learning_rate": 4.673090295023344e-05, + "loss": 0.4913, + "step": 189580 + }, + { + "epoch": 9.416410052647263, + "grad_norm": 0.236328125, + "learning_rate": 4.669116916658389e-05, + "loss": 0.5042, + "step": 189590 + }, + { + "epoch": 9.416906724942883, + "grad_norm": 0.1982421875, + "learning_rate": 4.665143538293435e-05, + "loss": 0.4967, + "step": 189600 + }, + { + "epoch": 9.417403397238502, + "grad_norm": 0.1923828125, + "learning_rate": 4.66117015992848e-05, + "loss": 0.4489, + "step": 189610 + }, + { + "epoch": 9.417900069534122, + "grad_norm": 0.216796875, + "learning_rate": 4.657196781563525e-05, + "loss": 0.5097, + "step": 189620 + }, + { + "epoch": 9.41839674182974, + "grad_norm": 0.22265625, + "learning_rate": 4.65322340319857e-05, + "loss": 0.4542, + "step": 189630 + }, + { + "epoch": 9.41889341412536, + "grad_norm": 0.2119140625, + "learning_rate": 4.649250024833615e-05, + "loss": 0.4861, + "step": 189640 + }, + { + "epoch": 9.419390086420979, + "grad_norm": 0.255859375, + "learning_rate": 4.64527664646866e-05, + "loss": 0.4727, + "step": 189650 + }, + { + "epoch": 9.419886758716599, + "grad_norm": 0.1826171875, + "learning_rate": 4.641303268103705e-05, + "loss": 0.4634, + "step": 189660 + }, + { + "epoch": 9.420383431012219, + "grad_norm": 0.19921875, + "learning_rate": 4.6373298897387504e-05, + "loss": 0.4895, + "step": 189670 + }, + { + "epoch": 9.420880103307837, + "grad_norm": 0.2119140625, + "learning_rate": 4.633356511373796e-05, + "loss": 0.5143, + "step": 189680 + }, + { + "epoch": 9.421376775603457, + "grad_norm": 0.193359375, + "learning_rate": 4.629383133008841e-05, + "loss": 0.4595, + "step": 189690 + }, + { + "epoch": 9.421873447899076, + "grad_norm": 0.193359375, + "learning_rate": 4.6254097546438864e-05, + "loss": 0.501, + "step": 189700 + }, + { + "epoch": 9.422370120194696, + "grad_norm": 0.2158203125, + "learning_rate": 4.6214363762789315e-05, + "loss": 0.4674, + "step": 189710 + }, + { + "epoch": 9.422866792490314, + "grad_norm": 0.2177734375, + "learning_rate": 4.6174629979139766e-05, + "loss": 0.5058, + "step": 189720 + }, + { + "epoch": 9.423363464785934, + "grad_norm": 0.224609375, + "learning_rate": 4.613489619549022e-05, + "loss": 0.4846, + "step": 189730 + }, + { + "epoch": 9.423860137081554, + "grad_norm": 0.1845703125, + "learning_rate": 4.609516241184067e-05, + "loss": 0.4785, + "step": 189740 + }, + { + "epoch": 9.424356809377173, + "grad_norm": 0.19921875, + "learning_rate": 4.605542862819112e-05, + "loss": 0.5165, + "step": 189750 + }, + { + "epoch": 9.424853481672793, + "grad_norm": 0.19921875, + "learning_rate": 4.601569484454158e-05, + "loss": 0.4946, + "step": 189760 + }, + { + "epoch": 9.425350153968411, + "grad_norm": 0.1982421875, + "learning_rate": 4.597596106089203e-05, + "loss": 0.5008, + "step": 189770 + }, + { + "epoch": 9.425846826264031, + "grad_norm": 0.1962890625, + "learning_rate": 4.593622727724248e-05, + "loss": 0.5056, + "step": 189780 + }, + { + "epoch": 9.42634349855965, + "grad_norm": 0.224609375, + "learning_rate": 4.589649349359293e-05, + "loss": 0.4819, + "step": 189790 + }, + { + "epoch": 9.42684017085527, + "grad_norm": 0.185546875, + "learning_rate": 4.585675970994338e-05, + "loss": 0.4559, + "step": 189800 + }, + { + "epoch": 9.42733684315089, + "grad_norm": 0.2109375, + "learning_rate": 4.581702592629383e-05, + "loss": 0.4852, + "step": 189810 + }, + { + "epoch": 9.427833515446508, + "grad_norm": 0.1923828125, + "learning_rate": 4.577729214264428e-05, + "loss": 0.4828, + "step": 189820 + }, + { + "epoch": 9.428330187742128, + "grad_norm": 0.2021484375, + "learning_rate": 4.5737558358994734e-05, + "loss": 0.5205, + "step": 189830 + }, + { + "epoch": 9.428826860037747, + "grad_norm": 0.1982421875, + "learning_rate": 4.5697824575345185e-05, + "loss": 0.4804, + "step": 189840 + }, + { + "epoch": 9.429323532333367, + "grad_norm": 0.201171875, + "learning_rate": 4.565809079169564e-05, + "loss": 0.5254, + "step": 189850 + }, + { + "epoch": 9.429820204628985, + "grad_norm": 0.18359375, + "learning_rate": 4.5618357008046094e-05, + "loss": 0.49, + "step": 189860 + }, + { + "epoch": 9.430316876924605, + "grad_norm": 0.1953125, + "learning_rate": 4.5578623224396545e-05, + "loss": 0.4915, + "step": 189870 + }, + { + "epoch": 9.430813549220225, + "grad_norm": 0.189453125, + "learning_rate": 4.5538889440746996e-05, + "loss": 0.4658, + "step": 189880 + }, + { + "epoch": 9.431310221515844, + "grad_norm": 0.2197265625, + "learning_rate": 4.549915565709745e-05, + "loss": 0.5017, + "step": 189890 + }, + { + "epoch": 9.431806893811464, + "grad_norm": 0.212890625, + "learning_rate": 4.54594218734479e-05, + "loss": 0.5033, + "step": 189900 + }, + { + "epoch": 9.432303566107082, + "grad_norm": 0.224609375, + "learning_rate": 4.541968808979835e-05, + "loss": 0.4766, + "step": 189910 + }, + { + "epoch": 9.432800238402702, + "grad_norm": 0.1826171875, + "learning_rate": 4.53799543061488e-05, + "loss": 0.4802, + "step": 189920 + }, + { + "epoch": 9.43329691069832, + "grad_norm": 0.203125, + "learning_rate": 4.534022052249926e-05, + "loss": 0.4642, + "step": 189930 + }, + { + "epoch": 9.43379358299394, + "grad_norm": 0.2197265625, + "learning_rate": 4.530048673884971e-05, + "loss": 0.4749, + "step": 189940 + }, + { + "epoch": 9.43429025528956, + "grad_norm": 0.2197265625, + "learning_rate": 4.526075295520016e-05, + "loss": 0.4763, + "step": 189950 + }, + { + "epoch": 9.43478692758518, + "grad_norm": 0.1953125, + "learning_rate": 4.522101917155061e-05, + "loss": 0.4624, + "step": 189960 + }, + { + "epoch": 9.4352835998808, + "grad_norm": 0.208984375, + "learning_rate": 4.518128538790107e-05, + "loss": 0.4999, + "step": 189970 + }, + { + "epoch": 9.435780272176418, + "grad_norm": 0.205078125, + "learning_rate": 4.514155160425152e-05, + "loss": 0.5081, + "step": 189980 + }, + { + "epoch": 9.436276944472038, + "grad_norm": 0.2080078125, + "learning_rate": 4.510181782060197e-05, + "loss": 0.4671, + "step": 189990 + }, + { + "epoch": 9.436773616767656, + "grad_norm": 0.267578125, + "learning_rate": 4.506208403695242e-05, + "loss": 0.4814, + "step": 190000 + }, + { + "epoch": 9.437270289063276, + "grad_norm": 0.2099609375, + "learning_rate": 4.502235025330287e-05, + "loss": 0.481, + "step": 190010 + }, + { + "epoch": 9.437766961358895, + "grad_norm": 0.1904296875, + "learning_rate": 4.4982616469653324e-05, + "loss": 0.4711, + "step": 190020 + }, + { + "epoch": 9.438263633654515, + "grad_norm": 0.1904296875, + "learning_rate": 4.494288268600378e-05, + "loss": 0.4816, + "step": 190030 + }, + { + "epoch": 9.438760305950135, + "grad_norm": 0.205078125, + "learning_rate": 4.490314890235423e-05, + "loss": 0.4984, + "step": 190040 + }, + { + "epoch": 9.439256978245753, + "grad_norm": 0.234375, + "learning_rate": 4.4863415118704684e-05, + "loss": 0.4892, + "step": 190050 + }, + { + "epoch": 9.439753650541373, + "grad_norm": 0.2021484375, + "learning_rate": 4.4823681335055135e-05, + "loss": 0.4802, + "step": 190060 + }, + { + "epoch": 9.440250322836992, + "grad_norm": 0.21484375, + "learning_rate": 4.4783947551405586e-05, + "loss": 0.4801, + "step": 190070 + }, + { + "epoch": 9.440746995132612, + "grad_norm": 0.2490234375, + "learning_rate": 4.474421376775604e-05, + "loss": 0.4821, + "step": 190080 + }, + { + "epoch": 9.44124366742823, + "grad_norm": 0.2421875, + "learning_rate": 4.470447998410649e-05, + "loss": 0.4888, + "step": 190090 + }, + { + "epoch": 9.44174033972385, + "grad_norm": 0.1923828125, + "learning_rate": 4.466474620045694e-05, + "loss": 0.4631, + "step": 190100 + }, + { + "epoch": 9.44223701201947, + "grad_norm": 0.2421875, + "learning_rate": 4.46250124168074e-05, + "loss": 0.4987, + "step": 190110 + }, + { + "epoch": 9.442733684315089, + "grad_norm": 0.2060546875, + "learning_rate": 4.458527863315785e-05, + "loss": 0.4833, + "step": 190120 + }, + { + "epoch": 9.443230356610709, + "grad_norm": 0.1953125, + "learning_rate": 4.45455448495083e-05, + "loss": 0.4911, + "step": 190130 + }, + { + "epoch": 9.443727028906327, + "grad_norm": 0.2197265625, + "learning_rate": 4.450581106585875e-05, + "loss": 0.4735, + "step": 190140 + }, + { + "epoch": 9.444223701201947, + "grad_norm": 0.19140625, + "learning_rate": 4.44660772822092e-05, + "loss": 0.4937, + "step": 190150 + }, + { + "epoch": 9.444720373497566, + "grad_norm": 0.189453125, + "learning_rate": 4.442634349855965e-05, + "loss": 0.47, + "step": 190160 + }, + { + "epoch": 9.445217045793186, + "grad_norm": 0.1884765625, + "learning_rate": 4.43866097149101e-05, + "loss": 0.4796, + "step": 190170 + }, + { + "epoch": 9.445713718088806, + "grad_norm": 0.181640625, + "learning_rate": 4.4346875931260554e-05, + "loss": 0.4912, + "step": 190180 + }, + { + "epoch": 9.446210390384424, + "grad_norm": 0.2021484375, + "learning_rate": 4.4307142147611005e-05, + "loss": 0.5073, + "step": 190190 + }, + { + "epoch": 9.446707062680044, + "grad_norm": 0.205078125, + "learning_rate": 4.426740836396146e-05, + "loss": 0.4368, + "step": 190200 + }, + { + "epoch": 9.447203734975663, + "grad_norm": 0.234375, + "learning_rate": 4.4227674580311914e-05, + "loss": 0.5063, + "step": 190210 + }, + { + "epoch": 9.447700407271283, + "grad_norm": 0.1953125, + "learning_rate": 4.4187940796662365e-05, + "loss": 0.4876, + "step": 190220 + }, + { + "epoch": 9.448197079566901, + "grad_norm": 0.1953125, + "learning_rate": 4.4148207013012816e-05, + "loss": 0.4991, + "step": 190230 + }, + { + "epoch": 9.448693751862521, + "grad_norm": 0.1953125, + "learning_rate": 4.410847322936327e-05, + "loss": 0.475, + "step": 190240 + }, + { + "epoch": 9.449190424158141, + "grad_norm": 0.2216796875, + "learning_rate": 4.406873944571372e-05, + "loss": 0.5007, + "step": 190250 + }, + { + "epoch": 9.44968709645376, + "grad_norm": 0.208984375, + "learning_rate": 4.402900566206417e-05, + "loss": 0.4957, + "step": 190260 + }, + { + "epoch": 9.45018376874938, + "grad_norm": 0.205078125, + "learning_rate": 4.398927187841462e-05, + "loss": 0.474, + "step": 190270 + }, + { + "epoch": 9.450680441044998, + "grad_norm": 0.2119140625, + "learning_rate": 4.394953809476508e-05, + "loss": 0.4743, + "step": 190280 + }, + { + "epoch": 9.451177113340618, + "grad_norm": 0.2109375, + "learning_rate": 4.390980431111553e-05, + "loss": 0.4712, + "step": 190290 + }, + { + "epoch": 9.451673785636237, + "grad_norm": 0.19921875, + "learning_rate": 4.387007052746598e-05, + "loss": 0.458, + "step": 190300 + }, + { + "epoch": 9.452170457931857, + "grad_norm": 0.19921875, + "learning_rate": 4.383033674381643e-05, + "loss": 0.4802, + "step": 190310 + }, + { + "epoch": 9.452667130227477, + "grad_norm": 0.2001953125, + "learning_rate": 4.379060296016688e-05, + "loss": 0.4896, + "step": 190320 + }, + { + "epoch": 9.453163802523095, + "grad_norm": 0.2158203125, + "learning_rate": 4.375086917651733e-05, + "loss": 0.4979, + "step": 190330 + }, + { + "epoch": 9.453660474818715, + "grad_norm": 0.2021484375, + "learning_rate": 4.3711135392867784e-05, + "loss": 0.5046, + "step": 190340 + }, + { + "epoch": 9.454157147114334, + "grad_norm": 0.216796875, + "learning_rate": 4.3671401609218235e-05, + "loss": 0.4758, + "step": 190350 + }, + { + "epoch": 9.454653819409954, + "grad_norm": 0.2294921875, + "learning_rate": 4.363166782556869e-05, + "loss": 0.4676, + "step": 190360 + }, + { + "epoch": 9.455150491705572, + "grad_norm": 0.2021484375, + "learning_rate": 4.3591934041919144e-05, + "loss": 0.4821, + "step": 190370 + }, + { + "epoch": 9.455647164001192, + "grad_norm": 0.21875, + "learning_rate": 4.3552200258269595e-05, + "loss": 0.5047, + "step": 190380 + }, + { + "epoch": 9.456143836296812, + "grad_norm": 0.2001953125, + "learning_rate": 4.351246647462005e-05, + "loss": 0.5165, + "step": 190390 + }, + { + "epoch": 9.45664050859243, + "grad_norm": 0.2041015625, + "learning_rate": 4.3472732690970504e-05, + "loss": 0.4672, + "step": 190400 + }, + { + "epoch": 9.45713718088805, + "grad_norm": 0.1953125, + "learning_rate": 4.3432998907320955e-05, + "loss": 0.472, + "step": 190410 + }, + { + "epoch": 9.457633853183669, + "grad_norm": 0.208984375, + "learning_rate": 4.3393265123671406e-05, + "loss": 0.4751, + "step": 190420 + }, + { + "epoch": 9.45813052547929, + "grad_norm": 0.185546875, + "learning_rate": 4.335353134002186e-05, + "loss": 0.4832, + "step": 190430 + }, + { + "epoch": 9.458627197774907, + "grad_norm": 0.228515625, + "learning_rate": 4.331379755637231e-05, + "loss": 0.5197, + "step": 190440 + }, + { + "epoch": 9.459123870070528, + "grad_norm": 0.21484375, + "learning_rate": 4.327406377272276e-05, + "loss": 0.5084, + "step": 190450 + }, + { + "epoch": 9.459620542366146, + "grad_norm": 0.1884765625, + "learning_rate": 4.3234329989073217e-05, + "loss": 0.4717, + "step": 190460 + }, + { + "epoch": 9.460117214661766, + "grad_norm": 0.203125, + "learning_rate": 4.319459620542367e-05, + "loss": 0.5032, + "step": 190470 + }, + { + "epoch": 9.460613886957386, + "grad_norm": 0.2119140625, + "learning_rate": 4.315486242177412e-05, + "loss": 0.4943, + "step": 190480 + }, + { + "epoch": 9.461110559253004, + "grad_norm": 0.240234375, + "learning_rate": 4.311512863812457e-05, + "loss": 0.4644, + "step": 190490 + }, + { + "epoch": 9.461607231548625, + "grad_norm": 0.203125, + "learning_rate": 4.307539485447502e-05, + "loss": 0.4937, + "step": 190500 + }, + { + "epoch": 9.462103903844243, + "grad_norm": 0.232421875, + "learning_rate": 4.303566107082547e-05, + "loss": 0.5034, + "step": 190510 + }, + { + "epoch": 9.462600576139863, + "grad_norm": 0.2041015625, + "learning_rate": 4.299592728717592e-05, + "loss": 0.4807, + "step": 190520 + }, + { + "epoch": 9.463097248435481, + "grad_norm": 0.189453125, + "learning_rate": 4.2956193503526374e-05, + "loss": 0.4897, + "step": 190530 + }, + { + "epoch": 9.463593920731102, + "grad_norm": 0.197265625, + "learning_rate": 4.291645971987683e-05, + "loss": 0.4738, + "step": 190540 + }, + { + "epoch": 9.464090593026722, + "grad_norm": 0.2099609375, + "learning_rate": 4.287672593622728e-05, + "loss": 0.5169, + "step": 190550 + }, + { + "epoch": 9.46458726532234, + "grad_norm": 0.205078125, + "learning_rate": 4.2836992152577734e-05, + "loss": 0.5095, + "step": 190560 + }, + { + "epoch": 9.46508393761796, + "grad_norm": 0.201171875, + "learning_rate": 4.2797258368928185e-05, + "loss": 0.4726, + "step": 190570 + }, + { + "epoch": 9.465580609913578, + "grad_norm": 0.1923828125, + "learning_rate": 4.2757524585278636e-05, + "loss": 0.4548, + "step": 190580 + }, + { + "epoch": 9.466077282209199, + "grad_norm": 0.216796875, + "learning_rate": 4.271779080162909e-05, + "loss": 0.481, + "step": 190590 + }, + { + "epoch": 9.466573954504817, + "grad_norm": 0.2099609375, + "learning_rate": 4.267805701797954e-05, + "loss": 0.5057, + "step": 190600 + }, + { + "epoch": 9.467070626800437, + "grad_norm": 0.19921875, + "learning_rate": 4.263832323432999e-05, + "loss": 0.4813, + "step": 190610 + }, + { + "epoch": 9.467567299096057, + "grad_norm": 0.201171875, + "learning_rate": 4.259858945068044e-05, + "loss": 0.5279, + "step": 190620 + }, + { + "epoch": 9.468063971391675, + "grad_norm": 0.193359375, + "learning_rate": 4.25588556670309e-05, + "loss": 0.4943, + "step": 190630 + }, + { + "epoch": 9.468560643687296, + "grad_norm": 0.19921875, + "learning_rate": 4.251912188338135e-05, + "loss": 0.4927, + "step": 190640 + }, + { + "epoch": 9.469057315982914, + "grad_norm": 0.2138671875, + "learning_rate": 4.24793880997318e-05, + "loss": 0.4796, + "step": 190650 + }, + { + "epoch": 9.469553988278534, + "grad_norm": 0.2099609375, + "learning_rate": 4.243965431608225e-05, + "loss": 0.4778, + "step": 190660 + }, + { + "epoch": 9.470050660574152, + "grad_norm": 0.2158203125, + "learning_rate": 4.23999205324327e-05, + "loss": 0.5134, + "step": 190670 + }, + { + "epoch": 9.470547332869772, + "grad_norm": 0.2255859375, + "learning_rate": 4.236018674878315e-05, + "loss": 0.4747, + "step": 190680 + }, + { + "epoch": 9.471044005165393, + "grad_norm": 0.1865234375, + "learning_rate": 4.2320452965133604e-05, + "loss": 0.5005, + "step": 190690 + }, + { + "epoch": 9.471540677461011, + "grad_norm": 0.2060546875, + "learning_rate": 4.2280719181484055e-05, + "loss": 0.4956, + "step": 190700 + }, + { + "epoch": 9.472037349756631, + "grad_norm": 0.228515625, + "learning_rate": 4.224098539783451e-05, + "loss": 0.5076, + "step": 190710 + }, + { + "epoch": 9.47253402205225, + "grad_norm": 0.189453125, + "learning_rate": 4.2201251614184964e-05, + "loss": 0.4921, + "step": 190720 + }, + { + "epoch": 9.47303069434787, + "grad_norm": 0.208984375, + "learning_rate": 4.2161517830535415e-05, + "loss": 0.4581, + "step": 190730 + }, + { + "epoch": 9.473527366643488, + "grad_norm": 0.201171875, + "learning_rate": 4.2121784046885866e-05, + "loss": 0.4879, + "step": 190740 + }, + { + "epoch": 9.474024038939108, + "grad_norm": 0.2041015625, + "learning_rate": 4.2082050263236317e-05, + "loss": 0.4743, + "step": 190750 + }, + { + "epoch": 9.474520711234728, + "grad_norm": 0.220703125, + "learning_rate": 4.204231647958677e-05, + "loss": 0.4943, + "step": 190760 + }, + { + "epoch": 9.475017383530346, + "grad_norm": 0.1943359375, + "learning_rate": 4.200258269593722e-05, + "loss": 0.4593, + "step": 190770 + }, + { + "epoch": 9.475514055825967, + "grad_norm": 0.189453125, + "learning_rate": 4.196284891228767e-05, + "loss": 0.4725, + "step": 190780 + }, + { + "epoch": 9.476010728121585, + "grad_norm": 0.1982421875, + "learning_rate": 4.192311512863812e-05, + "loss": 0.4773, + "step": 190790 + }, + { + "epoch": 9.476507400417205, + "grad_norm": 0.19921875, + "learning_rate": 4.188338134498858e-05, + "loss": 0.4791, + "step": 190800 + }, + { + "epoch": 9.477004072712823, + "grad_norm": 0.208984375, + "learning_rate": 4.184364756133903e-05, + "loss": 0.503, + "step": 190810 + }, + { + "epoch": 9.477500745008443, + "grad_norm": 0.2001953125, + "learning_rate": 4.180391377768949e-05, + "loss": 0.5181, + "step": 190820 + }, + { + "epoch": 9.477997417304064, + "grad_norm": 0.2119140625, + "learning_rate": 4.176417999403994e-05, + "loss": 0.465, + "step": 190830 + }, + { + "epoch": 9.478494089599682, + "grad_norm": 0.2109375, + "learning_rate": 4.172444621039039e-05, + "loss": 0.511, + "step": 190840 + }, + { + "epoch": 9.478990761895302, + "grad_norm": 0.1826171875, + "learning_rate": 4.168471242674084e-05, + "loss": 0.4744, + "step": 190850 + }, + { + "epoch": 9.47948743419092, + "grad_norm": 0.189453125, + "learning_rate": 4.164497864309129e-05, + "loss": 0.4672, + "step": 190860 + }, + { + "epoch": 9.47998410648654, + "grad_norm": 0.1904296875, + "learning_rate": 4.160524485944174e-05, + "loss": 0.472, + "step": 190870 + }, + { + "epoch": 9.480480778782159, + "grad_norm": 0.19921875, + "learning_rate": 4.1565511075792194e-05, + "loss": 0.4951, + "step": 190880 + }, + { + "epoch": 9.480977451077779, + "grad_norm": 0.23046875, + "learning_rate": 4.152577729214265e-05, + "loss": 0.5035, + "step": 190890 + }, + { + "epoch": 9.481474123373399, + "grad_norm": 0.2080078125, + "learning_rate": 4.14860435084931e-05, + "loss": 0.4604, + "step": 190900 + }, + { + "epoch": 9.481970795669017, + "grad_norm": 0.19140625, + "learning_rate": 4.144630972484355e-05, + "loss": 0.5188, + "step": 190910 + }, + { + "epoch": 9.482467467964637, + "grad_norm": 0.212890625, + "learning_rate": 4.1406575941194004e-05, + "loss": 0.4791, + "step": 190920 + }, + { + "epoch": 9.482964140260256, + "grad_norm": 0.1923828125, + "learning_rate": 4.1366842157544455e-05, + "loss": 0.4911, + "step": 190930 + }, + { + "epoch": 9.483460812555876, + "grad_norm": 0.2021484375, + "learning_rate": 4.1327108373894906e-05, + "loss": 0.4816, + "step": 190940 + }, + { + "epoch": 9.483957484851494, + "grad_norm": 0.2099609375, + "learning_rate": 4.128737459024536e-05, + "loss": 0.4844, + "step": 190950 + }, + { + "epoch": 9.484454157147114, + "grad_norm": 0.21484375, + "learning_rate": 4.124764080659581e-05, + "loss": 0.5126, + "step": 190960 + }, + { + "epoch": 9.484950829442734, + "grad_norm": 0.205078125, + "learning_rate": 4.120790702294626e-05, + "loss": 0.5173, + "step": 190970 + }, + { + "epoch": 9.485447501738353, + "grad_norm": 0.2099609375, + "learning_rate": 4.116817323929672e-05, + "loss": 0.4946, + "step": 190980 + }, + { + "epoch": 9.485944174033973, + "grad_norm": 0.2265625, + "learning_rate": 4.112843945564717e-05, + "loss": 0.5297, + "step": 190990 + }, + { + "epoch": 9.486440846329591, + "grad_norm": 0.201171875, + "learning_rate": 4.108870567199762e-05, + "loss": 0.4899, + "step": 191000 + }, + { + "epoch": 9.486937518625211, + "grad_norm": 0.2216796875, + "learning_rate": 4.104897188834807e-05, + "loss": 0.5052, + "step": 191010 + }, + { + "epoch": 9.48743419092083, + "grad_norm": 0.21484375, + "learning_rate": 4.100923810469852e-05, + "loss": 0.5071, + "step": 191020 + }, + { + "epoch": 9.48793086321645, + "grad_norm": 0.1865234375, + "learning_rate": 4.096950432104897e-05, + "loss": 0.4538, + "step": 191030 + }, + { + "epoch": 9.48842753551207, + "grad_norm": 0.2138671875, + "learning_rate": 4.0929770537399423e-05, + "loss": 0.4967, + "step": 191040 + }, + { + "epoch": 9.488924207807688, + "grad_norm": 0.1875, + "learning_rate": 4.0890036753749875e-05, + "loss": 0.5068, + "step": 191050 + }, + { + "epoch": 9.489420880103308, + "grad_norm": 0.2021484375, + "learning_rate": 4.085030297010033e-05, + "loss": 0.4787, + "step": 191060 + }, + { + "epoch": 9.489917552398927, + "grad_norm": 0.1923828125, + "learning_rate": 4.081056918645078e-05, + "loss": 0.4781, + "step": 191070 + }, + { + "epoch": 9.490414224694547, + "grad_norm": 0.2041015625, + "learning_rate": 4.0770835402801234e-05, + "loss": 0.5005, + "step": 191080 + }, + { + "epoch": 9.490910896990165, + "grad_norm": 0.205078125, + "learning_rate": 4.0731101619151685e-05, + "loss": 0.4583, + "step": 191090 + }, + { + "epoch": 9.491407569285785, + "grad_norm": 0.2060546875, + "learning_rate": 4.0691367835502136e-05, + "loss": 0.4993, + "step": 191100 + }, + { + "epoch": 9.491904241581405, + "grad_norm": 0.2060546875, + "learning_rate": 4.065163405185259e-05, + "loss": 0.4824, + "step": 191110 + }, + { + "epoch": 9.492400913877024, + "grad_norm": 0.19921875, + "learning_rate": 4.061190026820304e-05, + "loss": 0.4716, + "step": 191120 + }, + { + "epoch": 9.492897586172644, + "grad_norm": 0.2041015625, + "learning_rate": 4.057216648455349e-05, + "loss": 0.4972, + "step": 191130 + }, + { + "epoch": 9.493394258468262, + "grad_norm": 0.2021484375, + "learning_rate": 4.053243270090395e-05, + "loss": 0.5021, + "step": 191140 + }, + { + "epoch": 9.493890930763882, + "grad_norm": 0.205078125, + "learning_rate": 4.04926989172544e-05, + "loss": 0.5069, + "step": 191150 + }, + { + "epoch": 9.4943876030595, + "grad_norm": 0.234375, + "learning_rate": 4.045296513360485e-05, + "loss": 0.5028, + "step": 191160 + }, + { + "epoch": 9.49488427535512, + "grad_norm": 0.212890625, + "learning_rate": 4.04132313499553e-05, + "loss": 0.4892, + "step": 191170 + }, + { + "epoch": 9.495380947650741, + "grad_norm": 0.2021484375, + "learning_rate": 4.037349756630575e-05, + "loss": 0.5041, + "step": 191180 + }, + { + "epoch": 9.49587761994636, + "grad_norm": 0.2041015625, + "learning_rate": 4.03337637826562e-05, + "loss": 0.461, + "step": 191190 + }, + { + "epoch": 9.49637429224198, + "grad_norm": 0.1982421875, + "learning_rate": 4.0294029999006653e-05, + "loss": 0.5065, + "step": 191200 + }, + { + "epoch": 9.496870964537598, + "grad_norm": 0.2177734375, + "learning_rate": 4.0254296215357105e-05, + "loss": 0.4549, + "step": 191210 + }, + { + "epoch": 9.497367636833218, + "grad_norm": 0.2294921875, + "learning_rate": 4.0214562431707556e-05, + "loss": 0.5129, + "step": 191220 + }, + { + "epoch": 9.497864309128836, + "grad_norm": 0.2177734375, + "learning_rate": 4.017482864805801e-05, + "loss": 0.4957, + "step": 191230 + }, + { + "epoch": 9.498360981424456, + "grad_norm": 0.18359375, + "learning_rate": 4.013509486440847e-05, + "loss": 0.4678, + "step": 191240 + }, + { + "epoch": 9.498857653720076, + "grad_norm": 0.203125, + "learning_rate": 4.009536108075892e-05, + "loss": 0.4919, + "step": 191250 + }, + { + "epoch": 9.499354326015695, + "grad_norm": 0.1923828125, + "learning_rate": 4.005562729710937e-05, + "loss": 0.473, + "step": 191260 + }, + { + "epoch": 9.499850998311315, + "grad_norm": 0.1943359375, + "learning_rate": 4.0015893513459824e-05, + "loss": 0.4701, + "step": 191270 + }, + { + "epoch": 9.500347670606933, + "grad_norm": 0.2060546875, + "learning_rate": 3.9976159729810275e-05, + "loss": 0.5078, + "step": 191280 + }, + { + "epoch": 9.500844342902553, + "grad_norm": 0.2021484375, + "learning_rate": 3.9936425946160726e-05, + "loss": 0.4614, + "step": 191290 + }, + { + "epoch": 9.501341015198172, + "grad_norm": 0.2158203125, + "learning_rate": 3.989669216251118e-05, + "loss": 0.5163, + "step": 191300 + }, + { + "epoch": 9.501837687493792, + "grad_norm": 0.1982421875, + "learning_rate": 3.985695837886163e-05, + "loss": 0.4596, + "step": 191310 + }, + { + "epoch": 9.502334359789412, + "grad_norm": 0.2109375, + "learning_rate": 3.981722459521208e-05, + "loss": 0.4776, + "step": 191320 + }, + { + "epoch": 9.50283103208503, + "grad_norm": 0.2060546875, + "learning_rate": 3.977749081156254e-05, + "loss": 0.4652, + "step": 191330 + }, + { + "epoch": 9.50332770438065, + "grad_norm": 0.216796875, + "learning_rate": 3.973775702791299e-05, + "loss": 0.4922, + "step": 191340 + }, + { + "epoch": 9.503824376676269, + "grad_norm": 0.203125, + "learning_rate": 3.969802324426344e-05, + "loss": 0.4952, + "step": 191350 + }, + { + "epoch": 9.504321048971889, + "grad_norm": 0.201171875, + "learning_rate": 3.965828946061389e-05, + "loss": 0.4622, + "step": 191360 + }, + { + "epoch": 9.504817721267507, + "grad_norm": 0.2080078125, + "learning_rate": 3.961855567696434e-05, + "loss": 0.4596, + "step": 191370 + }, + { + "epoch": 9.505314393563127, + "grad_norm": 0.2021484375, + "learning_rate": 3.957882189331479e-05, + "loss": 0.4928, + "step": 191380 + }, + { + "epoch": 9.505811065858746, + "grad_norm": 0.2001953125, + "learning_rate": 3.953908810966524e-05, + "loss": 0.4832, + "step": 191390 + }, + { + "epoch": 9.506307738154366, + "grad_norm": 0.197265625, + "learning_rate": 3.9499354326015694e-05, + "loss": 0.4834, + "step": 191400 + }, + { + "epoch": 9.506804410449986, + "grad_norm": 0.197265625, + "learning_rate": 3.945962054236615e-05, + "loss": 0.4693, + "step": 191410 + }, + { + "epoch": 9.507301082745604, + "grad_norm": 0.18359375, + "learning_rate": 3.94198867587166e-05, + "loss": 0.4934, + "step": 191420 + }, + { + "epoch": 9.507797755041224, + "grad_norm": 0.1962890625, + "learning_rate": 3.9380152975067054e-05, + "loss": 0.4893, + "step": 191430 + }, + { + "epoch": 9.508294427336843, + "grad_norm": 0.1962890625, + "learning_rate": 3.9340419191417505e-05, + "loss": 0.4647, + "step": 191440 + }, + { + "epoch": 9.508791099632463, + "grad_norm": 0.2197265625, + "learning_rate": 3.9300685407767956e-05, + "loss": 0.4931, + "step": 191450 + }, + { + "epoch": 9.509287771928081, + "grad_norm": 0.2431640625, + "learning_rate": 3.926095162411841e-05, + "loss": 0.4797, + "step": 191460 + }, + { + "epoch": 9.509784444223701, + "grad_norm": 0.1875, + "learning_rate": 3.922121784046886e-05, + "loss": 0.4645, + "step": 191470 + }, + { + "epoch": 9.510281116519321, + "grad_norm": 0.1953125, + "learning_rate": 3.918148405681931e-05, + "loss": 0.4774, + "step": 191480 + }, + { + "epoch": 9.51077778881494, + "grad_norm": 0.2216796875, + "learning_rate": 3.914175027316977e-05, + "loss": 0.5144, + "step": 191490 + }, + { + "epoch": 9.51127446111056, + "grad_norm": 0.216796875, + "learning_rate": 3.910201648952022e-05, + "loss": 0.5031, + "step": 191500 + }, + { + "epoch": 9.511771133406178, + "grad_norm": 0.23046875, + "learning_rate": 3.906228270587067e-05, + "loss": 0.4978, + "step": 191510 + }, + { + "epoch": 9.512267805701798, + "grad_norm": 0.203125, + "learning_rate": 3.902254892222112e-05, + "loss": 0.4938, + "step": 191520 + }, + { + "epoch": 9.512764477997417, + "grad_norm": 0.197265625, + "learning_rate": 3.898281513857157e-05, + "loss": 0.4878, + "step": 191530 + }, + { + "epoch": 9.513261150293037, + "grad_norm": 0.1962890625, + "learning_rate": 3.894308135492202e-05, + "loss": 0.4879, + "step": 191540 + }, + { + "epoch": 9.513757822588657, + "grad_norm": 0.21484375, + "learning_rate": 3.890334757127247e-05, + "loss": 0.5107, + "step": 191550 + }, + { + "epoch": 9.514254494884275, + "grad_norm": 0.2109375, + "learning_rate": 3.8863613787622924e-05, + "loss": 0.457, + "step": 191560 + }, + { + "epoch": 9.514751167179895, + "grad_norm": 0.2001953125, + "learning_rate": 3.8823880003973375e-05, + "loss": 0.4798, + "step": 191570 + }, + { + "epoch": 9.515247839475514, + "grad_norm": 0.197265625, + "learning_rate": 3.878414622032383e-05, + "loss": 0.4988, + "step": 191580 + }, + { + "epoch": 9.515744511771134, + "grad_norm": 0.2001953125, + "learning_rate": 3.8744412436674284e-05, + "loss": 0.4823, + "step": 191590 + }, + { + "epoch": 9.516241184066752, + "grad_norm": 0.1884765625, + "learning_rate": 3.8704678653024735e-05, + "loss": 0.4892, + "step": 191600 + }, + { + "epoch": 9.516737856362372, + "grad_norm": 0.189453125, + "learning_rate": 3.8664944869375186e-05, + "loss": 0.4959, + "step": 191610 + }, + { + "epoch": 9.517234528657992, + "grad_norm": 0.1923828125, + "learning_rate": 3.862521108572564e-05, + "loss": 0.479, + "step": 191620 + }, + { + "epoch": 9.51773120095361, + "grad_norm": 0.197265625, + "learning_rate": 3.858547730207609e-05, + "loss": 0.4896, + "step": 191630 + }, + { + "epoch": 9.51822787324923, + "grad_norm": 0.2138671875, + "learning_rate": 3.854574351842654e-05, + "loss": 0.4593, + "step": 191640 + }, + { + "epoch": 9.518724545544849, + "grad_norm": 0.2158203125, + "learning_rate": 3.850600973477699e-05, + "loss": 0.5126, + "step": 191650 + }, + { + "epoch": 9.51922121784047, + "grad_norm": 0.2275390625, + "learning_rate": 3.846627595112745e-05, + "loss": 0.513, + "step": 191660 + }, + { + "epoch": 9.519717890136087, + "grad_norm": 0.205078125, + "learning_rate": 3.84265421674779e-05, + "loss": 0.4827, + "step": 191670 + }, + { + "epoch": 9.520214562431708, + "grad_norm": 0.201171875, + "learning_rate": 3.838680838382836e-05, + "loss": 0.467, + "step": 191680 + }, + { + "epoch": 9.520711234727328, + "grad_norm": 0.2001953125, + "learning_rate": 3.834707460017881e-05, + "loss": 0.4703, + "step": 191690 + }, + { + "epoch": 9.521207907022946, + "grad_norm": 0.1923828125, + "learning_rate": 3.830734081652926e-05, + "loss": 0.4589, + "step": 191700 + }, + { + "epoch": 9.521704579318566, + "grad_norm": 0.197265625, + "learning_rate": 3.826760703287971e-05, + "loss": 0.4747, + "step": 191710 + }, + { + "epoch": 9.522201251614185, + "grad_norm": 0.201171875, + "learning_rate": 3.822787324923016e-05, + "loss": 0.4633, + "step": 191720 + }, + { + "epoch": 9.522697923909805, + "grad_norm": 0.1962890625, + "learning_rate": 3.818813946558061e-05, + "loss": 0.49, + "step": 191730 + }, + { + "epoch": 9.523194596205423, + "grad_norm": 0.2255859375, + "learning_rate": 3.814840568193106e-05, + "loss": 0.5207, + "step": 191740 + }, + { + "epoch": 9.523691268501043, + "grad_norm": 0.1953125, + "learning_rate": 3.8108671898281514e-05, + "loss": 0.4922, + "step": 191750 + }, + { + "epoch": 9.524187940796661, + "grad_norm": 0.2001953125, + "learning_rate": 3.806893811463197e-05, + "loss": 0.4768, + "step": 191760 + }, + { + "epoch": 9.524684613092282, + "grad_norm": 0.1962890625, + "learning_rate": 3.802920433098242e-05, + "loss": 0.5086, + "step": 191770 + }, + { + "epoch": 9.525181285387902, + "grad_norm": 0.2041015625, + "learning_rate": 3.7989470547332874e-05, + "loss": 0.4887, + "step": 191780 + }, + { + "epoch": 9.52567795768352, + "grad_norm": 0.201171875, + "learning_rate": 3.7949736763683325e-05, + "loss": 0.4955, + "step": 191790 + }, + { + "epoch": 9.52617462997914, + "grad_norm": 0.2099609375, + "learning_rate": 3.7910002980033776e-05, + "loss": 0.4866, + "step": 191800 + }, + { + "epoch": 9.526671302274758, + "grad_norm": 0.21484375, + "learning_rate": 3.787026919638423e-05, + "loss": 0.4779, + "step": 191810 + }, + { + "epoch": 9.527167974570379, + "grad_norm": 0.232421875, + "learning_rate": 3.783053541273468e-05, + "loss": 0.4713, + "step": 191820 + }, + { + "epoch": 9.527664646865997, + "grad_norm": 0.20703125, + "learning_rate": 3.779080162908513e-05, + "loss": 0.5013, + "step": 191830 + }, + { + "epoch": 9.528161319161617, + "grad_norm": 0.193359375, + "learning_rate": 3.775106784543559e-05, + "loss": 0.5004, + "step": 191840 + }, + { + "epoch": 9.528657991457237, + "grad_norm": 0.236328125, + "learning_rate": 3.771133406178604e-05, + "loss": 0.5178, + "step": 191850 + }, + { + "epoch": 9.529154663752855, + "grad_norm": 0.1953125, + "learning_rate": 3.767160027813649e-05, + "loss": 0.4951, + "step": 191860 + }, + { + "epoch": 9.529651336048476, + "grad_norm": 0.2080078125, + "learning_rate": 3.763186649448694e-05, + "loss": 0.4878, + "step": 191870 + }, + { + "epoch": 9.530148008344094, + "grad_norm": 0.1982421875, + "learning_rate": 3.759213271083739e-05, + "loss": 0.489, + "step": 191880 + }, + { + "epoch": 9.530644680639714, + "grad_norm": 0.2001953125, + "learning_rate": 3.755239892718784e-05, + "loss": 0.4764, + "step": 191890 + }, + { + "epoch": 9.531141352935332, + "grad_norm": 0.2099609375, + "learning_rate": 3.751266514353829e-05, + "loss": 0.4773, + "step": 191900 + }, + { + "epoch": 9.531638025230952, + "grad_norm": 0.2060546875, + "learning_rate": 3.7472931359888744e-05, + "loss": 0.4708, + "step": 191910 + }, + { + "epoch": 9.532134697526573, + "grad_norm": 0.20703125, + "learning_rate": 3.7433197576239195e-05, + "loss": 0.5081, + "step": 191920 + }, + { + "epoch": 9.532631369822191, + "grad_norm": 0.2216796875, + "learning_rate": 3.739346379258965e-05, + "loss": 0.4885, + "step": 191930 + }, + { + "epoch": 9.533128042117811, + "grad_norm": 0.2060546875, + "learning_rate": 3.7353730008940104e-05, + "loss": 0.4689, + "step": 191940 + }, + { + "epoch": 9.53362471441343, + "grad_norm": 0.203125, + "learning_rate": 3.7313996225290555e-05, + "loss": 0.4842, + "step": 191950 + }, + { + "epoch": 9.53412138670905, + "grad_norm": 0.1962890625, + "learning_rate": 3.7274262441641006e-05, + "loss": 0.4605, + "step": 191960 + }, + { + "epoch": 9.534618059004668, + "grad_norm": 0.220703125, + "learning_rate": 3.723452865799146e-05, + "loss": 0.4799, + "step": 191970 + }, + { + "epoch": 9.535114731300288, + "grad_norm": 0.2177734375, + "learning_rate": 3.719479487434191e-05, + "loss": 0.5146, + "step": 191980 + }, + { + "epoch": 9.535611403595908, + "grad_norm": 0.2080078125, + "learning_rate": 3.715506109069236e-05, + "loss": 0.5379, + "step": 191990 + }, + { + "epoch": 9.536108075891526, + "grad_norm": 0.2001953125, + "learning_rate": 3.711532730704281e-05, + "loss": 0.5096, + "step": 192000 + }, + { + "epoch": 9.536604748187147, + "grad_norm": 0.2041015625, + "learning_rate": 3.707559352339327e-05, + "loss": 0.5056, + "step": 192010 + }, + { + "epoch": 9.537101420482765, + "grad_norm": 0.23046875, + "learning_rate": 3.703585973974372e-05, + "loss": 0.5013, + "step": 192020 + }, + { + "epoch": 9.537598092778385, + "grad_norm": 0.1806640625, + "learning_rate": 3.699612595609417e-05, + "loss": 0.4694, + "step": 192030 + }, + { + "epoch": 9.538094765074003, + "grad_norm": 0.203125, + "learning_rate": 3.695639217244462e-05, + "loss": 0.4828, + "step": 192040 + }, + { + "epoch": 9.538591437369623, + "grad_norm": 0.251953125, + "learning_rate": 3.691665838879507e-05, + "loss": 0.5081, + "step": 192050 + }, + { + "epoch": 9.539088109665244, + "grad_norm": 0.2138671875, + "learning_rate": 3.687692460514552e-05, + "loss": 0.5002, + "step": 192060 + }, + { + "epoch": 9.539584781960862, + "grad_norm": 0.208984375, + "learning_rate": 3.6837190821495974e-05, + "loss": 0.4973, + "step": 192070 + }, + { + "epoch": 9.540081454256482, + "grad_norm": 0.220703125, + "learning_rate": 3.679745703784643e-05, + "loss": 0.4881, + "step": 192080 + }, + { + "epoch": 9.5405781265521, + "grad_norm": 0.2158203125, + "learning_rate": 3.675772325419688e-05, + "loss": 0.5215, + "step": 192090 + }, + { + "epoch": 9.54107479884772, + "grad_norm": 0.203125, + "learning_rate": 3.6717989470547334e-05, + "loss": 0.4756, + "step": 192100 + }, + { + "epoch": 9.541571471143339, + "grad_norm": 0.19140625, + "learning_rate": 3.667825568689779e-05, + "loss": 0.4757, + "step": 192110 + }, + { + "epoch": 9.542068143438959, + "grad_norm": 0.2119140625, + "learning_rate": 3.663852190324824e-05, + "loss": 0.4859, + "step": 192120 + }, + { + "epoch": 9.542564815734579, + "grad_norm": 0.1826171875, + "learning_rate": 3.6598788119598694e-05, + "loss": 0.4845, + "step": 192130 + }, + { + "epoch": 9.543061488030197, + "grad_norm": 0.2021484375, + "learning_rate": 3.6559054335949145e-05, + "loss": 0.4964, + "step": 192140 + }, + { + "epoch": 9.543558160325817, + "grad_norm": 0.2041015625, + "learning_rate": 3.6519320552299596e-05, + "loss": 0.4743, + "step": 192150 + }, + { + "epoch": 9.544054832621436, + "grad_norm": 0.2197265625, + "learning_rate": 3.647958676865005e-05, + "loss": 0.4867, + "step": 192160 + }, + { + "epoch": 9.544551504917056, + "grad_norm": 0.201171875, + "learning_rate": 3.64398529850005e-05, + "loss": 0.49, + "step": 192170 + }, + { + "epoch": 9.545048177212674, + "grad_norm": 0.201171875, + "learning_rate": 3.640011920135095e-05, + "loss": 0.5057, + "step": 192180 + }, + { + "epoch": 9.545544849508294, + "grad_norm": 0.220703125, + "learning_rate": 3.636038541770141e-05, + "loss": 0.48, + "step": 192190 + }, + { + "epoch": 9.546041521803915, + "grad_norm": 0.1943359375, + "learning_rate": 3.632065163405186e-05, + "loss": 0.4865, + "step": 192200 + }, + { + "epoch": 9.546538194099533, + "grad_norm": 0.1943359375, + "learning_rate": 3.628091785040231e-05, + "loss": 0.5045, + "step": 192210 + }, + { + "epoch": 9.547034866395153, + "grad_norm": 0.212890625, + "learning_rate": 3.624118406675276e-05, + "loss": 0.5066, + "step": 192220 + }, + { + "epoch": 9.547531538690771, + "grad_norm": 0.2236328125, + "learning_rate": 3.620145028310321e-05, + "loss": 0.5052, + "step": 192230 + }, + { + "epoch": 9.548028210986391, + "grad_norm": 0.2001953125, + "learning_rate": 3.616171649945366e-05, + "loss": 0.5034, + "step": 192240 + }, + { + "epoch": 9.54852488328201, + "grad_norm": 0.2412109375, + "learning_rate": 3.612198271580411e-05, + "loss": 0.5156, + "step": 192250 + }, + { + "epoch": 9.54902155557763, + "grad_norm": 0.251953125, + "learning_rate": 3.6082248932154564e-05, + "loss": 0.4992, + "step": 192260 + }, + { + "epoch": 9.54951822787325, + "grad_norm": 0.21484375, + "learning_rate": 3.6042515148505015e-05, + "loss": 0.5048, + "step": 192270 + }, + { + "epoch": 9.550014900168868, + "grad_norm": 0.2021484375, + "learning_rate": 3.600278136485547e-05, + "loss": 0.4819, + "step": 192280 + }, + { + "epoch": 9.550511572464488, + "grad_norm": 0.2578125, + "learning_rate": 3.5963047581205924e-05, + "loss": 0.4945, + "step": 192290 + }, + { + "epoch": 9.551008244760107, + "grad_norm": 0.2041015625, + "learning_rate": 3.5923313797556375e-05, + "loss": 0.4933, + "step": 192300 + }, + { + "epoch": 9.551504917055727, + "grad_norm": 0.2275390625, + "learning_rate": 3.5883580013906826e-05, + "loss": 0.4834, + "step": 192310 + }, + { + "epoch": 9.552001589351345, + "grad_norm": 0.1943359375, + "learning_rate": 3.584384623025728e-05, + "loss": 0.5017, + "step": 192320 + }, + { + "epoch": 9.552498261646965, + "grad_norm": 0.203125, + "learning_rate": 3.580411244660773e-05, + "loss": 0.4988, + "step": 192330 + }, + { + "epoch": 9.552994933942585, + "grad_norm": 0.2236328125, + "learning_rate": 3.576437866295818e-05, + "loss": 0.4934, + "step": 192340 + }, + { + "epoch": 9.553491606238204, + "grad_norm": 0.1962890625, + "learning_rate": 3.572464487930863e-05, + "loss": 0.5099, + "step": 192350 + }, + { + "epoch": 9.553988278533824, + "grad_norm": 0.2080078125, + "learning_rate": 3.568491109565909e-05, + "loss": 0.5079, + "step": 192360 + }, + { + "epoch": 9.554484950829442, + "grad_norm": 0.1943359375, + "learning_rate": 3.564517731200954e-05, + "loss": 0.4867, + "step": 192370 + }, + { + "epoch": 9.554981623125062, + "grad_norm": 0.20703125, + "learning_rate": 3.560544352835999e-05, + "loss": 0.4577, + "step": 192380 + }, + { + "epoch": 9.55547829542068, + "grad_norm": 0.1904296875, + "learning_rate": 3.556570974471044e-05, + "loss": 0.4575, + "step": 192390 + }, + { + "epoch": 9.5559749677163, + "grad_norm": 0.2060546875, + "learning_rate": 3.552597596106089e-05, + "loss": 0.4862, + "step": 192400 + }, + { + "epoch": 9.556471640011921, + "grad_norm": 0.2431640625, + "learning_rate": 3.548624217741134e-05, + "loss": 0.4689, + "step": 192410 + }, + { + "epoch": 9.55696831230754, + "grad_norm": 0.2109375, + "learning_rate": 3.5446508393761794e-05, + "loss": 0.4932, + "step": 192420 + }, + { + "epoch": 9.55746498460316, + "grad_norm": 0.205078125, + "learning_rate": 3.5406774610112245e-05, + "loss": 0.4783, + "step": 192430 + }, + { + "epoch": 9.557961656898778, + "grad_norm": 0.21875, + "learning_rate": 3.53670408264627e-05, + "loss": 0.4805, + "step": 192440 + }, + { + "epoch": 9.558458329194398, + "grad_norm": 0.2021484375, + "learning_rate": 3.5327307042813154e-05, + "loss": 0.5025, + "step": 192450 + }, + { + "epoch": 9.558955001490016, + "grad_norm": 0.189453125, + "learning_rate": 3.5287573259163605e-05, + "loss": 0.4831, + "step": 192460 + }, + { + "epoch": 9.559451673785636, + "grad_norm": 0.2021484375, + "learning_rate": 3.5247839475514056e-05, + "loss": 0.4607, + "step": 192470 + }, + { + "epoch": 9.559948346081256, + "grad_norm": 0.2021484375, + "learning_rate": 3.520810569186451e-05, + "loss": 0.4851, + "step": 192480 + }, + { + "epoch": 9.560445018376875, + "grad_norm": 0.1875, + "learning_rate": 3.516837190821496e-05, + "loss": 0.4971, + "step": 192490 + }, + { + "epoch": 9.560941690672495, + "grad_norm": 0.2275390625, + "learning_rate": 3.5128638124565416e-05, + "loss": 0.4889, + "step": 192500 + }, + { + "epoch": 9.561438362968113, + "grad_norm": 0.2197265625, + "learning_rate": 3.508890434091587e-05, + "loss": 0.4994, + "step": 192510 + }, + { + "epoch": 9.561935035263733, + "grad_norm": 0.1923828125, + "learning_rate": 3.504917055726632e-05, + "loss": 0.4843, + "step": 192520 + }, + { + "epoch": 9.562431707559352, + "grad_norm": 0.1923828125, + "learning_rate": 3.500943677361677e-05, + "loss": 0.4535, + "step": 192530 + }, + { + "epoch": 9.562928379854972, + "grad_norm": 0.2197265625, + "learning_rate": 3.4969702989967227e-05, + "loss": 0.469, + "step": 192540 + }, + { + "epoch": 9.563425052150592, + "grad_norm": 0.212890625, + "learning_rate": 3.492996920631768e-05, + "loss": 0.4811, + "step": 192550 + }, + { + "epoch": 9.56392172444621, + "grad_norm": 0.2060546875, + "learning_rate": 3.489023542266813e-05, + "loss": 0.4826, + "step": 192560 + }, + { + "epoch": 9.56441839674183, + "grad_norm": 0.2119140625, + "learning_rate": 3.485050163901858e-05, + "loss": 0.4918, + "step": 192570 + }, + { + "epoch": 9.564915069037449, + "grad_norm": 0.212890625, + "learning_rate": 3.481076785536903e-05, + "loss": 0.4913, + "step": 192580 + }, + { + "epoch": 9.565411741333069, + "grad_norm": 0.19921875, + "learning_rate": 3.477103407171948e-05, + "loss": 0.4949, + "step": 192590 + }, + { + "epoch": 9.565908413628687, + "grad_norm": 0.201171875, + "learning_rate": 3.473130028806993e-05, + "loss": 0.5122, + "step": 192600 + }, + { + "epoch": 9.566405085924307, + "grad_norm": 0.2373046875, + "learning_rate": 3.4691566504420384e-05, + "loss": 0.4734, + "step": 192610 + }, + { + "epoch": 9.566901758219927, + "grad_norm": 0.2099609375, + "learning_rate": 3.4651832720770835e-05, + "loss": 0.4896, + "step": 192620 + }, + { + "epoch": 9.567398430515546, + "grad_norm": 0.1953125, + "learning_rate": 3.461209893712129e-05, + "loss": 0.5158, + "step": 192630 + }, + { + "epoch": 9.567895102811166, + "grad_norm": 0.2236328125, + "learning_rate": 3.4572365153471744e-05, + "loss": 0.5142, + "step": 192640 + }, + { + "epoch": 9.568391775106784, + "grad_norm": 0.220703125, + "learning_rate": 3.4532631369822195e-05, + "loss": 0.4838, + "step": 192650 + }, + { + "epoch": 9.568888447402404, + "grad_norm": 0.205078125, + "learning_rate": 3.4492897586172646e-05, + "loss": 0.4967, + "step": 192660 + }, + { + "epoch": 9.569385119698023, + "grad_norm": 0.23046875, + "learning_rate": 3.44531638025231e-05, + "loss": 0.4832, + "step": 192670 + }, + { + "epoch": 9.569881791993643, + "grad_norm": 0.2431640625, + "learning_rate": 3.441343001887355e-05, + "loss": 0.4955, + "step": 192680 + }, + { + "epoch": 9.570378464289263, + "grad_norm": 0.19140625, + "learning_rate": 3.4373696235224e-05, + "loss": 0.4461, + "step": 192690 + }, + { + "epoch": 9.570875136584881, + "grad_norm": 0.234375, + "learning_rate": 3.433396245157445e-05, + "loss": 0.4708, + "step": 192700 + }, + { + "epoch": 9.571371808880501, + "grad_norm": 0.1943359375, + "learning_rate": 3.429422866792491e-05, + "loss": 0.468, + "step": 192710 + }, + { + "epoch": 9.57186848117612, + "grad_norm": 0.201171875, + "learning_rate": 3.425449488427536e-05, + "loss": 0.5008, + "step": 192720 + }, + { + "epoch": 9.57236515347174, + "grad_norm": 0.197265625, + "learning_rate": 3.421476110062581e-05, + "loss": 0.4814, + "step": 192730 + }, + { + "epoch": 9.572861825767358, + "grad_norm": 0.2109375, + "learning_rate": 3.417502731697626e-05, + "loss": 0.4937, + "step": 192740 + }, + { + "epoch": 9.573358498062978, + "grad_norm": 0.2109375, + "learning_rate": 3.413529353332671e-05, + "loss": 0.4934, + "step": 192750 + }, + { + "epoch": 9.573855170358597, + "grad_norm": 0.2041015625, + "learning_rate": 3.409555974967716e-05, + "loss": 0.4747, + "step": 192760 + }, + { + "epoch": 9.574351842654217, + "grad_norm": 0.2578125, + "learning_rate": 3.4055825966027614e-05, + "loss": 0.5158, + "step": 192770 + }, + { + "epoch": 9.574848514949837, + "grad_norm": 0.19140625, + "learning_rate": 3.4016092182378065e-05, + "loss": 0.4607, + "step": 192780 + }, + { + "epoch": 9.575345187245455, + "grad_norm": 0.2158203125, + "learning_rate": 3.397635839872852e-05, + "loss": 0.507, + "step": 192790 + }, + { + "epoch": 9.575841859541075, + "grad_norm": 0.2275390625, + "learning_rate": 3.3936624615078974e-05, + "loss": 0.5017, + "step": 192800 + }, + { + "epoch": 9.576338531836694, + "grad_norm": 0.205078125, + "learning_rate": 3.3896890831429425e-05, + "loss": 0.5241, + "step": 192810 + }, + { + "epoch": 9.576835204132314, + "grad_norm": 0.2119140625, + "learning_rate": 3.3857157047779876e-05, + "loss": 0.4668, + "step": 192820 + }, + { + "epoch": 9.577331876427932, + "grad_norm": 0.1943359375, + "learning_rate": 3.381742326413033e-05, + "loss": 0.5134, + "step": 192830 + }, + { + "epoch": 9.577828548723552, + "grad_norm": 0.2021484375, + "learning_rate": 3.377768948048078e-05, + "loss": 0.4805, + "step": 192840 + }, + { + "epoch": 9.578325221019172, + "grad_norm": 0.216796875, + "learning_rate": 3.373795569683123e-05, + "loss": 0.4943, + "step": 192850 + }, + { + "epoch": 9.57882189331479, + "grad_norm": 0.2041015625, + "learning_rate": 3.369822191318168e-05, + "loss": 0.4839, + "step": 192860 + }, + { + "epoch": 9.57931856561041, + "grad_norm": 0.201171875, + "learning_rate": 3.365848812953213e-05, + "loss": 0.4885, + "step": 192870 + }, + { + "epoch": 9.579815237906029, + "grad_norm": 0.1826171875, + "learning_rate": 3.361875434588259e-05, + "loss": 0.4661, + "step": 192880 + }, + { + "epoch": 9.58031191020165, + "grad_norm": 0.201171875, + "learning_rate": 3.357902056223304e-05, + "loss": 0.4859, + "step": 192890 + }, + { + "epoch": 9.580808582497268, + "grad_norm": 0.2060546875, + "learning_rate": 3.353928677858349e-05, + "loss": 0.4815, + "step": 192900 + }, + { + "epoch": 9.581305254792888, + "grad_norm": 0.1865234375, + "learning_rate": 3.349955299493394e-05, + "loss": 0.5095, + "step": 192910 + }, + { + "epoch": 9.581801927088508, + "grad_norm": 0.1982421875, + "learning_rate": 3.34598192112844e-05, + "loss": 0.4854, + "step": 192920 + }, + { + "epoch": 9.582298599384126, + "grad_norm": 0.236328125, + "learning_rate": 3.342008542763485e-05, + "loss": 0.5088, + "step": 192930 + }, + { + "epoch": 9.582795271679746, + "grad_norm": 0.2021484375, + "learning_rate": 3.33803516439853e-05, + "loss": 0.4978, + "step": 192940 + }, + { + "epoch": 9.583291943975365, + "grad_norm": 0.2021484375, + "learning_rate": 3.334061786033575e-05, + "loss": 0.5023, + "step": 192950 + }, + { + "epoch": 9.583788616270985, + "grad_norm": 0.1904296875, + "learning_rate": 3.3300884076686204e-05, + "loss": 0.4632, + "step": 192960 + }, + { + "epoch": 9.584285288566603, + "grad_norm": 0.212890625, + "learning_rate": 3.326115029303666e-05, + "loss": 0.4744, + "step": 192970 + }, + { + "epoch": 9.584781960862223, + "grad_norm": 0.2001953125, + "learning_rate": 3.322141650938711e-05, + "loss": 0.4646, + "step": 192980 + }, + { + "epoch": 9.585278633157843, + "grad_norm": 0.23046875, + "learning_rate": 3.3181682725737563e-05, + "loss": 0.4881, + "step": 192990 + }, + { + "epoch": 9.585775305453462, + "grad_norm": 0.23828125, + "learning_rate": 3.3141948942088014e-05, + "loss": 0.4947, + "step": 193000 + }, + { + "epoch": 9.586271977749082, + "grad_norm": 0.2138671875, + "learning_rate": 3.3102215158438466e-05, + "loss": 0.4865, + "step": 193010 + }, + { + "epoch": 9.5867686500447, + "grad_norm": 0.1982421875, + "learning_rate": 3.3062481374788917e-05, + "loss": 0.4791, + "step": 193020 + }, + { + "epoch": 9.58726532234032, + "grad_norm": 0.224609375, + "learning_rate": 3.302274759113937e-05, + "loss": 0.4772, + "step": 193030 + }, + { + "epoch": 9.587761994635938, + "grad_norm": 0.26171875, + "learning_rate": 3.298301380748982e-05, + "loss": 0.5063, + "step": 193040 + }, + { + "epoch": 9.588258666931559, + "grad_norm": 0.2216796875, + "learning_rate": 3.294328002384027e-05, + "loss": 0.4819, + "step": 193050 + }, + { + "epoch": 9.588755339227179, + "grad_norm": 0.1953125, + "learning_rate": 3.290354624019073e-05, + "loss": 0.5097, + "step": 193060 + }, + { + "epoch": 9.589252011522797, + "grad_norm": 0.21484375, + "learning_rate": 3.286381245654118e-05, + "loss": 0.4994, + "step": 193070 + }, + { + "epoch": 9.589748683818417, + "grad_norm": 0.251953125, + "learning_rate": 3.282407867289163e-05, + "loss": 0.4718, + "step": 193080 + }, + { + "epoch": 9.590245356114036, + "grad_norm": 0.24609375, + "learning_rate": 3.278434488924208e-05, + "loss": 0.5238, + "step": 193090 + }, + { + "epoch": 9.590742028409656, + "grad_norm": 0.205078125, + "learning_rate": 3.274461110559253e-05, + "loss": 0.4581, + "step": 193100 + }, + { + "epoch": 9.591238700705274, + "grad_norm": 0.2392578125, + "learning_rate": 3.270487732194298e-05, + "loss": 0.454, + "step": 193110 + }, + { + "epoch": 9.591735373000894, + "grad_norm": 0.2041015625, + "learning_rate": 3.2665143538293434e-05, + "loss": 0.4849, + "step": 193120 + }, + { + "epoch": 9.592232045296514, + "grad_norm": 0.197265625, + "learning_rate": 3.2625409754643885e-05, + "loss": 0.5066, + "step": 193130 + }, + { + "epoch": 9.592728717592133, + "grad_norm": 0.1923828125, + "learning_rate": 3.258567597099434e-05, + "loss": 0.4867, + "step": 193140 + }, + { + "epoch": 9.593225389887753, + "grad_norm": 0.1962890625, + "learning_rate": 3.2545942187344793e-05, + "loss": 0.4706, + "step": 193150 + }, + { + "epoch": 9.593722062183371, + "grad_norm": 0.1982421875, + "learning_rate": 3.2506208403695244e-05, + "loss": 0.4917, + "step": 193160 + }, + { + "epoch": 9.594218734478991, + "grad_norm": 0.212890625, + "learning_rate": 3.2466474620045696e-05, + "loss": 0.4563, + "step": 193170 + }, + { + "epoch": 9.59471540677461, + "grad_norm": 0.2119140625, + "learning_rate": 3.2426740836396147e-05, + "loss": 0.4949, + "step": 193180 + }, + { + "epoch": 9.59521207907023, + "grad_norm": 0.2119140625, + "learning_rate": 3.23870070527466e-05, + "loss": 0.5061, + "step": 193190 + }, + { + "epoch": 9.595708751365848, + "grad_norm": 0.205078125, + "learning_rate": 3.234727326909705e-05, + "loss": 0.4821, + "step": 193200 + }, + { + "epoch": 9.596205423661468, + "grad_norm": 0.19140625, + "learning_rate": 3.23075394854475e-05, + "loss": 0.4701, + "step": 193210 + }, + { + "epoch": 9.596702095957088, + "grad_norm": 0.224609375, + "learning_rate": 3.226780570179796e-05, + "loss": 0.5114, + "step": 193220 + }, + { + "epoch": 9.597198768252706, + "grad_norm": 0.205078125, + "learning_rate": 3.222807191814841e-05, + "loss": 0.4598, + "step": 193230 + }, + { + "epoch": 9.597695440548327, + "grad_norm": 0.2373046875, + "learning_rate": 3.218833813449886e-05, + "loss": 0.4873, + "step": 193240 + }, + { + "epoch": 9.598192112843945, + "grad_norm": 0.21484375, + "learning_rate": 3.214860435084931e-05, + "loss": 0.4576, + "step": 193250 + }, + { + "epoch": 9.598688785139565, + "grad_norm": 0.236328125, + "learning_rate": 3.210887056719976e-05, + "loss": 0.5117, + "step": 193260 + }, + { + "epoch": 9.599185457435183, + "grad_norm": 0.20703125, + "learning_rate": 3.206913678355021e-05, + "loss": 0.4732, + "step": 193270 + }, + { + "epoch": 9.599682129730803, + "grad_norm": 0.2255859375, + "learning_rate": 3.2029402999900664e-05, + "loss": 0.4554, + "step": 193280 + }, + { + "epoch": 9.600178802026424, + "grad_norm": 0.2265625, + "learning_rate": 3.1989669216251115e-05, + "loss": 0.4936, + "step": 193290 + }, + { + "epoch": 9.600675474322042, + "grad_norm": 0.197265625, + "learning_rate": 3.1949935432601566e-05, + "loss": 0.4698, + "step": 193300 + }, + { + "epoch": 9.601172146617662, + "grad_norm": 0.2080078125, + "learning_rate": 3.1910201648952023e-05, + "loss": 0.4765, + "step": 193310 + }, + { + "epoch": 9.60166881891328, + "grad_norm": 0.2109375, + "learning_rate": 3.1870467865302474e-05, + "loss": 0.487, + "step": 193320 + }, + { + "epoch": 9.6021654912089, + "grad_norm": 0.2197265625, + "learning_rate": 3.1830734081652925e-05, + "loss": 0.501, + "step": 193330 + }, + { + "epoch": 9.602662163504519, + "grad_norm": 0.2080078125, + "learning_rate": 3.179100029800338e-05, + "loss": 0.4816, + "step": 193340 + }, + { + "epoch": 9.603158835800139, + "grad_norm": 0.23046875, + "learning_rate": 3.1751266514353834e-05, + "loss": 0.4811, + "step": 193350 + }, + { + "epoch": 9.603655508095759, + "grad_norm": 0.19921875, + "learning_rate": 3.1711532730704285e-05, + "loss": 0.4869, + "step": 193360 + }, + { + "epoch": 9.604152180391377, + "grad_norm": 0.2333984375, + "learning_rate": 3.1671798947054736e-05, + "loss": 0.4844, + "step": 193370 + }, + { + "epoch": 9.604648852686998, + "grad_norm": 0.228515625, + "learning_rate": 3.163206516340519e-05, + "loss": 0.5069, + "step": 193380 + }, + { + "epoch": 9.605145524982616, + "grad_norm": 0.2060546875, + "learning_rate": 3.159233137975564e-05, + "loss": 0.5005, + "step": 193390 + }, + { + "epoch": 9.605642197278236, + "grad_norm": 0.2080078125, + "learning_rate": 3.155259759610609e-05, + "loss": 0.5013, + "step": 193400 + }, + { + "epoch": 9.606138869573854, + "grad_norm": 0.2109375, + "learning_rate": 3.151286381245655e-05, + "loss": 0.5013, + "step": 193410 + }, + { + "epoch": 9.606635541869474, + "grad_norm": 0.2412109375, + "learning_rate": 3.1473130028807e-05, + "loss": 0.505, + "step": 193420 + }, + { + "epoch": 9.607132214165095, + "grad_norm": 0.1875, + "learning_rate": 3.143339624515745e-05, + "loss": 0.4849, + "step": 193430 + }, + { + "epoch": 9.607628886460713, + "grad_norm": 0.20703125, + "learning_rate": 3.13936624615079e-05, + "loss": 0.5046, + "step": 193440 + }, + { + "epoch": 9.608125558756333, + "grad_norm": 0.203125, + "learning_rate": 3.135392867785835e-05, + "loss": 0.4999, + "step": 193450 + }, + { + "epoch": 9.608622231051951, + "grad_norm": 0.21875, + "learning_rate": 3.13141948942088e-05, + "loss": 0.4982, + "step": 193460 + }, + { + "epoch": 9.609118903347571, + "grad_norm": 0.1962890625, + "learning_rate": 3.1274461110559253e-05, + "loss": 0.5025, + "step": 193470 + }, + { + "epoch": 9.60961557564319, + "grad_norm": 0.2060546875, + "learning_rate": 3.1234727326909704e-05, + "loss": 0.4829, + "step": 193480 + }, + { + "epoch": 9.61011224793881, + "grad_norm": 0.234375, + "learning_rate": 3.119499354326016e-05, + "loss": 0.4738, + "step": 193490 + }, + { + "epoch": 9.61060892023443, + "grad_norm": 0.197265625, + "learning_rate": 3.115525975961061e-05, + "loss": 0.4664, + "step": 193500 + }, + { + "epoch": 9.611105592530048, + "grad_norm": 0.2080078125, + "learning_rate": 3.1115525975961064e-05, + "loss": 0.4688, + "step": 193510 + }, + { + "epoch": 9.611602264825668, + "grad_norm": 0.2041015625, + "learning_rate": 3.1075792192311515e-05, + "loss": 0.4797, + "step": 193520 + }, + { + "epoch": 9.612098937121287, + "grad_norm": 0.23828125, + "learning_rate": 3.1036058408661966e-05, + "loss": 0.4989, + "step": 193530 + }, + { + "epoch": 9.612595609416907, + "grad_norm": 0.2158203125, + "learning_rate": 3.099632462501242e-05, + "loss": 0.5014, + "step": 193540 + }, + { + "epoch": 9.613092281712525, + "grad_norm": 0.2109375, + "learning_rate": 3.095659084136287e-05, + "loss": 0.4888, + "step": 193550 + }, + { + "epoch": 9.613588954008145, + "grad_norm": 0.2119140625, + "learning_rate": 3.091685705771332e-05, + "loss": 0.5036, + "step": 193560 + }, + { + "epoch": 9.614085626303766, + "grad_norm": 0.201171875, + "learning_rate": 3.087712327406378e-05, + "loss": 0.4386, + "step": 193570 + }, + { + "epoch": 9.614582298599384, + "grad_norm": 0.2197265625, + "learning_rate": 3.083738949041423e-05, + "loss": 0.5126, + "step": 193580 + }, + { + "epoch": 9.615078970895004, + "grad_norm": 0.2373046875, + "learning_rate": 3.079765570676468e-05, + "loss": 0.4903, + "step": 193590 + }, + { + "epoch": 9.615575643190622, + "grad_norm": 0.22265625, + "learning_rate": 3.075792192311513e-05, + "loss": 0.4977, + "step": 193600 + }, + { + "epoch": 9.616072315486242, + "grad_norm": 0.2119140625, + "learning_rate": 3.071818813946558e-05, + "loss": 0.479, + "step": 193610 + }, + { + "epoch": 9.61656898778186, + "grad_norm": 0.212890625, + "learning_rate": 3.067845435581603e-05, + "loss": 0.5186, + "step": 193620 + }, + { + "epoch": 9.61706566007748, + "grad_norm": 0.2041015625, + "learning_rate": 3.0638720572166483e-05, + "loss": 0.52, + "step": 193630 + }, + { + "epoch": 9.617562332373101, + "grad_norm": 0.208984375, + "learning_rate": 3.0598986788516934e-05, + "loss": 0.4909, + "step": 193640 + }, + { + "epoch": 9.61805900466872, + "grad_norm": 0.2119140625, + "learning_rate": 3.0559253004867385e-05, + "loss": 0.4971, + "step": 193650 + }, + { + "epoch": 9.61855567696434, + "grad_norm": 0.224609375, + "learning_rate": 3.051951922121784e-05, + "loss": 0.492, + "step": 193660 + }, + { + "epoch": 9.619052349259958, + "grad_norm": 0.212890625, + "learning_rate": 3.047978543756829e-05, + "loss": 0.4944, + "step": 193670 + }, + { + "epoch": 9.619549021555578, + "grad_norm": 0.275390625, + "learning_rate": 3.0440051653918745e-05, + "loss": 0.5009, + "step": 193680 + }, + { + "epoch": 9.620045693851196, + "grad_norm": 0.2060546875, + "learning_rate": 3.0400317870269196e-05, + "loss": 0.5048, + "step": 193690 + }, + { + "epoch": 9.620542366146816, + "grad_norm": 0.1962890625, + "learning_rate": 3.0360584086619647e-05, + "loss": 0.47, + "step": 193700 + }, + { + "epoch": 9.621039038442436, + "grad_norm": 0.189453125, + "learning_rate": 3.03208503029701e-05, + "loss": 0.5024, + "step": 193710 + }, + { + "epoch": 9.621535710738055, + "grad_norm": 0.1923828125, + "learning_rate": 3.0281116519320553e-05, + "loss": 0.4578, + "step": 193720 + }, + { + "epoch": 9.622032383033675, + "grad_norm": 0.2001953125, + "learning_rate": 3.0241382735671004e-05, + "loss": 0.5051, + "step": 193730 + }, + { + "epoch": 9.622529055329293, + "grad_norm": 0.2109375, + "learning_rate": 3.0201648952021455e-05, + "loss": 0.4967, + "step": 193740 + }, + { + "epoch": 9.623025727624913, + "grad_norm": 0.1806640625, + "learning_rate": 3.0161915168371906e-05, + "loss": 0.4644, + "step": 193750 + }, + { + "epoch": 9.623522399920532, + "grad_norm": 0.2451171875, + "learning_rate": 3.0122181384722364e-05, + "loss": 0.468, + "step": 193760 + }, + { + "epoch": 9.624019072216152, + "grad_norm": 0.2060546875, + "learning_rate": 3.0082447601072815e-05, + "loss": 0.4674, + "step": 193770 + }, + { + "epoch": 9.624515744511772, + "grad_norm": 0.2119140625, + "learning_rate": 3.004271381742327e-05, + "loss": 0.4686, + "step": 193780 + }, + { + "epoch": 9.62501241680739, + "grad_norm": 0.2021484375, + "learning_rate": 3.000298003377372e-05, + "loss": 0.5239, + "step": 193790 + }, + { + "epoch": 9.62550908910301, + "grad_norm": 0.2138671875, + "learning_rate": 2.996324625012417e-05, + "loss": 0.5107, + "step": 193800 + }, + { + "epoch": 9.626005761398629, + "grad_norm": 0.2021484375, + "learning_rate": 2.9923512466474622e-05, + "loss": 0.5074, + "step": 193810 + }, + { + "epoch": 9.626502433694249, + "grad_norm": 0.2119140625, + "learning_rate": 2.9883778682825077e-05, + "loss": 0.4984, + "step": 193820 + }, + { + "epoch": 9.626999105989867, + "grad_norm": 0.2216796875, + "learning_rate": 2.9844044899175528e-05, + "loss": 0.4873, + "step": 193830 + }, + { + "epoch": 9.627495778285487, + "grad_norm": 0.2080078125, + "learning_rate": 2.980431111552598e-05, + "loss": 0.5125, + "step": 193840 + }, + { + "epoch": 9.627992450581107, + "grad_norm": 0.1943359375, + "learning_rate": 2.976457733187643e-05, + "loss": 0.4712, + "step": 193850 + }, + { + "epoch": 9.628489122876726, + "grad_norm": 0.2197265625, + "learning_rate": 2.9724843548226884e-05, + "loss": 0.4807, + "step": 193860 + }, + { + "epoch": 9.628985795172346, + "grad_norm": 0.2275390625, + "learning_rate": 2.9685109764577335e-05, + "loss": 0.4664, + "step": 193870 + }, + { + "epoch": 9.629482467467964, + "grad_norm": 0.251953125, + "learning_rate": 2.9645375980927786e-05, + "loss": 0.515, + "step": 193880 + }, + { + "epoch": 9.629979139763584, + "grad_norm": 0.208984375, + "learning_rate": 2.9605642197278237e-05, + "loss": 0.4993, + "step": 193890 + }, + { + "epoch": 9.630475812059203, + "grad_norm": 0.2294921875, + "learning_rate": 2.956590841362869e-05, + "loss": 0.5217, + "step": 193900 + }, + { + "epoch": 9.630972484354823, + "grad_norm": 0.2294921875, + "learning_rate": 2.9526174629979143e-05, + "loss": 0.4946, + "step": 193910 + }, + { + "epoch": 9.631469156650443, + "grad_norm": 0.2080078125, + "learning_rate": 2.9486440846329594e-05, + "loss": 0.4654, + "step": 193920 + }, + { + "epoch": 9.631965828946061, + "grad_norm": 0.189453125, + "learning_rate": 2.9446707062680045e-05, + "loss": 0.4723, + "step": 193930 + }, + { + "epoch": 9.632462501241681, + "grad_norm": 0.2001953125, + "learning_rate": 2.94069732790305e-05, + "loss": 0.5216, + "step": 193940 + }, + { + "epoch": 9.6329591735373, + "grad_norm": 0.2099609375, + "learning_rate": 2.936723949538095e-05, + "loss": 0.4662, + "step": 193950 + }, + { + "epoch": 9.63345584583292, + "grad_norm": 0.1923828125, + "learning_rate": 2.93275057117314e-05, + "loss": 0.4802, + "step": 193960 + }, + { + "epoch": 9.633952518128538, + "grad_norm": 0.212890625, + "learning_rate": 2.9287771928081852e-05, + "loss": 0.4854, + "step": 193970 + }, + { + "epoch": 9.634449190424158, + "grad_norm": 0.2060546875, + "learning_rate": 2.9248038144432303e-05, + "loss": 0.5109, + "step": 193980 + }, + { + "epoch": 9.634945862719778, + "grad_norm": 0.19921875, + "learning_rate": 2.9208304360782758e-05, + "loss": 0.4891, + "step": 193990 + }, + { + "epoch": 9.635442535015397, + "grad_norm": 0.2216796875, + "learning_rate": 2.916857057713321e-05, + "loss": 0.4895, + "step": 194000 + }, + { + "epoch": 9.635939207311017, + "grad_norm": 0.2021484375, + "learning_rate": 2.912883679348366e-05, + "loss": 0.5063, + "step": 194010 + }, + { + "epoch": 9.636435879606635, + "grad_norm": 0.2041015625, + "learning_rate": 2.908910300983411e-05, + "loss": 0.473, + "step": 194020 + }, + { + "epoch": 9.636932551902255, + "grad_norm": 0.203125, + "learning_rate": 2.9049369226184565e-05, + "loss": 0.5293, + "step": 194030 + }, + { + "epoch": 9.637429224197874, + "grad_norm": 0.19921875, + "learning_rate": 2.9009635442535016e-05, + "loss": 0.478, + "step": 194040 + }, + { + "epoch": 9.637925896493494, + "grad_norm": 0.236328125, + "learning_rate": 2.8969901658885467e-05, + "loss": 0.4794, + "step": 194050 + }, + { + "epoch": 9.638422568789114, + "grad_norm": 0.203125, + "learning_rate": 2.8930167875235918e-05, + "loss": 0.5039, + "step": 194060 + }, + { + "epoch": 9.638919241084732, + "grad_norm": 0.2001953125, + "learning_rate": 2.8890434091586373e-05, + "loss": 0.4825, + "step": 194070 + }, + { + "epoch": 9.639415913380352, + "grad_norm": 0.2373046875, + "learning_rate": 2.8850700307936824e-05, + "loss": 0.4791, + "step": 194080 + }, + { + "epoch": 9.63991258567597, + "grad_norm": 0.1962890625, + "learning_rate": 2.8810966524287275e-05, + "loss": 0.4545, + "step": 194090 + }, + { + "epoch": 9.64040925797159, + "grad_norm": 0.2099609375, + "learning_rate": 2.8771232740637726e-05, + "loss": 0.5037, + "step": 194100 + }, + { + "epoch": 9.640905930267209, + "grad_norm": 0.26953125, + "learning_rate": 2.873149895698818e-05, + "loss": 0.4948, + "step": 194110 + }, + { + "epoch": 9.64140260256283, + "grad_norm": 0.2001953125, + "learning_rate": 2.869176517333863e-05, + "loss": 0.4682, + "step": 194120 + }, + { + "epoch": 9.64189927485845, + "grad_norm": 0.197265625, + "learning_rate": 2.8652031389689082e-05, + "loss": 0.4699, + "step": 194130 + }, + { + "epoch": 9.642395947154068, + "grad_norm": 0.193359375, + "learning_rate": 2.8612297606039533e-05, + "loss": 0.5437, + "step": 194140 + }, + { + "epoch": 9.642892619449688, + "grad_norm": 0.2236328125, + "learning_rate": 2.8572563822389988e-05, + "loss": 0.4852, + "step": 194150 + }, + { + "epoch": 9.643389291745306, + "grad_norm": 0.2138671875, + "learning_rate": 2.853283003874044e-05, + "loss": 0.5102, + "step": 194160 + }, + { + "epoch": 9.643885964040926, + "grad_norm": 0.232421875, + "learning_rate": 2.849309625509089e-05, + "loss": 0.5137, + "step": 194170 + }, + { + "epoch": 9.644382636336545, + "grad_norm": 0.228515625, + "learning_rate": 2.845336247144134e-05, + "loss": 0.4675, + "step": 194180 + }, + { + "epoch": 9.644879308632165, + "grad_norm": 0.1943359375, + "learning_rate": 2.84136286877918e-05, + "loss": 0.4924, + "step": 194190 + }, + { + "epoch": 9.645375980927783, + "grad_norm": 0.212890625, + "learning_rate": 2.837389490414225e-05, + "loss": 0.5043, + "step": 194200 + }, + { + "epoch": 9.645872653223403, + "grad_norm": 0.22265625, + "learning_rate": 2.8334161120492704e-05, + "loss": 0.4471, + "step": 194210 + }, + { + "epoch": 9.646369325519023, + "grad_norm": 0.2314453125, + "learning_rate": 2.8294427336843155e-05, + "loss": 0.4832, + "step": 194220 + }, + { + "epoch": 9.646865997814642, + "grad_norm": 0.21484375, + "learning_rate": 2.8254693553193606e-05, + "loss": 0.487, + "step": 194230 + }, + { + "epoch": 9.647362670110262, + "grad_norm": 0.2119140625, + "learning_rate": 2.8214959769544057e-05, + "loss": 0.4722, + "step": 194240 + }, + { + "epoch": 9.64785934240588, + "grad_norm": 0.1953125, + "learning_rate": 2.817522598589451e-05, + "loss": 0.4637, + "step": 194250 + }, + { + "epoch": 9.6483560147015, + "grad_norm": 0.208984375, + "learning_rate": 2.8135492202244962e-05, + "loss": 0.4972, + "step": 194260 + }, + { + "epoch": 9.648852686997119, + "grad_norm": 0.2392578125, + "learning_rate": 2.8095758418595413e-05, + "loss": 0.4992, + "step": 194270 + }, + { + "epoch": 9.649349359292739, + "grad_norm": 0.1953125, + "learning_rate": 2.8056024634945865e-05, + "loss": 0.4637, + "step": 194280 + }, + { + "epoch": 9.649846031588359, + "grad_norm": 0.22265625, + "learning_rate": 2.801629085129632e-05, + "loss": 0.4976, + "step": 194290 + }, + { + "epoch": 9.650342703883977, + "grad_norm": 0.19921875, + "learning_rate": 2.797655706764677e-05, + "loss": 0.4965, + "step": 194300 + }, + { + "epoch": 9.650839376179597, + "grad_norm": 0.2001953125, + "learning_rate": 2.793682328399722e-05, + "loss": 0.4809, + "step": 194310 + }, + { + "epoch": 9.651336048475216, + "grad_norm": 0.21875, + "learning_rate": 2.7897089500347672e-05, + "loss": 0.5083, + "step": 194320 + }, + { + "epoch": 9.651832720770836, + "grad_norm": 0.1962890625, + "learning_rate": 2.7857355716698123e-05, + "loss": 0.4955, + "step": 194330 + }, + { + "epoch": 9.652329393066454, + "grad_norm": 0.2041015625, + "learning_rate": 2.7817621933048577e-05, + "loss": 0.4847, + "step": 194340 + }, + { + "epoch": 9.652826065362074, + "grad_norm": 0.220703125, + "learning_rate": 2.777788814939903e-05, + "loss": 0.4941, + "step": 194350 + }, + { + "epoch": 9.653322737657694, + "grad_norm": 0.22265625, + "learning_rate": 2.773815436574948e-05, + "loss": 0.5079, + "step": 194360 + }, + { + "epoch": 9.653819409953313, + "grad_norm": 0.23828125, + "learning_rate": 2.769842058209993e-05, + "loss": 0.5025, + "step": 194370 + }, + { + "epoch": 9.654316082248933, + "grad_norm": 0.21875, + "learning_rate": 2.7658686798450385e-05, + "loss": 0.4983, + "step": 194380 + }, + { + "epoch": 9.654812754544551, + "grad_norm": 0.2060546875, + "learning_rate": 2.7618953014800836e-05, + "loss": 0.4958, + "step": 194390 + }, + { + "epoch": 9.655309426840171, + "grad_norm": 0.205078125, + "learning_rate": 2.7579219231151287e-05, + "loss": 0.4882, + "step": 194400 + }, + { + "epoch": 9.65580609913579, + "grad_norm": 0.2294921875, + "learning_rate": 2.7539485447501738e-05, + "loss": 0.4729, + "step": 194410 + }, + { + "epoch": 9.65630277143141, + "grad_norm": 0.2021484375, + "learning_rate": 2.7499751663852192e-05, + "loss": 0.4555, + "step": 194420 + }, + { + "epoch": 9.65679944372703, + "grad_norm": 0.2158203125, + "learning_rate": 2.7460017880202643e-05, + "loss": 0.4827, + "step": 194430 + }, + { + "epoch": 9.657296116022648, + "grad_norm": 0.220703125, + "learning_rate": 2.7420284096553094e-05, + "loss": 0.5014, + "step": 194440 + }, + { + "epoch": 9.657792788318268, + "grad_norm": 0.197265625, + "learning_rate": 2.7380550312903546e-05, + "loss": 0.4615, + "step": 194450 + }, + { + "epoch": 9.658289460613886, + "grad_norm": 0.2314453125, + "learning_rate": 2.7340816529254e-05, + "loss": 0.4873, + "step": 194460 + }, + { + "epoch": 9.658786132909507, + "grad_norm": 0.20703125, + "learning_rate": 2.730108274560445e-05, + "loss": 0.4867, + "step": 194470 + }, + { + "epoch": 9.659282805205125, + "grad_norm": 0.208984375, + "learning_rate": 2.7261348961954902e-05, + "loss": 0.5168, + "step": 194480 + }, + { + "epoch": 9.659779477500745, + "grad_norm": 0.197265625, + "learning_rate": 2.7221615178305353e-05, + "loss": 0.4792, + "step": 194490 + }, + { + "epoch": 9.660276149796365, + "grad_norm": 0.2041015625, + "learning_rate": 2.7181881394655807e-05, + "loss": 0.4787, + "step": 194500 + }, + { + "epoch": 9.660772822091984, + "grad_norm": 0.2138671875, + "learning_rate": 2.714214761100626e-05, + "loss": 0.5178, + "step": 194510 + }, + { + "epoch": 9.661269494387604, + "grad_norm": 0.2333984375, + "learning_rate": 2.710241382735671e-05, + "loss": 0.4603, + "step": 194520 + }, + { + "epoch": 9.661766166683222, + "grad_norm": 0.22265625, + "learning_rate": 2.706268004370716e-05, + "loss": 0.5243, + "step": 194530 + }, + { + "epoch": 9.662262838978842, + "grad_norm": 0.2197265625, + "learning_rate": 2.7022946260057615e-05, + "loss": 0.4726, + "step": 194540 + }, + { + "epoch": 9.66275951127446, + "grad_norm": 0.21484375, + "learning_rate": 2.6983212476408066e-05, + "loss": 0.488, + "step": 194550 + }, + { + "epoch": 9.66325618357008, + "grad_norm": 0.2373046875, + "learning_rate": 2.6943478692758517e-05, + "loss": 0.5003, + "step": 194560 + }, + { + "epoch": 9.663752855865699, + "grad_norm": 0.2177734375, + "learning_rate": 2.6903744909108968e-05, + "loss": 0.4723, + "step": 194570 + }, + { + "epoch": 9.664249528161319, + "grad_norm": 0.1923828125, + "learning_rate": 2.686401112545942e-05, + "loss": 0.4933, + "step": 194580 + }, + { + "epoch": 9.664746200456939, + "grad_norm": 0.1865234375, + "learning_rate": 2.6824277341809873e-05, + "loss": 0.4836, + "step": 194590 + }, + { + "epoch": 9.665242872752557, + "grad_norm": 0.1923828125, + "learning_rate": 2.6784543558160324e-05, + "loss": 0.4715, + "step": 194600 + }, + { + "epoch": 9.665739545048178, + "grad_norm": 0.2138671875, + "learning_rate": 2.6744809774510782e-05, + "loss": 0.4682, + "step": 194610 + }, + { + "epoch": 9.666236217343796, + "grad_norm": 0.2265625, + "learning_rate": 2.6705075990861233e-05, + "loss": 0.5242, + "step": 194620 + }, + { + "epoch": 9.666732889639416, + "grad_norm": 0.2021484375, + "learning_rate": 2.6665342207211684e-05, + "loss": 0.5015, + "step": 194630 + }, + { + "epoch": 9.667229561935034, + "grad_norm": 0.2001953125, + "learning_rate": 2.662560842356214e-05, + "loss": 0.504, + "step": 194640 + }, + { + "epoch": 9.667726234230654, + "grad_norm": 0.2021484375, + "learning_rate": 2.658587463991259e-05, + "loss": 0.467, + "step": 194650 + }, + { + "epoch": 9.668222906526275, + "grad_norm": 0.20703125, + "learning_rate": 2.654614085626304e-05, + "loss": 0.4887, + "step": 194660 + }, + { + "epoch": 9.668719578821893, + "grad_norm": 0.1962890625, + "learning_rate": 2.6506407072613492e-05, + "loss": 0.4907, + "step": 194670 + }, + { + "epoch": 9.669216251117513, + "grad_norm": 0.234375, + "learning_rate": 2.6466673288963946e-05, + "loss": 0.5036, + "step": 194680 + }, + { + "epoch": 9.669712923413131, + "grad_norm": 0.25390625, + "learning_rate": 2.6426939505314397e-05, + "loss": 0.4953, + "step": 194690 + }, + { + "epoch": 9.670209595708751, + "grad_norm": 0.2099609375, + "learning_rate": 2.6387205721664848e-05, + "loss": 0.5137, + "step": 194700 + }, + { + "epoch": 9.67070626800437, + "grad_norm": 0.1865234375, + "learning_rate": 2.63474719380153e-05, + "loss": 0.4861, + "step": 194710 + }, + { + "epoch": 9.67120294029999, + "grad_norm": 0.205078125, + "learning_rate": 2.630773815436575e-05, + "loss": 0.4877, + "step": 194720 + }, + { + "epoch": 9.67169961259561, + "grad_norm": 0.208984375, + "learning_rate": 2.6268004370716205e-05, + "loss": 0.486, + "step": 194730 + }, + { + "epoch": 9.672196284891228, + "grad_norm": 0.1943359375, + "learning_rate": 2.6228270587066656e-05, + "loss": 0.4809, + "step": 194740 + }, + { + "epoch": 9.672692957186849, + "grad_norm": 0.201171875, + "learning_rate": 2.6188536803417107e-05, + "loss": 0.4748, + "step": 194750 + }, + { + "epoch": 9.673189629482467, + "grad_norm": 0.18359375, + "learning_rate": 2.6148803019767558e-05, + "loss": 0.4607, + "step": 194760 + }, + { + "epoch": 9.673686301778087, + "grad_norm": 0.2197265625, + "learning_rate": 2.6109069236118012e-05, + "loss": 0.4913, + "step": 194770 + }, + { + "epoch": 9.674182974073705, + "grad_norm": 0.22265625, + "learning_rate": 2.6069335452468463e-05, + "loss": 0.4711, + "step": 194780 + }, + { + "epoch": 9.674679646369325, + "grad_norm": 0.1962890625, + "learning_rate": 2.6029601668818914e-05, + "loss": 0.4707, + "step": 194790 + }, + { + "epoch": 9.675176318664946, + "grad_norm": 0.25, + "learning_rate": 2.5989867885169365e-05, + "loss": 0.48, + "step": 194800 + }, + { + "epoch": 9.675672990960564, + "grad_norm": 0.22265625, + "learning_rate": 2.595013410151982e-05, + "loss": 0.4536, + "step": 194810 + }, + { + "epoch": 9.676169663256184, + "grad_norm": 0.1982421875, + "learning_rate": 2.591040031787027e-05, + "loss": 0.4831, + "step": 194820 + }, + { + "epoch": 9.676666335551802, + "grad_norm": 0.20703125, + "learning_rate": 2.5870666534220722e-05, + "loss": 0.4773, + "step": 194830 + }, + { + "epoch": 9.677163007847422, + "grad_norm": 0.2080078125, + "learning_rate": 2.5830932750571173e-05, + "loss": 0.502, + "step": 194840 + }, + { + "epoch": 9.67765968014304, + "grad_norm": 0.23046875, + "learning_rate": 2.5791198966921627e-05, + "loss": 0.4914, + "step": 194850 + }, + { + "epoch": 9.678156352438661, + "grad_norm": 0.21484375, + "learning_rate": 2.5751465183272078e-05, + "loss": 0.4692, + "step": 194860 + }, + { + "epoch": 9.678653024734281, + "grad_norm": 0.21484375, + "learning_rate": 2.571173139962253e-05, + "loss": 0.476, + "step": 194870 + }, + { + "epoch": 9.6791496970299, + "grad_norm": 0.208984375, + "learning_rate": 2.567199761597298e-05, + "loss": 0.4852, + "step": 194880 + }, + { + "epoch": 9.67964636932552, + "grad_norm": 0.2119140625, + "learning_rate": 2.5632263832323435e-05, + "loss": 0.5134, + "step": 194890 + }, + { + "epoch": 9.680143041621138, + "grad_norm": 0.1943359375, + "learning_rate": 2.5592530048673886e-05, + "loss": 0.4981, + "step": 194900 + }, + { + "epoch": 9.680639713916758, + "grad_norm": 0.2236328125, + "learning_rate": 2.5552796265024337e-05, + "loss": 0.4656, + "step": 194910 + }, + { + "epoch": 9.681136386212376, + "grad_norm": 0.2421875, + "learning_rate": 2.5513062481374788e-05, + "loss": 0.4665, + "step": 194920 + }, + { + "epoch": 9.681633058507996, + "grad_norm": 0.2138671875, + "learning_rate": 2.5473328697725242e-05, + "loss": 0.5203, + "step": 194930 + }, + { + "epoch": 9.682129730803616, + "grad_norm": 0.205078125, + "learning_rate": 2.5433594914075693e-05, + "loss": 0.526, + "step": 194940 + }, + { + "epoch": 9.682626403099235, + "grad_norm": 0.2119140625, + "learning_rate": 2.5393861130426144e-05, + "loss": 0.5292, + "step": 194950 + }, + { + "epoch": 9.683123075394855, + "grad_norm": 0.23828125, + "learning_rate": 2.5354127346776595e-05, + "loss": 0.5179, + "step": 194960 + }, + { + "epoch": 9.683619747690473, + "grad_norm": 0.205078125, + "learning_rate": 2.5314393563127046e-05, + "loss": 0.4797, + "step": 194970 + }, + { + "epoch": 9.684116419986093, + "grad_norm": 0.21484375, + "learning_rate": 2.52746597794775e-05, + "loss": 0.4866, + "step": 194980 + }, + { + "epoch": 9.684613092281712, + "grad_norm": 0.2197265625, + "learning_rate": 2.5234925995827952e-05, + "loss": 0.5057, + "step": 194990 + }, + { + "epoch": 9.685109764577332, + "grad_norm": 0.2158203125, + "learning_rate": 2.5195192212178403e-05, + "loss": 0.5067, + "step": 195000 + }, + { + "epoch": 9.685606436872952, + "grad_norm": 0.2060546875, + "learning_rate": 2.5155458428528854e-05, + "loss": 0.4849, + "step": 195010 + }, + { + "epoch": 9.68610310916857, + "grad_norm": 0.2314453125, + "learning_rate": 2.5115724644879308e-05, + "loss": 0.4757, + "step": 195020 + }, + { + "epoch": 9.68659978146419, + "grad_norm": 0.203125, + "learning_rate": 2.5075990861229766e-05, + "loss": 0.4971, + "step": 195030 + }, + { + "epoch": 9.687096453759809, + "grad_norm": 0.1982421875, + "learning_rate": 2.5036257077580217e-05, + "loss": 0.5103, + "step": 195040 + }, + { + "epoch": 9.687593126055429, + "grad_norm": 0.216796875, + "learning_rate": 2.4996523293930665e-05, + "loss": 0.4784, + "step": 195050 + }, + { + "epoch": 9.688089798351047, + "grad_norm": 0.197265625, + "learning_rate": 2.4956789510281116e-05, + "loss": 0.501, + "step": 195060 + }, + { + "epoch": 9.688586470646667, + "grad_norm": 0.2041015625, + "learning_rate": 2.491705572663157e-05, + "loss": 0.46, + "step": 195070 + }, + { + "epoch": 9.689083142942287, + "grad_norm": 0.2109375, + "learning_rate": 2.487732194298202e-05, + "loss": 0.5057, + "step": 195080 + }, + { + "epoch": 9.689579815237906, + "grad_norm": 0.2294921875, + "learning_rate": 2.4837588159332472e-05, + "loss": 0.5015, + "step": 195090 + }, + { + "epoch": 9.690076487533526, + "grad_norm": 0.2119140625, + "learning_rate": 2.4797854375682923e-05, + "loss": 0.4884, + "step": 195100 + }, + { + "epoch": 9.690573159829144, + "grad_norm": 0.193359375, + "learning_rate": 2.4758120592033378e-05, + "loss": 0.5, + "step": 195110 + }, + { + "epoch": 9.691069832124764, + "grad_norm": 0.19921875, + "learning_rate": 2.471838680838383e-05, + "loss": 0.496, + "step": 195120 + }, + { + "epoch": 9.691566504420383, + "grad_norm": 0.19921875, + "learning_rate": 2.4678653024734283e-05, + "loss": 0.5178, + "step": 195130 + }, + { + "epoch": 9.692063176716003, + "grad_norm": 0.2578125, + "learning_rate": 2.4638919241084734e-05, + "loss": 0.4895, + "step": 195140 + }, + { + "epoch": 9.692559849011623, + "grad_norm": 0.201171875, + "learning_rate": 2.4599185457435185e-05, + "loss": 0.4871, + "step": 195150 + }, + { + "epoch": 9.693056521307241, + "grad_norm": 0.1982421875, + "learning_rate": 2.455945167378564e-05, + "loss": 0.5003, + "step": 195160 + }, + { + "epoch": 9.693553193602861, + "grad_norm": 0.201171875, + "learning_rate": 2.451971789013609e-05, + "loss": 0.4816, + "step": 195170 + }, + { + "epoch": 9.69404986589848, + "grad_norm": 0.19921875, + "learning_rate": 2.447998410648654e-05, + "loss": 0.4768, + "step": 195180 + }, + { + "epoch": 9.6945465381941, + "grad_norm": 0.177734375, + "learning_rate": 2.4440250322836993e-05, + "loss": 0.4494, + "step": 195190 + }, + { + "epoch": 9.695043210489718, + "grad_norm": 0.1962890625, + "learning_rate": 2.4400516539187447e-05, + "loss": 0.4537, + "step": 195200 + }, + { + "epoch": 9.695539882785338, + "grad_norm": 0.1943359375, + "learning_rate": 2.4360782755537898e-05, + "loss": 0.4809, + "step": 195210 + }, + { + "epoch": 9.696036555080958, + "grad_norm": 0.2041015625, + "learning_rate": 2.432104897188835e-05, + "loss": 0.46, + "step": 195220 + }, + { + "epoch": 9.696533227376577, + "grad_norm": 0.197265625, + "learning_rate": 2.42813151882388e-05, + "loss": 0.4753, + "step": 195230 + }, + { + "epoch": 9.697029899672197, + "grad_norm": 0.2080078125, + "learning_rate": 2.4241581404589255e-05, + "loss": 0.4959, + "step": 195240 + }, + { + "epoch": 9.697526571967815, + "grad_norm": 0.205078125, + "learning_rate": 2.4201847620939706e-05, + "loss": 0.4751, + "step": 195250 + }, + { + "epoch": 9.698023244263435, + "grad_norm": 0.212890625, + "learning_rate": 2.4162113837290157e-05, + "loss": 0.5048, + "step": 195260 + }, + { + "epoch": 9.698519916559054, + "grad_norm": 0.197265625, + "learning_rate": 2.4122380053640608e-05, + "loss": 0.4776, + "step": 195270 + }, + { + "epoch": 9.699016588854674, + "grad_norm": 0.1953125, + "learning_rate": 2.4082646269991062e-05, + "loss": 0.4882, + "step": 195280 + }, + { + "epoch": 9.699513261150294, + "grad_norm": 0.208984375, + "learning_rate": 2.4042912486341513e-05, + "loss": 0.4751, + "step": 195290 + }, + { + "epoch": 9.700009933445912, + "grad_norm": 0.208984375, + "learning_rate": 2.4003178702691964e-05, + "loss": 0.4944, + "step": 195300 + }, + { + "epoch": 9.700506605741532, + "grad_norm": 0.2412109375, + "learning_rate": 2.3963444919042415e-05, + "loss": 0.5295, + "step": 195310 + }, + { + "epoch": 9.70100327803715, + "grad_norm": 0.220703125, + "learning_rate": 2.3923711135392866e-05, + "loss": 0.5129, + "step": 195320 + }, + { + "epoch": 9.70149995033277, + "grad_norm": 0.216796875, + "learning_rate": 2.388397735174332e-05, + "loss": 0.481, + "step": 195330 + }, + { + "epoch": 9.70199662262839, + "grad_norm": 0.1962890625, + "learning_rate": 2.3844243568093775e-05, + "loss": 0.4714, + "step": 195340 + }, + { + "epoch": 9.70249329492401, + "grad_norm": 0.212890625, + "learning_rate": 2.3804509784444226e-05, + "loss": 0.5126, + "step": 195350 + }, + { + "epoch": 9.70298996721963, + "grad_norm": 0.2265625, + "learning_rate": 2.3764776000794677e-05, + "loss": 0.4717, + "step": 195360 + }, + { + "epoch": 9.703486639515248, + "grad_norm": 0.2138671875, + "learning_rate": 2.3725042217145128e-05, + "loss": 0.5067, + "step": 195370 + }, + { + "epoch": 9.703983311810868, + "grad_norm": 0.1962890625, + "learning_rate": 2.3685308433495582e-05, + "loss": 0.4728, + "step": 195380 + }, + { + "epoch": 9.704479984106486, + "grad_norm": 0.2158203125, + "learning_rate": 2.3645574649846034e-05, + "loss": 0.4708, + "step": 195390 + }, + { + "epoch": 9.704976656402106, + "grad_norm": 0.2119140625, + "learning_rate": 2.3605840866196485e-05, + "loss": 0.4978, + "step": 195400 + }, + { + "epoch": 9.705473328697725, + "grad_norm": 0.19921875, + "learning_rate": 2.3566107082546936e-05, + "loss": 0.491, + "step": 195410 + }, + { + "epoch": 9.705970000993345, + "grad_norm": 0.1953125, + "learning_rate": 2.352637329889739e-05, + "loss": 0.5006, + "step": 195420 + }, + { + "epoch": 9.706466673288965, + "grad_norm": 0.197265625, + "learning_rate": 2.348663951524784e-05, + "loss": 0.4791, + "step": 195430 + }, + { + "epoch": 9.706963345584583, + "grad_norm": 0.22265625, + "learning_rate": 2.3446905731598292e-05, + "loss": 0.5047, + "step": 195440 + }, + { + "epoch": 9.707460017880203, + "grad_norm": 0.20703125, + "learning_rate": 2.3407171947948743e-05, + "loss": 0.4874, + "step": 195450 + }, + { + "epoch": 9.707956690175822, + "grad_norm": 0.2119140625, + "learning_rate": 2.3367438164299197e-05, + "loss": 0.4617, + "step": 195460 + }, + { + "epoch": 9.708453362471442, + "grad_norm": 0.20703125, + "learning_rate": 2.332770438064965e-05, + "loss": 0.4805, + "step": 195470 + }, + { + "epoch": 9.70895003476706, + "grad_norm": 0.2578125, + "learning_rate": 2.32879705970001e-05, + "loss": 0.5012, + "step": 195480 + }, + { + "epoch": 9.70944670706268, + "grad_norm": 0.19140625, + "learning_rate": 2.324823681335055e-05, + "loss": 0.4576, + "step": 195490 + }, + { + "epoch": 9.7099433793583, + "grad_norm": 0.2080078125, + "learning_rate": 2.3208503029701005e-05, + "loss": 0.4666, + "step": 195500 + }, + { + "epoch": 9.710440051653919, + "grad_norm": 0.1943359375, + "learning_rate": 2.3168769246051456e-05, + "loss": 0.4796, + "step": 195510 + }, + { + "epoch": 9.710936723949539, + "grad_norm": 0.2021484375, + "learning_rate": 2.3129035462401907e-05, + "loss": 0.5225, + "step": 195520 + }, + { + "epoch": 9.711433396245157, + "grad_norm": 0.2119140625, + "learning_rate": 2.3089301678752358e-05, + "loss": 0.5004, + "step": 195530 + }, + { + "epoch": 9.711930068540777, + "grad_norm": 0.20703125, + "learning_rate": 2.3049567895102812e-05, + "loss": 0.4764, + "step": 195540 + }, + { + "epoch": 9.712426740836396, + "grad_norm": 0.18359375, + "learning_rate": 2.3009834111453267e-05, + "loss": 0.496, + "step": 195550 + }, + { + "epoch": 9.712923413132016, + "grad_norm": 0.232421875, + "learning_rate": 2.2970100327803718e-05, + "loss": 0.4956, + "step": 195560 + }, + { + "epoch": 9.713420085427634, + "grad_norm": 0.2197265625, + "learning_rate": 2.293036654415417e-05, + "loss": 0.4903, + "step": 195570 + }, + { + "epoch": 9.713916757723254, + "grad_norm": 0.2138671875, + "learning_rate": 2.289063276050462e-05, + "loss": 0.4937, + "step": 195580 + }, + { + "epoch": 9.714413430018874, + "grad_norm": 0.259765625, + "learning_rate": 2.2850898976855074e-05, + "loss": 0.5052, + "step": 195590 + }, + { + "epoch": 9.714910102314493, + "grad_norm": 0.2119140625, + "learning_rate": 2.2811165193205525e-05, + "loss": 0.514, + "step": 195600 + }, + { + "epoch": 9.715406774610113, + "grad_norm": 0.2392578125, + "learning_rate": 2.2771431409555976e-05, + "loss": 0.5074, + "step": 195610 + }, + { + "epoch": 9.715903446905731, + "grad_norm": 0.23046875, + "learning_rate": 2.2731697625906427e-05, + "loss": 0.4891, + "step": 195620 + }, + { + "epoch": 9.716400119201351, + "grad_norm": 0.240234375, + "learning_rate": 2.2691963842256882e-05, + "loss": 0.4955, + "step": 195630 + }, + { + "epoch": 9.71689679149697, + "grad_norm": 0.2060546875, + "learning_rate": 2.2652230058607333e-05, + "loss": 0.5105, + "step": 195640 + }, + { + "epoch": 9.71739346379259, + "grad_norm": 0.2236328125, + "learning_rate": 2.2612496274957784e-05, + "loss": 0.4582, + "step": 195650 + }, + { + "epoch": 9.71789013608821, + "grad_norm": 0.19140625, + "learning_rate": 2.2572762491308235e-05, + "loss": 0.4924, + "step": 195660 + }, + { + "epoch": 9.718386808383828, + "grad_norm": 0.22265625, + "learning_rate": 2.2533028707658686e-05, + "loss": 0.4607, + "step": 195670 + }, + { + "epoch": 9.718883480679448, + "grad_norm": 0.19921875, + "learning_rate": 2.249329492400914e-05, + "loss": 0.5146, + "step": 195680 + }, + { + "epoch": 9.719380152975067, + "grad_norm": 0.2080078125, + "learning_rate": 2.245356114035959e-05, + "loss": 0.4757, + "step": 195690 + }, + { + "epoch": 9.719876825270687, + "grad_norm": 0.2041015625, + "learning_rate": 2.2413827356710042e-05, + "loss": 0.469, + "step": 195700 + }, + { + "epoch": 9.720373497566305, + "grad_norm": 0.2099609375, + "learning_rate": 2.2374093573060493e-05, + "loss": 0.4821, + "step": 195710 + }, + { + "epoch": 9.720870169861925, + "grad_norm": 0.2197265625, + "learning_rate": 2.2334359789410948e-05, + "loss": 0.5017, + "step": 195720 + }, + { + "epoch": 9.721366842157545, + "grad_norm": 0.212890625, + "learning_rate": 2.22946260057614e-05, + "loss": 0.5143, + "step": 195730 + }, + { + "epoch": 9.721863514453164, + "grad_norm": 0.19921875, + "learning_rate": 2.225489222211185e-05, + "loss": 0.5029, + "step": 195740 + }, + { + "epoch": 9.722360186748784, + "grad_norm": 0.2109375, + "learning_rate": 2.22151584384623e-05, + "loss": 0.4939, + "step": 195750 + }, + { + "epoch": 9.722856859044402, + "grad_norm": 0.2099609375, + "learning_rate": 2.2175424654812755e-05, + "loss": 0.4829, + "step": 195760 + }, + { + "epoch": 9.723353531340022, + "grad_norm": 0.19921875, + "learning_rate": 2.213569087116321e-05, + "loss": 0.4811, + "step": 195770 + }, + { + "epoch": 9.72385020363564, + "grad_norm": 0.2041015625, + "learning_rate": 2.209595708751366e-05, + "loss": 0.4619, + "step": 195780 + }, + { + "epoch": 9.72434687593126, + "grad_norm": 0.1884765625, + "learning_rate": 2.2056223303864112e-05, + "loss": 0.4848, + "step": 195790 + }, + { + "epoch": 9.72484354822688, + "grad_norm": 0.1953125, + "learning_rate": 2.2016489520214563e-05, + "loss": 0.4892, + "step": 195800 + }, + { + "epoch": 9.725340220522499, + "grad_norm": 0.2109375, + "learning_rate": 2.1976755736565017e-05, + "loss": 0.4849, + "step": 195810 + }, + { + "epoch": 9.72583689281812, + "grad_norm": 0.1923828125, + "learning_rate": 2.193702195291547e-05, + "loss": 0.4736, + "step": 195820 + }, + { + "epoch": 9.726333565113737, + "grad_norm": 0.21875, + "learning_rate": 2.189728816926592e-05, + "loss": 0.5, + "step": 195830 + }, + { + "epoch": 9.726830237409358, + "grad_norm": 0.2265625, + "learning_rate": 2.185755438561637e-05, + "loss": 0.5078, + "step": 195840 + }, + { + "epoch": 9.727326909704976, + "grad_norm": 0.279296875, + "learning_rate": 2.1817820601966825e-05, + "loss": 0.4833, + "step": 195850 + }, + { + "epoch": 9.727823582000596, + "grad_norm": 0.2177734375, + "learning_rate": 2.1778086818317276e-05, + "loss": 0.5043, + "step": 195860 + }, + { + "epoch": 9.728320254296216, + "grad_norm": 0.2421875, + "learning_rate": 2.1738353034667727e-05, + "loss": 0.4861, + "step": 195870 + }, + { + "epoch": 9.728816926591835, + "grad_norm": 0.2216796875, + "learning_rate": 2.1698619251018178e-05, + "loss": 0.4772, + "step": 195880 + }, + { + "epoch": 9.729313598887455, + "grad_norm": 0.212890625, + "learning_rate": 2.1658885467368632e-05, + "loss": 0.5058, + "step": 195890 + }, + { + "epoch": 9.729810271183073, + "grad_norm": 0.193359375, + "learning_rate": 2.1619151683719083e-05, + "loss": 0.507, + "step": 195900 + }, + { + "epoch": 9.730306943478693, + "grad_norm": 0.2294921875, + "learning_rate": 2.1579417900069534e-05, + "loss": 0.4756, + "step": 195910 + }, + { + "epoch": 9.730803615774311, + "grad_norm": 0.2177734375, + "learning_rate": 2.1539684116419985e-05, + "loss": 0.4806, + "step": 195920 + }, + { + "epoch": 9.731300288069932, + "grad_norm": 0.205078125, + "learning_rate": 2.149995033277044e-05, + "loss": 0.4856, + "step": 195930 + }, + { + "epoch": 9.731796960365552, + "grad_norm": 0.18359375, + "learning_rate": 2.146021654912089e-05, + "loss": 0.4852, + "step": 195940 + }, + { + "epoch": 9.73229363266117, + "grad_norm": 0.197265625, + "learning_rate": 2.1420482765471342e-05, + "loss": 0.5089, + "step": 195950 + }, + { + "epoch": 9.73279030495679, + "grad_norm": 0.21484375, + "learning_rate": 2.1380748981821793e-05, + "loss": 0.4961, + "step": 195960 + }, + { + "epoch": 9.733286977252408, + "grad_norm": 0.2099609375, + "learning_rate": 2.1341015198172244e-05, + "loss": 0.4767, + "step": 195970 + }, + { + "epoch": 9.733783649548029, + "grad_norm": 0.1806640625, + "learning_rate": 2.1301281414522702e-05, + "loss": 0.4586, + "step": 195980 + }, + { + "epoch": 9.734280321843647, + "grad_norm": 0.2265625, + "learning_rate": 2.1261547630873153e-05, + "loss": 0.4875, + "step": 195990 + }, + { + "epoch": 9.734776994139267, + "grad_norm": 0.251953125, + "learning_rate": 2.1221813847223604e-05, + "loss": 0.5347, + "step": 196000 + }, + { + "epoch": 9.735273666434885, + "grad_norm": 0.2265625, + "learning_rate": 2.1182080063574055e-05, + "loss": 0.4966, + "step": 196010 + }, + { + "epoch": 9.735770338730505, + "grad_norm": 0.18359375, + "learning_rate": 2.114234627992451e-05, + "loss": 0.4585, + "step": 196020 + }, + { + "epoch": 9.736267011026126, + "grad_norm": 0.25, + "learning_rate": 2.110261249627496e-05, + "loss": 0.4704, + "step": 196030 + }, + { + "epoch": 9.736763683321744, + "grad_norm": 0.2255859375, + "learning_rate": 2.106287871262541e-05, + "loss": 0.4908, + "step": 196040 + }, + { + "epoch": 9.737260355617364, + "grad_norm": 0.2109375, + "learning_rate": 2.1023144928975862e-05, + "loss": 0.4916, + "step": 196050 + }, + { + "epoch": 9.737757027912982, + "grad_norm": 0.2177734375, + "learning_rate": 2.0983411145326313e-05, + "loss": 0.5012, + "step": 196060 + }, + { + "epoch": 9.738253700208602, + "grad_norm": 0.224609375, + "learning_rate": 2.0943677361676768e-05, + "loss": 0.4638, + "step": 196070 + }, + { + "epoch": 9.73875037250422, + "grad_norm": 0.1982421875, + "learning_rate": 2.090394357802722e-05, + "loss": 0.4841, + "step": 196080 + }, + { + "epoch": 9.739247044799841, + "grad_norm": 0.19921875, + "learning_rate": 2.086420979437767e-05, + "loss": 0.4794, + "step": 196090 + }, + { + "epoch": 9.739743717095461, + "grad_norm": 0.26171875, + "learning_rate": 2.082447601072812e-05, + "loss": 0.5137, + "step": 196100 + }, + { + "epoch": 9.74024038939108, + "grad_norm": 0.1943359375, + "learning_rate": 2.0784742227078575e-05, + "loss": 0.4988, + "step": 196110 + }, + { + "epoch": 9.7407370616867, + "grad_norm": 0.2255859375, + "learning_rate": 2.0745008443429026e-05, + "loss": 0.4938, + "step": 196120 + }, + { + "epoch": 9.741233733982318, + "grad_norm": 0.310546875, + "learning_rate": 2.0705274659779477e-05, + "loss": 0.4864, + "step": 196130 + }, + { + "epoch": 9.741730406277938, + "grad_norm": 0.20703125, + "learning_rate": 2.0665540876129928e-05, + "loss": 0.4936, + "step": 196140 + }, + { + "epoch": 9.742227078573556, + "grad_norm": 0.2099609375, + "learning_rate": 2.0625807092480383e-05, + "loss": 0.4587, + "step": 196150 + }, + { + "epoch": 9.742723750869176, + "grad_norm": 0.197265625, + "learning_rate": 2.0586073308830834e-05, + "loss": 0.4832, + "step": 196160 + }, + { + "epoch": 9.743220423164797, + "grad_norm": 0.203125, + "learning_rate": 2.0546339525181285e-05, + "loss": 0.5016, + "step": 196170 + }, + { + "epoch": 9.743717095460415, + "grad_norm": 0.2119140625, + "learning_rate": 2.0506605741531736e-05, + "loss": 0.4711, + "step": 196180 + }, + { + "epoch": 9.744213767756035, + "grad_norm": 0.1943359375, + "learning_rate": 2.046687195788219e-05, + "loss": 0.48, + "step": 196190 + }, + { + "epoch": 9.744710440051653, + "grad_norm": 0.2314453125, + "learning_rate": 2.0427138174232645e-05, + "loss": 0.4777, + "step": 196200 + }, + { + "epoch": 9.745207112347273, + "grad_norm": 0.1982421875, + "learning_rate": 2.0387404390583096e-05, + "loss": 0.4614, + "step": 196210 + }, + { + "epoch": 9.745703784642892, + "grad_norm": 0.216796875, + "learning_rate": 2.0347670606933547e-05, + "loss": 0.5297, + "step": 196220 + }, + { + "epoch": 9.746200456938512, + "grad_norm": 0.21875, + "learning_rate": 2.0307936823283998e-05, + "loss": 0.5064, + "step": 196230 + }, + { + "epoch": 9.746697129234132, + "grad_norm": 0.203125, + "learning_rate": 2.0268203039634452e-05, + "loss": 0.4925, + "step": 196240 + }, + { + "epoch": 9.74719380152975, + "grad_norm": 0.212890625, + "learning_rate": 2.0228469255984903e-05, + "loss": 0.4921, + "step": 196250 + }, + { + "epoch": 9.74769047382537, + "grad_norm": 0.2236328125, + "learning_rate": 2.0188735472335354e-05, + "loss": 0.4602, + "step": 196260 + }, + { + "epoch": 9.748187146120989, + "grad_norm": 0.2177734375, + "learning_rate": 2.0149001688685805e-05, + "loss": 0.4966, + "step": 196270 + }, + { + "epoch": 9.748683818416609, + "grad_norm": 0.2021484375, + "learning_rate": 2.010926790503626e-05, + "loss": 0.4766, + "step": 196280 + }, + { + "epoch": 9.749180490712227, + "grad_norm": 0.2099609375, + "learning_rate": 2.006953412138671e-05, + "loss": 0.4918, + "step": 196290 + }, + { + "epoch": 9.749677163007847, + "grad_norm": 0.1982421875, + "learning_rate": 2.002980033773716e-05, + "loss": 0.5133, + "step": 196300 + }, + { + "epoch": 9.750173835303467, + "grad_norm": 0.2021484375, + "learning_rate": 1.9990066554087613e-05, + "loss": 0.4892, + "step": 196310 + }, + { + "epoch": 9.750670507599086, + "grad_norm": 0.208984375, + "learning_rate": 1.9950332770438067e-05, + "loss": 0.4952, + "step": 196320 + }, + { + "epoch": 9.751167179894706, + "grad_norm": 0.193359375, + "learning_rate": 1.9910598986788518e-05, + "loss": 0.5125, + "step": 196330 + }, + { + "epoch": 9.751663852190324, + "grad_norm": 0.2216796875, + "learning_rate": 1.987086520313897e-05, + "loss": 0.5006, + "step": 196340 + }, + { + "epoch": 9.752160524485944, + "grad_norm": 0.1943359375, + "learning_rate": 1.983113141948942e-05, + "loss": 0.5008, + "step": 196350 + }, + { + "epoch": 9.752657196781563, + "grad_norm": 0.2041015625, + "learning_rate": 1.979139763583987e-05, + "loss": 0.4845, + "step": 196360 + }, + { + "epoch": 9.753153869077183, + "grad_norm": 0.2197265625, + "learning_rate": 1.9751663852190326e-05, + "loss": 0.4881, + "step": 196370 + }, + { + "epoch": 9.753650541372803, + "grad_norm": 0.2197265625, + "learning_rate": 1.9711930068540777e-05, + "loss": 0.5026, + "step": 196380 + }, + { + "epoch": 9.754147213668421, + "grad_norm": 0.1982421875, + "learning_rate": 1.9672196284891228e-05, + "loss": 0.4911, + "step": 196390 + }, + { + "epoch": 9.754643885964041, + "grad_norm": 0.1904296875, + "learning_rate": 1.9632462501241682e-05, + "loss": 0.467, + "step": 196400 + }, + { + "epoch": 9.75514055825966, + "grad_norm": 0.20703125, + "learning_rate": 1.9592728717592133e-05, + "loss": 0.47, + "step": 196410 + }, + { + "epoch": 9.75563723055528, + "grad_norm": 0.2177734375, + "learning_rate": 1.9552994933942588e-05, + "loss": 0.4833, + "step": 196420 + }, + { + "epoch": 9.756133902850898, + "grad_norm": 0.197265625, + "learning_rate": 1.951326115029304e-05, + "loss": 0.4716, + "step": 196430 + }, + { + "epoch": 9.756630575146518, + "grad_norm": 0.234375, + "learning_rate": 1.947352736664349e-05, + "loss": 0.5068, + "step": 196440 + }, + { + "epoch": 9.757127247442138, + "grad_norm": 0.212890625, + "learning_rate": 1.943379358299394e-05, + "loss": 0.4756, + "step": 196450 + }, + { + "epoch": 9.757623919737757, + "grad_norm": 0.236328125, + "learning_rate": 1.9394059799344395e-05, + "loss": 0.4787, + "step": 196460 + }, + { + "epoch": 9.758120592033377, + "grad_norm": 0.19921875, + "learning_rate": 1.9354326015694846e-05, + "loss": 0.5143, + "step": 196470 + }, + { + "epoch": 9.758617264328995, + "grad_norm": 0.203125, + "learning_rate": 1.9314592232045297e-05, + "loss": 0.4796, + "step": 196480 + }, + { + "epoch": 9.759113936624615, + "grad_norm": 0.208984375, + "learning_rate": 1.9274858448395748e-05, + "loss": 0.4623, + "step": 196490 + }, + { + "epoch": 9.759610608920234, + "grad_norm": 0.2490234375, + "learning_rate": 1.9235124664746203e-05, + "loss": 0.5225, + "step": 196500 + }, + { + "epoch": 9.760107281215854, + "grad_norm": 0.2021484375, + "learning_rate": 1.9195390881096654e-05, + "loss": 0.5526, + "step": 196510 + }, + { + "epoch": 9.760603953511474, + "grad_norm": 0.21875, + "learning_rate": 1.9155657097447105e-05, + "loss": 0.4987, + "step": 196520 + }, + { + "epoch": 9.761100625807092, + "grad_norm": 0.2021484375, + "learning_rate": 1.9115923313797556e-05, + "loss": 0.4952, + "step": 196530 + }, + { + "epoch": 9.761597298102712, + "grad_norm": 0.25390625, + "learning_rate": 1.907618953014801e-05, + "loss": 0.5011, + "step": 196540 + }, + { + "epoch": 9.76209397039833, + "grad_norm": 0.236328125, + "learning_rate": 1.903645574649846e-05, + "loss": 0.4748, + "step": 196550 + }, + { + "epoch": 9.76259064269395, + "grad_norm": 0.2451171875, + "learning_rate": 1.8996721962848912e-05, + "loss": 0.5095, + "step": 196560 + }, + { + "epoch": 9.76308731498957, + "grad_norm": 0.205078125, + "learning_rate": 1.8956988179199363e-05, + "loss": 0.5107, + "step": 196570 + }, + { + "epoch": 9.76358398728519, + "grad_norm": 0.2001953125, + "learning_rate": 1.8917254395549818e-05, + "loss": 0.5026, + "step": 196580 + }, + { + "epoch": 9.76408065958081, + "grad_norm": 0.216796875, + "learning_rate": 1.887752061190027e-05, + "loss": 0.512, + "step": 196590 + }, + { + "epoch": 9.764577331876428, + "grad_norm": 0.2431640625, + "learning_rate": 1.883778682825072e-05, + "loss": 0.4859, + "step": 196600 + }, + { + "epoch": 9.765074004172048, + "grad_norm": 0.19921875, + "learning_rate": 1.8798053044601174e-05, + "loss": 0.4648, + "step": 196610 + }, + { + "epoch": 9.765570676467666, + "grad_norm": 0.21875, + "learning_rate": 1.8758319260951625e-05, + "loss": 0.4673, + "step": 196620 + }, + { + "epoch": 9.766067348763286, + "grad_norm": 0.1884765625, + "learning_rate": 1.871858547730208e-05, + "loss": 0.4805, + "step": 196630 + }, + { + "epoch": 9.766564021058905, + "grad_norm": 0.2041015625, + "learning_rate": 1.867885169365253e-05, + "loss": 0.4962, + "step": 196640 + }, + { + "epoch": 9.767060693354525, + "grad_norm": 0.2109375, + "learning_rate": 1.863911791000298e-05, + "loss": 0.4537, + "step": 196650 + }, + { + "epoch": 9.767557365650145, + "grad_norm": 0.203125, + "learning_rate": 1.8599384126353433e-05, + "loss": 0.484, + "step": 196660 + }, + { + "epoch": 9.768054037945763, + "grad_norm": 0.2060546875, + "learning_rate": 1.8559650342703887e-05, + "loss": 0.4855, + "step": 196670 + }, + { + "epoch": 9.768550710241383, + "grad_norm": 0.2216796875, + "learning_rate": 1.8519916559054338e-05, + "loss": 0.4983, + "step": 196680 + }, + { + "epoch": 9.769047382537002, + "grad_norm": 0.216796875, + "learning_rate": 1.848018277540479e-05, + "loss": 0.4894, + "step": 196690 + }, + { + "epoch": 9.769544054832622, + "grad_norm": 0.2109375, + "learning_rate": 1.844044899175524e-05, + "loss": 0.4924, + "step": 196700 + }, + { + "epoch": 9.77004072712824, + "grad_norm": 0.1953125, + "learning_rate": 1.840071520810569e-05, + "loss": 0.496, + "step": 196710 + }, + { + "epoch": 9.77053739942386, + "grad_norm": 0.2041015625, + "learning_rate": 1.8360981424456145e-05, + "loss": 0.4968, + "step": 196720 + }, + { + "epoch": 9.77103407171948, + "grad_norm": 0.1943359375, + "learning_rate": 1.8321247640806596e-05, + "loss": 0.4638, + "step": 196730 + }, + { + "epoch": 9.771530744015099, + "grad_norm": 0.2294921875, + "learning_rate": 1.8281513857157048e-05, + "loss": 0.5139, + "step": 196740 + }, + { + "epoch": 9.772027416310719, + "grad_norm": 0.2109375, + "learning_rate": 1.82417800735075e-05, + "loss": 0.4681, + "step": 196750 + }, + { + "epoch": 9.772524088606337, + "grad_norm": 0.232421875, + "learning_rate": 1.8202046289857953e-05, + "loss": 0.48, + "step": 196760 + }, + { + "epoch": 9.773020760901957, + "grad_norm": 0.279296875, + "learning_rate": 1.8162312506208404e-05, + "loss": 0.4901, + "step": 196770 + }, + { + "epoch": 9.773517433197576, + "grad_norm": 0.2177734375, + "learning_rate": 1.8122578722558855e-05, + "loss": 0.5155, + "step": 196780 + }, + { + "epoch": 9.774014105493196, + "grad_norm": 0.2294921875, + "learning_rate": 1.8082844938909306e-05, + "loss": 0.4819, + "step": 196790 + }, + { + "epoch": 9.774510777788816, + "grad_norm": 0.203125, + "learning_rate": 1.804311115525976e-05, + "loss": 0.4388, + "step": 196800 + }, + { + "epoch": 9.775007450084434, + "grad_norm": 0.197265625, + "learning_rate": 1.800337737161021e-05, + "loss": 0.5005, + "step": 196810 + }, + { + "epoch": 9.775504122380054, + "grad_norm": 0.2421875, + "learning_rate": 1.7963643587960666e-05, + "loss": 0.4617, + "step": 196820 + }, + { + "epoch": 9.776000794675673, + "grad_norm": 0.208984375, + "learning_rate": 1.7923909804311117e-05, + "loss": 0.468, + "step": 196830 + }, + { + "epoch": 9.776497466971293, + "grad_norm": 0.201171875, + "learning_rate": 1.7884176020661568e-05, + "loss": 0.5165, + "step": 196840 + }, + { + "epoch": 9.776994139266911, + "grad_norm": 0.2236328125, + "learning_rate": 1.7844442237012022e-05, + "loss": 0.4809, + "step": 196850 + }, + { + "epoch": 9.777490811562531, + "grad_norm": 0.2099609375, + "learning_rate": 1.7804708453362473e-05, + "loss": 0.4636, + "step": 196860 + }, + { + "epoch": 9.777987483858151, + "grad_norm": 0.212890625, + "learning_rate": 1.7764974669712924e-05, + "loss": 0.482, + "step": 196870 + }, + { + "epoch": 9.77848415615377, + "grad_norm": 0.1943359375, + "learning_rate": 1.7725240886063375e-05, + "loss": 0.4925, + "step": 196880 + }, + { + "epoch": 9.77898082844939, + "grad_norm": 0.2041015625, + "learning_rate": 1.768550710241383e-05, + "loss": 0.4965, + "step": 196890 + }, + { + "epoch": 9.779477500745008, + "grad_norm": 0.193359375, + "learning_rate": 1.764577331876428e-05, + "loss": 0.4706, + "step": 196900 + }, + { + "epoch": 9.779974173040628, + "grad_norm": 0.23828125, + "learning_rate": 1.7606039535114732e-05, + "loss": 0.492, + "step": 196910 + }, + { + "epoch": 9.780470845336247, + "grad_norm": 0.2392578125, + "learning_rate": 1.7566305751465183e-05, + "loss": 0.4736, + "step": 196920 + }, + { + "epoch": 9.780967517631867, + "grad_norm": 0.2041015625, + "learning_rate": 1.7526571967815637e-05, + "loss": 0.5131, + "step": 196930 + }, + { + "epoch": 9.781464189927487, + "grad_norm": 0.2099609375, + "learning_rate": 1.748683818416609e-05, + "loss": 0.464, + "step": 196940 + }, + { + "epoch": 9.781960862223105, + "grad_norm": 0.19921875, + "learning_rate": 1.744710440051654e-05, + "loss": 0.4476, + "step": 196950 + }, + { + "epoch": 9.782457534518725, + "grad_norm": 0.203125, + "learning_rate": 1.740737061686699e-05, + "loss": 0.496, + "step": 196960 + }, + { + "epoch": 9.782954206814344, + "grad_norm": 0.203125, + "learning_rate": 1.7367636833217445e-05, + "loss": 0.472, + "step": 196970 + }, + { + "epoch": 9.783450879109964, + "grad_norm": 0.2490234375, + "learning_rate": 1.7327903049567896e-05, + "loss": 0.4679, + "step": 196980 + }, + { + "epoch": 9.783947551405582, + "grad_norm": 0.2041015625, + "learning_rate": 1.7288169265918347e-05, + "loss": 0.4861, + "step": 196990 + }, + { + "epoch": 9.784444223701202, + "grad_norm": 0.189453125, + "learning_rate": 1.7248435482268798e-05, + "loss": 0.4758, + "step": 197000 + }, + { + "epoch": 9.78494089599682, + "grad_norm": 0.2080078125, + "learning_rate": 1.720870169861925e-05, + "loss": 0.4768, + "step": 197010 + }, + { + "epoch": 9.78543756829244, + "grad_norm": 0.220703125, + "learning_rate": 1.7168967914969703e-05, + "loss": 0.4772, + "step": 197020 + }, + { + "epoch": 9.78593424058806, + "grad_norm": 0.216796875, + "learning_rate": 1.7129234131320158e-05, + "loss": 0.4638, + "step": 197030 + }, + { + "epoch": 9.786430912883679, + "grad_norm": 0.2353515625, + "learning_rate": 1.708950034767061e-05, + "loss": 0.4954, + "step": 197040 + }, + { + "epoch": 9.7869275851793, + "grad_norm": 0.22265625, + "learning_rate": 1.704976656402106e-05, + "loss": 0.4877, + "step": 197050 + }, + { + "epoch": 9.787424257474918, + "grad_norm": 0.212890625, + "learning_rate": 1.7010032780371514e-05, + "loss": 0.4782, + "step": 197060 + }, + { + "epoch": 9.787920929770538, + "grad_norm": 0.2138671875, + "learning_rate": 1.6970298996721965e-05, + "loss": 0.4893, + "step": 197070 + }, + { + "epoch": 9.788417602066156, + "grad_norm": 0.234375, + "learning_rate": 1.6930565213072416e-05, + "loss": 0.4931, + "step": 197080 + }, + { + "epoch": 9.788914274361776, + "grad_norm": 0.20703125, + "learning_rate": 1.6890831429422867e-05, + "loss": 0.4815, + "step": 197090 + }, + { + "epoch": 9.789410946657396, + "grad_norm": 0.224609375, + "learning_rate": 1.685109764577332e-05, + "loss": 0.4825, + "step": 197100 + }, + { + "epoch": 9.789907618953015, + "grad_norm": 0.265625, + "learning_rate": 1.6811363862123773e-05, + "loss": 0.4978, + "step": 197110 + }, + { + "epoch": 9.790404291248635, + "grad_norm": 0.2080078125, + "learning_rate": 1.6771630078474224e-05, + "loss": 0.4736, + "step": 197120 + }, + { + "epoch": 9.790900963544253, + "grad_norm": 0.2451171875, + "learning_rate": 1.6731896294824675e-05, + "loss": 0.4855, + "step": 197130 + }, + { + "epoch": 9.791397635839873, + "grad_norm": 0.2109375, + "learning_rate": 1.6692162511175126e-05, + "loss": 0.4818, + "step": 197140 + }, + { + "epoch": 9.791894308135491, + "grad_norm": 0.244140625, + "learning_rate": 1.665242872752558e-05, + "loss": 0.5244, + "step": 197150 + }, + { + "epoch": 9.792390980431112, + "grad_norm": 0.19140625, + "learning_rate": 1.661269494387603e-05, + "loss": 0.4878, + "step": 197160 + }, + { + "epoch": 9.792887652726732, + "grad_norm": 0.232421875, + "learning_rate": 1.6572961160226482e-05, + "loss": 0.5117, + "step": 197170 + }, + { + "epoch": 9.79338432502235, + "grad_norm": 0.224609375, + "learning_rate": 1.6533227376576933e-05, + "loss": 0.4566, + "step": 197180 + }, + { + "epoch": 9.79388099731797, + "grad_norm": 0.2138671875, + "learning_rate": 1.6493493592927388e-05, + "loss": 0.468, + "step": 197190 + }, + { + "epoch": 9.794377669613588, + "grad_norm": 0.2109375, + "learning_rate": 1.645375980927784e-05, + "loss": 0.4971, + "step": 197200 + }, + { + "epoch": 9.794874341909209, + "grad_norm": 0.1943359375, + "learning_rate": 1.641402602562829e-05, + "loss": 0.4705, + "step": 197210 + }, + { + "epoch": 9.795371014204827, + "grad_norm": 0.2216796875, + "learning_rate": 1.637429224197874e-05, + "loss": 0.466, + "step": 197220 + }, + { + "epoch": 9.795867686500447, + "grad_norm": 0.2021484375, + "learning_rate": 1.6334558458329195e-05, + "loss": 0.5081, + "step": 197230 + }, + { + "epoch": 9.796364358796067, + "grad_norm": 0.2333984375, + "learning_rate": 1.629482467467965e-05, + "loss": 0.5119, + "step": 197240 + }, + { + "epoch": 9.796861031091685, + "grad_norm": 0.2177734375, + "learning_rate": 1.62550908910301e-05, + "loss": 0.5001, + "step": 197250 + }, + { + "epoch": 9.797357703387306, + "grad_norm": 0.21875, + "learning_rate": 1.6215357107380552e-05, + "loss": 0.5009, + "step": 197260 + }, + { + "epoch": 9.797854375682924, + "grad_norm": 0.2080078125, + "learning_rate": 1.6175623323731003e-05, + "loss": 0.4913, + "step": 197270 + }, + { + "epoch": 9.798351047978544, + "grad_norm": 0.244140625, + "learning_rate": 1.6135889540081457e-05, + "loss": 0.5293, + "step": 197280 + }, + { + "epoch": 9.798847720274162, + "grad_norm": 0.2021484375, + "learning_rate": 1.6096155756431908e-05, + "loss": 0.4953, + "step": 197290 + }, + { + "epoch": 9.799344392569783, + "grad_norm": 0.1962890625, + "learning_rate": 1.605642197278236e-05, + "loss": 0.4845, + "step": 197300 + }, + { + "epoch": 9.799841064865403, + "grad_norm": 0.2001953125, + "learning_rate": 1.601668818913281e-05, + "loss": 0.5252, + "step": 197310 + }, + { + "epoch": 9.800337737161021, + "grad_norm": 0.220703125, + "learning_rate": 1.5976954405483265e-05, + "loss": 0.4906, + "step": 197320 + }, + { + "epoch": 9.800834409456641, + "grad_norm": 0.23828125, + "learning_rate": 1.5937220621833716e-05, + "loss": 0.4903, + "step": 197330 + }, + { + "epoch": 9.80133108175226, + "grad_norm": 0.2119140625, + "learning_rate": 1.5897486838184167e-05, + "loss": 0.4786, + "step": 197340 + }, + { + "epoch": 9.80182775404788, + "grad_norm": 0.240234375, + "learning_rate": 1.5857753054534618e-05, + "loss": 0.4979, + "step": 197350 + }, + { + "epoch": 9.802324426343498, + "grad_norm": 0.20703125, + "learning_rate": 1.5818019270885072e-05, + "loss": 0.4802, + "step": 197360 + }, + { + "epoch": 9.802821098639118, + "grad_norm": 0.2060546875, + "learning_rate": 1.5778285487235523e-05, + "loss": 0.4789, + "step": 197370 + }, + { + "epoch": 9.803317770934736, + "grad_norm": 0.2001953125, + "learning_rate": 1.5738551703585974e-05, + "loss": 0.4494, + "step": 197380 + }, + { + "epoch": 9.803814443230356, + "grad_norm": 0.2060546875, + "learning_rate": 1.5698817919936425e-05, + "loss": 0.4801, + "step": 197390 + }, + { + "epoch": 9.804311115525977, + "grad_norm": 0.240234375, + "learning_rate": 1.5659084136286876e-05, + "loss": 0.5141, + "step": 197400 + }, + { + "epoch": 9.804807787821595, + "grad_norm": 0.263671875, + "learning_rate": 1.561935035263733e-05, + "loss": 0.5059, + "step": 197410 + }, + { + "epoch": 9.805304460117215, + "grad_norm": 0.2177734375, + "learning_rate": 1.5579616568987782e-05, + "loss": 0.4943, + "step": 197420 + }, + { + "epoch": 9.805801132412833, + "grad_norm": 0.208984375, + "learning_rate": 1.5539882785338233e-05, + "loss": 0.4801, + "step": 197430 + }, + { + "epoch": 9.806297804708453, + "grad_norm": 0.1943359375, + "learning_rate": 1.5500149001688684e-05, + "loss": 0.4785, + "step": 197440 + }, + { + "epoch": 9.806794477004072, + "grad_norm": 0.2294921875, + "learning_rate": 1.5460415218039138e-05, + "loss": 0.49, + "step": 197450 + }, + { + "epoch": 9.807291149299692, + "grad_norm": 0.1943359375, + "learning_rate": 1.5420681434389593e-05, + "loss": 0.4955, + "step": 197460 + }, + { + "epoch": 9.807787821595312, + "grad_norm": 0.2265625, + "learning_rate": 1.5380947650740044e-05, + "loss": 0.4877, + "step": 197470 + }, + { + "epoch": 9.80828449389093, + "grad_norm": 0.21484375, + "learning_rate": 1.5341213867090495e-05, + "loss": 0.4834, + "step": 197480 + }, + { + "epoch": 9.80878116618655, + "grad_norm": 0.2216796875, + "learning_rate": 1.5301480083440946e-05, + "loss": 0.4689, + "step": 197490 + }, + { + "epoch": 9.809277838482169, + "grad_norm": 0.1982421875, + "learning_rate": 1.52617462997914e-05, + "loss": 0.4608, + "step": 197500 + }, + { + "epoch": 9.809774510777789, + "grad_norm": 0.21875, + "learning_rate": 1.5222012516141851e-05, + "loss": 0.528, + "step": 197510 + }, + { + "epoch": 9.810271183073407, + "grad_norm": 0.2021484375, + "learning_rate": 1.5182278732492302e-05, + "loss": 0.5089, + "step": 197520 + }, + { + "epoch": 9.810767855369027, + "grad_norm": 0.193359375, + "learning_rate": 1.5142544948842755e-05, + "loss": 0.487, + "step": 197530 + }, + { + "epoch": 9.811264527664648, + "grad_norm": 0.208984375, + "learning_rate": 1.5102811165193206e-05, + "loss": 0.4944, + "step": 197540 + }, + { + "epoch": 9.811761199960266, + "grad_norm": 0.1962890625, + "learning_rate": 1.5063077381543659e-05, + "loss": 0.51, + "step": 197550 + }, + { + "epoch": 9.812257872255886, + "grad_norm": 0.2119140625, + "learning_rate": 1.502334359789411e-05, + "loss": 0.486, + "step": 197560 + }, + { + "epoch": 9.812754544551504, + "grad_norm": 0.1953125, + "learning_rate": 1.4983609814244562e-05, + "loss": 0.501, + "step": 197570 + }, + { + "epoch": 9.813251216847124, + "grad_norm": 0.2421875, + "learning_rate": 1.4943876030595013e-05, + "loss": 0.4839, + "step": 197580 + }, + { + "epoch": 9.813747889142743, + "grad_norm": 0.21875, + "learning_rate": 1.4904142246945466e-05, + "loss": 0.4591, + "step": 197590 + }, + { + "epoch": 9.814244561438363, + "grad_norm": 0.2138671875, + "learning_rate": 1.4864408463295917e-05, + "loss": 0.484, + "step": 197600 + }, + { + "epoch": 9.814741233733983, + "grad_norm": 0.22265625, + "learning_rate": 1.482467467964637e-05, + "loss": 0.5014, + "step": 197610 + }, + { + "epoch": 9.815237906029601, + "grad_norm": 0.1953125, + "learning_rate": 1.4784940895996821e-05, + "loss": 0.454, + "step": 197620 + }, + { + "epoch": 9.815734578325221, + "grad_norm": 0.228515625, + "learning_rate": 1.4745207112347274e-05, + "loss": 0.526, + "step": 197630 + }, + { + "epoch": 9.81623125062084, + "grad_norm": 0.2216796875, + "learning_rate": 1.4705473328697725e-05, + "loss": 0.495, + "step": 197640 + }, + { + "epoch": 9.81672792291646, + "grad_norm": 0.2001953125, + "learning_rate": 1.4665739545048177e-05, + "loss": 0.4979, + "step": 197650 + }, + { + "epoch": 9.817224595212078, + "grad_norm": 0.1845703125, + "learning_rate": 1.4626005761398632e-05, + "loss": 0.5173, + "step": 197660 + }, + { + "epoch": 9.817721267507698, + "grad_norm": 0.21875, + "learning_rate": 1.4586271977749083e-05, + "loss": 0.5001, + "step": 197670 + }, + { + "epoch": 9.818217939803318, + "grad_norm": 0.2138671875, + "learning_rate": 1.4546538194099536e-05, + "loss": 0.4927, + "step": 197680 + }, + { + "epoch": 9.818714612098937, + "grad_norm": 0.2158203125, + "learning_rate": 1.4506804410449987e-05, + "loss": 0.4713, + "step": 197690 + }, + { + "epoch": 9.819211284394557, + "grad_norm": 0.1943359375, + "learning_rate": 1.446707062680044e-05, + "loss": 0.4837, + "step": 197700 + }, + { + "epoch": 9.819707956690175, + "grad_norm": 0.2021484375, + "learning_rate": 1.442733684315089e-05, + "loss": 0.4783, + "step": 197710 + }, + { + "epoch": 9.820204628985795, + "grad_norm": 0.275390625, + "learning_rate": 1.4387603059501343e-05, + "loss": 0.5004, + "step": 197720 + }, + { + "epoch": 9.820701301281414, + "grad_norm": 0.224609375, + "learning_rate": 1.4347869275851794e-05, + "loss": 0.516, + "step": 197730 + }, + { + "epoch": 9.821197973577034, + "grad_norm": 0.23046875, + "learning_rate": 1.4308135492202247e-05, + "loss": 0.4959, + "step": 197740 + }, + { + "epoch": 9.821694645872654, + "grad_norm": 0.212890625, + "learning_rate": 1.4268401708552698e-05, + "loss": 0.5084, + "step": 197750 + }, + { + "epoch": 9.822191318168272, + "grad_norm": 0.2138671875, + "learning_rate": 1.4228667924903149e-05, + "loss": 0.4627, + "step": 197760 + }, + { + "epoch": 9.822687990463892, + "grad_norm": 0.2265625, + "learning_rate": 1.4188934141253602e-05, + "loss": 0.4914, + "step": 197770 + }, + { + "epoch": 9.82318466275951, + "grad_norm": 0.1962890625, + "learning_rate": 1.4149200357604053e-05, + "loss": 0.4794, + "step": 197780 + }, + { + "epoch": 9.82368133505513, + "grad_norm": 0.2119140625, + "learning_rate": 1.4109466573954505e-05, + "loss": 0.4824, + "step": 197790 + }, + { + "epoch": 9.82417800735075, + "grad_norm": 0.1923828125, + "learning_rate": 1.4069732790304956e-05, + "loss": 0.4791, + "step": 197800 + }, + { + "epoch": 9.82467467964637, + "grad_norm": 0.212890625, + "learning_rate": 1.4029999006655409e-05, + "loss": 0.4911, + "step": 197810 + }, + { + "epoch": 9.82517135194199, + "grad_norm": 0.197265625, + "learning_rate": 1.399026522300586e-05, + "loss": 0.4759, + "step": 197820 + }, + { + "epoch": 9.825668024237608, + "grad_norm": 0.203125, + "learning_rate": 1.3950531439356313e-05, + "loss": 0.481, + "step": 197830 + }, + { + "epoch": 9.826164696533228, + "grad_norm": 0.203125, + "learning_rate": 1.3910797655706764e-05, + "loss": 0.5043, + "step": 197840 + }, + { + "epoch": 9.826661368828846, + "grad_norm": 0.2041015625, + "learning_rate": 1.3871063872057217e-05, + "loss": 0.4735, + "step": 197850 + }, + { + "epoch": 9.827158041124466, + "grad_norm": 0.208984375, + "learning_rate": 1.3831330088407668e-05, + "loss": 0.5065, + "step": 197860 + }, + { + "epoch": 9.827654713420085, + "grad_norm": 0.20703125, + "learning_rate": 1.3791596304758122e-05, + "loss": 0.5063, + "step": 197870 + }, + { + "epoch": 9.828151385715705, + "grad_norm": 0.189453125, + "learning_rate": 1.3751862521108575e-05, + "loss": 0.4593, + "step": 197880 + }, + { + "epoch": 9.828648058011325, + "grad_norm": 0.2294921875, + "learning_rate": 1.3712128737459026e-05, + "loss": 0.4911, + "step": 197890 + }, + { + "epoch": 9.829144730306943, + "grad_norm": 0.208984375, + "learning_rate": 1.3672394953809478e-05, + "loss": 0.4695, + "step": 197900 + }, + { + "epoch": 9.829641402602563, + "grad_norm": 0.2021484375, + "learning_rate": 1.363266117015993e-05, + "loss": 0.4862, + "step": 197910 + }, + { + "epoch": 9.830138074898182, + "grad_norm": 0.234375, + "learning_rate": 1.3592927386510382e-05, + "loss": 0.464, + "step": 197920 + }, + { + "epoch": 9.830634747193802, + "grad_norm": 0.2236328125, + "learning_rate": 1.3553193602860833e-05, + "loss": 0.4768, + "step": 197930 + }, + { + "epoch": 9.83113141948942, + "grad_norm": 0.220703125, + "learning_rate": 1.3513459819211286e-05, + "loss": 0.4961, + "step": 197940 + }, + { + "epoch": 9.83162809178504, + "grad_norm": 0.251953125, + "learning_rate": 1.3473726035561737e-05, + "loss": 0.4947, + "step": 197950 + }, + { + "epoch": 9.83212476408066, + "grad_norm": 0.205078125, + "learning_rate": 1.343399225191219e-05, + "loss": 0.4792, + "step": 197960 + }, + { + "epoch": 9.832621436376279, + "grad_norm": 0.2041015625, + "learning_rate": 1.339425846826264e-05, + "loss": 0.4931, + "step": 197970 + }, + { + "epoch": 9.833118108671899, + "grad_norm": 0.20703125, + "learning_rate": 1.3354524684613093e-05, + "loss": 0.497, + "step": 197980 + }, + { + "epoch": 9.833614780967517, + "grad_norm": 0.1923828125, + "learning_rate": 1.3314790900963544e-05, + "loss": 0.4727, + "step": 197990 + }, + { + "epoch": 9.834111453263137, + "grad_norm": 0.205078125, + "learning_rate": 1.3275057117313997e-05, + "loss": 0.4828, + "step": 198000 + }, + { + "epoch": 9.834608125558756, + "grad_norm": 0.1875, + "learning_rate": 1.3235323333664448e-05, + "loss": 0.4798, + "step": 198010 + }, + { + "epoch": 9.835104797854376, + "grad_norm": 0.193359375, + "learning_rate": 1.3195589550014901e-05, + "loss": 0.4992, + "step": 198020 + }, + { + "epoch": 9.835601470149996, + "grad_norm": 0.203125, + "learning_rate": 1.3155855766365352e-05, + "loss": 0.4932, + "step": 198030 + }, + { + "epoch": 9.836098142445614, + "grad_norm": 0.2060546875, + "learning_rate": 1.3116121982715805e-05, + "loss": 0.4665, + "step": 198040 + }, + { + "epoch": 9.836594814741234, + "grad_norm": 0.2236328125, + "learning_rate": 1.3076388199066256e-05, + "loss": 0.4977, + "step": 198050 + }, + { + "epoch": 9.837091487036853, + "grad_norm": 0.2177734375, + "learning_rate": 1.3036654415416707e-05, + "loss": 0.4823, + "step": 198060 + }, + { + "epoch": 9.837588159332473, + "grad_norm": 0.203125, + "learning_rate": 1.299692063176716e-05, + "loss": 0.5245, + "step": 198070 + }, + { + "epoch": 9.838084831628091, + "grad_norm": 0.23046875, + "learning_rate": 1.2957186848117614e-05, + "loss": 0.5101, + "step": 198080 + }, + { + "epoch": 9.838581503923711, + "grad_norm": 0.21484375, + "learning_rate": 1.2917453064468067e-05, + "loss": 0.5216, + "step": 198090 + }, + { + "epoch": 9.839078176219331, + "grad_norm": 0.197265625, + "learning_rate": 1.2877719280818518e-05, + "loss": 0.4817, + "step": 198100 + }, + { + "epoch": 9.83957484851495, + "grad_norm": 0.212890625, + "learning_rate": 1.2837985497168969e-05, + "loss": 0.4895, + "step": 198110 + }, + { + "epoch": 9.84007152081057, + "grad_norm": 0.197265625, + "learning_rate": 1.2798251713519421e-05, + "loss": 0.5008, + "step": 198120 + }, + { + "epoch": 9.840568193106188, + "grad_norm": 0.216796875, + "learning_rate": 1.2758517929869872e-05, + "loss": 0.4854, + "step": 198130 + }, + { + "epoch": 9.841064865401808, + "grad_norm": 0.2158203125, + "learning_rate": 1.2718784146220325e-05, + "loss": 0.4617, + "step": 198140 + }, + { + "epoch": 9.841561537697427, + "grad_norm": 0.212890625, + "learning_rate": 1.2679050362570776e-05, + "loss": 0.5255, + "step": 198150 + }, + { + "epoch": 9.842058209993047, + "grad_norm": 0.22265625, + "learning_rate": 1.2639316578921229e-05, + "loss": 0.4817, + "step": 198160 + }, + { + "epoch": 9.842554882288667, + "grad_norm": 0.2021484375, + "learning_rate": 1.259958279527168e-05, + "loss": 0.4856, + "step": 198170 + }, + { + "epoch": 9.843051554584285, + "grad_norm": 0.1923828125, + "learning_rate": 1.2559849011622133e-05, + "loss": 0.4613, + "step": 198180 + }, + { + "epoch": 9.843548226879905, + "grad_norm": 0.2197265625, + "learning_rate": 1.2520115227972584e-05, + "loss": 0.5174, + "step": 198190 + }, + { + "epoch": 9.844044899175524, + "grad_norm": 0.228515625, + "learning_rate": 1.2480381444323036e-05, + "loss": 0.5079, + "step": 198200 + }, + { + "epoch": 9.844541571471144, + "grad_norm": 0.2001953125, + "learning_rate": 1.2440647660673487e-05, + "loss": 0.4683, + "step": 198210 + }, + { + "epoch": 9.845038243766762, + "grad_norm": 0.2421875, + "learning_rate": 1.240091387702394e-05, + "loss": 0.5125, + "step": 198220 + }, + { + "epoch": 9.845534916062382, + "grad_norm": 0.203125, + "learning_rate": 1.2361180093374391e-05, + "loss": 0.4804, + "step": 198230 + }, + { + "epoch": 9.846031588358002, + "grad_norm": 0.21484375, + "learning_rate": 1.2321446309724846e-05, + "loss": 0.477, + "step": 198240 + }, + { + "epoch": 9.84652826065362, + "grad_norm": 0.2060546875, + "learning_rate": 1.2281712526075297e-05, + "loss": 0.4946, + "step": 198250 + }, + { + "epoch": 9.84702493294924, + "grad_norm": 0.2265625, + "learning_rate": 1.224197874242575e-05, + "loss": 0.4718, + "step": 198260 + }, + { + "epoch": 9.847521605244859, + "grad_norm": 0.20703125, + "learning_rate": 1.22022449587762e-05, + "loss": 0.4889, + "step": 198270 + }, + { + "epoch": 9.84801827754048, + "grad_norm": 0.2158203125, + "learning_rate": 1.2162511175126651e-05, + "loss": 0.4428, + "step": 198280 + }, + { + "epoch": 9.848514949836098, + "grad_norm": 0.1982421875, + "learning_rate": 1.2122777391477104e-05, + "loss": 0.4802, + "step": 198290 + }, + { + "epoch": 9.849011622131718, + "grad_norm": 0.20703125, + "learning_rate": 1.2083043607827555e-05, + "loss": 0.478, + "step": 198300 + }, + { + "epoch": 9.849508294427338, + "grad_norm": 0.2119140625, + "learning_rate": 1.2043309824178008e-05, + "loss": 0.4929, + "step": 198310 + }, + { + "epoch": 9.850004966722956, + "grad_norm": 0.205078125, + "learning_rate": 1.2003576040528459e-05, + "loss": 0.4814, + "step": 198320 + }, + { + "epoch": 9.850501639018576, + "grad_norm": 0.21875, + "learning_rate": 1.1963842256878912e-05, + "loss": 0.498, + "step": 198330 + }, + { + "epoch": 9.850998311314195, + "grad_norm": 0.1943359375, + "learning_rate": 1.1924108473229364e-05, + "loss": 0.4992, + "step": 198340 + }, + { + "epoch": 9.851494983609815, + "grad_norm": 0.2109375, + "learning_rate": 1.1884374689579817e-05, + "loss": 0.5012, + "step": 198350 + }, + { + "epoch": 9.851991655905433, + "grad_norm": 0.1982421875, + "learning_rate": 1.1844640905930268e-05, + "loss": 0.4566, + "step": 198360 + }, + { + "epoch": 9.852488328201053, + "grad_norm": 0.20703125, + "learning_rate": 1.180490712228072e-05, + "loss": 0.4975, + "step": 198370 + }, + { + "epoch": 9.852985000496671, + "grad_norm": 0.208984375, + "learning_rate": 1.1765173338631172e-05, + "loss": 0.4822, + "step": 198380 + }, + { + "epoch": 9.853481672792292, + "grad_norm": 0.296875, + "learning_rate": 1.1725439554981624e-05, + "loss": 0.4713, + "step": 198390 + }, + { + "epoch": 9.853978345087912, + "grad_norm": 0.2109375, + "learning_rate": 1.1685705771332076e-05, + "loss": 0.5041, + "step": 198400 + }, + { + "epoch": 9.85447501738353, + "grad_norm": 0.2138671875, + "learning_rate": 1.1645971987682528e-05, + "loss": 0.4846, + "step": 198410 + }, + { + "epoch": 9.85497168967915, + "grad_norm": 0.201171875, + "learning_rate": 1.160623820403298e-05, + "loss": 0.4568, + "step": 198420 + }, + { + "epoch": 9.855468361974768, + "grad_norm": 0.220703125, + "learning_rate": 1.156650442038343e-05, + "loss": 0.5135, + "step": 198430 + }, + { + "epoch": 9.855965034270389, + "grad_norm": 0.255859375, + "learning_rate": 1.1526770636733883e-05, + "loss": 0.5184, + "step": 198440 + }, + { + "epoch": 9.856461706566007, + "grad_norm": 0.224609375, + "learning_rate": 1.1487036853084336e-05, + "loss": 0.4928, + "step": 198450 + }, + { + "epoch": 9.856958378861627, + "grad_norm": 0.2060546875, + "learning_rate": 1.1447303069434788e-05, + "loss": 0.4681, + "step": 198460 + }, + { + "epoch": 9.857455051157247, + "grad_norm": 0.216796875, + "learning_rate": 1.140756928578524e-05, + "loss": 0.469, + "step": 198470 + }, + { + "epoch": 9.857951723452866, + "grad_norm": 0.21875, + "learning_rate": 1.1367835502135692e-05, + "loss": 0.4632, + "step": 198480 + }, + { + "epoch": 9.858448395748486, + "grad_norm": 0.1884765625, + "learning_rate": 1.1328101718486143e-05, + "loss": 0.4931, + "step": 198490 + }, + { + "epoch": 9.858945068044104, + "grad_norm": 0.232421875, + "learning_rate": 1.1288367934836596e-05, + "loss": 0.4856, + "step": 198500 + }, + { + "epoch": 9.859441740339724, + "grad_norm": 0.2578125, + "learning_rate": 1.1248634151187047e-05, + "loss": 0.4895, + "step": 198510 + }, + { + "epoch": 9.859938412635342, + "grad_norm": 0.2041015625, + "learning_rate": 1.12089003675375e-05, + "loss": 0.4701, + "step": 198520 + }, + { + "epoch": 9.860435084930963, + "grad_norm": 0.240234375, + "learning_rate": 1.116916658388795e-05, + "loss": 0.4943, + "step": 198530 + }, + { + "epoch": 9.860931757226583, + "grad_norm": 0.2470703125, + "learning_rate": 1.1129432800238403e-05, + "loss": 0.482, + "step": 198540 + }, + { + "epoch": 9.861428429522201, + "grad_norm": 0.2080078125, + "learning_rate": 1.1089699016588854e-05, + "loss": 0.4901, + "step": 198550 + }, + { + "epoch": 9.861925101817821, + "grad_norm": 0.244140625, + "learning_rate": 1.1049965232939307e-05, + "loss": 0.4929, + "step": 198560 + }, + { + "epoch": 9.86242177411344, + "grad_norm": 0.208984375, + "learning_rate": 1.101023144928976e-05, + "loss": 0.5104, + "step": 198570 + }, + { + "epoch": 9.86291844640906, + "grad_norm": 0.2236328125, + "learning_rate": 1.0970497665640211e-05, + "loss": 0.4708, + "step": 198580 + }, + { + "epoch": 9.863415118704678, + "grad_norm": 0.2138671875, + "learning_rate": 1.0930763881990664e-05, + "loss": 0.4916, + "step": 198590 + }, + { + "epoch": 9.863911791000298, + "grad_norm": 0.2021484375, + "learning_rate": 1.0891030098341115e-05, + "loss": 0.4666, + "step": 198600 + }, + { + "epoch": 9.864408463295918, + "grad_norm": 0.28515625, + "learning_rate": 1.0851296314691567e-05, + "loss": 0.5095, + "step": 198610 + }, + { + "epoch": 9.864905135591536, + "grad_norm": 0.2080078125, + "learning_rate": 1.0811562531042018e-05, + "loss": 0.47, + "step": 198620 + }, + { + "epoch": 9.865401807887157, + "grad_norm": 0.2021484375, + "learning_rate": 1.0771828747392471e-05, + "loss": 0.4777, + "step": 198630 + }, + { + "epoch": 9.865898480182775, + "grad_norm": 0.2138671875, + "learning_rate": 1.0732094963742922e-05, + "loss": 0.4784, + "step": 198640 + }, + { + "epoch": 9.866395152478395, + "grad_norm": 0.197265625, + "learning_rate": 1.0692361180093375e-05, + "loss": 0.4772, + "step": 198650 + }, + { + "epoch": 9.866891824774013, + "grad_norm": 0.212890625, + "learning_rate": 1.0652627396443828e-05, + "loss": 0.4804, + "step": 198660 + }, + { + "epoch": 9.867388497069634, + "grad_norm": 0.1962890625, + "learning_rate": 1.0612893612794279e-05, + "loss": 0.4588, + "step": 198670 + }, + { + "epoch": 9.867885169365254, + "grad_norm": 0.21484375, + "learning_rate": 1.0573159829144731e-05, + "loss": 0.4805, + "step": 198680 + }, + { + "epoch": 9.868381841660872, + "grad_norm": 0.21875, + "learning_rate": 1.0533426045495182e-05, + "loss": 0.4908, + "step": 198690 + }, + { + "epoch": 9.868878513956492, + "grad_norm": 0.236328125, + "learning_rate": 1.0493692261845635e-05, + "loss": 0.4779, + "step": 198700 + }, + { + "epoch": 9.86937518625211, + "grad_norm": 0.2119140625, + "learning_rate": 1.0453958478196086e-05, + "loss": 0.4911, + "step": 198710 + }, + { + "epoch": 9.86987185854773, + "grad_norm": 0.2158203125, + "learning_rate": 1.0414224694546539e-05, + "loss": 0.49, + "step": 198720 + }, + { + "epoch": 9.870368530843349, + "grad_norm": 0.1943359375, + "learning_rate": 1.037449091089699e-05, + "loss": 0.467, + "step": 198730 + }, + { + "epoch": 9.870865203138969, + "grad_norm": 0.244140625, + "learning_rate": 1.0334757127247443e-05, + "loss": 0.5067, + "step": 198740 + }, + { + "epoch": 9.871361875434587, + "grad_norm": 0.1982421875, + "learning_rate": 1.0295023343597894e-05, + "loss": 0.4727, + "step": 198750 + }, + { + "epoch": 9.871858547730207, + "grad_norm": 0.208984375, + "learning_rate": 1.0255289559948346e-05, + "loss": 0.468, + "step": 198760 + }, + { + "epoch": 9.872355220025828, + "grad_norm": 0.2177734375, + "learning_rate": 1.0215555776298799e-05, + "loss": 0.488, + "step": 198770 + }, + { + "epoch": 9.872851892321446, + "grad_norm": 0.21484375, + "learning_rate": 1.017582199264925e-05, + "loss": 0.4788, + "step": 198780 + }, + { + "epoch": 9.873348564617066, + "grad_norm": 0.2236328125, + "learning_rate": 1.0136088208999703e-05, + "loss": 0.5071, + "step": 198790 + }, + { + "epoch": 9.873845236912684, + "grad_norm": 0.2412109375, + "learning_rate": 1.0096354425350154e-05, + "loss": 0.4792, + "step": 198800 + }, + { + "epoch": 9.874341909208304, + "grad_norm": 0.2431640625, + "learning_rate": 1.0056620641700607e-05, + "loss": 0.5005, + "step": 198810 + }, + { + "epoch": 9.874838581503923, + "grad_norm": 0.24609375, + "learning_rate": 1.0016886858051058e-05, + "loss": 0.5227, + "step": 198820 + }, + { + "epoch": 9.875335253799543, + "grad_norm": 0.201171875, + "learning_rate": 9.97715307440151e-06, + "loss": 0.4724, + "step": 198830 + }, + { + "epoch": 9.875831926095163, + "grad_norm": 0.2158203125, + "learning_rate": 9.937419290751961e-06, + "loss": 0.4943, + "step": 198840 + }, + { + "epoch": 9.876328598390781, + "grad_norm": 0.2041015625, + "learning_rate": 9.897685507102414e-06, + "loss": 0.5062, + "step": 198850 + }, + { + "epoch": 9.876825270686401, + "grad_norm": 0.2138671875, + "learning_rate": 9.857951723452865e-06, + "loss": 0.5059, + "step": 198860 + }, + { + "epoch": 9.87732194298202, + "grad_norm": 0.23828125, + "learning_rate": 9.81821793980332e-06, + "loss": 0.5141, + "step": 198870 + }, + { + "epoch": 9.87781861527764, + "grad_norm": 0.1953125, + "learning_rate": 9.77848415615377e-06, + "loss": 0.5031, + "step": 198880 + }, + { + "epoch": 9.878315287573258, + "grad_norm": 0.1904296875, + "learning_rate": 9.738750372504223e-06, + "loss": 0.4937, + "step": 198890 + }, + { + "epoch": 9.878811959868878, + "grad_norm": 0.21484375, + "learning_rate": 9.699016588854674e-06, + "loss": 0.4737, + "step": 198900 + }, + { + "epoch": 9.879308632164499, + "grad_norm": 0.205078125, + "learning_rate": 9.659282805205127e-06, + "loss": 0.475, + "step": 198910 + }, + { + "epoch": 9.879805304460117, + "grad_norm": 0.2001953125, + "learning_rate": 9.619549021555578e-06, + "loss": 0.5084, + "step": 198920 + }, + { + "epoch": 9.880301976755737, + "grad_norm": 0.21484375, + "learning_rate": 9.57981523790603e-06, + "loss": 0.5012, + "step": 198930 + }, + { + "epoch": 9.880798649051355, + "grad_norm": 0.2255859375, + "learning_rate": 9.540081454256482e-06, + "loss": 0.483, + "step": 198940 + }, + { + "epoch": 9.881295321346975, + "grad_norm": 0.236328125, + "learning_rate": 9.500347670606933e-06, + "loss": 0.5229, + "step": 198950 + }, + { + "epoch": 9.881791993642594, + "grad_norm": 0.2421875, + "learning_rate": 9.460613886957386e-06, + "loss": 0.524, + "step": 198960 + }, + { + "epoch": 9.882288665938214, + "grad_norm": 0.208984375, + "learning_rate": 9.420880103307837e-06, + "loss": 0.4776, + "step": 198970 + }, + { + "epoch": 9.882785338233834, + "grad_norm": 0.21484375, + "learning_rate": 9.381146319658291e-06, + "loss": 0.4885, + "step": 198980 + }, + { + "epoch": 9.883282010529452, + "grad_norm": 0.2080078125, + "learning_rate": 9.341412536008742e-06, + "loss": 0.4874, + "step": 198990 + }, + { + "epoch": 9.883778682825072, + "grad_norm": 0.201171875, + "learning_rate": 9.301678752359195e-06, + "loss": 0.5069, + "step": 199000 + }, + { + "epoch": 9.88427535512069, + "grad_norm": 0.193359375, + "learning_rate": 9.261944968709646e-06, + "loss": 0.4734, + "step": 199010 + }, + { + "epoch": 9.884772027416311, + "grad_norm": 0.228515625, + "learning_rate": 9.222211185060098e-06, + "loss": 0.4917, + "step": 199020 + }, + { + "epoch": 9.88526869971193, + "grad_norm": 0.208984375, + "learning_rate": 9.18247740141055e-06, + "loss": 0.4996, + "step": 199030 + }, + { + "epoch": 9.88576537200755, + "grad_norm": 0.2109375, + "learning_rate": 9.142743617761002e-06, + "loss": 0.5089, + "step": 199040 + }, + { + "epoch": 9.88626204430317, + "grad_norm": 0.212890625, + "learning_rate": 9.103009834111453e-06, + "loss": 0.5113, + "step": 199050 + }, + { + "epoch": 9.886758716598788, + "grad_norm": 0.205078125, + "learning_rate": 9.063276050461906e-06, + "loss": 0.483, + "step": 199060 + }, + { + "epoch": 9.887255388894408, + "grad_norm": 0.220703125, + "learning_rate": 9.023542266812357e-06, + "loss": 0.5336, + "step": 199070 + }, + { + "epoch": 9.887752061190026, + "grad_norm": 0.21484375, + "learning_rate": 8.98380848316281e-06, + "loss": 0.4434, + "step": 199080 + }, + { + "epoch": 9.888248733485646, + "grad_norm": 0.19921875, + "learning_rate": 8.944074699513262e-06, + "loss": 0.494, + "step": 199090 + }, + { + "epoch": 9.888745405781265, + "grad_norm": 0.22265625, + "learning_rate": 8.904340915863713e-06, + "loss": 0.5094, + "step": 199100 + }, + { + "epoch": 9.889242078076885, + "grad_norm": 0.2197265625, + "learning_rate": 8.864607132214166e-06, + "loss": 0.4903, + "step": 199110 + }, + { + "epoch": 9.889738750372505, + "grad_norm": 0.2177734375, + "learning_rate": 8.824873348564617e-06, + "loss": 0.4492, + "step": 199120 + }, + { + "epoch": 9.890235422668123, + "grad_norm": 0.228515625, + "learning_rate": 8.78513956491507e-06, + "loss": 0.5074, + "step": 199130 + }, + { + "epoch": 9.890732094963743, + "grad_norm": 0.2353515625, + "learning_rate": 8.745405781265521e-06, + "loss": 0.4771, + "step": 199140 + }, + { + "epoch": 9.891228767259362, + "grad_norm": 0.2080078125, + "learning_rate": 8.705671997615974e-06, + "loss": 0.5127, + "step": 199150 + }, + { + "epoch": 9.891725439554982, + "grad_norm": 0.2158203125, + "learning_rate": 8.665938213966425e-06, + "loss": 0.4915, + "step": 199160 + }, + { + "epoch": 9.8922221118506, + "grad_norm": 0.216796875, + "learning_rate": 8.626204430316877e-06, + "loss": 0.4868, + "step": 199170 + }, + { + "epoch": 9.89271878414622, + "grad_norm": 0.279296875, + "learning_rate": 8.586470646667328e-06, + "loss": 0.5011, + "step": 199180 + }, + { + "epoch": 9.89321545644184, + "grad_norm": 0.220703125, + "learning_rate": 8.546736863017781e-06, + "loss": 0.4708, + "step": 199190 + }, + { + "epoch": 9.893712128737459, + "grad_norm": 0.2080078125, + "learning_rate": 8.507003079368234e-06, + "loss": 0.4672, + "step": 199200 + }, + { + "epoch": 9.894208801033079, + "grad_norm": 0.2197265625, + "learning_rate": 8.467269295718685e-06, + "loss": 0.496, + "step": 199210 + }, + { + "epoch": 9.894705473328697, + "grad_norm": 0.25390625, + "learning_rate": 8.427535512069138e-06, + "loss": 0.4965, + "step": 199220 + }, + { + "epoch": 9.895202145624317, + "grad_norm": 0.22265625, + "learning_rate": 8.387801728419589e-06, + "loss": 0.4911, + "step": 199230 + }, + { + "epoch": 9.895698817919936, + "grad_norm": 0.201171875, + "learning_rate": 8.348067944770041e-06, + "loss": 0.4907, + "step": 199240 + }, + { + "epoch": 9.896195490215556, + "grad_norm": 0.236328125, + "learning_rate": 8.308334161120492e-06, + "loss": 0.5264, + "step": 199250 + }, + { + "epoch": 9.896692162511176, + "grad_norm": 0.220703125, + "learning_rate": 8.268600377470945e-06, + "loss": 0.4876, + "step": 199260 + }, + { + "epoch": 9.897188834806794, + "grad_norm": 0.197265625, + "learning_rate": 8.228866593821396e-06, + "loss": 0.5253, + "step": 199270 + }, + { + "epoch": 9.897685507102414, + "grad_norm": 0.2021484375, + "learning_rate": 8.189132810171849e-06, + "loss": 0.4679, + "step": 199280 + }, + { + "epoch": 9.898182179398033, + "grad_norm": 0.2119140625, + "learning_rate": 8.149399026522302e-06, + "loss": 0.512, + "step": 199290 + }, + { + "epoch": 9.898678851693653, + "grad_norm": 0.2314453125, + "learning_rate": 8.109665242872753e-06, + "loss": 0.4922, + "step": 199300 + }, + { + "epoch": 9.899175523989271, + "grad_norm": 0.216796875, + "learning_rate": 8.069931459223205e-06, + "loss": 0.4891, + "step": 199310 + }, + { + "epoch": 9.899672196284891, + "grad_norm": 0.291015625, + "learning_rate": 8.030197675573656e-06, + "loss": 0.5064, + "step": 199320 + }, + { + "epoch": 9.900168868580511, + "grad_norm": 0.1923828125, + "learning_rate": 7.990463891924109e-06, + "loss": 0.4944, + "step": 199330 + }, + { + "epoch": 9.90066554087613, + "grad_norm": 0.1923828125, + "learning_rate": 7.95073010827456e-06, + "loss": 0.4719, + "step": 199340 + }, + { + "epoch": 9.90116221317175, + "grad_norm": 0.2119140625, + "learning_rate": 7.910996324625013e-06, + "loss": 0.4804, + "step": 199350 + }, + { + "epoch": 9.901658885467368, + "grad_norm": 0.197265625, + "learning_rate": 7.871262540975464e-06, + "loss": 0.5012, + "step": 199360 + }, + { + "epoch": 9.902155557762988, + "grad_norm": 0.2451171875, + "learning_rate": 7.831528757325917e-06, + "loss": 0.5368, + "step": 199370 + }, + { + "epoch": 9.902652230058607, + "grad_norm": 0.212890625, + "learning_rate": 7.791794973676368e-06, + "loss": 0.5028, + "step": 199380 + }, + { + "epoch": 9.903148902354227, + "grad_norm": 0.19921875, + "learning_rate": 7.75206119002682e-06, + "loss": 0.4807, + "step": 199390 + }, + { + "epoch": 9.903645574649847, + "grad_norm": 0.2060546875, + "learning_rate": 7.712327406377273e-06, + "loss": 0.4862, + "step": 199400 + }, + { + "epoch": 9.904142246945465, + "grad_norm": 0.21484375, + "learning_rate": 7.672593622727726e-06, + "loss": 0.5135, + "step": 199410 + }, + { + "epoch": 9.904638919241085, + "grad_norm": 0.2060546875, + "learning_rate": 7.632859839078177e-06, + "loss": 0.4843, + "step": 199420 + }, + { + "epoch": 9.905135591536704, + "grad_norm": 0.1865234375, + "learning_rate": 7.593126055428629e-06, + "loss": 0.5021, + "step": 199430 + }, + { + "epoch": 9.905632263832324, + "grad_norm": 0.216796875, + "learning_rate": 7.5533922717790806e-06, + "loss": 0.4686, + "step": 199440 + }, + { + "epoch": 9.906128936127942, + "grad_norm": 0.2255859375, + "learning_rate": 7.5136584881295324e-06, + "loss": 0.4893, + "step": 199450 + }, + { + "epoch": 9.906625608423562, + "grad_norm": 0.216796875, + "learning_rate": 7.473924704479984e-06, + "loss": 0.4977, + "step": 199460 + }, + { + "epoch": 9.907122280719182, + "grad_norm": 0.23046875, + "learning_rate": 7.434190920830436e-06, + "loss": 0.517, + "step": 199470 + }, + { + "epoch": 9.9076189530148, + "grad_norm": 0.2275390625, + "learning_rate": 7.394457137180888e-06, + "loss": 0.4733, + "step": 199480 + }, + { + "epoch": 9.90811562531042, + "grad_norm": 0.21484375, + "learning_rate": 7.35472335353134e-06, + "loss": 0.4651, + "step": 199490 + }, + { + "epoch": 9.90861229760604, + "grad_norm": 0.2099609375, + "learning_rate": 7.314989569881793e-06, + "loss": 0.4776, + "step": 199500 + }, + { + "epoch": 9.90910896990166, + "grad_norm": 0.2265625, + "learning_rate": 7.2752557862322445e-06, + "loss": 0.4696, + "step": 199510 + }, + { + "epoch": 9.909605642197278, + "grad_norm": 0.2060546875, + "learning_rate": 7.235522002582696e-06, + "loss": 0.4766, + "step": 199520 + }, + { + "epoch": 9.910102314492898, + "grad_norm": 0.20703125, + "learning_rate": 7.195788218933148e-06, + "loss": 0.4903, + "step": 199530 + }, + { + "epoch": 9.910598986788518, + "grad_norm": 0.2001953125, + "learning_rate": 7.1560544352836e-06, + "loss": 0.4591, + "step": 199540 + }, + { + "epoch": 9.911095659084136, + "grad_norm": 0.22265625, + "learning_rate": 7.116320651634052e-06, + "loss": 0.4726, + "step": 199550 + }, + { + "epoch": 9.911592331379756, + "grad_norm": 0.189453125, + "learning_rate": 7.076586867984504e-06, + "loss": 0.5133, + "step": 199560 + }, + { + "epoch": 9.912089003675375, + "grad_norm": 0.2333984375, + "learning_rate": 7.036853084334956e-06, + "loss": 0.4847, + "step": 199570 + }, + { + "epoch": 9.912585675970995, + "grad_norm": 0.216796875, + "learning_rate": 6.997119300685408e-06, + "loss": 0.5002, + "step": 199580 + }, + { + "epoch": 9.913082348266613, + "grad_norm": 0.220703125, + "learning_rate": 6.9573855170358595e-06, + "loss": 0.4941, + "step": 199590 + }, + { + "epoch": 9.913579020562233, + "grad_norm": 0.220703125, + "learning_rate": 6.917651733386311e-06, + "loss": 0.5123, + "step": 199600 + }, + { + "epoch": 9.914075692857853, + "grad_norm": 0.2119140625, + "learning_rate": 6.877917949736765e-06, + "loss": 0.5071, + "step": 199610 + }, + { + "epoch": 9.914572365153472, + "grad_norm": 0.2236328125, + "learning_rate": 6.838184166087217e-06, + "loss": 0.4905, + "step": 199620 + }, + { + "epoch": 9.915069037449092, + "grad_norm": 0.2080078125, + "learning_rate": 6.798450382437669e-06, + "loss": 0.4938, + "step": 199630 + }, + { + "epoch": 9.91556570974471, + "grad_norm": 0.232421875, + "learning_rate": 6.75871659878812e-06, + "loss": 0.4943, + "step": 199640 + }, + { + "epoch": 9.91606238204033, + "grad_norm": 0.2021484375, + "learning_rate": 6.718982815138572e-06, + "loss": 0.4932, + "step": 199650 + }, + { + "epoch": 9.916559054335949, + "grad_norm": 0.29296875, + "learning_rate": 6.6792490314890235e-06, + "loss": 0.4677, + "step": 199660 + }, + { + "epoch": 9.917055726631569, + "grad_norm": 0.21875, + "learning_rate": 6.639515247839475e-06, + "loss": 0.4805, + "step": 199670 + }, + { + "epoch": 9.917552398927189, + "grad_norm": 0.25390625, + "learning_rate": 6.599781464189927e-06, + "loss": 0.5073, + "step": 199680 + }, + { + "epoch": 9.918049071222807, + "grad_norm": 0.1982421875, + "learning_rate": 6.560047680540379e-06, + "loss": 0.4645, + "step": 199690 + }, + { + "epoch": 9.918545743518427, + "grad_norm": 0.203125, + "learning_rate": 6.520313896890831e-06, + "loss": 0.458, + "step": 199700 + }, + { + "epoch": 9.919042415814046, + "grad_norm": 0.20703125, + "learning_rate": 6.4805801132412845e-06, + "loss": 0.4685, + "step": 199710 + }, + { + "epoch": 9.919539088109666, + "grad_norm": 0.2041015625, + "learning_rate": 6.440846329591736e-06, + "loss": 0.4949, + "step": 199720 + }, + { + "epoch": 9.920035760405284, + "grad_norm": 0.203125, + "learning_rate": 6.401112545942188e-06, + "loss": 0.4867, + "step": 199730 + }, + { + "epoch": 9.920532432700904, + "grad_norm": 0.2080078125, + "learning_rate": 6.36137876229264e-06, + "loss": 0.4741, + "step": 199740 + }, + { + "epoch": 9.921029104996522, + "grad_norm": 0.271484375, + "learning_rate": 6.321644978643092e-06, + "loss": 0.4931, + "step": 199750 + }, + { + "epoch": 9.921525777292143, + "grad_norm": 0.2060546875, + "learning_rate": 6.281911194993544e-06, + "loss": 0.4854, + "step": 199760 + }, + { + "epoch": 9.922022449587763, + "grad_norm": 0.23046875, + "learning_rate": 6.242177411343996e-06, + "loss": 0.4866, + "step": 199770 + }, + { + "epoch": 9.922519121883381, + "grad_norm": 0.2119140625, + "learning_rate": 6.202443627694448e-06, + "loss": 0.4936, + "step": 199780 + }, + { + "epoch": 9.923015794179001, + "grad_norm": 0.2138671875, + "learning_rate": 6.1627098440448995e-06, + "loss": 0.4695, + "step": 199790 + }, + { + "epoch": 9.92351246647462, + "grad_norm": 0.205078125, + "learning_rate": 6.122976060395351e-06, + "loss": 0.5042, + "step": 199800 + }, + { + "epoch": 9.92400913877024, + "grad_norm": 0.208984375, + "learning_rate": 6.083242276745803e-06, + "loss": 0.4995, + "step": 199810 + }, + { + "epoch": 9.924505811065858, + "grad_norm": 0.2158203125, + "learning_rate": 6.043508493096255e-06, + "loss": 0.4638, + "step": 199820 + }, + { + "epoch": 9.925002483361478, + "grad_norm": 0.224609375, + "learning_rate": 6.003774709446707e-06, + "loss": 0.4836, + "step": 199830 + }, + { + "epoch": 9.925499155657098, + "grad_norm": 0.1943359375, + "learning_rate": 5.964040925797159e-06, + "loss": 0.4783, + "step": 199840 + }, + { + "epoch": 9.925995827952717, + "grad_norm": 0.185546875, + "learning_rate": 5.924307142147612e-06, + "loss": 0.4765, + "step": 199850 + }, + { + "epoch": 9.926492500248337, + "grad_norm": 0.2109375, + "learning_rate": 5.8845733584980635e-06, + "loss": 0.5236, + "step": 199860 + }, + { + "epoch": 9.926989172543955, + "grad_norm": 0.2431640625, + "learning_rate": 5.844839574848515e-06, + "loss": 0.5028, + "step": 199870 + }, + { + "epoch": 9.927485844839575, + "grad_norm": 0.208984375, + "learning_rate": 5.805105791198967e-06, + "loss": 0.4942, + "step": 199880 + }, + { + "epoch": 9.927982517135193, + "grad_norm": 0.21875, + "learning_rate": 5.765372007549419e-06, + "loss": 0.5034, + "step": 199890 + }, + { + "epoch": 9.928479189430814, + "grad_norm": 0.205078125, + "learning_rate": 5.725638223899871e-06, + "loss": 0.4904, + "step": 199900 + }, + { + "epoch": 9.928975861726434, + "grad_norm": 0.2109375, + "learning_rate": 5.685904440250323e-06, + "loss": 0.5059, + "step": 199910 + }, + { + "epoch": 9.929472534022052, + "grad_norm": 0.2119140625, + "learning_rate": 5.646170656600775e-06, + "loss": 0.4754, + "step": 199920 + }, + { + "epoch": 9.929969206317672, + "grad_norm": 0.212890625, + "learning_rate": 5.606436872951227e-06, + "loss": 0.5001, + "step": 199930 + }, + { + "epoch": 9.93046587861329, + "grad_norm": 0.205078125, + "learning_rate": 5.5667030893016785e-06, + "loss": 0.4922, + "step": 199940 + }, + { + "epoch": 9.93096255090891, + "grad_norm": 0.21875, + "learning_rate": 5.526969305652131e-06, + "loss": 0.4963, + "step": 199950 + }, + { + "epoch": 9.931459223204529, + "grad_norm": 0.2451171875, + "learning_rate": 5.487235522002583e-06, + "loss": 0.5201, + "step": 199960 + }, + { + "epoch": 9.931955895500149, + "grad_norm": 0.2119140625, + "learning_rate": 5.447501738353035e-06, + "loss": 0.4807, + "step": 199970 + }, + { + "epoch": 9.93245256779577, + "grad_norm": 0.208984375, + "learning_rate": 5.407767954703487e-06, + "loss": 0.4466, + "step": 199980 + }, + { + "epoch": 9.932949240091387, + "grad_norm": 0.2333984375, + "learning_rate": 5.368034171053939e-06, + "loss": 0.4951, + "step": 199990 + }, + { + "epoch": 9.933445912387008, + "grad_norm": 0.201171875, + "learning_rate": 5.3283003874043914e-06, + "loss": 0.4759, + "step": 200000 + }, + { + "epoch": 9.933942584682626, + "grad_norm": 0.20703125, + "learning_rate": 5.288566603754843e-06, + "loss": 0.4694, + "step": 200010 + }, + { + "epoch": 9.934439256978246, + "grad_norm": 0.189453125, + "learning_rate": 5.248832820105295e-06, + "loss": 0.4585, + "step": 200020 + }, + { + "epoch": 9.934935929273864, + "grad_norm": 0.216796875, + "learning_rate": 5.209099036455747e-06, + "loss": 0.4969, + "step": 200030 + }, + { + "epoch": 9.935432601569484, + "grad_norm": 0.201171875, + "learning_rate": 5.169365252806199e-06, + "loss": 0.4757, + "step": 200040 + }, + { + "epoch": 9.935929273865105, + "grad_norm": 0.2158203125, + "learning_rate": 5.12963146915665e-06, + "loss": 0.4838, + "step": 200050 + }, + { + "epoch": 9.936425946160723, + "grad_norm": 0.2041015625, + "learning_rate": 5.089897685507103e-06, + "loss": 0.4847, + "step": 200060 + }, + { + "epoch": 9.936922618456343, + "grad_norm": 0.2109375, + "learning_rate": 5.0501639018575545e-06, + "loss": 0.4847, + "step": 200070 + }, + { + "epoch": 9.937419290751961, + "grad_norm": 0.2060546875, + "learning_rate": 5.010430118208006e-06, + "loss": 0.4708, + "step": 200080 + }, + { + "epoch": 9.937915963047582, + "grad_norm": 0.18359375, + "learning_rate": 4.970696334558458e-06, + "loss": 0.4802, + "step": 200090 + }, + { + "epoch": 9.9384126353432, + "grad_norm": 0.197265625, + "learning_rate": 4.93096255090891e-06, + "loss": 0.4859, + "step": 200100 + }, + { + "epoch": 9.93890930763882, + "grad_norm": 0.185546875, + "learning_rate": 4.891228767259363e-06, + "loss": 0.4903, + "step": 200110 + }, + { + "epoch": 9.93940597993444, + "grad_norm": 0.2314453125, + "learning_rate": 4.851494983609815e-06, + "loss": 0.5034, + "step": 200120 + }, + { + "epoch": 9.939902652230058, + "grad_norm": 0.236328125, + "learning_rate": 4.811761199960267e-06, + "loss": 0.4839, + "step": 200130 + }, + { + "epoch": 9.940399324525679, + "grad_norm": 0.201171875, + "learning_rate": 4.7720274163107185e-06, + "loss": 0.4631, + "step": 200140 + }, + { + "epoch": 9.940895996821297, + "grad_norm": 0.205078125, + "learning_rate": 4.73229363266117e-06, + "loss": 0.4924, + "step": 200150 + }, + { + "epoch": 9.941392669116917, + "grad_norm": 0.2001953125, + "learning_rate": 4.692559849011622e-06, + "loss": 0.4681, + "step": 200160 + }, + { + "epoch": 9.941889341412535, + "grad_norm": 0.201171875, + "learning_rate": 4.652826065362074e-06, + "loss": 0.4945, + "step": 200170 + }, + { + "epoch": 9.942386013708155, + "grad_norm": 0.2216796875, + "learning_rate": 4.613092281712526e-06, + "loss": 0.4998, + "step": 200180 + }, + { + "epoch": 9.942882686003774, + "grad_norm": 0.236328125, + "learning_rate": 4.573358498062978e-06, + "loss": 0.5073, + "step": 200190 + }, + { + "epoch": 9.943379358299394, + "grad_norm": 0.20703125, + "learning_rate": 4.53362471441343e-06, + "loss": 0.5035, + "step": 200200 + }, + { + "epoch": 9.943876030595014, + "grad_norm": 0.2373046875, + "learning_rate": 4.4938909307638825e-06, + "loss": 0.5029, + "step": 200210 + }, + { + "epoch": 9.944372702890632, + "grad_norm": 0.21484375, + "learning_rate": 4.454157147114334e-06, + "loss": 0.511, + "step": 200220 + }, + { + "epoch": 9.944869375186252, + "grad_norm": 0.216796875, + "learning_rate": 4.414423363464786e-06, + "loss": 0.5102, + "step": 200230 + }, + { + "epoch": 9.94536604748187, + "grad_norm": 0.2333984375, + "learning_rate": 4.374689579815238e-06, + "loss": 0.5198, + "step": 200240 + }, + { + "epoch": 9.945862719777491, + "grad_norm": 0.21484375, + "learning_rate": 4.33495579616569e-06, + "loss": 0.482, + "step": 200250 + }, + { + "epoch": 9.94635939207311, + "grad_norm": 0.2109375, + "learning_rate": 4.295222012516142e-06, + "loss": 0.5265, + "step": 200260 + }, + { + "epoch": 9.94685606436873, + "grad_norm": 0.21484375, + "learning_rate": 4.2554882288665946e-06, + "loss": 0.5336, + "step": 200270 + }, + { + "epoch": 9.94735273666435, + "grad_norm": 0.25390625, + "learning_rate": 4.2157544452170464e-06, + "loss": 0.5028, + "step": 200280 + }, + { + "epoch": 9.947849408959968, + "grad_norm": 0.228515625, + "learning_rate": 4.176020661567498e-06, + "loss": 0.4794, + "step": 200290 + }, + { + "epoch": 9.948346081255588, + "grad_norm": 0.234375, + "learning_rate": 4.13628687791795e-06, + "loss": 0.5099, + "step": 200300 + }, + { + "epoch": 9.948842753551206, + "grad_norm": 0.1943359375, + "learning_rate": 4.096553094268401e-06, + "loss": 0.4611, + "step": 200310 + }, + { + "epoch": 9.949339425846826, + "grad_norm": 0.197265625, + "learning_rate": 4.056819310618854e-06, + "loss": 0.466, + "step": 200320 + }, + { + "epoch": 9.949836098142445, + "grad_norm": 0.2314453125, + "learning_rate": 4.017085526969306e-06, + "loss": 0.482, + "step": 200330 + }, + { + "epoch": 9.950332770438065, + "grad_norm": 0.2119140625, + "learning_rate": 3.977351743319758e-06, + "loss": 0.4685, + "step": 200340 + }, + { + "epoch": 9.950829442733685, + "grad_norm": 0.2109375, + "learning_rate": 3.9376179596702096e-06, + "loss": 0.4698, + "step": 200350 + }, + { + "epoch": 9.951326115029303, + "grad_norm": 0.1767578125, + "learning_rate": 3.8978841760206614e-06, + "loss": 0.4589, + "step": 200360 + }, + { + "epoch": 9.951822787324923, + "grad_norm": 0.205078125, + "learning_rate": 3.858150392371114e-06, + "loss": 0.4901, + "step": 200370 + }, + { + "epoch": 9.952319459620542, + "grad_norm": 0.212890625, + "learning_rate": 3.818416608721566e-06, + "loss": 0.4826, + "step": 200380 + }, + { + "epoch": 9.952816131916162, + "grad_norm": 0.2177734375, + "learning_rate": 3.778682825072018e-06, + "loss": 0.4923, + "step": 200390 + }, + { + "epoch": 9.95331280421178, + "grad_norm": 0.2001953125, + "learning_rate": 3.7389490414224694e-06, + "loss": 0.4776, + "step": 200400 + }, + { + "epoch": 9.9538094765074, + "grad_norm": 0.236328125, + "learning_rate": 3.6992152577729212e-06, + "loss": 0.4819, + "step": 200410 + }, + { + "epoch": 9.95430614880302, + "grad_norm": 0.201171875, + "learning_rate": 3.659481474123374e-06, + "loss": 0.4963, + "step": 200420 + }, + { + "epoch": 9.954802821098639, + "grad_norm": 0.2197265625, + "learning_rate": 3.619747690473826e-06, + "loss": 0.4885, + "step": 200430 + }, + { + "epoch": 9.955299493394259, + "grad_norm": 0.2060546875, + "learning_rate": 3.5800139068242777e-06, + "loss": 0.466, + "step": 200440 + }, + { + "epoch": 9.955796165689877, + "grad_norm": 0.201171875, + "learning_rate": 3.5402801231747296e-06, + "loss": 0.4784, + "step": 200450 + }, + { + "epoch": 9.956292837985497, + "grad_norm": 0.2216796875, + "learning_rate": 3.5005463395251814e-06, + "loss": 0.5032, + "step": 200460 + }, + { + "epoch": 9.956789510281116, + "grad_norm": 0.2001953125, + "learning_rate": 3.4608125558756333e-06, + "loss": 0.509, + "step": 200470 + }, + { + "epoch": 9.957286182576736, + "grad_norm": 0.19140625, + "learning_rate": 3.4210787722260856e-06, + "loss": 0.4765, + "step": 200480 + }, + { + "epoch": 9.957782854872356, + "grad_norm": 0.21484375, + "learning_rate": 3.3813449885765375e-06, + "loss": 0.492, + "step": 200490 + }, + { + "epoch": 9.958279527167974, + "grad_norm": 0.2060546875, + "learning_rate": 3.3416112049269894e-06, + "loss": 0.5059, + "step": 200500 + }, + { + "epoch": 9.958776199463594, + "grad_norm": 0.2021484375, + "learning_rate": 3.3018774212774412e-06, + "loss": 0.4879, + "step": 200510 + }, + { + "epoch": 9.959272871759213, + "grad_norm": 0.2080078125, + "learning_rate": 3.262143637627893e-06, + "loss": 0.4838, + "step": 200520 + }, + { + "epoch": 9.959769544054833, + "grad_norm": 0.2021484375, + "learning_rate": 3.2224098539783454e-06, + "loss": 0.4625, + "step": 200530 + }, + { + "epoch": 9.960266216350451, + "grad_norm": 0.2275390625, + "learning_rate": 3.1826760703287973e-06, + "loss": 0.5267, + "step": 200540 + }, + { + "epoch": 9.960762888646071, + "grad_norm": 0.1875, + "learning_rate": 3.142942286679249e-06, + "loss": 0.4634, + "step": 200550 + }, + { + "epoch": 9.961259560941691, + "grad_norm": 0.2119140625, + "learning_rate": 3.103208503029701e-06, + "loss": 0.4889, + "step": 200560 + }, + { + "epoch": 9.96175623323731, + "grad_norm": 0.251953125, + "learning_rate": 3.0634747193801533e-06, + "loss": 0.4917, + "step": 200570 + }, + { + "epoch": 9.96225290553293, + "grad_norm": 0.205078125, + "learning_rate": 3.023740935730605e-06, + "loss": 0.496, + "step": 200580 + }, + { + "epoch": 9.962749577828548, + "grad_norm": 0.2080078125, + "learning_rate": 2.984007152081057e-06, + "loss": 0.4951, + "step": 200590 + }, + { + "epoch": 9.963246250124168, + "grad_norm": 0.20703125, + "learning_rate": 2.944273368431509e-06, + "loss": 0.5108, + "step": 200600 + }, + { + "epoch": 9.963742922419787, + "grad_norm": 0.236328125, + "learning_rate": 2.904539584781961e-06, + "loss": 0.4966, + "step": 200610 + }, + { + "epoch": 9.964239594715407, + "grad_norm": 0.232421875, + "learning_rate": 2.864805801132413e-06, + "loss": 0.517, + "step": 200620 + }, + { + "epoch": 9.964736267011027, + "grad_norm": 0.2109375, + "learning_rate": 2.825072017482865e-06, + "loss": 0.4991, + "step": 200630 + }, + { + "epoch": 9.965232939306645, + "grad_norm": 0.2001953125, + "learning_rate": 2.785338233833317e-06, + "loss": 0.4462, + "step": 200640 + }, + { + "epoch": 9.965729611602265, + "grad_norm": 0.2099609375, + "learning_rate": 2.745604450183769e-06, + "loss": 0.5046, + "step": 200650 + }, + { + "epoch": 9.966226283897884, + "grad_norm": 0.208984375, + "learning_rate": 2.7058706665342206e-06, + "loss": 0.4744, + "step": 200660 + }, + { + "epoch": 9.966722956193504, + "grad_norm": 0.2060546875, + "learning_rate": 2.666136882884673e-06, + "loss": 0.498, + "step": 200670 + }, + { + "epoch": 9.967219628489122, + "grad_norm": 0.2177734375, + "learning_rate": 2.6264030992351248e-06, + "loss": 0.5009, + "step": 200680 + }, + { + "epoch": 9.967716300784742, + "grad_norm": 0.2197265625, + "learning_rate": 2.5866693155855767e-06, + "loss": 0.4931, + "step": 200690 + }, + { + "epoch": 9.968212973080362, + "grad_norm": 0.197265625, + "learning_rate": 2.546935531936029e-06, + "loss": 0.4721, + "step": 200700 + }, + { + "epoch": 9.96870964537598, + "grad_norm": 0.193359375, + "learning_rate": 2.507201748286481e-06, + "loss": 0.4798, + "step": 200710 + }, + { + "epoch": 9.9692063176716, + "grad_norm": 0.2109375, + "learning_rate": 2.4674679646369327e-06, + "loss": 0.4699, + "step": 200720 + }, + { + "epoch": 9.96970298996722, + "grad_norm": 0.1982421875, + "learning_rate": 2.4277341809873846e-06, + "loss": 0.5015, + "step": 200730 + }, + { + "epoch": 9.97019966226284, + "grad_norm": 0.2001953125, + "learning_rate": 2.3880003973378365e-06, + "loss": 0.4943, + "step": 200740 + }, + { + "epoch": 9.970696334558458, + "grad_norm": 0.2236328125, + "learning_rate": 2.3482666136882887e-06, + "loss": 0.4559, + "step": 200750 + }, + { + "epoch": 9.971193006854078, + "grad_norm": 0.234375, + "learning_rate": 2.3085328300387406e-06, + "loss": 0.503, + "step": 200760 + }, + { + "epoch": 9.971689679149698, + "grad_norm": 0.2021484375, + "learning_rate": 2.2687990463891925e-06, + "loss": 0.5, + "step": 200770 + }, + { + "epoch": 9.972186351445316, + "grad_norm": 0.2333984375, + "learning_rate": 2.229065262739645e-06, + "loss": 0.4934, + "step": 200780 + }, + { + "epoch": 9.972683023740936, + "grad_norm": 0.2099609375, + "learning_rate": 2.1893314790900962e-06, + "loss": 0.5046, + "step": 200790 + }, + { + "epoch": 9.973179696036555, + "grad_norm": 0.263671875, + "learning_rate": 2.149597695440548e-06, + "loss": 0.5024, + "step": 200800 + }, + { + "epoch": 9.973676368332175, + "grad_norm": 0.220703125, + "learning_rate": 2.1098639117910004e-06, + "loss": 0.4862, + "step": 200810 + }, + { + "epoch": 9.974173040627793, + "grad_norm": 0.1943359375, + "learning_rate": 2.0701301281414523e-06, + "loss": 0.4961, + "step": 200820 + }, + { + "epoch": 9.974669712923413, + "grad_norm": 0.205078125, + "learning_rate": 2.0303963444919046e-06, + "loss": 0.4749, + "step": 200830 + }, + { + "epoch": 9.975166385219033, + "grad_norm": 0.21484375, + "learning_rate": 1.9906625608423565e-06, + "loss": 0.4709, + "step": 200840 + }, + { + "epoch": 9.975663057514652, + "grad_norm": 0.2578125, + "learning_rate": 1.9509287771928083e-06, + "loss": 0.5292, + "step": 200850 + }, + { + "epoch": 9.976159729810272, + "grad_norm": 0.2060546875, + "learning_rate": 1.91119499354326e-06, + "loss": 0.4629, + "step": 200860 + }, + { + "epoch": 9.97665640210589, + "grad_norm": 0.2109375, + "learning_rate": 1.871461209893712e-06, + "loss": 0.4717, + "step": 200870 + }, + { + "epoch": 9.97715307440151, + "grad_norm": 0.23828125, + "learning_rate": 1.831727426244164e-06, + "loss": 0.4822, + "step": 200880 + }, + { + "epoch": 9.977649746697129, + "grad_norm": 0.205078125, + "learning_rate": 1.7919936425946163e-06, + "loss": 0.5102, + "step": 200890 + }, + { + "epoch": 9.978146418992749, + "grad_norm": 0.2099609375, + "learning_rate": 1.7522598589450681e-06, + "loss": 0.4861, + "step": 200900 + }, + { + "epoch": 9.978643091288369, + "grad_norm": 0.2294921875, + "learning_rate": 1.7125260752955202e-06, + "loss": 0.4787, + "step": 200910 + }, + { + "epoch": 9.979139763583987, + "grad_norm": 0.21484375, + "learning_rate": 1.672792291645972e-06, + "loss": 0.4888, + "step": 200920 + }, + { + "epoch": 9.979636435879607, + "grad_norm": 0.21875, + "learning_rate": 1.633058507996424e-06, + "loss": 0.4792, + "step": 200930 + }, + { + "epoch": 9.980133108175226, + "grad_norm": 0.2080078125, + "learning_rate": 1.593324724346876e-06, + "loss": 0.5013, + "step": 200940 + }, + { + "epoch": 9.980629780470846, + "grad_norm": 0.181640625, + "learning_rate": 1.553590940697328e-06, + "loss": 0.4623, + "step": 200950 + }, + { + "epoch": 9.981126452766464, + "grad_norm": 0.2080078125, + "learning_rate": 1.51385715704778e-06, + "loss": 0.4763, + "step": 200960 + }, + { + "epoch": 9.981623125062084, + "grad_norm": 0.2216796875, + "learning_rate": 1.4741233733982319e-06, + "loss": 0.5116, + "step": 200970 + }, + { + "epoch": 9.982119797357704, + "grad_norm": 0.2119140625, + "learning_rate": 1.434389589748684e-06, + "loss": 0.484, + "step": 200980 + }, + { + "epoch": 9.982616469653323, + "grad_norm": 0.193359375, + "learning_rate": 1.3946558060991358e-06, + "loss": 0.4718, + "step": 200990 + }, + { + "epoch": 9.983113141948943, + "grad_norm": 0.2177734375, + "learning_rate": 1.3549220224495877e-06, + "loss": 0.4857, + "step": 201000 + }, + { + "epoch": 9.983609814244561, + "grad_norm": 0.19921875, + "learning_rate": 1.3151882388000398e-06, + "loss": 0.4885, + "step": 201010 + }, + { + "epoch": 9.984106486540181, + "grad_norm": 0.220703125, + "learning_rate": 1.2754544551504919e-06, + "loss": 0.4994, + "step": 201020 + }, + { + "epoch": 9.9846031588358, + "grad_norm": 0.1923828125, + "learning_rate": 1.2357206715009438e-06, + "loss": 0.5013, + "step": 201030 + }, + { + "epoch": 9.98509983113142, + "grad_norm": 0.2099609375, + "learning_rate": 1.1959868878513956e-06, + "loss": 0.4846, + "step": 201040 + }, + { + "epoch": 9.98559650342704, + "grad_norm": 0.197265625, + "learning_rate": 1.1562531042018477e-06, + "loss": 0.4439, + "step": 201050 + }, + { + "epoch": 9.986093175722658, + "grad_norm": 0.22265625, + "learning_rate": 1.1165193205522998e-06, + "loss": 0.4679, + "step": 201060 + }, + { + "epoch": 9.986589848018278, + "grad_norm": 0.2099609375, + "learning_rate": 1.0767855369027515e-06, + "loss": 0.4808, + "step": 201070 + }, + { + "epoch": 9.987086520313897, + "grad_norm": 0.1982421875, + "learning_rate": 1.0370517532532036e-06, + "loss": 0.5098, + "step": 201080 + }, + { + "epoch": 9.987583192609517, + "grad_norm": 0.2041015625, + "learning_rate": 9.973179696036556e-07, + "loss": 0.4858, + "step": 201090 + }, + { + "epoch": 9.988079864905135, + "grad_norm": 0.197265625, + "learning_rate": 9.575841859541075e-07, + "loss": 0.4841, + "step": 201100 + }, + { + "epoch": 9.988576537200755, + "grad_norm": 0.220703125, + "learning_rate": 9.178504023045594e-07, + "loss": 0.4603, + "step": 201110 + }, + { + "epoch": 9.989073209496375, + "grad_norm": 0.2177734375, + "learning_rate": 8.781166186550115e-07, + "loss": 0.4741, + "step": 201120 + }, + { + "epoch": 9.989569881791994, + "grad_norm": 0.1953125, + "learning_rate": 8.383828350054634e-07, + "loss": 0.4818, + "step": 201130 + }, + { + "epoch": 9.990066554087614, + "grad_norm": 0.216796875, + "learning_rate": 7.986490513559154e-07, + "loss": 0.4931, + "step": 201140 + }, + { + "epoch": 9.990563226383232, + "grad_norm": 0.212890625, + "learning_rate": 7.589152677063674e-07, + "loss": 0.51, + "step": 201150 + }, + { + "epoch": 9.991059898678852, + "grad_norm": 0.1865234375, + "learning_rate": 7.191814840568193e-07, + "loss": 0.4486, + "step": 201160 + }, + { + "epoch": 9.99155657097447, + "grad_norm": 0.2216796875, + "learning_rate": 6.794477004072713e-07, + "loss": 0.5048, + "step": 201170 + }, + { + "epoch": 9.99205324327009, + "grad_norm": 0.205078125, + "learning_rate": 6.397139167577233e-07, + "loss": 0.5031, + "step": 201180 + }, + { + "epoch": 9.992549915565709, + "grad_norm": 0.1943359375, + "learning_rate": 5.999801331081752e-07, + "loss": 0.5323, + "step": 201190 + }, + { + "epoch": 9.993046587861329, + "grad_norm": 0.2177734375, + "learning_rate": 5.602463494586273e-07, + "loss": 0.4506, + "step": 201200 + }, + { + "epoch": 9.99354326015695, + "grad_norm": 0.2109375, + "learning_rate": 5.205125658090792e-07, + "loss": 0.4974, + "step": 201210 + }, + { + "epoch": 9.994039932452567, + "grad_norm": 0.25, + "learning_rate": 4.807787821595312e-07, + "loss": 0.5025, + "step": 201220 + }, + { + "epoch": 9.994536604748188, + "grad_norm": 0.2294921875, + "learning_rate": 4.4104499850998314e-07, + "loss": 0.5047, + "step": 201230 + }, + { + "epoch": 9.995033277043806, + "grad_norm": 0.2353515625, + "learning_rate": 4.013112148604351e-07, + "loss": 0.4562, + "step": 201240 + }, + { + "epoch": 9.995529949339426, + "grad_norm": 0.2109375, + "learning_rate": 3.6157743121088705e-07, + "loss": 0.4832, + "step": 201250 + }, + { + "epoch": 9.996026621635044, + "grad_norm": 0.1923828125, + "learning_rate": 3.2184364756133903e-07, + "loss": 0.5173, + "step": 201260 + }, + { + "epoch": 9.996523293930665, + "grad_norm": 0.2197265625, + "learning_rate": 2.82109863911791e-07, + "loss": 0.4914, + "step": 201270 + }, + { + "epoch": 9.997019966226285, + "grad_norm": 0.2138671875, + "learning_rate": 2.42376080262243e-07, + "loss": 0.5044, + "step": 201280 + }, + { + "epoch": 9.997516638521903, + "grad_norm": 0.1904296875, + "learning_rate": 2.0264229661269497e-07, + "loss": 0.4748, + "step": 201290 + }, + { + "epoch": 9.998013310817523, + "grad_norm": 0.2158203125, + "learning_rate": 1.6290851296314692e-07, + "loss": 0.4938, + "step": 201300 + }, + { + "epoch": 9.998509983113141, + "grad_norm": 0.2001953125, + "learning_rate": 1.231747293135989e-07, + "loss": 0.5312, + "step": 201310 + }, + { + "epoch": 9.999006655408762, + "grad_norm": 0.1875, + "learning_rate": 8.344094566405085e-08, + "loss": 0.4645, + "step": 201320 + }, + { + "epoch": 9.99950332770438, + "grad_norm": 0.2421875, + "learning_rate": 4.370716201450283e-08, + "loss": 0.4832, + "step": 201330 + }, + { + "epoch": 10.0, + "grad_norm": 0.203125, + "learning_rate": 3.9733783649548035e-09, + "loss": 0.4821, + "step": 201340 + }, + { + "epoch": 10.0, + "step": 201340, + "total_flos": 1.6346195493598003e+18, + "train_loss": 0.5436877673617467, + "train_runtime": 108051.857, + "train_samples_per_second": 7.453, + "train_steps_per_second": 1.863 + } + ], + "logging_steps": 10, + "max_steps": 201340, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6346195493598003e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}