diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,35034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 29.10360884749709, + "eval_steps": 500, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005820721769499418, + "grad_norm": 4.032624244689941, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.1314, + "step": 10 + }, + { + "epoch": 0.011641443538998836, + "grad_norm": 1.5682274103164673, + "learning_rate": 6.333333333333333e-07, + "loss": 1.0984, + "step": 20 + }, + { + "epoch": 0.017462165308498253, + "grad_norm": 2.392137289047241, + "learning_rate": 9.666666666666668e-07, + "loss": 1.0923, + "step": 30 + }, + { + "epoch": 0.023282887077997673, + "grad_norm": 1.9130572080612183, + "learning_rate": 1.3e-06, + "loss": 1.073, + "step": 40 + }, + { + "epoch": 0.02910360884749709, + "grad_norm": 1.5198686122894287, + "learning_rate": 1.6333333333333333e-06, + "loss": 1.0663, + "step": 50 + }, + { + "epoch": 0.034924330616996506, + "grad_norm": 1.414235234260559, + "learning_rate": 1.9666666666666668e-06, + "loss": 1.0682, + "step": 60 + }, + { + "epoch": 0.04074505238649592, + "grad_norm": 1.5277303457260132, + "learning_rate": 2.3e-06, + "loss": 1.0725, + "step": 70 + }, + { + "epoch": 0.046565774155995346, + "grad_norm": 1.5666190385818481, + "learning_rate": 2.6333333333333337e-06, + "loss": 1.0593, + "step": 80 + }, + { + "epoch": 0.05238649592549476, + "grad_norm": 1.3526203632354736, + "learning_rate": 2.966666666666667e-06, + "loss": 1.0772, + "step": 90 + }, + { + "epoch": 0.05820721769499418, + "grad_norm": 1.5236045122146606, + "learning_rate": 3.3e-06, + "loss": 1.0597, + "step": 100 + }, + { + "epoch": 0.0640279394644936, + "grad_norm": 1.249487042427063, + "learning_rate": 3.633333333333334e-06, + "loss": 1.0467, + "step": 110 + }, + { + "epoch": 0.06984866123399301, + "grad_norm": 1.5206713676452637, + "learning_rate": 3.966666666666667e-06, + "loss": 1.0459, + "step": 120 + }, + { + "epoch": 0.07566938300349244, + "grad_norm": 1.2953330278396606, + "learning_rate": 4.2999999999999995e-06, + "loss": 1.0696, + "step": 130 + }, + { + "epoch": 0.08149010477299184, + "grad_norm": 1.1768537759780884, + "learning_rate": 4.633333333333334e-06, + "loss": 1.0571, + "step": 140 + }, + { + "epoch": 0.08731082654249127, + "grad_norm": 1.8447908163070679, + "learning_rate": 4.966666666666667e-06, + "loss": 1.0598, + "step": 150 + }, + { + "epoch": 0.09313154831199069, + "grad_norm": 2.2257068157196045, + "learning_rate": 5.3e-06, + "loss": 1.0241, + "step": 160 + }, + { + "epoch": 0.0989522700814901, + "grad_norm": 2.487992286682129, + "learning_rate": 5.633333333333333e-06, + "loss": 1.0448, + "step": 170 + }, + { + "epoch": 0.10477299185098952, + "grad_norm": 1.7957653999328613, + "learning_rate": 5.9666666666666666e-06, + "loss": 1.0356, + "step": 180 + }, + { + "epoch": 0.11059371362048893, + "grad_norm": 1.517179250717163, + "learning_rate": 6.300000000000001e-06, + "loss": 1.0147, + "step": 190 + }, + { + "epoch": 0.11641443538998836, + "grad_norm": 2.659872531890869, + "learning_rate": 6.633333333333333e-06, + "loss": 1.047, + "step": 200 + }, + { + "epoch": 0.12223515715948778, + "grad_norm": 2.055488109588623, + "learning_rate": 6.966666666666667e-06, + "loss": 1.0252, + "step": 210 + }, + { + "epoch": 0.1280558789289872, + "grad_norm": 2.121302366256714, + "learning_rate": 7.2999999999999996e-06, + "loss": 1.0391, + "step": 220 + }, + { + "epoch": 0.13387660069848661, + "grad_norm": 2.0612199306488037, + "learning_rate": 7.633333333333334e-06, + "loss": 1.0048, + "step": 230 + }, + { + "epoch": 0.13969732246798602, + "grad_norm": 3.2754385471343994, + "learning_rate": 7.966666666666666e-06, + "loss": 0.997, + "step": 240 + }, + { + "epoch": 0.14551804423748546, + "grad_norm": 3.7705655097961426, + "learning_rate": 8.3e-06, + "loss": 1.0092, + "step": 250 + }, + { + "epoch": 0.15133876600698487, + "grad_norm": 5.143420696258545, + "learning_rate": 8.633333333333334e-06, + "loss": 0.9514, + "step": 260 + }, + { + "epoch": 0.15715948777648428, + "grad_norm": 5.071794033050537, + "learning_rate": 8.966666666666668e-06, + "loss": 0.9048, + "step": 270 + }, + { + "epoch": 0.1629802095459837, + "grad_norm": 6.457543849945068, + "learning_rate": 9.3e-06, + "loss": 0.8713, + "step": 280 + }, + { + "epoch": 0.16880093131548313, + "grad_norm": 10.855563163757324, + "learning_rate": 9.633333333333335e-06, + "loss": 0.8059, + "step": 290 + }, + { + "epoch": 0.17462165308498254, + "grad_norm": 4.343142032623291, + "learning_rate": 9.966666666666667e-06, + "loss": 0.7843, + "step": 300 + }, + { + "epoch": 0.18044237485448195, + "grad_norm": 4.996496677398682, + "learning_rate": 1.03e-05, + "loss": 0.6912, + "step": 310 + }, + { + "epoch": 0.18626309662398138, + "grad_norm": 5.401365280151367, + "learning_rate": 1.0633333333333334e-05, + "loss": 0.6328, + "step": 320 + }, + { + "epoch": 0.1920838183934808, + "grad_norm": 8.77446460723877, + "learning_rate": 1.0966666666666666e-05, + "loss": 0.5901, + "step": 330 + }, + { + "epoch": 0.1979045401629802, + "grad_norm": 10.216883659362793, + "learning_rate": 1.13e-05, + "loss": 0.5226, + "step": 340 + }, + { + "epoch": 0.20372526193247964, + "grad_norm": 6.220579147338867, + "learning_rate": 1.1633333333333334e-05, + "loss": 0.5177, + "step": 350 + }, + { + "epoch": 0.20954598370197905, + "grad_norm": 4.358880519866943, + "learning_rate": 1.1966666666666668e-05, + "loss": 0.4509, + "step": 360 + }, + { + "epoch": 0.21536670547147846, + "grad_norm": 6.4626145362854, + "learning_rate": 1.23e-05, + "loss": 0.3981, + "step": 370 + }, + { + "epoch": 0.22118742724097787, + "grad_norm": 8.341089248657227, + "learning_rate": 1.2633333333333333e-05, + "loss": 0.3885, + "step": 380 + }, + { + "epoch": 0.2270081490104773, + "grad_norm": 6.410506725311279, + "learning_rate": 1.2966666666666669e-05, + "loss": 0.4087, + "step": 390 + }, + { + "epoch": 0.23282887077997672, + "grad_norm": 5.53407621383667, + "learning_rate": 1.3300000000000001e-05, + "loss": 0.3306, + "step": 400 + }, + { + "epoch": 0.23864959254947612, + "grad_norm": 3.7776386737823486, + "learning_rate": 1.3633333333333334e-05, + "loss": 0.3342, + "step": 410 + }, + { + "epoch": 0.24447031431897556, + "grad_norm": 5.946916580200195, + "learning_rate": 1.3966666666666666e-05, + "loss": 0.3279, + "step": 420 + }, + { + "epoch": 0.25029103608847497, + "grad_norm": 3.886380910873413, + "learning_rate": 1.43e-05, + "loss": 0.3176, + "step": 430 + }, + { + "epoch": 0.2561117578579744, + "grad_norm": 5.515733242034912, + "learning_rate": 1.4633333333333334e-05, + "loss": 0.2901, + "step": 440 + }, + { + "epoch": 0.2619324796274738, + "grad_norm": 3.7609305381774902, + "learning_rate": 1.4966666666666668e-05, + "loss": 0.2803, + "step": 450 + }, + { + "epoch": 0.26775320139697323, + "grad_norm": 4.293857097625732, + "learning_rate": 1.53e-05, + "loss": 0.2436, + "step": 460 + }, + { + "epoch": 0.27357392316647267, + "grad_norm": 4.057872295379639, + "learning_rate": 1.563333333333333e-05, + "loss": 0.241, + "step": 470 + }, + { + "epoch": 0.27939464493597205, + "grad_norm": 2.3653814792633057, + "learning_rate": 1.5966666666666667e-05, + "loss": 0.2043, + "step": 480 + }, + { + "epoch": 0.2852153667054715, + "grad_norm": 2.3830487728118896, + "learning_rate": 1.63e-05, + "loss": 0.2193, + "step": 490 + }, + { + "epoch": 0.2910360884749709, + "grad_norm": 2.2712197303771973, + "learning_rate": 1.6633333333333336e-05, + "loss": 0.1657, + "step": 500 + }, + { + "epoch": 0.2968568102444703, + "grad_norm": 3.355534791946411, + "learning_rate": 1.6966666666666668e-05, + "loss": 0.1849, + "step": 510 + }, + { + "epoch": 0.30267753201396974, + "grad_norm": 2.0032505989074707, + "learning_rate": 1.73e-05, + "loss": 0.1484, + "step": 520 + }, + { + "epoch": 0.3084982537834691, + "grad_norm": 3.4510762691497803, + "learning_rate": 1.7633333333333336e-05, + "loss": 0.1708, + "step": 530 + }, + { + "epoch": 0.31431897555296856, + "grad_norm": 2.4246864318847656, + "learning_rate": 1.796666666666667e-05, + "loss": 0.1778, + "step": 540 + }, + { + "epoch": 0.320139697322468, + "grad_norm": 1.810784935951233, + "learning_rate": 1.83e-05, + "loss": 0.1368, + "step": 550 + }, + { + "epoch": 0.3259604190919674, + "grad_norm": 1.8402965068817139, + "learning_rate": 1.8633333333333333e-05, + "loss": 0.1496, + "step": 560 + }, + { + "epoch": 0.3317811408614668, + "grad_norm": 3.0320515632629395, + "learning_rate": 1.896666666666667e-05, + "loss": 0.1609, + "step": 570 + }, + { + "epoch": 0.33760186263096625, + "grad_norm": 2.5287060737609863, + "learning_rate": 1.93e-05, + "loss": 0.1617, + "step": 580 + }, + { + "epoch": 0.34342258440046564, + "grad_norm": 2.77878475189209, + "learning_rate": 1.9633333333333334e-05, + "loss": 0.1278, + "step": 590 + }, + { + "epoch": 0.3492433061699651, + "grad_norm": 3.866158962249756, + "learning_rate": 1.9966666666666666e-05, + "loss": 0.1483, + "step": 600 + }, + { + "epoch": 0.3550640279394645, + "grad_norm": 1.5810424089431763, + "learning_rate": 2.0300000000000002e-05, + "loss": 0.1404, + "step": 610 + }, + { + "epoch": 0.3608847497089639, + "grad_norm": 2.5479066371917725, + "learning_rate": 2.0633333333333335e-05, + "loss": 0.1378, + "step": 620 + }, + { + "epoch": 0.36670547147846333, + "grad_norm": 1.824920415878296, + "learning_rate": 2.0966666666666667e-05, + "loss": 0.1235, + "step": 630 + }, + { + "epoch": 0.37252619324796277, + "grad_norm": 2.357595920562744, + "learning_rate": 2.13e-05, + "loss": 0.1089, + "step": 640 + }, + { + "epoch": 0.37834691501746215, + "grad_norm": 2.2098588943481445, + "learning_rate": 2.1633333333333332e-05, + "loss": 0.1137, + "step": 650 + }, + { + "epoch": 0.3841676367869616, + "grad_norm": 1.9903205633163452, + "learning_rate": 2.1966666666666668e-05, + "loss": 0.1128, + "step": 660 + }, + { + "epoch": 0.389988358556461, + "grad_norm": 1.922202467918396, + "learning_rate": 2.23e-05, + "loss": 0.0895, + "step": 670 + }, + { + "epoch": 0.3958090803259604, + "grad_norm": 2.437887191772461, + "learning_rate": 2.2633333333333336e-05, + "loss": 0.1182, + "step": 680 + }, + { + "epoch": 0.40162980209545984, + "grad_norm": 1.9287029504776, + "learning_rate": 2.2966666666666668e-05, + "loss": 0.0879, + "step": 690 + }, + { + "epoch": 0.4074505238649593, + "grad_norm": 2.0406229496002197, + "learning_rate": 2.3300000000000004e-05, + "loss": 0.1027, + "step": 700 + }, + { + "epoch": 0.41327124563445866, + "grad_norm": 2.2528634071350098, + "learning_rate": 2.3633333333333336e-05, + "loss": 0.1156, + "step": 710 + }, + { + "epoch": 0.4190919674039581, + "grad_norm": 1.4574944972991943, + "learning_rate": 2.396666666666667e-05, + "loss": 0.1043, + "step": 720 + }, + { + "epoch": 0.42491268917345754, + "grad_norm": 2.2627789974212646, + "learning_rate": 2.43e-05, + "loss": 0.1199, + "step": 730 + }, + { + "epoch": 0.4307334109429569, + "grad_norm": 2.3515450954437256, + "learning_rate": 2.4633333333333334e-05, + "loss": 0.1034, + "step": 740 + }, + { + "epoch": 0.43655413271245636, + "grad_norm": 1.3387889862060547, + "learning_rate": 2.496666666666667e-05, + "loss": 0.0986, + "step": 750 + }, + { + "epoch": 0.44237485448195574, + "grad_norm": 1.990921974182129, + "learning_rate": 2.5300000000000002e-05, + "loss": 0.0982, + "step": 760 + }, + { + "epoch": 0.4481955762514552, + "grad_norm": 1.5333963632583618, + "learning_rate": 2.5633333333333338e-05, + "loss": 0.0977, + "step": 770 + }, + { + "epoch": 0.4540162980209546, + "grad_norm": 1.7646198272705078, + "learning_rate": 2.5966666666666667e-05, + "loss": 0.0962, + "step": 780 + }, + { + "epoch": 0.459837019790454, + "grad_norm": 2.2315261363983154, + "learning_rate": 2.6300000000000002e-05, + "loss": 0.0829, + "step": 790 + }, + { + "epoch": 0.46565774155995343, + "grad_norm": 1.5080845355987549, + "learning_rate": 2.663333333333333e-05, + "loss": 0.0987, + "step": 800 + }, + { + "epoch": 0.47147846332945287, + "grad_norm": 1.0622602701187134, + "learning_rate": 2.6966666666666667e-05, + "loss": 0.0774, + "step": 810 + }, + { + "epoch": 0.47729918509895225, + "grad_norm": 1.2934136390686035, + "learning_rate": 2.7300000000000003e-05, + "loss": 0.1106, + "step": 820 + }, + { + "epoch": 0.4831199068684517, + "grad_norm": 1.2431435585021973, + "learning_rate": 2.7633333333333332e-05, + "loss": 0.0921, + "step": 830 + }, + { + "epoch": 0.4889406286379511, + "grad_norm": 2.0530009269714355, + "learning_rate": 2.7966666666666668e-05, + "loss": 0.082, + "step": 840 + }, + { + "epoch": 0.4947613504074505, + "grad_norm": 1.8657811880111694, + "learning_rate": 2.83e-05, + "loss": 0.0937, + "step": 850 + }, + { + "epoch": 0.5005820721769499, + "grad_norm": 1.7814955711364746, + "learning_rate": 2.8633333333333336e-05, + "loss": 0.108, + "step": 860 + }, + { + "epoch": 0.5064027939464494, + "grad_norm": 1.570999026298523, + "learning_rate": 2.8966666666666668e-05, + "loss": 0.0949, + "step": 870 + }, + { + "epoch": 0.5122235157159488, + "grad_norm": 2.0259008407592773, + "learning_rate": 2.93e-05, + "loss": 0.0953, + "step": 880 + }, + { + "epoch": 0.5180442374854481, + "grad_norm": 1.4935063123703003, + "learning_rate": 2.9633333333333336e-05, + "loss": 0.0961, + "step": 890 + }, + { + "epoch": 0.5238649592549476, + "grad_norm": 1.2004295587539673, + "learning_rate": 2.9966666666666672e-05, + "loss": 0.079, + "step": 900 + }, + { + "epoch": 0.529685681024447, + "grad_norm": 1.8651480674743652, + "learning_rate": 3.03e-05, + "loss": 0.0845, + "step": 910 + }, + { + "epoch": 0.5355064027939465, + "grad_norm": 1.207340955734253, + "learning_rate": 3.063333333333334e-05, + "loss": 0.0907, + "step": 920 + }, + { + "epoch": 0.5413271245634459, + "grad_norm": 1.615902304649353, + "learning_rate": 3.096666666666666e-05, + "loss": 0.1058, + "step": 930 + }, + { + "epoch": 0.5471478463329453, + "grad_norm": 1.5028736591339111, + "learning_rate": 3.13e-05, + "loss": 0.0685, + "step": 940 + }, + { + "epoch": 0.5529685681024447, + "grad_norm": 1.422624111175537, + "learning_rate": 3.1633333333333334e-05, + "loss": 0.0773, + "step": 950 + }, + { + "epoch": 0.5587892898719441, + "grad_norm": 2.051846742630005, + "learning_rate": 3.196666666666667e-05, + "loss": 0.109, + "step": 960 + }, + { + "epoch": 0.5646100116414435, + "grad_norm": 1.2181199789047241, + "learning_rate": 3.2300000000000006e-05, + "loss": 0.0766, + "step": 970 + }, + { + "epoch": 0.570430733410943, + "grad_norm": 1.0509109497070312, + "learning_rate": 3.263333333333333e-05, + "loss": 0.079, + "step": 980 + }, + { + "epoch": 0.5762514551804424, + "grad_norm": 1.3375624418258667, + "learning_rate": 3.296666666666667e-05, + "loss": 0.0851, + "step": 990 + }, + { + "epoch": 0.5820721769499418, + "grad_norm": 1.9029721021652222, + "learning_rate": 3.33e-05, + "loss": 0.0741, + "step": 1000 + }, + { + "epoch": 0.5878928987194412, + "grad_norm": 1.3018946647644043, + "learning_rate": 3.3633333333333335e-05, + "loss": 0.1036, + "step": 1010 + }, + { + "epoch": 0.5937136204889406, + "grad_norm": 1.2205839157104492, + "learning_rate": 3.396666666666667e-05, + "loss": 0.0766, + "step": 1020 + }, + { + "epoch": 0.59953434225844, + "grad_norm": 2.033945083618164, + "learning_rate": 3.430000000000001e-05, + "loss": 0.0813, + "step": 1030 + }, + { + "epoch": 0.6053550640279395, + "grad_norm": 0.9236090183258057, + "learning_rate": 3.463333333333333e-05, + "loss": 0.0724, + "step": 1040 + }, + { + "epoch": 0.6111757857974389, + "grad_norm": 0.849105715751648, + "learning_rate": 3.496666666666667e-05, + "loss": 0.0725, + "step": 1050 + }, + { + "epoch": 0.6169965075669382, + "grad_norm": 1.6253907680511475, + "learning_rate": 3.53e-05, + "loss": 0.0866, + "step": 1060 + }, + { + "epoch": 0.6228172293364377, + "grad_norm": 1.0370668172836304, + "learning_rate": 3.563333333333334e-05, + "loss": 0.0635, + "step": 1070 + }, + { + "epoch": 0.6286379511059371, + "grad_norm": 1.305660367012024, + "learning_rate": 3.596666666666667e-05, + "loss": 0.0804, + "step": 1080 + }, + { + "epoch": 0.6344586728754366, + "grad_norm": 1.8295872211456299, + "learning_rate": 3.63e-05, + "loss": 0.0809, + "step": 1090 + }, + { + "epoch": 0.640279394644936, + "grad_norm": 1.544111728668213, + "learning_rate": 3.6633333333333334e-05, + "loss": 0.0812, + "step": 1100 + }, + { + "epoch": 0.6461001164144354, + "grad_norm": 1.3157166242599487, + "learning_rate": 3.6966666666666666e-05, + "loss": 0.0724, + "step": 1110 + }, + { + "epoch": 0.6519208381839348, + "grad_norm": 1.2681264877319336, + "learning_rate": 3.73e-05, + "loss": 0.0627, + "step": 1120 + }, + { + "epoch": 0.6577415599534342, + "grad_norm": 0.8867344856262207, + "learning_rate": 3.763333333333334e-05, + "loss": 0.0724, + "step": 1130 + }, + { + "epoch": 0.6635622817229336, + "grad_norm": 1.297824501991272, + "learning_rate": 3.796666666666667e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.6693830034924331, + "grad_norm": 1.5536450147628784, + "learning_rate": 3.83e-05, + "loss": 0.0808, + "step": 1150 + }, + { + "epoch": 0.6752037252619325, + "grad_norm": 1.0738470554351807, + "learning_rate": 3.8633333333333335e-05, + "loss": 0.0657, + "step": 1160 + }, + { + "epoch": 0.681024447031432, + "grad_norm": 1.6649178266525269, + "learning_rate": 3.896666666666667e-05, + "loss": 0.0786, + "step": 1170 + }, + { + "epoch": 0.6868451688009313, + "grad_norm": 1.4848556518554688, + "learning_rate": 3.9300000000000007e-05, + "loss": 0.0793, + "step": 1180 + }, + { + "epoch": 0.6926658905704307, + "grad_norm": 0.9653380513191223, + "learning_rate": 3.963333333333333e-05, + "loss": 0.075, + "step": 1190 + }, + { + "epoch": 0.6984866123399301, + "grad_norm": 1.0167896747589111, + "learning_rate": 3.996666666666667e-05, + "loss": 0.0748, + "step": 1200 + }, + { + "epoch": 0.7043073341094296, + "grad_norm": 1.5390400886535645, + "learning_rate": 4.0300000000000004e-05, + "loss": 0.0968, + "step": 1210 + }, + { + "epoch": 0.710128055878929, + "grad_norm": 1.2293447256088257, + "learning_rate": 4.0633333333333336e-05, + "loss": 0.0767, + "step": 1220 + }, + { + "epoch": 0.7159487776484285, + "grad_norm": 1.1937015056610107, + "learning_rate": 4.096666666666667e-05, + "loss": 0.0659, + "step": 1230 + }, + { + "epoch": 0.7217694994179278, + "grad_norm": 1.0698705911636353, + "learning_rate": 4.13e-05, + "loss": 0.0775, + "step": 1240 + }, + { + "epoch": 0.7275902211874272, + "grad_norm": 1.0751047134399414, + "learning_rate": 4.1633333333333333e-05, + "loss": 0.062, + "step": 1250 + }, + { + "epoch": 0.7334109429569267, + "grad_norm": 1.2925069332122803, + "learning_rate": 4.196666666666667e-05, + "loss": 0.0577, + "step": 1260 + }, + { + "epoch": 0.7392316647264261, + "grad_norm": 1.017557978630066, + "learning_rate": 4.23e-05, + "loss": 0.0639, + "step": 1270 + }, + { + "epoch": 0.7450523864959255, + "grad_norm": 1.200992465019226, + "learning_rate": 4.263333333333334e-05, + "loss": 0.0703, + "step": 1280 + }, + { + "epoch": 0.7508731082654249, + "grad_norm": 0.8262253999710083, + "learning_rate": 4.296666666666666e-05, + "loss": 0.0667, + "step": 1290 + }, + { + "epoch": 0.7566938300349243, + "grad_norm": 1.3656136989593506, + "learning_rate": 4.33e-05, + "loss": 0.0703, + "step": 1300 + }, + { + "epoch": 0.7625145518044237, + "grad_norm": 1.2767976522445679, + "learning_rate": 4.3633333333333335e-05, + "loss": 0.0686, + "step": 1310 + }, + { + "epoch": 0.7683352735739232, + "grad_norm": 1.3134602308273315, + "learning_rate": 4.396666666666667e-05, + "loss": 0.0733, + "step": 1320 + }, + { + "epoch": 0.7741559953434226, + "grad_norm": 1.600818395614624, + "learning_rate": 4.43e-05, + "loss": 0.0774, + "step": 1330 + }, + { + "epoch": 0.779976717112922, + "grad_norm": 0.9601349234580994, + "learning_rate": 4.463333333333334e-05, + "loss": 0.0952, + "step": 1340 + }, + { + "epoch": 0.7857974388824214, + "grad_norm": 0.7436151504516602, + "learning_rate": 4.496666666666667e-05, + "loss": 0.0612, + "step": 1350 + }, + { + "epoch": 0.7916181606519208, + "grad_norm": 0.9638441801071167, + "learning_rate": 4.53e-05, + "loss": 0.0751, + "step": 1360 + }, + { + "epoch": 0.7974388824214202, + "grad_norm": 1.2402719259262085, + "learning_rate": 4.5633333333333336e-05, + "loss": 0.0666, + "step": 1370 + }, + { + "epoch": 0.8032596041909197, + "grad_norm": 1.071277379989624, + "learning_rate": 4.596666666666667e-05, + "loss": 0.0694, + "step": 1380 + }, + { + "epoch": 0.8090803259604191, + "grad_norm": 1.243671178817749, + "learning_rate": 4.630000000000001e-05, + "loss": 0.0812, + "step": 1390 + }, + { + "epoch": 0.8149010477299186, + "grad_norm": 1.2321245670318604, + "learning_rate": 4.663333333333333e-05, + "loss": 0.072, + "step": 1400 + }, + { + "epoch": 0.8207217694994179, + "grad_norm": 0.8125421404838562, + "learning_rate": 4.696666666666667e-05, + "loss": 0.0758, + "step": 1410 + }, + { + "epoch": 0.8265424912689173, + "grad_norm": 1.1880472898483276, + "learning_rate": 4.73e-05, + "loss": 0.0668, + "step": 1420 + }, + { + "epoch": 0.8323632130384168, + "grad_norm": 1.2899314165115356, + "learning_rate": 4.763333333333334e-05, + "loss": 0.0701, + "step": 1430 + }, + { + "epoch": 0.8381839348079162, + "grad_norm": 1.034522294998169, + "learning_rate": 4.796666666666667e-05, + "loss": 0.051, + "step": 1440 + }, + { + "epoch": 0.8440046565774156, + "grad_norm": 1.037085771560669, + "learning_rate": 4.83e-05, + "loss": 0.0713, + "step": 1450 + }, + { + "epoch": 0.8498253783469151, + "grad_norm": 0.9453491568565369, + "learning_rate": 4.8633333333333334e-05, + "loss": 0.0599, + "step": 1460 + }, + { + "epoch": 0.8556461001164144, + "grad_norm": 0.7867245078086853, + "learning_rate": 4.8966666666666667e-05, + "loss": 0.0472, + "step": 1470 + }, + { + "epoch": 0.8614668218859138, + "grad_norm": 0.9109075665473938, + "learning_rate": 4.93e-05, + "loss": 0.0582, + "step": 1480 + }, + { + "epoch": 0.8672875436554133, + "grad_norm": 0.9364614486694336, + "learning_rate": 4.963333333333334e-05, + "loss": 0.0622, + "step": 1490 + }, + { + "epoch": 0.8731082654249127, + "grad_norm": 0.9675806760787964, + "learning_rate": 4.996666666666667e-05, + "loss": 0.0721, + "step": 1500 + }, + { + "epoch": 0.8789289871944121, + "grad_norm": 0.9428089261054993, + "learning_rate": 5.03e-05, + "loss": 0.0549, + "step": 1510 + }, + { + "epoch": 0.8847497089639115, + "grad_norm": 0.7811077237129211, + "learning_rate": 5.0633333333333335e-05, + "loss": 0.062, + "step": 1520 + }, + { + "epoch": 0.8905704307334109, + "grad_norm": 0.8632674217224121, + "learning_rate": 5.0966666666666674e-05, + "loss": 0.0571, + "step": 1530 + }, + { + "epoch": 0.8963911525029103, + "grad_norm": 1.0392459630966187, + "learning_rate": 5.130000000000001e-05, + "loss": 0.0612, + "step": 1540 + }, + { + "epoch": 0.9022118742724098, + "grad_norm": 0.6784144043922424, + "learning_rate": 5.163333333333333e-05, + "loss": 0.0753, + "step": 1550 + }, + { + "epoch": 0.9080325960419092, + "grad_norm": 1.0237780809402466, + "learning_rate": 5.196666666666667e-05, + "loss": 0.0516, + "step": 1560 + }, + { + "epoch": 0.9138533178114087, + "grad_norm": 0.8753323554992676, + "learning_rate": 5.2300000000000004e-05, + "loss": 0.0551, + "step": 1570 + }, + { + "epoch": 0.919674039580908, + "grad_norm": 0.6855782866477966, + "learning_rate": 5.2633333333333336e-05, + "loss": 0.0586, + "step": 1580 + }, + { + "epoch": 0.9254947613504074, + "grad_norm": 1.0390018224716187, + "learning_rate": 5.296666666666666e-05, + "loss": 0.0576, + "step": 1590 + }, + { + "epoch": 0.9313154831199069, + "grad_norm": 1.214719533920288, + "learning_rate": 5.330000000000001e-05, + "loss": 0.0662, + "step": 1600 + }, + { + "epoch": 0.9371362048894063, + "grad_norm": 1.473379135131836, + "learning_rate": 5.3633333333333334e-05, + "loss": 0.0741, + "step": 1610 + }, + { + "epoch": 0.9429569266589057, + "grad_norm": 0.9103056192398071, + "learning_rate": 5.3966666666666666e-05, + "loss": 0.066, + "step": 1620 + }, + { + "epoch": 0.9487776484284052, + "grad_norm": 1.1360570192337036, + "learning_rate": 5.4300000000000005e-05, + "loss": 0.0709, + "step": 1630 + }, + { + "epoch": 0.9545983701979045, + "grad_norm": 0.91033536195755, + "learning_rate": 5.463333333333334e-05, + "loss": 0.0602, + "step": 1640 + }, + { + "epoch": 0.9604190919674039, + "grad_norm": 0.9203469753265381, + "learning_rate": 5.496666666666666e-05, + "loss": 0.053, + "step": 1650 + }, + { + "epoch": 0.9662398137369034, + "grad_norm": 0.9352394342422485, + "learning_rate": 5.530000000000001e-05, + "loss": 0.0721, + "step": 1660 + }, + { + "epoch": 0.9720605355064028, + "grad_norm": 0.823686957359314, + "learning_rate": 5.5633333333333335e-05, + "loss": 0.0587, + "step": 1670 + }, + { + "epoch": 0.9778812572759022, + "grad_norm": 0.7705183625221252, + "learning_rate": 5.596666666666667e-05, + "loss": 0.0569, + "step": 1680 + }, + { + "epoch": 0.9837019790454016, + "grad_norm": 1.1214704513549805, + "learning_rate": 5.63e-05, + "loss": 0.0831, + "step": 1690 + }, + { + "epoch": 0.989522700814901, + "grad_norm": 0.7889619469642639, + "learning_rate": 5.663333333333334e-05, + "loss": 0.0613, + "step": 1700 + }, + { + "epoch": 0.9953434225844005, + "grad_norm": 0.9206379652023315, + "learning_rate": 5.696666666666667e-05, + "loss": 0.0573, + "step": 1710 + }, + { + "epoch": 1.0011641443538999, + "grad_norm": 1.31033194065094, + "learning_rate": 5.73e-05, + "loss": 0.07, + "step": 1720 + }, + { + "epoch": 1.0069848661233993, + "grad_norm": 0.7337334752082825, + "learning_rate": 5.7633333333333336e-05, + "loss": 0.0513, + "step": 1730 + }, + { + "epoch": 1.0128055878928988, + "grad_norm": 0.3675587475299835, + "learning_rate": 5.796666666666667e-05, + "loss": 0.0632, + "step": 1740 + }, + { + "epoch": 1.0186263096623982, + "grad_norm": 0.6040637493133545, + "learning_rate": 5.83e-05, + "loss": 0.0518, + "step": 1750 + }, + { + "epoch": 1.0244470314318976, + "grad_norm": 0.7541618347167969, + "learning_rate": 5.863333333333334e-05, + "loss": 0.0524, + "step": 1760 + }, + { + "epoch": 1.030267753201397, + "grad_norm": 0.6439351439476013, + "learning_rate": 5.896666666666667e-05, + "loss": 0.0582, + "step": 1770 + }, + { + "epoch": 1.0360884749708963, + "grad_norm": 0.7351394891738892, + "learning_rate": 5.93e-05, + "loss": 0.0479, + "step": 1780 + }, + { + "epoch": 1.0419091967403957, + "grad_norm": 0.5620404481887817, + "learning_rate": 5.9633333333333344e-05, + "loss": 0.0591, + "step": 1790 + }, + { + "epoch": 1.0477299185098952, + "grad_norm": 0.943565309047699, + "learning_rate": 5.996666666666667e-05, + "loss": 0.0561, + "step": 1800 + }, + { + "epoch": 1.0535506402793946, + "grad_norm": 0.7097998261451721, + "learning_rate": 6.03e-05, + "loss": 0.0519, + "step": 1810 + }, + { + "epoch": 1.059371362048894, + "grad_norm": 0.794703483581543, + "learning_rate": 6.063333333333333e-05, + "loss": 0.0451, + "step": 1820 + }, + { + "epoch": 1.0651920838183935, + "grad_norm": 1.0672003030776978, + "learning_rate": 6.0966666666666674e-05, + "loss": 0.0534, + "step": 1830 + }, + { + "epoch": 1.071012805587893, + "grad_norm": 0.8843370676040649, + "learning_rate": 6.13e-05, + "loss": 0.0588, + "step": 1840 + }, + { + "epoch": 1.0768335273573924, + "grad_norm": 0.6974763870239258, + "learning_rate": 6.163333333333333e-05, + "loss": 0.0551, + "step": 1850 + }, + { + "epoch": 1.0826542491268918, + "grad_norm": 0.6385523080825806, + "learning_rate": 6.196666666666668e-05, + "loss": 0.0578, + "step": 1860 + }, + { + "epoch": 1.0884749708963912, + "grad_norm": 0.46418532729148865, + "learning_rate": 6.23e-05, + "loss": 0.0465, + "step": 1870 + }, + { + "epoch": 1.0942956926658907, + "grad_norm": 0.6161043047904968, + "learning_rate": 6.263333333333333e-05, + "loss": 0.0538, + "step": 1880 + }, + { + "epoch": 1.1001164144353899, + "grad_norm": 0.6387382745742798, + "learning_rate": 6.296666666666667e-05, + "loss": 0.0606, + "step": 1890 + }, + { + "epoch": 1.1059371362048893, + "grad_norm": 0.5421847701072693, + "learning_rate": 6.330000000000001e-05, + "loss": 0.0628, + "step": 1900 + }, + { + "epoch": 1.1117578579743888, + "grad_norm": 0.6349193453788757, + "learning_rate": 6.363333333333334e-05, + "loss": 0.0492, + "step": 1910 + }, + { + "epoch": 1.1175785797438882, + "grad_norm": 0.6271241307258606, + "learning_rate": 6.396666666666667e-05, + "loss": 0.0441, + "step": 1920 + }, + { + "epoch": 1.1233993015133876, + "grad_norm": 0.6843981146812439, + "learning_rate": 6.43e-05, + "loss": 0.0538, + "step": 1930 + }, + { + "epoch": 1.129220023282887, + "grad_norm": 0.7008379101753235, + "learning_rate": 6.463333333333334e-05, + "loss": 0.0477, + "step": 1940 + }, + { + "epoch": 1.1350407450523865, + "grad_norm": 1.0106284618377686, + "learning_rate": 6.496666666666667e-05, + "loss": 0.057, + "step": 1950 + }, + { + "epoch": 1.140861466821886, + "grad_norm": 0.8347068428993225, + "learning_rate": 6.53e-05, + "loss": 0.0527, + "step": 1960 + }, + { + "epoch": 1.1466821885913854, + "grad_norm": 0.44086241722106934, + "learning_rate": 6.563333333333333e-05, + "loss": 0.0601, + "step": 1970 + }, + { + "epoch": 1.1525029103608848, + "grad_norm": 0.6656976938247681, + "learning_rate": 6.596666666666667e-05, + "loss": 0.0628, + "step": 1980 + }, + { + "epoch": 1.1583236321303843, + "grad_norm": 0.700560450553894, + "learning_rate": 6.630000000000001e-05, + "loss": 0.0487, + "step": 1990 + }, + { + "epoch": 1.1641443538998835, + "grad_norm": 0.6834568977355957, + "learning_rate": 6.663333333333333e-05, + "loss": 0.0561, + "step": 2000 + }, + { + "epoch": 1.1699650756693831, + "grad_norm": 0.49468666315078735, + "learning_rate": 6.696666666666666e-05, + "loss": 0.0497, + "step": 2010 + }, + { + "epoch": 1.1757857974388823, + "grad_norm": 0.5188592672348022, + "learning_rate": 6.730000000000001e-05, + "loss": 0.0492, + "step": 2020 + }, + { + "epoch": 1.1816065192083818, + "grad_norm": 0.7852494716644287, + "learning_rate": 6.763333333333334e-05, + "loss": 0.0524, + "step": 2030 + }, + { + "epoch": 1.1874272409778812, + "grad_norm": 0.7215738296508789, + "learning_rate": 6.796666666666666e-05, + "loss": 0.0495, + "step": 2040 + }, + { + "epoch": 1.1932479627473807, + "grad_norm": 0.8026352524757385, + "learning_rate": 6.83e-05, + "loss": 0.0474, + "step": 2050 + }, + { + "epoch": 1.19906868451688, + "grad_norm": 0.643627405166626, + "learning_rate": 6.863333333333334e-05, + "loss": 0.0573, + "step": 2060 + }, + { + "epoch": 1.2048894062863795, + "grad_norm": 0.4118272364139557, + "learning_rate": 6.896666666666667e-05, + "loss": 0.0549, + "step": 2070 + }, + { + "epoch": 1.210710128055879, + "grad_norm": 0.6615014672279358, + "learning_rate": 6.93e-05, + "loss": 0.0532, + "step": 2080 + }, + { + "epoch": 1.2165308498253784, + "grad_norm": 0.7348880767822266, + "learning_rate": 6.963333333333334e-05, + "loss": 0.0455, + "step": 2090 + }, + { + "epoch": 1.2223515715948778, + "grad_norm": 0.8961721062660217, + "learning_rate": 6.996666666666667e-05, + "loss": 0.0474, + "step": 2100 + }, + { + "epoch": 1.2281722933643773, + "grad_norm": 0.6734530925750732, + "learning_rate": 7.03e-05, + "loss": 0.0544, + "step": 2110 + }, + { + "epoch": 1.2339930151338767, + "grad_norm": 0.9427734017372131, + "learning_rate": 7.063333333333333e-05, + "loss": 0.0659, + "step": 2120 + }, + { + "epoch": 1.239813736903376, + "grad_norm": 0.5700082778930664, + "learning_rate": 7.096666666666667e-05, + "loss": 0.0622, + "step": 2130 + }, + { + "epoch": 1.2456344586728754, + "grad_norm": 0.4767299294471741, + "learning_rate": 7.13e-05, + "loss": 0.0473, + "step": 2140 + }, + { + "epoch": 1.2514551804423748, + "grad_norm": 0.6068447828292847, + "learning_rate": 7.163333333333334e-05, + "loss": 0.0464, + "step": 2150 + }, + { + "epoch": 1.2572759022118742, + "grad_norm": 0.661034107208252, + "learning_rate": 7.196666666666668e-05, + "loss": 0.0606, + "step": 2160 + }, + { + "epoch": 1.2630966239813737, + "grad_norm": 0.7720133662223816, + "learning_rate": 7.23e-05, + "loss": 0.0456, + "step": 2170 + }, + { + "epoch": 1.2689173457508731, + "grad_norm": 0.39131397008895874, + "learning_rate": 7.263333333333334e-05, + "loss": 0.0434, + "step": 2180 + }, + { + "epoch": 1.2747380675203726, + "grad_norm": 0.5375649333000183, + "learning_rate": 7.296666666666667e-05, + "loss": 0.0513, + "step": 2190 + }, + { + "epoch": 1.280558789289872, + "grad_norm": 0.576289176940918, + "learning_rate": 7.33e-05, + "loss": 0.0485, + "step": 2200 + }, + { + "epoch": 1.2863795110593714, + "grad_norm": 0.9382694363594055, + "learning_rate": 7.363333333333334e-05, + "loss": 0.0571, + "step": 2210 + }, + { + "epoch": 1.2922002328288706, + "grad_norm": 0.8007000684738159, + "learning_rate": 7.396666666666667e-05, + "loss": 0.0544, + "step": 2220 + }, + { + "epoch": 1.2980209545983703, + "grad_norm": 0.8196623921394348, + "learning_rate": 7.43e-05, + "loss": 0.061, + "step": 2230 + }, + { + "epoch": 1.3038416763678695, + "grad_norm": 0.6569730043411255, + "learning_rate": 7.463333333333334e-05, + "loss": 0.0519, + "step": 2240 + }, + { + "epoch": 1.309662398137369, + "grad_norm": 0.5986012816429138, + "learning_rate": 7.496666666666667e-05, + "loss": 0.0631, + "step": 2250 + }, + { + "epoch": 1.3154831199068684, + "grad_norm": 0.4098905622959137, + "learning_rate": 7.53e-05, + "loss": 0.0703, + "step": 2260 + }, + { + "epoch": 1.3213038416763678, + "grad_norm": 0.5381914973258972, + "learning_rate": 7.563333333333333e-05, + "loss": 0.0587, + "step": 2270 + }, + { + "epoch": 1.3271245634458673, + "grad_norm": 0.703535795211792, + "learning_rate": 7.596666666666668e-05, + "loss": 0.0585, + "step": 2280 + }, + { + "epoch": 1.3329452852153667, + "grad_norm": 0.5545676946640015, + "learning_rate": 7.630000000000001e-05, + "loss": 0.0585, + "step": 2290 + }, + { + "epoch": 1.3387660069848661, + "grad_norm": 0.6178395748138428, + "learning_rate": 7.663333333333333e-05, + "loss": 0.0526, + "step": 2300 + }, + { + "epoch": 1.3445867287543656, + "grad_norm": 0.6851446628570557, + "learning_rate": 7.696666666666668e-05, + "loss": 0.0496, + "step": 2310 + }, + { + "epoch": 1.350407450523865, + "grad_norm": 0.5430983304977417, + "learning_rate": 7.730000000000001e-05, + "loss": 0.058, + "step": 2320 + }, + { + "epoch": 1.3562281722933645, + "grad_norm": 0.6637172698974609, + "learning_rate": 7.763333333333334e-05, + "loss": 0.0533, + "step": 2330 + }, + { + "epoch": 1.362048894062864, + "grad_norm": 0.5572364926338196, + "learning_rate": 7.796666666666666e-05, + "loss": 0.0506, + "step": 2340 + }, + { + "epoch": 1.367869615832363, + "grad_norm": 0.8173773288726807, + "learning_rate": 7.83e-05, + "loss": 0.0527, + "step": 2350 + }, + { + "epoch": 1.3736903376018628, + "grad_norm": 0.8470041155815125, + "learning_rate": 7.863333333333334e-05, + "loss": 0.0488, + "step": 2360 + }, + { + "epoch": 1.379511059371362, + "grad_norm": 0.7618017792701721, + "learning_rate": 7.896666666666667e-05, + "loss": 0.054, + "step": 2370 + }, + { + "epoch": 1.3853317811408614, + "grad_norm": 0.7008715867996216, + "learning_rate": 7.93e-05, + "loss": 0.0483, + "step": 2380 + }, + { + "epoch": 1.3911525029103609, + "grad_norm": 0.6306919455528259, + "learning_rate": 7.963333333333334e-05, + "loss": 0.0631, + "step": 2390 + }, + { + "epoch": 1.3969732246798603, + "grad_norm": 0.6492921710014343, + "learning_rate": 7.996666666666667e-05, + "loss": 0.0512, + "step": 2400 + }, + { + "epoch": 1.4027939464493597, + "grad_norm": 0.6941025257110596, + "learning_rate": 8.030000000000001e-05, + "loss": 0.0554, + "step": 2410 + }, + { + "epoch": 1.4086146682188592, + "grad_norm": 0.4640394151210785, + "learning_rate": 8.063333333333333e-05, + "loss": 0.0538, + "step": 2420 + }, + { + "epoch": 1.4144353899883586, + "grad_norm": 0.6097138524055481, + "learning_rate": 8.096666666666667e-05, + "loss": 0.0478, + "step": 2430 + }, + { + "epoch": 1.420256111757858, + "grad_norm": 0.5762205123901367, + "learning_rate": 8.13e-05, + "loss": 0.0527, + "step": 2440 + }, + { + "epoch": 1.4260768335273575, + "grad_norm": 0.7153871655464172, + "learning_rate": 8.163333333333334e-05, + "loss": 0.0538, + "step": 2450 + }, + { + "epoch": 1.4318975552968567, + "grad_norm": 0.7628239989280701, + "learning_rate": 8.196666666666668e-05, + "loss": 0.0574, + "step": 2460 + }, + { + "epoch": 1.4377182770663564, + "grad_norm": 0.6316484808921814, + "learning_rate": 8.23e-05, + "loss": 0.0445, + "step": 2470 + }, + { + "epoch": 1.4435389988358556, + "grad_norm": 0.7094146609306335, + "learning_rate": 8.263333333333334e-05, + "loss": 0.052, + "step": 2480 + }, + { + "epoch": 1.449359720605355, + "grad_norm": 0.9890119433403015, + "learning_rate": 8.296666666666667e-05, + "loss": 0.0613, + "step": 2490 + }, + { + "epoch": 1.4551804423748544, + "grad_norm": 0.7253393530845642, + "learning_rate": 8.33e-05, + "loss": 0.0545, + "step": 2500 + }, + { + "epoch": 1.4610011641443539, + "grad_norm": 0.7713330388069153, + "learning_rate": 8.363333333333334e-05, + "loss": 0.0543, + "step": 2510 + }, + { + "epoch": 1.4668218859138533, + "grad_norm": 0.5354117155075073, + "learning_rate": 8.396666666666667e-05, + "loss": 0.0437, + "step": 2520 + }, + { + "epoch": 1.4726426076833528, + "grad_norm": 0.4532734453678131, + "learning_rate": 8.43e-05, + "loss": 0.048, + "step": 2530 + }, + { + "epoch": 1.4784633294528522, + "grad_norm": 0.5203303694725037, + "learning_rate": 8.463333333333335e-05, + "loss": 0.0688, + "step": 2540 + }, + { + "epoch": 1.4842840512223516, + "grad_norm": 0.6412729620933533, + "learning_rate": 8.496666666666667e-05, + "loss": 0.0475, + "step": 2550 + }, + { + "epoch": 1.490104772991851, + "grad_norm": 0.6195562481880188, + "learning_rate": 8.53e-05, + "loss": 0.0619, + "step": 2560 + }, + { + "epoch": 1.4959254947613503, + "grad_norm": 0.5293517708778381, + "learning_rate": 8.563333333333333e-05, + "loss": 0.0587, + "step": 2570 + }, + { + "epoch": 1.50174621653085, + "grad_norm": 0.4708130657672882, + "learning_rate": 8.596666666666668e-05, + "loss": 0.0586, + "step": 2580 + }, + { + "epoch": 1.5075669383003492, + "grad_norm": 0.5902058482170105, + "learning_rate": 8.63e-05, + "loss": 0.0532, + "step": 2590 + }, + { + "epoch": 1.5133876600698488, + "grad_norm": 0.5766910910606384, + "learning_rate": 8.663333333333333e-05, + "loss": 0.0641, + "step": 2600 + }, + { + "epoch": 1.519208381839348, + "grad_norm": 0.6033872365951538, + "learning_rate": 8.696666666666668e-05, + "loss": 0.0587, + "step": 2610 + }, + { + "epoch": 1.5250291036088475, + "grad_norm": 0.7018511295318604, + "learning_rate": 8.730000000000001e-05, + "loss": 0.0469, + "step": 2620 + }, + { + "epoch": 1.530849825378347, + "grad_norm": 0.49219250679016113, + "learning_rate": 8.763333333333334e-05, + "loss": 0.0528, + "step": 2630 + }, + { + "epoch": 1.5366705471478463, + "grad_norm": 0.6524537205696106, + "learning_rate": 8.796666666666667e-05, + "loss": 0.0505, + "step": 2640 + }, + { + "epoch": 1.5424912689173458, + "grad_norm": 0.3836422562599182, + "learning_rate": 8.83e-05, + "loss": 0.0443, + "step": 2650 + }, + { + "epoch": 1.5483119906868452, + "grad_norm": 0.6034940481185913, + "learning_rate": 8.863333333333334e-05, + "loss": 0.0445, + "step": 2660 + }, + { + "epoch": 1.5541327124563447, + "grad_norm": 0.6652889847755432, + "learning_rate": 8.896666666666667e-05, + "loss": 0.0529, + "step": 2670 + }, + { + "epoch": 1.5599534342258439, + "grad_norm": 0.6977400183677673, + "learning_rate": 8.93e-05, + "loss": 0.0418, + "step": 2680 + }, + { + "epoch": 1.5657741559953435, + "grad_norm": 0.44700467586517334, + "learning_rate": 8.963333333333333e-05, + "loss": 0.0413, + "step": 2690 + }, + { + "epoch": 1.5715948777648427, + "grad_norm": 0.44100069999694824, + "learning_rate": 8.996666666666667e-05, + "loss": 0.0454, + "step": 2700 + }, + { + "epoch": 1.5774155995343424, + "grad_norm": 0.6039486527442932, + "learning_rate": 9.030000000000001e-05, + "loss": 0.0397, + "step": 2710 + }, + { + "epoch": 1.5832363213038416, + "grad_norm": 0.548845648765564, + "learning_rate": 9.063333333333333e-05, + "loss": 0.0466, + "step": 2720 + }, + { + "epoch": 1.589057043073341, + "grad_norm": 0.726391077041626, + "learning_rate": 9.096666666666666e-05, + "loss": 0.0347, + "step": 2730 + }, + { + "epoch": 1.5948777648428405, + "grad_norm": 0.5363685488700867, + "learning_rate": 9.130000000000001e-05, + "loss": 0.0393, + "step": 2740 + }, + { + "epoch": 1.60069848661234, + "grad_norm": 0.7888290286064148, + "learning_rate": 9.163333333333334e-05, + "loss": 0.0445, + "step": 2750 + }, + { + "epoch": 1.6065192083818394, + "grad_norm": 0.7007513642311096, + "learning_rate": 9.196666666666666e-05, + "loss": 0.0462, + "step": 2760 + }, + { + "epoch": 1.6123399301513388, + "grad_norm": 0.5745195150375366, + "learning_rate": 9.230000000000001e-05, + "loss": 0.0432, + "step": 2770 + }, + { + "epoch": 1.6181606519208382, + "grad_norm": 0.6411996483802795, + "learning_rate": 9.263333333333334e-05, + "loss": 0.0415, + "step": 2780 + }, + { + "epoch": 1.6239813736903375, + "grad_norm": 0.7109386920928955, + "learning_rate": 9.296666666666667e-05, + "loss": 0.0583, + "step": 2790 + }, + { + "epoch": 1.6298020954598371, + "grad_norm": 0.5170132517814636, + "learning_rate": 9.33e-05, + "loss": 0.0503, + "step": 2800 + }, + { + "epoch": 1.6356228172293363, + "grad_norm": 0.40340667963027954, + "learning_rate": 9.363333333333334e-05, + "loss": 0.0448, + "step": 2810 + }, + { + "epoch": 1.641443538998836, + "grad_norm": 0.7362358570098877, + "learning_rate": 9.396666666666667e-05, + "loss": 0.0416, + "step": 2820 + }, + { + "epoch": 1.6472642607683352, + "grad_norm": 0.6744977831840515, + "learning_rate": 9.43e-05, + "loss": 0.0551, + "step": 2830 + }, + { + "epoch": 1.6530849825378346, + "grad_norm": 0.4630856513977051, + "learning_rate": 9.463333333333333e-05, + "loss": 0.0499, + "step": 2840 + }, + { + "epoch": 1.658905704307334, + "grad_norm": 0.29350531101226807, + "learning_rate": 9.496666666666667e-05, + "loss": 0.0428, + "step": 2850 + }, + { + "epoch": 1.6647264260768335, + "grad_norm": 0.6002875566482544, + "learning_rate": 9.53e-05, + "loss": 0.0439, + "step": 2860 + }, + { + "epoch": 1.670547147846333, + "grad_norm": 0.585309624671936, + "learning_rate": 9.563333333333334e-05, + "loss": 0.0483, + "step": 2870 + }, + { + "epoch": 1.6763678696158324, + "grad_norm": 0.6369213461875916, + "learning_rate": 9.596666666666668e-05, + "loss": 0.0475, + "step": 2880 + }, + { + "epoch": 1.6821885913853318, + "grad_norm": 0.49775993824005127, + "learning_rate": 9.63e-05, + "loss": 0.0511, + "step": 2890 + }, + { + "epoch": 1.688009313154831, + "grad_norm": 0.5923731327056885, + "learning_rate": 9.663333333333334e-05, + "loss": 0.0523, + "step": 2900 + }, + { + "epoch": 1.6938300349243307, + "grad_norm": 0.47141608595848083, + "learning_rate": 9.696666666666667e-05, + "loss": 0.0428, + "step": 2910 + }, + { + "epoch": 1.69965075669383, + "grad_norm": 0.42312103509902954, + "learning_rate": 9.730000000000001e-05, + "loss": 0.0435, + "step": 2920 + }, + { + "epoch": 1.7054714784633296, + "grad_norm": 0.44771796464920044, + "learning_rate": 9.763333333333334e-05, + "loss": 0.0445, + "step": 2930 + }, + { + "epoch": 1.7112922002328288, + "grad_norm": 0.35708752274513245, + "learning_rate": 9.796666666666667e-05, + "loss": 0.0514, + "step": 2940 + }, + { + "epoch": 1.7171129220023282, + "grad_norm": 0.5295321345329285, + "learning_rate": 9.83e-05, + "loss": 0.0595, + "step": 2950 + }, + { + "epoch": 1.7229336437718277, + "grad_norm": 0.6439855694770813, + "learning_rate": 9.863333333333334e-05, + "loss": 0.0507, + "step": 2960 + }, + { + "epoch": 1.728754365541327, + "grad_norm": 0.635633111000061, + "learning_rate": 9.896666666666667e-05, + "loss": 0.0468, + "step": 2970 + }, + { + "epoch": 1.7345750873108265, + "grad_norm": 0.5558052062988281, + "learning_rate": 9.93e-05, + "loss": 0.0394, + "step": 2980 + }, + { + "epoch": 1.740395809080326, + "grad_norm": 0.590999960899353, + "learning_rate": 9.963333333333333e-05, + "loss": 0.0493, + "step": 2990 + }, + { + "epoch": 1.7462165308498254, + "grad_norm": 0.5690597891807556, + "learning_rate": 9.996666666666668e-05, + "loss": 0.062, + "step": 3000 + }, + { + "epoch": 1.7520372526193246, + "grad_norm": 0.6431434154510498, + "learning_rate": 9.999999384858465e-05, + "loss": 0.0446, + "step": 3010 + }, + { + "epoch": 1.7578579743888243, + "grad_norm": 0.6208341717720032, + "learning_rate": 9.999997258443473e-05, + "loss": 0.0437, + "step": 3020 + }, + { + "epoch": 1.7636786961583235, + "grad_norm": 0.4363138675689697, + "learning_rate": 9.999993613161331e-05, + "loss": 0.0461, + "step": 3030 + }, + { + "epoch": 1.7694994179278232, + "grad_norm": 0.6112787127494812, + "learning_rate": 9.999988449013146e-05, + "loss": 0.0483, + "step": 3040 + }, + { + "epoch": 1.7753201396973224, + "grad_norm": 0.5666700601577759, + "learning_rate": 9.99998176600049e-05, + "loss": 0.0535, + "step": 3050 + }, + { + "epoch": 1.781140861466822, + "grad_norm": 0.4413955509662628, + "learning_rate": 9.999973564125389e-05, + "loss": 0.0402, + "step": 3060 + }, + { + "epoch": 1.7869615832363213, + "grad_norm": 0.5139228105545044, + "learning_rate": 9.999963843390335e-05, + "loss": 0.0428, + "step": 3070 + }, + { + "epoch": 1.7927823050058207, + "grad_norm": 0.5856751203536987, + "learning_rate": 9.999952603798282e-05, + "loss": 0.0662, + "step": 3080 + }, + { + "epoch": 1.7986030267753201, + "grad_norm": 0.5467512607574463, + "learning_rate": 9.999939845352646e-05, + "loss": 0.0486, + "step": 3090 + }, + { + "epoch": 1.8044237485448196, + "grad_norm": 0.35387200117111206, + "learning_rate": 9.999925568057298e-05, + "loss": 0.0525, + "step": 3100 + }, + { + "epoch": 1.810244470314319, + "grad_norm": 0.3916459381580353, + "learning_rate": 9.999909771916578e-05, + "loss": 0.0528, + "step": 3110 + }, + { + "epoch": 1.8160651920838184, + "grad_norm": 0.6513524651527405, + "learning_rate": 9.999892456935285e-05, + "loss": 0.0503, + "step": 3120 + }, + { + "epoch": 1.8218859138533179, + "grad_norm": 0.6384074687957764, + "learning_rate": 9.999873623118679e-05, + "loss": 0.0478, + "step": 3130 + }, + { + "epoch": 1.827706635622817, + "grad_norm": 0.6200078725814819, + "learning_rate": 9.999853270472479e-05, + "loss": 0.0483, + "step": 3140 + }, + { + "epoch": 1.8335273573923168, + "grad_norm": 0.5584779977798462, + "learning_rate": 9.999831399002871e-05, + "loss": 0.0481, + "step": 3150 + }, + { + "epoch": 1.839348079161816, + "grad_norm": 0.5830670595169067, + "learning_rate": 9.999808008716494e-05, + "loss": 0.0452, + "step": 3160 + }, + { + "epoch": 1.8451688009313156, + "grad_norm": 0.48997604846954346, + "learning_rate": 9.999783099620459e-05, + "loss": 0.0555, + "step": 3170 + }, + { + "epoch": 1.8509895227008148, + "grad_norm": 0.47353824973106384, + "learning_rate": 9.999756671722328e-05, + "loss": 0.0551, + "step": 3180 + }, + { + "epoch": 1.8568102444703143, + "grad_norm": 0.5996155738830566, + "learning_rate": 9.99972872503013e-05, + "loss": 0.0472, + "step": 3190 + }, + { + "epoch": 1.8626309662398137, + "grad_norm": 0.5755963325500488, + "learning_rate": 9.999699259552359e-05, + "loss": 0.0607, + "step": 3200 + }, + { + "epoch": 1.8684516880093132, + "grad_norm": 0.6970465183258057, + "learning_rate": 9.99966827529796e-05, + "loss": 0.0484, + "step": 3210 + }, + { + "epoch": 1.8742724097788126, + "grad_norm": 0.4643675684928894, + "learning_rate": 9.999635772276348e-05, + "loss": 0.0533, + "step": 3220 + }, + { + "epoch": 1.880093131548312, + "grad_norm": 0.6132497787475586, + "learning_rate": 9.999601750497396e-05, + "loss": 0.0499, + "step": 3230 + }, + { + "epoch": 1.8859138533178115, + "grad_norm": 0.453950434923172, + "learning_rate": 9.99956620997144e-05, + "loss": 0.0441, + "step": 3240 + }, + { + "epoch": 1.8917345750873107, + "grad_norm": 0.4298945665359497, + "learning_rate": 9.999529150709275e-05, + "loss": 0.0494, + "step": 3250 + }, + { + "epoch": 1.8975552968568103, + "grad_norm": 0.5931066274642944, + "learning_rate": 9.999490572722158e-05, + "loss": 0.0482, + "step": 3260 + }, + { + "epoch": 1.9033760186263096, + "grad_norm": 0.6276513338088989, + "learning_rate": 9.99945047602181e-05, + "loss": 0.0552, + "step": 3270 + }, + { + "epoch": 1.9091967403958092, + "grad_norm": 0.5670948028564453, + "learning_rate": 9.99940886062041e-05, + "loss": 0.0515, + "step": 3280 + }, + { + "epoch": 1.9150174621653084, + "grad_norm": 0.4934888482093811, + "learning_rate": 9.999365726530599e-05, + "loss": 0.0511, + "step": 3290 + }, + { + "epoch": 1.9208381839348079, + "grad_norm": 0.7306284308433533, + "learning_rate": 9.999321073765481e-05, + "loss": 0.0592, + "step": 3300 + }, + { + "epoch": 1.9266589057043073, + "grad_norm": 0.42045265436172485, + "learning_rate": 9.99927490233862e-05, + "loss": 0.0357, + "step": 3310 + }, + { + "epoch": 1.9324796274738067, + "grad_norm": 0.394064337015152, + "learning_rate": 9.999227212264043e-05, + "loss": 0.0387, + "step": 3320 + }, + { + "epoch": 1.9383003492433062, + "grad_norm": 0.602567195892334, + "learning_rate": 9.999178003556236e-05, + "loss": 0.0502, + "step": 3330 + }, + { + "epoch": 1.9441210710128056, + "grad_norm": 0.39112797379493713, + "learning_rate": 9.999127276230146e-05, + "loss": 0.0388, + "step": 3340 + }, + { + "epoch": 1.949941792782305, + "grad_norm": 0.788226842880249, + "learning_rate": 9.999075030301184e-05, + "loss": 0.0463, + "step": 3350 + }, + { + "epoch": 1.9557625145518043, + "grad_norm": 0.47495588660240173, + "learning_rate": 9.999021265785221e-05, + "loss": 0.0508, + "step": 3360 + }, + { + "epoch": 1.961583236321304, + "grad_norm": 0.534824788570404, + "learning_rate": 9.998965982698589e-05, + "loss": 0.0619, + "step": 3370 + }, + { + "epoch": 1.9674039580908032, + "grad_norm": 0.3852725625038147, + "learning_rate": 9.998909181058082e-05, + "loss": 0.0404, + "step": 3380 + }, + { + "epoch": 1.9732246798603028, + "grad_norm": 0.563822865486145, + "learning_rate": 9.998850860880953e-05, + "loss": 0.0432, + "step": 3390 + }, + { + "epoch": 1.979045401629802, + "grad_norm": 0.5548107028007507, + "learning_rate": 9.998791022184922e-05, + "loss": 0.0409, + "step": 3400 + }, + { + "epoch": 1.9848661233993015, + "grad_norm": 0.36916905641555786, + "learning_rate": 9.99872966498816e-05, + "loss": 0.042, + "step": 3410 + }, + { + "epoch": 1.990686845168801, + "grad_norm": 0.4684253931045532, + "learning_rate": 9.998666789309313e-05, + "loss": 0.0432, + "step": 3420 + }, + { + "epoch": 1.9965075669383003, + "grad_norm": 0.6417539119720459, + "learning_rate": 9.998602395167475e-05, + "loss": 0.0412, + "step": 3430 + }, + { + "epoch": 2.0023282887077998, + "grad_norm": 0.5209254622459412, + "learning_rate": 9.998536482582213e-05, + "loss": 0.0535, + "step": 3440 + }, + { + "epoch": 2.008149010477299, + "grad_norm": 0.4283353090286255, + "learning_rate": 9.998469051573544e-05, + "loss": 0.0468, + "step": 3450 + }, + { + "epoch": 2.0139697322467986, + "grad_norm": 0.30979207158088684, + "learning_rate": 9.998400102161954e-05, + "loss": 0.0427, + "step": 3460 + }, + { + "epoch": 2.019790454016298, + "grad_norm": 0.39632585644721985, + "learning_rate": 9.998329634368388e-05, + "loss": 0.0382, + "step": 3470 + }, + { + "epoch": 2.0256111757857975, + "grad_norm": 0.4269607961177826, + "learning_rate": 9.998257648214253e-05, + "loss": 0.0383, + "step": 3480 + }, + { + "epoch": 2.0314318975552967, + "grad_norm": 0.4003724157810211, + "learning_rate": 9.998184143721417e-05, + "loss": 0.0483, + "step": 3490 + }, + { + "epoch": 2.0372526193247964, + "grad_norm": 0.4324784576892853, + "learning_rate": 9.998109120912206e-05, + "loss": 0.0412, + "step": 3500 + }, + { + "epoch": 2.0430733410942956, + "grad_norm": 0.5491438508033752, + "learning_rate": 9.998032579809411e-05, + "loss": 0.0459, + "step": 3510 + }, + { + "epoch": 2.0488940628637953, + "grad_norm": 0.4472302794456482, + "learning_rate": 9.997954520436286e-05, + "loss": 0.051, + "step": 3520 + }, + { + "epoch": 2.0547147846332945, + "grad_norm": 0.5161761045455933, + "learning_rate": 9.997874942816538e-05, + "loss": 0.0391, + "step": 3530 + }, + { + "epoch": 2.060535506402794, + "grad_norm": 0.3781692087650299, + "learning_rate": 9.997793846974345e-05, + "loss": 0.0502, + "step": 3540 + }, + { + "epoch": 2.0663562281722934, + "grad_norm": 0.45540711283683777, + "learning_rate": 9.997711232934341e-05, + "loss": 0.0419, + "step": 3550 + }, + { + "epoch": 2.0721769499417926, + "grad_norm": 0.43276068568229675, + "learning_rate": 9.99762710072162e-05, + "loss": 0.0433, + "step": 3560 + }, + { + "epoch": 2.0779976717112922, + "grad_norm": 0.4050533175468445, + "learning_rate": 9.997541450361743e-05, + "loss": 0.0517, + "step": 3570 + }, + { + "epoch": 2.0838183934807915, + "grad_norm": 0.4451623558998108, + "learning_rate": 9.997454281880723e-05, + "loss": 0.0509, + "step": 3580 + }, + { + "epoch": 2.089639115250291, + "grad_norm": 0.43508100509643555, + "learning_rate": 9.997365595305044e-05, + "loss": 0.045, + "step": 3590 + }, + { + "epoch": 2.0954598370197903, + "grad_norm": 0.5293198227882385, + "learning_rate": 9.997275390661644e-05, + "loss": 0.0472, + "step": 3600 + }, + { + "epoch": 2.10128055878929, + "grad_norm": 0.44087734818458557, + "learning_rate": 9.997183667977926e-05, + "loss": 0.0463, + "step": 3610 + }, + { + "epoch": 2.107101280558789, + "grad_norm": 0.4538293778896332, + "learning_rate": 9.997090427281752e-05, + "loss": 0.0435, + "step": 3620 + }, + { + "epoch": 2.112922002328289, + "grad_norm": 0.42595174908638, + "learning_rate": 9.996995668601448e-05, + "loss": 0.0391, + "step": 3630 + }, + { + "epoch": 2.118742724097788, + "grad_norm": 0.4689980745315552, + "learning_rate": 9.996899391965798e-05, + "loss": 0.0375, + "step": 3640 + }, + { + "epoch": 2.1245634458672877, + "grad_norm": 0.3825773298740387, + "learning_rate": 9.996801597404048e-05, + "loss": 0.0456, + "step": 3650 + }, + { + "epoch": 2.130384167636787, + "grad_norm": 0.42378920316696167, + "learning_rate": 9.996702284945905e-05, + "loss": 0.0365, + "step": 3660 + }, + { + "epoch": 2.1362048894062866, + "grad_norm": 0.3784042000770569, + "learning_rate": 9.996601454621539e-05, + "loss": 0.0429, + "step": 3670 + }, + { + "epoch": 2.142025611175786, + "grad_norm": 0.5618342161178589, + "learning_rate": 9.996499106461577e-05, + "loss": 0.0393, + "step": 3680 + }, + { + "epoch": 2.147846332945285, + "grad_norm": 0.5433347225189209, + "learning_rate": 9.996395240497112e-05, + "loss": 0.0489, + "step": 3690 + }, + { + "epoch": 2.1536670547147847, + "grad_norm": 0.5670026540756226, + "learning_rate": 9.996289856759696e-05, + "loss": 0.0525, + "step": 3700 + }, + { + "epoch": 2.159487776484284, + "grad_norm": 0.5390502214431763, + "learning_rate": 9.996182955281342e-05, + "loss": 0.0484, + "step": 3710 + }, + { + "epoch": 2.1653084982537836, + "grad_norm": 0.46189066767692566, + "learning_rate": 9.996074536094519e-05, + "loss": 0.0465, + "step": 3720 + }, + { + "epoch": 2.171129220023283, + "grad_norm": 0.4135039448738098, + "learning_rate": 9.995964599232168e-05, + "loss": 0.0407, + "step": 3730 + }, + { + "epoch": 2.1769499417927825, + "grad_norm": 0.4730028212070465, + "learning_rate": 9.995853144727683e-05, + "loss": 0.047, + "step": 3740 + }, + { + "epoch": 2.1827706635622817, + "grad_norm": 0.3549439013004303, + "learning_rate": 9.99574017261492e-05, + "loss": 0.0424, + "step": 3750 + }, + { + "epoch": 2.1885913853317813, + "grad_norm": 0.2521669566631317, + "learning_rate": 9.995625682928198e-05, + "loss": 0.0337, + "step": 3760 + }, + { + "epoch": 2.1944121071012805, + "grad_norm": 0.350687712430954, + "learning_rate": 9.995509675702295e-05, + "loss": 0.0357, + "step": 3770 + }, + { + "epoch": 2.2002328288707798, + "grad_norm": 0.28898248076438904, + "learning_rate": 9.995392150972451e-05, + "loss": 0.0378, + "step": 3780 + }, + { + "epoch": 2.2060535506402794, + "grad_norm": 0.33482974767684937, + "learning_rate": 9.995273108774366e-05, + "loss": 0.0477, + "step": 3790 + }, + { + "epoch": 2.2118742724097786, + "grad_norm": 0.30142876505851746, + "learning_rate": 9.995152549144205e-05, + "loss": 0.0484, + "step": 3800 + }, + { + "epoch": 2.2176949941792783, + "grad_norm": 0.32891640067100525, + "learning_rate": 9.995030472118587e-05, + "loss": 0.0436, + "step": 3810 + }, + { + "epoch": 2.2235157159487775, + "grad_norm": 0.5289058685302734, + "learning_rate": 9.9949068777346e-05, + "loss": 0.0448, + "step": 3820 + }, + { + "epoch": 2.229336437718277, + "grad_norm": 0.373087078332901, + "learning_rate": 9.994781766029786e-05, + "loss": 0.0363, + "step": 3830 + }, + { + "epoch": 2.2351571594877764, + "grad_norm": 0.487063467502594, + "learning_rate": 9.994655137042151e-05, + "loss": 0.0506, + "step": 3840 + }, + { + "epoch": 2.240977881257276, + "grad_norm": 0.35774511098861694, + "learning_rate": 9.99452699081016e-05, + "loss": 0.0406, + "step": 3850 + }, + { + "epoch": 2.2467986030267753, + "grad_norm": 0.5361164212226868, + "learning_rate": 9.994397327372743e-05, + "loss": 0.0481, + "step": 3860 + }, + { + "epoch": 2.252619324796275, + "grad_norm": 0.5463615655899048, + "learning_rate": 9.994266146769286e-05, + "loss": 0.0454, + "step": 3870 + }, + { + "epoch": 2.258440046565774, + "grad_norm": 0.4966629147529602, + "learning_rate": 9.994133449039642e-05, + "loss": 0.042, + "step": 3880 + }, + { + "epoch": 2.264260768335274, + "grad_norm": 0.47062358260154724, + "learning_rate": 9.993999234224118e-05, + "loss": 0.0485, + "step": 3890 + }, + { + "epoch": 2.270081490104773, + "grad_norm": 0.48285776376724243, + "learning_rate": 9.993863502363485e-05, + "loss": 0.0456, + "step": 3900 + }, + { + "epoch": 2.275902211874272, + "grad_norm": 0.30222591757774353, + "learning_rate": 9.993726253498976e-05, + "loss": 0.0348, + "step": 3910 + }, + { + "epoch": 2.281722933643772, + "grad_norm": 0.47466835379600525, + "learning_rate": 9.993587487672282e-05, + "loss": 0.0386, + "step": 3920 + }, + { + "epoch": 2.287543655413271, + "grad_norm": 0.4055164158344269, + "learning_rate": 9.993447204925558e-05, + "loss": 0.0415, + "step": 3930 + }, + { + "epoch": 2.2933643771827708, + "grad_norm": 0.3751679062843323, + "learning_rate": 9.993305405301416e-05, + "loss": 0.0486, + "step": 3940 + }, + { + "epoch": 2.29918509895227, + "grad_norm": 0.34478074312210083, + "learning_rate": 9.993162088842935e-05, + "loss": 0.051, + "step": 3950 + }, + { + "epoch": 2.3050058207217696, + "grad_norm": 0.486297070980072, + "learning_rate": 9.993017255593646e-05, + "loss": 0.061, + "step": 3960 + }, + { + "epoch": 2.310826542491269, + "grad_norm": 0.4932585656642914, + "learning_rate": 9.992870905597548e-05, + "loss": 0.0435, + "step": 3970 + }, + { + "epoch": 2.3166472642607685, + "grad_norm": 0.5525426268577576, + "learning_rate": 9.9927230388991e-05, + "loss": 0.0387, + "step": 3980 + }, + { + "epoch": 2.3224679860302677, + "grad_norm": 0.43700477480888367, + "learning_rate": 9.992573655543215e-05, + "loss": 0.0394, + "step": 3990 + }, + { + "epoch": 2.328288707799767, + "grad_norm": 0.38458186388015747, + "learning_rate": 9.992422755575277e-05, + "loss": 0.0375, + "step": 4000 + }, + { + "epoch": 2.3341094295692666, + "grad_norm": 0.40991055965423584, + "learning_rate": 9.992270339041123e-05, + "loss": 0.037, + "step": 4010 + }, + { + "epoch": 2.3399301513387663, + "grad_norm": 0.46249616146087646, + "learning_rate": 9.992116405987053e-05, + "loss": 0.0346, + "step": 4020 + }, + { + "epoch": 2.3457508731082655, + "grad_norm": 0.4323664605617523, + "learning_rate": 9.991960956459828e-05, + "loss": 0.038, + "step": 4030 + }, + { + "epoch": 2.3515715948777647, + "grad_norm": 0.39810648560523987, + "learning_rate": 9.991803990506669e-05, + "loss": 0.0409, + "step": 4040 + }, + { + "epoch": 2.3573923166472643, + "grad_norm": 0.4808797836303711, + "learning_rate": 9.991645508175258e-05, + "loss": 0.0471, + "step": 4050 + }, + { + "epoch": 2.3632130384167636, + "grad_norm": 0.32473814487457275, + "learning_rate": 9.99148550951374e-05, + "loss": 0.041, + "step": 4060 + }, + { + "epoch": 2.369033760186263, + "grad_norm": 0.4616101384162903, + "learning_rate": 9.991323994570716e-05, + "loss": 0.0495, + "step": 4070 + }, + { + "epoch": 2.3748544819557624, + "grad_norm": 0.40883034467697144, + "learning_rate": 9.99116096339525e-05, + "loss": 0.0415, + "step": 4080 + }, + { + "epoch": 2.380675203725262, + "grad_norm": 0.45047518610954285, + "learning_rate": 9.990996416036869e-05, + "loss": 0.0407, + "step": 4090 + }, + { + "epoch": 2.3864959254947613, + "grad_norm": 0.38528552651405334, + "learning_rate": 9.990830352545555e-05, + "loss": 0.0409, + "step": 4100 + }, + { + "epoch": 2.392316647264261, + "grad_norm": 0.39823997020721436, + "learning_rate": 9.990662772971756e-05, + "loss": 0.034, + "step": 4110 + }, + { + "epoch": 2.39813736903376, + "grad_norm": 0.40887370705604553, + "learning_rate": 9.990493677366376e-05, + "loss": 0.042, + "step": 4120 + }, + { + "epoch": 2.4039580908032594, + "grad_norm": 0.47967609763145447, + "learning_rate": 9.990323065780786e-05, + "loss": 0.0346, + "step": 4130 + }, + { + "epoch": 2.409778812572759, + "grad_norm": 0.41241252422332764, + "learning_rate": 9.990150938266808e-05, + "loss": 0.0402, + "step": 4140 + }, + { + "epoch": 2.4155995343422583, + "grad_norm": 0.49063795804977417, + "learning_rate": 9.989977294876733e-05, + "loss": 0.0392, + "step": 4150 + }, + { + "epoch": 2.421420256111758, + "grad_norm": 0.28759634494781494, + "learning_rate": 9.989802135663308e-05, + "loss": 0.0308, + "step": 4160 + }, + { + "epoch": 2.427240977881257, + "grad_norm": 0.424186646938324, + "learning_rate": 9.989625460679743e-05, + "loss": 0.0364, + "step": 4170 + }, + { + "epoch": 2.433061699650757, + "grad_norm": 0.4975746273994446, + "learning_rate": 9.989447269979706e-05, + "loss": 0.0387, + "step": 4180 + }, + { + "epoch": 2.438882421420256, + "grad_norm": 0.45739078521728516, + "learning_rate": 9.989267563617328e-05, + "loss": 0.0352, + "step": 4190 + }, + { + "epoch": 2.4447031431897557, + "grad_norm": 0.2972700297832489, + "learning_rate": 9.989086341647198e-05, + "loss": 0.0328, + "step": 4200 + }, + { + "epoch": 2.450523864959255, + "grad_norm": 0.24247117340564728, + "learning_rate": 9.988903604124366e-05, + "loss": 0.0362, + "step": 4210 + }, + { + "epoch": 2.4563445867287546, + "grad_norm": 0.3768739700317383, + "learning_rate": 9.988719351104343e-05, + "loss": 0.0341, + "step": 4220 + }, + { + "epoch": 2.4621653084982538, + "grad_norm": 0.5660073161125183, + "learning_rate": 9.9885335826431e-05, + "loss": 0.0344, + "step": 4230 + }, + { + "epoch": 2.4679860302677534, + "grad_norm": 0.4670793414115906, + "learning_rate": 9.988346298797071e-05, + "loss": 0.033, + "step": 4240 + }, + { + "epoch": 2.4738067520372526, + "grad_norm": 0.5636444687843323, + "learning_rate": 9.988157499623146e-05, + "loss": 0.0487, + "step": 4250 + }, + { + "epoch": 2.479627473806752, + "grad_norm": 0.5248582363128662, + "learning_rate": 9.987967185178677e-05, + "loss": 0.0347, + "step": 4260 + }, + { + "epoch": 2.4854481955762515, + "grad_norm": 0.5512403249740601, + "learning_rate": 9.987775355521476e-05, + "loss": 0.0421, + "step": 4270 + }, + { + "epoch": 2.4912689173457507, + "grad_norm": 0.4049229025840759, + "learning_rate": 9.987582010709817e-05, + "loss": 0.0418, + "step": 4280 + }, + { + "epoch": 2.4970896391152504, + "grad_norm": 0.30030813813209534, + "learning_rate": 9.987387150802431e-05, + "loss": 0.0555, + "step": 4290 + }, + { + "epoch": 2.5029103608847496, + "grad_norm": 0.5155074596405029, + "learning_rate": 9.987190775858517e-05, + "loss": 0.0336, + "step": 4300 + }, + { + "epoch": 2.5087310826542493, + "grad_norm": 0.5060864090919495, + "learning_rate": 9.98699288593772e-05, + "loss": 0.0484, + "step": 4310 + }, + { + "epoch": 2.5145518044237485, + "grad_norm": 0.4157051146030426, + "learning_rate": 9.986793481100161e-05, + "loss": 0.0367, + "step": 4320 + }, + { + "epoch": 2.520372526193248, + "grad_norm": 0.3570779263973236, + "learning_rate": 9.986592561406412e-05, + "loss": 0.0395, + "step": 4330 + }, + { + "epoch": 2.5261932479627474, + "grad_norm": 0.3446103632450104, + "learning_rate": 9.986390126917503e-05, + "loss": 0.0262, + "step": 4340 + }, + { + "epoch": 2.5320139697322466, + "grad_norm": 0.3270779848098755, + "learning_rate": 9.986186177694933e-05, + "loss": 0.0345, + "step": 4350 + }, + { + "epoch": 2.5378346915017462, + "grad_norm": 0.4974748492240906, + "learning_rate": 9.985980713800656e-05, + "loss": 0.0493, + "step": 4360 + }, + { + "epoch": 2.543655413271246, + "grad_norm": 0.48646602034568787, + "learning_rate": 9.985773735297084e-05, + "loss": 0.0305, + "step": 4370 + }, + { + "epoch": 2.549476135040745, + "grad_norm": 0.436065137386322, + "learning_rate": 9.985565242247092e-05, + "loss": 0.0416, + "step": 4380 + }, + { + "epoch": 2.5552968568102443, + "grad_norm": 0.4028678834438324, + "learning_rate": 9.985355234714016e-05, + "loss": 0.0471, + "step": 4390 + }, + { + "epoch": 2.561117578579744, + "grad_norm": 0.6195926070213318, + "learning_rate": 9.985143712761652e-05, + "loss": 0.0329, + "step": 4400 + }, + { + "epoch": 2.566938300349243, + "grad_norm": 0.4351823925971985, + "learning_rate": 9.984930676454252e-05, + "loss": 0.0371, + "step": 4410 + }, + { + "epoch": 2.572759022118743, + "grad_norm": 0.37860992550849915, + "learning_rate": 9.984716125856532e-05, + "loss": 0.0367, + "step": 4420 + }, + { + "epoch": 2.578579743888242, + "grad_norm": 0.39674344658851624, + "learning_rate": 9.984500061033667e-05, + "loss": 0.0314, + "step": 4430 + }, + { + "epoch": 2.5844004656577413, + "grad_norm": 0.3547307252883911, + "learning_rate": 9.984282482051293e-05, + "loss": 0.0344, + "step": 4440 + }, + { + "epoch": 2.590221187427241, + "grad_norm": 0.3130470812320709, + "learning_rate": 9.9840633889755e-05, + "loss": 0.0373, + "step": 4450 + }, + { + "epoch": 2.5960419091967406, + "grad_norm": 0.39799734950065613, + "learning_rate": 9.983842781872848e-05, + "loss": 0.042, + "step": 4460 + }, + { + "epoch": 2.60186263096624, + "grad_norm": 0.2947298288345337, + "learning_rate": 9.98362066081035e-05, + "loss": 0.0377, + "step": 4470 + }, + { + "epoch": 2.607683352735739, + "grad_norm": 0.34159260988235474, + "learning_rate": 9.983397025855479e-05, + "loss": 0.0296, + "step": 4480 + }, + { + "epoch": 2.6135040745052387, + "grad_norm": 0.32064080238342285, + "learning_rate": 9.983171877076171e-05, + "loss": 0.0325, + "step": 4490 + }, + { + "epoch": 2.619324796274738, + "grad_norm": 0.40089815855026245, + "learning_rate": 9.98294521454082e-05, + "loss": 0.0451, + "step": 4500 + }, + { + "epoch": 2.6251455180442376, + "grad_norm": 0.3698492646217346, + "learning_rate": 9.98271703831828e-05, + "loss": 0.036, + "step": 4510 + }, + { + "epoch": 2.630966239813737, + "grad_norm": 0.41546982526779175, + "learning_rate": 9.982487348477865e-05, + "loss": 0.0467, + "step": 4520 + }, + { + "epoch": 2.6367869615832364, + "grad_norm": 0.4187735915184021, + "learning_rate": 9.982256145089347e-05, + "loss": 0.0401, + "step": 4530 + }, + { + "epoch": 2.6426076833527357, + "grad_norm": 0.41227519512176514, + "learning_rate": 9.982023428222962e-05, + "loss": 0.0308, + "step": 4540 + }, + { + "epoch": 2.6484284051222353, + "grad_norm": 0.42545995116233826, + "learning_rate": 9.981789197949403e-05, + "loss": 0.0461, + "step": 4550 + }, + { + "epoch": 2.6542491268917345, + "grad_norm": 0.41365307569503784, + "learning_rate": 9.98155345433982e-05, + "loss": 0.0374, + "step": 4560 + }, + { + "epoch": 2.6600698486612337, + "grad_norm": 0.329509437084198, + "learning_rate": 9.981316197465831e-05, + "loss": 0.0381, + "step": 4570 + }, + { + "epoch": 2.6658905704307334, + "grad_norm": 0.44999194145202637, + "learning_rate": 9.981077427399504e-05, + "loss": 0.0356, + "step": 4580 + }, + { + "epoch": 2.671711292200233, + "grad_norm": 0.20365506410598755, + "learning_rate": 9.980837144213371e-05, + "loss": 0.0355, + "step": 4590 + }, + { + "epoch": 2.6775320139697323, + "grad_norm": 0.3129006326198578, + "learning_rate": 9.980595347980426e-05, + "loss": 0.033, + "step": 4600 + }, + { + "epoch": 2.6833527357392315, + "grad_norm": 0.3402425944805145, + "learning_rate": 9.980352038774119e-05, + "loss": 0.0451, + "step": 4610 + }, + { + "epoch": 2.689173457508731, + "grad_norm": 0.30726802349090576, + "learning_rate": 9.98010721666836e-05, + "loss": 0.0421, + "step": 4620 + }, + { + "epoch": 2.6949941792782304, + "grad_norm": 0.35414859652519226, + "learning_rate": 9.979860881737523e-05, + "loss": 0.0308, + "step": 4630 + }, + { + "epoch": 2.70081490104773, + "grad_norm": 0.5069484710693359, + "learning_rate": 9.979613034056434e-05, + "loss": 0.0387, + "step": 4640 + }, + { + "epoch": 2.7066356228172292, + "grad_norm": 0.4498346745967865, + "learning_rate": 9.979363673700386e-05, + "loss": 0.0457, + "step": 4650 + }, + { + "epoch": 2.712456344586729, + "grad_norm": 0.46107155084609985, + "learning_rate": 9.979112800745124e-05, + "loss": 0.0467, + "step": 4660 + }, + { + "epoch": 2.718277066356228, + "grad_norm": 0.32879555225372314, + "learning_rate": 9.978860415266861e-05, + "loss": 0.0415, + "step": 4670 + }, + { + "epoch": 2.724097788125728, + "grad_norm": 0.37707000970840454, + "learning_rate": 9.978606517342262e-05, + "loss": 0.0335, + "step": 4680 + }, + { + "epoch": 2.729918509895227, + "grad_norm": 0.22451092302799225, + "learning_rate": 9.978351107048456e-05, + "loss": 0.0421, + "step": 4690 + }, + { + "epoch": 2.735739231664726, + "grad_norm": 0.4823485016822815, + "learning_rate": 9.978094184463029e-05, + "loss": 0.0424, + "step": 4700 + }, + { + "epoch": 2.741559953434226, + "grad_norm": 0.4330075979232788, + "learning_rate": 9.977835749664029e-05, + "loss": 0.0389, + "step": 4710 + }, + { + "epoch": 2.7473806752037255, + "grad_norm": 0.3777349293231964, + "learning_rate": 9.97757580272996e-05, + "loss": 0.0288, + "step": 4720 + }, + { + "epoch": 2.7532013969732247, + "grad_norm": 0.3479163646697998, + "learning_rate": 9.977314343739786e-05, + "loss": 0.0276, + "step": 4730 + }, + { + "epoch": 2.759022118742724, + "grad_norm": 0.3058561384677887, + "learning_rate": 9.977051372772934e-05, + "loss": 0.044, + "step": 4740 + }, + { + "epoch": 2.7648428405122236, + "grad_norm": 0.30302882194519043, + "learning_rate": 9.976786889909286e-05, + "loss": 0.0301, + "step": 4750 + }, + { + "epoch": 2.770663562281723, + "grad_norm": 0.4651854634284973, + "learning_rate": 9.976520895229185e-05, + "loss": 0.0331, + "step": 4760 + }, + { + "epoch": 2.7764842840512225, + "grad_norm": 0.39766213297843933, + "learning_rate": 9.976253388813433e-05, + "loss": 0.0392, + "step": 4770 + }, + { + "epoch": 2.7823050058207217, + "grad_norm": 0.335710346698761, + "learning_rate": 9.975984370743293e-05, + "loss": 0.0387, + "step": 4780 + }, + { + "epoch": 2.788125727590221, + "grad_norm": 0.3780670166015625, + "learning_rate": 9.975713841100485e-05, + "loss": 0.0448, + "step": 4790 + }, + { + "epoch": 2.7939464493597206, + "grad_norm": 0.26506704092025757, + "learning_rate": 9.975441799967187e-05, + "loss": 0.035, + "step": 4800 + }, + { + "epoch": 2.7997671711292202, + "grad_norm": 0.22306565940380096, + "learning_rate": 9.975168247426039e-05, + "loss": 0.0336, + "step": 4810 + }, + { + "epoch": 2.8055878928987195, + "grad_norm": 0.44731366634368896, + "learning_rate": 9.974893183560139e-05, + "loss": 0.0338, + "step": 4820 + }, + { + "epoch": 2.8114086146682187, + "grad_norm": 0.3656608760356903, + "learning_rate": 9.974616608453045e-05, + "loss": 0.0366, + "step": 4830 + }, + { + "epoch": 2.8172293364377183, + "grad_norm": 0.37372320890426636, + "learning_rate": 9.974338522188772e-05, + "loss": 0.0359, + "step": 4840 + }, + { + "epoch": 2.8230500582072175, + "grad_norm": 0.42058780789375305, + "learning_rate": 9.974058924851797e-05, + "loss": 0.0334, + "step": 4850 + }, + { + "epoch": 2.828870779976717, + "grad_norm": 0.41544508934020996, + "learning_rate": 9.973777816527051e-05, + "loss": 0.0369, + "step": 4860 + }, + { + "epoch": 2.8346915017462164, + "grad_norm": 0.41404104232788086, + "learning_rate": 9.973495197299931e-05, + "loss": 0.0387, + "step": 4870 + }, + { + "epoch": 2.840512223515716, + "grad_norm": 0.4846097230911255, + "learning_rate": 9.973211067256287e-05, + "loss": 0.0373, + "step": 4880 + }, + { + "epoch": 2.8463329452852153, + "grad_norm": 0.5090671181678772, + "learning_rate": 9.97292542648243e-05, + "loss": 0.0444, + "step": 4890 + }, + { + "epoch": 2.852153667054715, + "grad_norm": 0.4326211214065552, + "learning_rate": 9.972638275065131e-05, + "loss": 0.0375, + "step": 4900 + }, + { + "epoch": 2.857974388824214, + "grad_norm": 0.3988668620586395, + "learning_rate": 9.972349613091621e-05, + "loss": 0.0443, + "step": 4910 + }, + { + "epoch": 2.8637951105937134, + "grad_norm": 0.4106663763523102, + "learning_rate": 9.972059440649584e-05, + "loss": 0.047, + "step": 4920 + }, + { + "epoch": 2.869615832363213, + "grad_norm": 0.44620198011398315, + "learning_rate": 9.971767757827168e-05, + "loss": 0.0423, + "step": 4930 + }, + { + "epoch": 2.8754365541327127, + "grad_norm": 0.4232618808746338, + "learning_rate": 9.971474564712982e-05, + "loss": 0.0441, + "step": 4940 + }, + { + "epoch": 2.881257275902212, + "grad_norm": 0.4010876715183258, + "learning_rate": 9.971179861396084e-05, + "loss": 0.0375, + "step": 4950 + }, + { + "epoch": 2.887077997671711, + "grad_norm": 0.41923055052757263, + "learning_rate": 9.970883647966003e-05, + "loss": 0.0382, + "step": 4960 + }, + { + "epoch": 2.892898719441211, + "grad_norm": 0.49063077569007874, + "learning_rate": 9.970585924512717e-05, + "loss": 0.0397, + "step": 4970 + }, + { + "epoch": 2.89871944121071, + "grad_norm": 0.28669509291648865, + "learning_rate": 9.970286691126669e-05, + "loss": 0.0302, + "step": 4980 + }, + { + "epoch": 2.9045401629802097, + "grad_norm": 0.3314480185508728, + "learning_rate": 9.969985947898756e-05, + "loss": 0.0395, + "step": 4990 + }, + { + "epoch": 2.910360884749709, + "grad_norm": 0.3386068642139435, + "learning_rate": 9.969683694920337e-05, + "loss": 0.0491, + "step": 5000 + }, + { + "epoch": 2.9161816065192085, + "grad_norm": 0.36342647671699524, + "learning_rate": 9.969379932283228e-05, + "loss": 0.0384, + "step": 5010 + }, + { + "epoch": 2.9220023282887078, + "grad_norm": 0.5353965163230896, + "learning_rate": 9.969074660079704e-05, + "loss": 0.0415, + "step": 5020 + }, + { + "epoch": 2.9278230500582074, + "grad_norm": 0.489899218082428, + "learning_rate": 9.968767878402501e-05, + "loss": 0.0387, + "step": 5030 + }, + { + "epoch": 2.9336437718277066, + "grad_norm": 0.3332742750644684, + "learning_rate": 9.968459587344808e-05, + "loss": 0.0386, + "step": 5040 + }, + { + "epoch": 2.939464493597206, + "grad_norm": 0.4086686074733734, + "learning_rate": 9.968149787000278e-05, + "loss": 0.0387, + "step": 5050 + }, + { + "epoch": 2.9452852153667055, + "grad_norm": 0.3574705123901367, + "learning_rate": 9.967838477463018e-05, + "loss": 0.0284, + "step": 5060 + }, + { + "epoch": 2.9511059371362047, + "grad_norm": 0.39927080273628235, + "learning_rate": 9.967525658827597e-05, + "loss": 0.0368, + "step": 5070 + }, + { + "epoch": 2.9569266589057044, + "grad_norm": 0.30566880106925964, + "learning_rate": 9.967211331189042e-05, + "loss": 0.0425, + "step": 5080 + }, + { + "epoch": 2.9627473806752036, + "grad_norm": 0.4606829583644867, + "learning_rate": 9.966895494642834e-05, + "loss": 0.0486, + "step": 5090 + }, + { + "epoch": 2.9685681024447033, + "grad_norm": 0.4065619111061096, + "learning_rate": 9.96657814928492e-05, + "loss": 0.0466, + "step": 5100 + }, + { + "epoch": 2.9743888242142025, + "grad_norm": 0.3218784034252167, + "learning_rate": 9.966259295211697e-05, + "loss": 0.0388, + "step": 5110 + }, + { + "epoch": 2.980209545983702, + "grad_norm": 0.25869008898735046, + "learning_rate": 9.965938932520028e-05, + "loss": 0.0381, + "step": 5120 + }, + { + "epoch": 2.9860302677532014, + "grad_norm": 0.24495179951190948, + "learning_rate": 9.965617061307229e-05, + "loss": 0.0331, + "step": 5130 + }, + { + "epoch": 2.9918509895227006, + "grad_norm": 0.3310537338256836, + "learning_rate": 9.965293681671077e-05, + "loss": 0.0386, + "step": 5140 + }, + { + "epoch": 2.9976717112922002, + "grad_norm": 0.35270634293556213, + "learning_rate": 9.964968793709804e-05, + "loss": 0.0399, + "step": 5150 + }, + { + "epoch": 3.0034924330616994, + "grad_norm": 0.43722623586654663, + "learning_rate": 9.964642397522106e-05, + "loss": 0.042, + "step": 5160 + }, + { + "epoch": 3.009313154831199, + "grad_norm": 0.4060680866241455, + "learning_rate": 9.96431449320713e-05, + "loss": 0.0345, + "step": 5170 + }, + { + "epoch": 3.0151338766006983, + "grad_norm": 0.2799883186817169, + "learning_rate": 9.963985080864486e-05, + "loss": 0.0387, + "step": 5180 + }, + { + "epoch": 3.020954598370198, + "grad_norm": 0.37053537368774414, + "learning_rate": 9.96365416059424e-05, + "loss": 0.0314, + "step": 5190 + }, + { + "epoch": 3.026775320139697, + "grad_norm": 0.2757987380027771, + "learning_rate": 9.963321732496919e-05, + "loss": 0.0356, + "step": 5200 + }, + { + "epoch": 3.032596041909197, + "grad_norm": 0.4192562401294708, + "learning_rate": 9.962987796673506e-05, + "loss": 0.038, + "step": 5210 + }, + { + "epoch": 3.038416763678696, + "grad_norm": 0.36045825481414795, + "learning_rate": 9.962652353225438e-05, + "loss": 0.0299, + "step": 5220 + }, + { + "epoch": 3.0442374854481957, + "grad_norm": 0.3032984137535095, + "learning_rate": 9.962315402254619e-05, + "loss": 0.0377, + "step": 5230 + }, + { + "epoch": 3.050058207217695, + "grad_norm": 0.3811168372631073, + "learning_rate": 9.9619769438634e-05, + "loss": 0.0376, + "step": 5240 + }, + { + "epoch": 3.0558789289871946, + "grad_norm": 0.3403345048427582, + "learning_rate": 9.9616369781546e-05, + "loss": 0.0304, + "step": 5250 + }, + { + "epoch": 3.061699650756694, + "grad_norm": 0.29770582914352417, + "learning_rate": 9.961295505231491e-05, + "loss": 0.0371, + "step": 5260 + }, + { + "epoch": 3.067520372526193, + "grad_norm": 0.40513524413108826, + "learning_rate": 9.960952525197804e-05, + "loss": 0.0342, + "step": 5270 + }, + { + "epoch": 3.0733410942956927, + "grad_norm": 0.41450560092926025, + "learning_rate": 9.960608038157724e-05, + "loss": 0.028, + "step": 5280 + }, + { + "epoch": 3.079161816065192, + "grad_norm": 0.4197642505168915, + "learning_rate": 9.960262044215901e-05, + "loss": 0.0405, + "step": 5290 + }, + { + "epoch": 3.0849825378346916, + "grad_norm": 0.3120761215686798, + "learning_rate": 9.959914543477435e-05, + "loss": 0.048, + "step": 5300 + }, + { + "epoch": 3.090803259604191, + "grad_norm": 0.3364720642566681, + "learning_rate": 9.959565536047892e-05, + "loss": 0.0536, + "step": 5310 + }, + { + "epoch": 3.0966239813736904, + "grad_norm": 0.30635660886764526, + "learning_rate": 9.959215022033288e-05, + "loss": 0.0378, + "step": 5320 + }, + { + "epoch": 3.1024447031431897, + "grad_norm": 0.45363980531692505, + "learning_rate": 9.9588630015401e-05, + "loss": 0.0424, + "step": 5330 + }, + { + "epoch": 3.1082654249126893, + "grad_norm": 0.3181758522987366, + "learning_rate": 9.958509474675264e-05, + "loss": 0.0351, + "step": 5340 + }, + { + "epoch": 3.1140861466821885, + "grad_norm": 0.3375316560268402, + "learning_rate": 9.958154441546171e-05, + "loss": 0.0412, + "step": 5350 + }, + { + "epoch": 3.119906868451688, + "grad_norm": 0.22768385708332062, + "learning_rate": 9.957797902260673e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 3.1257275902211874, + "grad_norm": 0.3175099194049835, + "learning_rate": 9.957439856927073e-05, + "loss": 0.031, + "step": 5370 + }, + { + "epoch": 3.131548311990687, + "grad_norm": 0.2502155005931854, + "learning_rate": 9.957080305654139e-05, + "loss": 0.0381, + "step": 5380 + }, + { + "epoch": 3.1373690337601863, + "grad_norm": 0.47000524401664734, + "learning_rate": 9.956719248551092e-05, + "loss": 0.0427, + "step": 5390 + }, + { + "epoch": 3.1431897555296855, + "grad_norm": 0.504729688167572, + "learning_rate": 9.956356685727612e-05, + "loss": 0.0384, + "step": 5400 + }, + { + "epoch": 3.149010477299185, + "grad_norm": 0.45616719126701355, + "learning_rate": 9.955992617293836e-05, + "loss": 0.0379, + "step": 5410 + }, + { + "epoch": 3.1548311990686844, + "grad_norm": 0.29938599467277527, + "learning_rate": 9.955627043360358e-05, + "loss": 0.0465, + "step": 5420 + }, + { + "epoch": 3.160651920838184, + "grad_norm": 0.40350016951560974, + "learning_rate": 9.955259964038231e-05, + "loss": 0.0331, + "step": 5430 + }, + { + "epoch": 3.1664726426076832, + "grad_norm": 0.5336388945579529, + "learning_rate": 9.954891379438962e-05, + "loss": 0.0388, + "step": 5440 + }, + { + "epoch": 3.172293364377183, + "grad_norm": 0.4100174903869629, + "learning_rate": 9.954521289674519e-05, + "loss": 0.0438, + "step": 5450 + }, + { + "epoch": 3.178114086146682, + "grad_norm": 0.33198121190071106, + "learning_rate": 9.954149694857325e-05, + "loss": 0.036, + "step": 5460 + }, + { + "epoch": 3.1839348079161818, + "grad_norm": 0.43370336294174194, + "learning_rate": 9.953776595100258e-05, + "loss": 0.0448, + "step": 5470 + }, + { + "epoch": 3.189755529685681, + "grad_norm": 0.45768406987190247, + "learning_rate": 9.95340199051666e-05, + "loss": 0.0309, + "step": 5480 + }, + { + "epoch": 3.1955762514551806, + "grad_norm": 0.35636889934539795, + "learning_rate": 9.953025881220325e-05, + "loss": 0.0326, + "step": 5490 + }, + { + "epoch": 3.20139697322468, + "grad_norm": 0.3238945007324219, + "learning_rate": 9.952648267325504e-05, + "loss": 0.0344, + "step": 5500 + }, + { + "epoch": 3.207217694994179, + "grad_norm": 0.4708104729652405, + "learning_rate": 9.952269148946905e-05, + "loss": 0.0347, + "step": 5510 + }, + { + "epoch": 3.2130384167636787, + "grad_norm": 0.3420480191707611, + "learning_rate": 9.951888526199697e-05, + "loss": 0.0348, + "step": 5520 + }, + { + "epoch": 3.218859138533178, + "grad_norm": 0.28190043568611145, + "learning_rate": 9.951506399199501e-05, + "loss": 0.0339, + "step": 5530 + }, + { + "epoch": 3.2246798603026776, + "grad_norm": 0.3611662983894348, + "learning_rate": 9.951122768062399e-05, + "loss": 0.0305, + "step": 5540 + }, + { + "epoch": 3.230500582072177, + "grad_norm": 0.25071653723716736, + "learning_rate": 9.950737632904927e-05, + "loss": 0.0349, + "step": 5550 + }, + { + "epoch": 3.2363213038416765, + "grad_norm": 0.30609941482543945, + "learning_rate": 9.950350993844077e-05, + "loss": 0.0377, + "step": 5560 + }, + { + "epoch": 3.2421420256111757, + "grad_norm": 0.346263587474823, + "learning_rate": 9.949962850997303e-05, + "loss": 0.0335, + "step": 5570 + }, + { + "epoch": 3.2479627473806754, + "grad_norm": 0.3590392470359802, + "learning_rate": 9.949573204482512e-05, + "loss": 0.0247, + "step": 5580 + }, + { + "epoch": 3.2537834691501746, + "grad_norm": 0.40216776728630066, + "learning_rate": 9.949182054418064e-05, + "loss": 0.0306, + "step": 5590 + }, + { + "epoch": 3.2596041909196742, + "grad_norm": 0.31848517060279846, + "learning_rate": 9.948789400922787e-05, + "loss": 0.0376, + "step": 5600 + }, + { + "epoch": 3.2654249126891735, + "grad_norm": 0.37429991364479065, + "learning_rate": 9.948395244115953e-05, + "loss": 0.0365, + "step": 5610 + }, + { + "epoch": 3.2712456344586727, + "grad_norm": 0.3046098053455353, + "learning_rate": 9.9479995841173e-05, + "loss": 0.0381, + "step": 5620 + }, + { + "epoch": 3.2770663562281723, + "grad_norm": 0.2501715123653412, + "learning_rate": 9.947602421047017e-05, + "loss": 0.0268, + "step": 5630 + }, + { + "epoch": 3.2828870779976715, + "grad_norm": 0.4236372709274292, + "learning_rate": 9.947203755025753e-05, + "loss": 0.0372, + "step": 5640 + }, + { + "epoch": 3.288707799767171, + "grad_norm": 0.3400777578353882, + "learning_rate": 9.946803586174611e-05, + "loss": 0.0311, + "step": 5650 + }, + { + "epoch": 3.2945285215366704, + "grad_norm": 0.33404913544654846, + "learning_rate": 9.946401914615151e-05, + "loss": 0.0334, + "step": 5660 + }, + { + "epoch": 3.30034924330617, + "grad_norm": 0.27669253945350647, + "learning_rate": 9.945998740469394e-05, + "loss": 0.028, + "step": 5670 + }, + { + "epoch": 3.3061699650756693, + "grad_norm": 0.27043387293815613, + "learning_rate": 9.945594063859809e-05, + "loss": 0.0416, + "step": 5680 + }, + { + "epoch": 3.311990686845169, + "grad_norm": 0.40608108043670654, + "learning_rate": 9.94518788490933e-05, + "loss": 0.0269, + "step": 5690 + }, + { + "epoch": 3.317811408614668, + "grad_norm": 0.3451448380947113, + "learning_rate": 9.944780203741341e-05, + "loss": 0.0423, + "step": 5700 + }, + { + "epoch": 3.323632130384168, + "grad_norm": 0.3251936435699463, + "learning_rate": 9.944371020479686e-05, + "loss": 0.0393, + "step": 5710 + }, + { + "epoch": 3.329452852153667, + "grad_norm": 0.4305363893508911, + "learning_rate": 9.943960335248662e-05, + "loss": 0.0346, + "step": 5720 + }, + { + "epoch": 3.3352735739231667, + "grad_norm": 0.4813692271709442, + "learning_rate": 9.943548148173027e-05, + "loss": 0.0386, + "step": 5730 + }, + { + "epoch": 3.341094295692666, + "grad_norm": 0.3841996192932129, + "learning_rate": 9.943134459377992e-05, + "loss": 0.0288, + "step": 5740 + }, + { + "epoch": 3.346915017462165, + "grad_norm": 0.2989925444126129, + "learning_rate": 9.942719268989222e-05, + "loss": 0.0376, + "step": 5750 + }, + { + "epoch": 3.352735739231665, + "grad_norm": 0.2500865161418915, + "learning_rate": 9.942302577132844e-05, + "loss": 0.0276, + "step": 5760 + }, + { + "epoch": 3.358556461001164, + "grad_norm": 0.4245300889015198, + "learning_rate": 9.941884383935438e-05, + "loss": 0.0313, + "step": 5770 + }, + { + "epoch": 3.3643771827706637, + "grad_norm": 0.348772257566452, + "learning_rate": 9.941464689524039e-05, + "loss": 0.0329, + "step": 5780 + }, + { + "epoch": 3.370197904540163, + "grad_norm": 0.37258872389793396, + "learning_rate": 9.941043494026139e-05, + "loss": 0.0343, + "step": 5790 + }, + { + "epoch": 3.3760186263096625, + "grad_norm": 0.28229570388793945, + "learning_rate": 9.940620797569685e-05, + "loss": 0.0336, + "step": 5800 + }, + { + "epoch": 3.3818393480791618, + "grad_norm": 0.38085484504699707, + "learning_rate": 9.940196600283082e-05, + "loss": 0.0485, + "step": 5810 + }, + { + "epoch": 3.3876600698486614, + "grad_norm": 0.268378883600235, + "learning_rate": 9.939770902295192e-05, + "loss": 0.0456, + "step": 5820 + }, + { + "epoch": 3.3934807916181606, + "grad_norm": 0.31732895970344543, + "learning_rate": 9.939343703735329e-05, + "loss": 0.0385, + "step": 5830 + }, + { + "epoch": 3.39930151338766, + "grad_norm": 0.36037662625312805, + "learning_rate": 9.938915004733264e-05, + "loss": 0.0334, + "step": 5840 + }, + { + "epoch": 3.4051222351571595, + "grad_norm": 0.33846715092658997, + "learning_rate": 9.938484805419224e-05, + "loss": 0.0352, + "step": 5850 + }, + { + "epoch": 3.4109429569266587, + "grad_norm": 0.36437246203422546, + "learning_rate": 9.938053105923894e-05, + "loss": 0.0335, + "step": 5860 + }, + { + "epoch": 3.4167636786961584, + "grad_norm": 0.4330507218837738, + "learning_rate": 9.937619906378413e-05, + "loss": 0.0307, + "step": 5870 + }, + { + "epoch": 3.4225844004656576, + "grad_norm": 0.4035661816596985, + "learning_rate": 9.937185206914374e-05, + "loss": 0.0352, + "step": 5880 + }, + { + "epoch": 3.4284051222351573, + "grad_norm": 0.4324305057525635, + "learning_rate": 9.936749007663829e-05, + "loss": 0.0346, + "step": 5890 + }, + { + "epoch": 3.4342258440046565, + "grad_norm": 0.4129992127418518, + "learning_rate": 9.93631130875928e-05, + "loss": 0.0349, + "step": 5900 + }, + { + "epoch": 3.440046565774156, + "grad_norm": 0.40750840306282043, + "learning_rate": 9.935872110333692e-05, + "loss": 0.0435, + "step": 5910 + }, + { + "epoch": 3.4458672875436553, + "grad_norm": 0.43066924810409546, + "learning_rate": 9.935431412520484e-05, + "loss": 0.0395, + "step": 5920 + }, + { + "epoch": 3.451688009313155, + "grad_norm": 0.21329356729984283, + "learning_rate": 9.934989215453523e-05, + "loss": 0.0367, + "step": 5930 + }, + { + "epoch": 3.457508731082654, + "grad_norm": 0.25613340735435486, + "learning_rate": 9.934545519267139e-05, + "loss": 0.036, + "step": 5940 + }, + { + "epoch": 3.463329452852154, + "grad_norm": 0.3532758951187134, + "learning_rate": 9.934100324096117e-05, + "loss": 0.0312, + "step": 5950 + }, + { + "epoch": 3.469150174621653, + "grad_norm": 0.31850937008857727, + "learning_rate": 9.933653630075692e-05, + "loss": 0.0346, + "step": 5960 + }, + { + "epoch": 3.4749708963911523, + "grad_norm": 0.32994377613067627, + "learning_rate": 9.93320543734156e-05, + "loss": 0.0362, + "step": 5970 + }, + { + "epoch": 3.480791618160652, + "grad_norm": 0.309565931558609, + "learning_rate": 9.932755746029871e-05, + "loss": 0.0258, + "step": 5980 + }, + { + "epoch": 3.486612339930151, + "grad_norm": 0.39881595969200134, + "learning_rate": 9.932304556277228e-05, + "loss": 0.0328, + "step": 5990 + }, + { + "epoch": 3.492433061699651, + "grad_norm": 0.43460991978645325, + "learning_rate": 9.93185186822069e-05, + "loss": 0.0396, + "step": 6000 + }, + { + "epoch": 3.49825378346915, + "grad_norm": 0.3899631202220917, + "learning_rate": 9.931397681997773e-05, + "loss": 0.0349, + "step": 6010 + }, + { + "epoch": 3.5040745052386497, + "grad_norm": 0.37574502825737, + "learning_rate": 9.930941997746446e-05, + "loss": 0.0364, + "step": 6020 + }, + { + "epoch": 3.509895227008149, + "grad_norm": 0.3812849223613739, + "learning_rate": 9.930484815605134e-05, + "loss": 0.0374, + "step": 6030 + }, + { + "epoch": 3.5157159487776486, + "grad_norm": 0.46897608041763306, + "learning_rate": 9.930026135712717e-05, + "loss": 0.0411, + "step": 6040 + }, + { + "epoch": 3.521536670547148, + "grad_norm": 0.4864242672920227, + "learning_rate": 9.92956595820853e-05, + "loss": 0.035, + "step": 6050 + }, + { + "epoch": 3.527357392316647, + "grad_norm": 0.4410932660102844, + "learning_rate": 9.929104283232362e-05, + "loss": 0.0305, + "step": 6060 + }, + { + "epoch": 3.5331781140861467, + "grad_norm": 0.3106556236743927, + "learning_rate": 9.92864111092446e-05, + "loss": 0.0356, + "step": 6070 + }, + { + "epoch": 3.5389988358556463, + "grad_norm": 0.38226425647735596, + "learning_rate": 9.92817644142552e-05, + "loss": 0.0322, + "step": 6080 + }, + { + "epoch": 3.5448195576251456, + "grad_norm": 0.28042078018188477, + "learning_rate": 9.927710274876698e-05, + "loss": 0.0345, + "step": 6090 + }, + { + "epoch": 3.5506402793946448, + "grad_norm": 0.5434801578521729, + "learning_rate": 9.927242611419603e-05, + "loss": 0.0346, + "step": 6100 + }, + { + "epoch": 3.5564610011641444, + "grad_norm": 0.3740876615047455, + "learning_rate": 9.926773451196301e-05, + "loss": 0.0404, + "step": 6110 + }, + { + "epoch": 3.5622817229336436, + "grad_norm": 0.29763171076774597, + "learning_rate": 9.926302794349306e-05, + "loss": 0.0358, + "step": 6120 + }, + { + "epoch": 3.5681024447031433, + "grad_norm": 0.3736281096935272, + "learning_rate": 9.925830641021594e-05, + "loss": 0.0433, + "step": 6130 + }, + { + "epoch": 3.5739231664726425, + "grad_norm": 0.31615135073661804, + "learning_rate": 9.925356991356593e-05, + "loss": 0.0314, + "step": 6140 + }, + { + "epoch": 3.579743888242142, + "grad_norm": 0.4853357672691345, + "learning_rate": 9.924881845498184e-05, + "loss": 0.0338, + "step": 6150 + }, + { + "epoch": 3.5855646100116414, + "grad_norm": 0.41613292694091797, + "learning_rate": 9.924405203590705e-05, + "loss": 0.037, + "step": 6160 + }, + { + "epoch": 3.591385331781141, + "grad_norm": 0.33463701605796814, + "learning_rate": 9.923927065778946e-05, + "loss": 0.0335, + "step": 6170 + }, + { + "epoch": 3.5972060535506403, + "grad_norm": 0.2552605867385864, + "learning_rate": 9.923447432208154e-05, + "loss": 0.0366, + "step": 6180 + }, + { + "epoch": 3.6030267753201395, + "grad_norm": 0.3971342444419861, + "learning_rate": 9.922966303024027e-05, + "loss": 0.0335, + "step": 6190 + }, + { + "epoch": 3.608847497089639, + "grad_norm": 0.24646179378032684, + "learning_rate": 9.922483678372721e-05, + "loss": 0.0391, + "step": 6200 + }, + { + "epoch": 3.614668218859139, + "grad_norm": 0.31631094217300415, + "learning_rate": 9.921999558400845e-05, + "loss": 0.035, + "step": 6210 + }, + { + "epoch": 3.620488940628638, + "grad_norm": 0.24963916838169098, + "learning_rate": 9.92151394325546e-05, + "loss": 0.0497, + "step": 6220 + }, + { + "epoch": 3.6263096623981372, + "grad_norm": 0.31648048758506775, + "learning_rate": 9.921026833084084e-05, + "loss": 0.0387, + "step": 6230 + }, + { + "epoch": 3.632130384167637, + "grad_norm": 0.4078068137168884, + "learning_rate": 9.920538228034689e-05, + "loss": 0.0331, + "step": 6240 + }, + { + "epoch": 3.637951105937136, + "grad_norm": 0.3072776198387146, + "learning_rate": 9.920048128255699e-05, + "loss": 0.0366, + "step": 6250 + }, + { + "epoch": 3.6437718277066358, + "grad_norm": 0.4248827397823334, + "learning_rate": 9.919556533895995e-05, + "loss": 0.0411, + "step": 6260 + }, + { + "epoch": 3.649592549476135, + "grad_norm": 0.27224981784820557, + "learning_rate": 9.919063445104907e-05, + "loss": 0.0393, + "step": 6270 + }, + { + "epoch": 3.655413271245634, + "grad_norm": 0.3928719460964203, + "learning_rate": 9.918568862032227e-05, + "loss": 0.0351, + "step": 6280 + }, + { + "epoch": 3.661233993015134, + "grad_norm": 0.44982922077178955, + "learning_rate": 9.918072784828194e-05, + "loss": 0.0295, + "step": 6290 + }, + { + "epoch": 3.6670547147846335, + "grad_norm": 0.32131168246269226, + "learning_rate": 9.917575213643501e-05, + "loss": 0.035, + "step": 6300 + }, + { + "epoch": 3.6728754365541327, + "grad_norm": 0.27630677819252014, + "learning_rate": 9.917076148629302e-05, + "loss": 0.0297, + "step": 6310 + }, + { + "epoch": 3.678696158323632, + "grad_norm": 0.2875361442565918, + "learning_rate": 9.916575589937196e-05, + "loss": 0.0351, + "step": 6320 + }, + { + "epoch": 3.6845168800931316, + "grad_norm": 0.3990020453929901, + "learning_rate": 9.916073537719239e-05, + "loss": 0.0361, + "step": 6330 + }, + { + "epoch": 3.690337601862631, + "grad_norm": 0.33642569184303284, + "learning_rate": 9.915569992127944e-05, + "loss": 0.0331, + "step": 6340 + }, + { + "epoch": 3.6961583236321305, + "grad_norm": 0.35597261786460876, + "learning_rate": 9.915064953316273e-05, + "loss": 0.0307, + "step": 6350 + }, + { + "epoch": 3.7019790454016297, + "grad_norm": 0.33990809321403503, + "learning_rate": 9.914558421437645e-05, + "loss": 0.0311, + "step": 6360 + }, + { + "epoch": 3.7077997671711294, + "grad_norm": 0.34190526604652405, + "learning_rate": 9.914050396645929e-05, + "loss": 0.0312, + "step": 6370 + }, + { + "epoch": 3.7136204889406286, + "grad_norm": 0.3270927965641022, + "learning_rate": 9.913540879095452e-05, + "loss": 0.0361, + "step": 6380 + }, + { + "epoch": 3.7194412107101282, + "grad_norm": 0.34668219089508057, + "learning_rate": 9.913029868940987e-05, + "loss": 0.041, + "step": 6390 + }, + { + "epoch": 3.7252619324796274, + "grad_norm": 0.33635997772216797, + "learning_rate": 9.912517366337772e-05, + "loss": 0.0332, + "step": 6400 + }, + { + "epoch": 3.7310826542491267, + "grad_norm": 0.242023304104805, + "learning_rate": 9.912003371441487e-05, + "loss": 0.0362, + "step": 6410 + }, + { + "epoch": 3.7369033760186263, + "grad_norm": 0.5687283277511597, + "learning_rate": 9.911487884408271e-05, + "loss": 0.0334, + "step": 6420 + }, + { + "epoch": 3.742724097788126, + "grad_norm": 0.1781076043844223, + "learning_rate": 9.910970905394719e-05, + "loss": 0.0332, + "step": 6430 + }, + { + "epoch": 3.748544819557625, + "grad_norm": 0.26690176129341125, + "learning_rate": 9.91045243455787e-05, + "loss": 0.0278, + "step": 6440 + }, + { + "epoch": 3.7543655413271244, + "grad_norm": 0.2202785164117813, + "learning_rate": 9.909932472055225e-05, + "loss": 0.0349, + "step": 6450 + }, + { + "epoch": 3.760186263096624, + "grad_norm": 0.26391735672950745, + "learning_rate": 9.909411018044734e-05, + "loss": 0.025, + "step": 6460 + }, + { + "epoch": 3.7660069848661233, + "grad_norm": 0.39851659536361694, + "learning_rate": 9.908888072684802e-05, + "loss": 0.0318, + "step": 6470 + }, + { + "epoch": 3.771827706635623, + "grad_norm": 0.27641424536705017, + "learning_rate": 9.908363636134285e-05, + "loss": 0.0247, + "step": 6480 + }, + { + "epoch": 3.777648428405122, + "grad_norm": 0.3374848961830139, + "learning_rate": 9.907837708552493e-05, + "loss": 0.0255, + "step": 6490 + }, + { + "epoch": 3.7834691501746214, + "grad_norm": 0.3504069745540619, + "learning_rate": 9.90731029009919e-05, + "loss": 0.0328, + "step": 6500 + }, + { + "epoch": 3.789289871944121, + "grad_norm": 0.5741119980812073, + "learning_rate": 9.906781380934589e-05, + "loss": 0.0454, + "step": 6510 + }, + { + "epoch": 3.7951105937136207, + "grad_norm": 0.315986692905426, + "learning_rate": 9.906250981219362e-05, + "loss": 0.0391, + "step": 6520 + }, + { + "epoch": 3.80093131548312, + "grad_norm": 0.32467150688171387, + "learning_rate": 9.905719091114628e-05, + "loss": 0.0274, + "step": 6530 + }, + { + "epoch": 3.806752037252619, + "grad_norm": 0.35320356488227844, + "learning_rate": 9.905185710781964e-05, + "loss": 0.0418, + "step": 6540 + }, + { + "epoch": 3.812572759022119, + "grad_norm": 0.23943130671977997, + "learning_rate": 9.904650840383392e-05, + "loss": 0.0332, + "step": 6550 + }, + { + "epoch": 3.818393480791618, + "grad_norm": 0.24390850961208344, + "learning_rate": 9.904114480081397e-05, + "loss": 0.0256, + "step": 6560 + }, + { + "epoch": 3.8242142025611177, + "grad_norm": 0.2909289002418518, + "learning_rate": 9.903576630038906e-05, + "loss": 0.0392, + "step": 6570 + }, + { + "epoch": 3.830034924330617, + "grad_norm": 0.22450710833072662, + "learning_rate": 9.903037290419309e-05, + "loss": 0.032, + "step": 6580 + }, + { + "epoch": 3.8358556461001165, + "grad_norm": 0.4196394383907318, + "learning_rate": 9.902496461386439e-05, + "loss": 0.0277, + "step": 6590 + }, + { + "epoch": 3.8416763678696157, + "grad_norm": 0.4791473150253296, + "learning_rate": 9.901954143104588e-05, + "loss": 0.0329, + "step": 6600 + }, + { + "epoch": 3.8474970896391154, + "grad_norm": 0.3372289836406708, + "learning_rate": 9.901410335738496e-05, + "loss": 0.0252, + "step": 6610 + }, + { + "epoch": 3.8533178114086146, + "grad_norm": 0.2633388042449951, + "learning_rate": 9.900865039453358e-05, + "loss": 0.0399, + "step": 6620 + }, + { + "epoch": 3.859138533178114, + "grad_norm": 0.3194822669029236, + "learning_rate": 9.900318254414821e-05, + "loss": 0.0328, + "step": 6630 + }, + { + "epoch": 3.8649592549476135, + "grad_norm": 0.31285813450813293, + "learning_rate": 9.899769980788985e-05, + "loss": 0.0273, + "step": 6640 + }, + { + "epoch": 3.870779976717113, + "grad_norm": 0.3318024277687073, + "learning_rate": 9.899220218742398e-05, + "loss": 0.03, + "step": 6650 + }, + { + "epoch": 3.8766006984866124, + "grad_norm": 0.44767144322395325, + "learning_rate": 9.898668968442066e-05, + "loss": 0.0443, + "step": 6660 + }, + { + "epoch": 3.8824214202561116, + "grad_norm": 0.2584259510040283, + "learning_rate": 9.898116230055443e-05, + "loss": 0.035, + "step": 6670 + }, + { + "epoch": 3.8882421420256112, + "grad_norm": 0.26104825735092163, + "learning_rate": 9.897562003750437e-05, + "loss": 0.0297, + "step": 6680 + }, + { + "epoch": 3.8940628637951105, + "grad_norm": 0.2951517701148987, + "learning_rate": 9.897006289695407e-05, + "loss": 0.036, + "step": 6690 + }, + { + "epoch": 3.89988358556461, + "grad_norm": 0.19088056683540344, + "learning_rate": 9.896449088059164e-05, + "loss": 0.0383, + "step": 6700 + }, + { + "epoch": 3.9057043073341093, + "grad_norm": 0.29061159491539, + "learning_rate": 9.89589039901097e-05, + "loss": 0.0342, + "step": 6710 + }, + { + "epoch": 3.911525029103609, + "grad_norm": 0.45604458451271057, + "learning_rate": 9.895330222720542e-05, + "loss": 0.0324, + "step": 6720 + }, + { + "epoch": 3.917345750873108, + "grad_norm": 0.26996859908103943, + "learning_rate": 9.894768559358047e-05, + "loss": 0.0433, + "step": 6730 + }, + { + "epoch": 3.923166472642608, + "grad_norm": 0.310240775346756, + "learning_rate": 9.894205409094101e-05, + "loss": 0.0416, + "step": 6740 + }, + { + "epoch": 3.928987194412107, + "grad_norm": 0.24375246465206146, + "learning_rate": 9.893640772099777e-05, + "loss": 0.03, + "step": 6750 + }, + { + "epoch": 3.9348079161816063, + "grad_norm": 0.32256248593330383, + "learning_rate": 9.893074648546595e-05, + "loss": 0.0457, + "step": 6760 + }, + { + "epoch": 3.940628637951106, + "grad_norm": 0.4482342302799225, + "learning_rate": 9.892507038606528e-05, + "loss": 0.043, + "step": 6770 + }, + { + "epoch": 3.9464493597206056, + "grad_norm": 0.40255895256996155, + "learning_rate": 9.891937942452003e-05, + "loss": 0.0364, + "step": 6780 + }, + { + "epoch": 3.952270081490105, + "grad_norm": 0.34155017137527466, + "learning_rate": 9.891367360255895e-05, + "loss": 0.0332, + "step": 6790 + }, + { + "epoch": 3.958090803259604, + "grad_norm": 0.5011407732963562, + "learning_rate": 9.890795292191532e-05, + "loss": 0.0485, + "step": 6800 + }, + { + "epoch": 3.9639115250291037, + "grad_norm": 0.35580557584762573, + "learning_rate": 9.890221738432694e-05, + "loss": 0.0391, + "step": 6810 + }, + { + "epoch": 3.969732246798603, + "grad_norm": 0.32879164814949036, + "learning_rate": 9.88964669915361e-05, + "loss": 0.032, + "step": 6820 + }, + { + "epoch": 3.9755529685681026, + "grad_norm": 0.47585371136665344, + "learning_rate": 9.889070174528963e-05, + "loss": 0.0409, + "step": 6830 + }, + { + "epoch": 3.981373690337602, + "grad_norm": 0.2513808310031891, + "learning_rate": 9.888492164733883e-05, + "loss": 0.0282, + "step": 6840 + }, + { + "epoch": 3.987194412107101, + "grad_norm": 0.28978461027145386, + "learning_rate": 9.88791266994396e-05, + "loss": 0.0335, + "step": 6850 + }, + { + "epoch": 3.9930151338766007, + "grad_norm": 0.21137544512748718, + "learning_rate": 9.887331690335223e-05, + "loss": 0.0289, + "step": 6860 + }, + { + "epoch": 3.9988358556461003, + "grad_norm": 0.23081302642822266, + "learning_rate": 9.886749226084163e-05, + "loss": 0.0272, + "step": 6870 + }, + { + "epoch": 4.0046565774155995, + "grad_norm": 0.2810524106025696, + "learning_rate": 9.886165277367714e-05, + "loss": 0.0439, + "step": 6880 + }, + { + "epoch": 4.010477299185099, + "grad_norm": 0.3914248049259186, + "learning_rate": 9.885579844363265e-05, + "loss": 0.0297, + "step": 6890 + }, + { + "epoch": 4.016298020954598, + "grad_norm": 0.4860977828502655, + "learning_rate": 9.884992927248656e-05, + "loss": 0.0331, + "step": 6900 + }, + { + "epoch": 4.022118742724098, + "grad_norm": 0.2981434166431427, + "learning_rate": 9.884404526202178e-05, + "loss": 0.0357, + "step": 6910 + }, + { + "epoch": 4.027939464493597, + "grad_norm": 0.2977084815502167, + "learning_rate": 9.883814641402568e-05, + "loss": 0.038, + "step": 6920 + }, + { + "epoch": 4.0337601862630965, + "grad_norm": 0.36415451765060425, + "learning_rate": 9.88322327302902e-05, + "loss": 0.0331, + "step": 6930 + }, + { + "epoch": 4.039580908032596, + "grad_norm": 0.28962454199790955, + "learning_rate": 9.882630421261176e-05, + "loss": 0.0275, + "step": 6940 + }, + { + "epoch": 4.045401629802096, + "grad_norm": 0.2568827271461487, + "learning_rate": 9.88203608627913e-05, + "loss": 0.0362, + "step": 6950 + }, + { + "epoch": 4.051222351571595, + "grad_norm": 0.41539111733436584, + "learning_rate": 9.881440268263422e-05, + "loss": 0.0353, + "step": 6960 + }, + { + "epoch": 4.057043073341094, + "grad_norm": 0.24688757956027985, + "learning_rate": 9.880842967395048e-05, + "loss": 0.0356, + "step": 6970 + }, + { + "epoch": 4.0628637951105935, + "grad_norm": 0.27847328782081604, + "learning_rate": 9.880244183855452e-05, + "loss": 0.0346, + "step": 6980 + }, + { + "epoch": 4.068684516880094, + "grad_norm": 0.3261456787586212, + "learning_rate": 9.879643917826527e-05, + "loss": 0.0438, + "step": 6990 + }, + { + "epoch": 4.074505238649593, + "grad_norm": 0.3721921443939209, + "learning_rate": 9.87904216949062e-05, + "loss": 0.0336, + "step": 7000 + }, + { + "epoch": 4.080325960419092, + "grad_norm": 0.38280585408210754, + "learning_rate": 9.878438939030526e-05, + "loss": 0.0394, + "step": 7010 + }, + { + "epoch": 4.086146682188591, + "grad_norm": 0.30630847811698914, + "learning_rate": 9.877834226629489e-05, + "loss": 0.0323, + "step": 7020 + }, + { + "epoch": 4.09196740395809, + "grad_norm": 0.37623828649520874, + "learning_rate": 9.877228032471206e-05, + "loss": 0.032, + "step": 7030 + }, + { + "epoch": 4.0977881257275905, + "grad_norm": 0.3254849314689636, + "learning_rate": 9.876620356739823e-05, + "loss": 0.0308, + "step": 7040 + }, + { + "epoch": 4.10360884749709, + "grad_norm": 0.2608814835548401, + "learning_rate": 9.876011199619935e-05, + "loss": 0.0326, + "step": 7050 + }, + { + "epoch": 4.109429569266589, + "grad_norm": 0.26627814769744873, + "learning_rate": 9.875400561296589e-05, + "loss": 0.0333, + "step": 7060 + }, + { + "epoch": 4.115250291036088, + "grad_norm": 0.3432648777961731, + "learning_rate": 9.874788441955278e-05, + "loss": 0.037, + "step": 7070 + }, + { + "epoch": 4.121071012805588, + "grad_norm": 0.41905730962753296, + "learning_rate": 9.874174841781951e-05, + "loss": 0.0376, + "step": 7080 + }, + { + "epoch": 4.1268917345750875, + "grad_norm": 0.3737552762031555, + "learning_rate": 9.873559760963003e-05, + "loss": 0.0363, + "step": 7090 + }, + { + "epoch": 4.132712456344587, + "grad_norm": 0.36257293820381165, + "learning_rate": 9.872943199685278e-05, + "loss": 0.035, + "step": 7100 + }, + { + "epoch": 4.138533178114086, + "grad_norm": 0.3002929091453552, + "learning_rate": 9.872325158136071e-05, + "loss": 0.0362, + "step": 7110 + }, + { + "epoch": 4.144353899883585, + "grad_norm": 0.48544037342071533, + "learning_rate": 9.871705636503128e-05, + "loss": 0.0357, + "step": 7120 + }, + { + "epoch": 4.150174621653085, + "grad_norm": 0.2564215064048767, + "learning_rate": 9.871084634974641e-05, + "loss": 0.0422, + "step": 7130 + }, + { + "epoch": 4.1559953434225845, + "grad_norm": 0.43296030163764954, + "learning_rate": 9.870462153739257e-05, + "loss": 0.045, + "step": 7140 + }, + { + "epoch": 4.161816065192084, + "grad_norm": 0.39923781156539917, + "learning_rate": 9.869838192986067e-05, + "loss": 0.0339, + "step": 7150 + }, + { + "epoch": 4.167636786961583, + "grad_norm": 0.35720279812812805, + "learning_rate": 9.869212752904616e-05, + "loss": 0.0382, + "step": 7160 + }, + { + "epoch": 4.173457508731083, + "grad_norm": 0.2416677325963974, + "learning_rate": 9.868585833684894e-05, + "loss": 0.0337, + "step": 7170 + }, + { + "epoch": 4.179278230500582, + "grad_norm": 0.34798213839530945, + "learning_rate": 9.867957435517342e-05, + "loss": 0.0324, + "step": 7180 + }, + { + "epoch": 4.185098952270081, + "grad_norm": 0.30287230014801025, + "learning_rate": 9.867327558592854e-05, + "loss": 0.0351, + "step": 7190 + }, + { + "epoch": 4.190919674039581, + "grad_norm": 0.2968210279941559, + "learning_rate": 9.866696203102766e-05, + "loss": 0.0431, + "step": 7200 + }, + { + "epoch": 4.19674039580908, + "grad_norm": 0.27825668454170227, + "learning_rate": 9.86606336923887e-05, + "loss": 0.0282, + "step": 7210 + }, + { + "epoch": 4.20256111757858, + "grad_norm": 0.24664129316806793, + "learning_rate": 9.865429057193403e-05, + "loss": 0.0299, + "step": 7220 + }, + { + "epoch": 4.208381839348079, + "grad_norm": 0.2862345278263092, + "learning_rate": 9.864793267159053e-05, + "loss": 0.0343, + "step": 7230 + }, + { + "epoch": 4.214202561117578, + "grad_norm": 0.37116143107414246, + "learning_rate": 9.864155999328957e-05, + "loss": 0.0316, + "step": 7240 + }, + { + "epoch": 4.220023282887078, + "grad_norm": 0.3148447573184967, + "learning_rate": 9.8635172538967e-05, + "loss": 0.0356, + "step": 7250 + }, + { + "epoch": 4.225844004656578, + "grad_norm": 0.32253962755203247, + "learning_rate": 9.862877031056312e-05, + "loss": 0.0311, + "step": 7260 + }, + { + "epoch": 4.231664726426077, + "grad_norm": 0.20730654895305634, + "learning_rate": 9.862235331002279e-05, + "loss": 0.0403, + "step": 7270 + }, + { + "epoch": 4.237485448195576, + "grad_norm": 0.42813384532928467, + "learning_rate": 9.861592153929533e-05, + "loss": 0.0306, + "step": 7280 + }, + { + "epoch": 4.243306169965075, + "grad_norm": 0.24713492393493652, + "learning_rate": 9.860947500033455e-05, + "loss": 0.0297, + "step": 7290 + }, + { + "epoch": 4.2491268917345755, + "grad_norm": 0.259696364402771, + "learning_rate": 9.86030136950987e-05, + "loss": 0.0352, + "step": 7300 + }, + { + "epoch": 4.254947613504075, + "grad_norm": 0.26994651556015015, + "learning_rate": 9.85965376255506e-05, + "loss": 0.0246, + "step": 7310 + }, + { + "epoch": 4.260768335273574, + "grad_norm": 0.24389442801475525, + "learning_rate": 9.859004679365747e-05, + "loss": 0.0279, + "step": 7320 + }, + { + "epoch": 4.266589057043073, + "grad_norm": 0.2777242064476013, + "learning_rate": 9.858354120139108e-05, + "loss": 0.0306, + "step": 7330 + }, + { + "epoch": 4.272409778812573, + "grad_norm": 0.3286935091018677, + "learning_rate": 9.857702085072764e-05, + "loss": 0.0354, + "step": 7340 + }, + { + "epoch": 4.278230500582072, + "grad_norm": 0.3922005295753479, + "learning_rate": 9.857048574364787e-05, + "loss": 0.0326, + "step": 7350 + }, + { + "epoch": 4.284051222351572, + "grad_norm": 0.266549676656723, + "learning_rate": 9.856393588213698e-05, + "loss": 0.0376, + "step": 7360 + }, + { + "epoch": 4.289871944121071, + "grad_norm": 0.4101126790046692, + "learning_rate": 9.855737126818458e-05, + "loss": 0.0368, + "step": 7370 + }, + { + "epoch": 4.29569266589057, + "grad_norm": 0.26293471455574036, + "learning_rate": 9.855079190378491e-05, + "loss": 0.0341, + "step": 7380 + }, + { + "epoch": 4.30151338766007, + "grad_norm": 0.30972710251808167, + "learning_rate": 9.854419779093655e-05, + "loss": 0.0292, + "step": 7390 + }, + { + "epoch": 4.307334109429569, + "grad_norm": 0.39483124017715454, + "learning_rate": 9.853758893164264e-05, + "loss": 0.03, + "step": 7400 + }, + { + "epoch": 4.313154831199069, + "grad_norm": 0.31104040145874023, + "learning_rate": 9.853096532791078e-05, + "loss": 0.0288, + "step": 7410 + }, + { + "epoch": 4.318975552968568, + "grad_norm": 0.27059441804885864, + "learning_rate": 9.852432698175304e-05, + "loss": 0.0393, + "step": 7420 + }, + { + "epoch": 4.324796274738068, + "grad_norm": 0.328520804643631, + "learning_rate": 9.851767389518597e-05, + "loss": 0.0412, + "step": 7430 + }, + { + "epoch": 4.330616996507567, + "grad_norm": 0.28422829508781433, + "learning_rate": 9.85110060702306e-05, + "loss": 0.0334, + "step": 7440 + }, + { + "epoch": 4.336437718277066, + "grad_norm": 0.29639628529548645, + "learning_rate": 9.850432350891245e-05, + "loss": 0.0309, + "step": 7450 + }, + { + "epoch": 4.342258440046566, + "grad_norm": 0.29426971077919006, + "learning_rate": 9.84976262132615e-05, + "loss": 0.0265, + "step": 7460 + }, + { + "epoch": 4.348079161816065, + "grad_norm": 0.3002302944660187, + "learning_rate": 9.849091418531222e-05, + "loss": 0.0391, + "step": 7470 + }, + { + "epoch": 4.353899883585565, + "grad_norm": 0.44496846199035645, + "learning_rate": 9.848418742710353e-05, + "loss": 0.0402, + "step": 7480 + }, + { + "epoch": 4.359720605355064, + "grad_norm": 0.2934856116771698, + "learning_rate": 9.847744594067885e-05, + "loss": 0.0328, + "step": 7490 + }, + { + "epoch": 4.365541327124563, + "grad_norm": 0.42463046312332153, + "learning_rate": 9.847068972808607e-05, + "loss": 0.0369, + "step": 7500 + }, + { + "epoch": 4.3713620488940625, + "grad_norm": 0.3948734402656555, + "learning_rate": 9.846391879137756e-05, + "loss": 0.0262, + "step": 7510 + }, + { + "epoch": 4.377182770663563, + "grad_norm": 0.29425516724586487, + "learning_rate": 9.845713313261012e-05, + "loss": 0.0356, + "step": 7520 + }, + { + "epoch": 4.383003492433062, + "grad_norm": 0.45764410495758057, + "learning_rate": 9.845033275384505e-05, + "loss": 0.0457, + "step": 7530 + }, + { + "epoch": 4.388824214202561, + "grad_norm": 0.2851108908653259, + "learning_rate": 9.844351765714818e-05, + "loss": 0.0362, + "step": 7540 + }, + { + "epoch": 4.39464493597206, + "grad_norm": 0.2464396059513092, + "learning_rate": 9.843668784458971e-05, + "loss": 0.0411, + "step": 7550 + }, + { + "epoch": 4.4004656577415595, + "grad_norm": 0.44425472617149353, + "learning_rate": 9.842984331824437e-05, + "loss": 0.0304, + "step": 7560 + }, + { + "epoch": 4.40628637951106, + "grad_norm": 0.2536200284957886, + "learning_rate": 9.842298408019133e-05, + "loss": 0.0248, + "step": 7570 + }, + { + "epoch": 4.412107101280559, + "grad_norm": 0.26575830578804016, + "learning_rate": 9.841611013251429e-05, + "loss": 0.037, + "step": 7580 + }, + { + "epoch": 4.417927823050058, + "grad_norm": 0.38499611616134644, + "learning_rate": 9.840922147730133e-05, + "loss": 0.036, + "step": 7590 + }, + { + "epoch": 4.423748544819557, + "grad_norm": 0.37018486857414246, + "learning_rate": 9.840231811664506e-05, + "loss": 0.0364, + "step": 7600 + }, + { + "epoch": 4.429569266589057, + "grad_norm": 0.45704662799835205, + "learning_rate": 9.839540005264252e-05, + "loss": 0.0345, + "step": 7610 + }, + { + "epoch": 4.435389988358557, + "grad_norm": 0.23781853914260864, + "learning_rate": 9.838846728739527e-05, + "loss": 0.0251, + "step": 7620 + }, + { + "epoch": 4.441210710128056, + "grad_norm": 0.2855353057384491, + "learning_rate": 9.838151982300927e-05, + "loss": 0.0396, + "step": 7630 + }, + { + "epoch": 4.447031431897555, + "grad_norm": 0.37394949793815613, + "learning_rate": 9.8374557661595e-05, + "loss": 0.0321, + "step": 7640 + }, + { + "epoch": 4.452852153667055, + "grad_norm": 0.2460787296295166, + "learning_rate": 9.836758080526735e-05, + "loss": 0.0343, + "step": 7650 + }, + { + "epoch": 4.458672875436554, + "grad_norm": 0.37795597314834595, + "learning_rate": 9.836058925614575e-05, + "loss": 0.032, + "step": 7660 + }, + { + "epoch": 4.4644935972060535, + "grad_norm": 0.25688549876213074, + "learning_rate": 9.8353583016354e-05, + "loss": 0.0304, + "step": 7670 + }, + { + "epoch": 4.470314318975553, + "grad_norm": 0.2825370728969574, + "learning_rate": 9.834656208802044e-05, + "loss": 0.0341, + "step": 7680 + }, + { + "epoch": 4.476135040745052, + "grad_norm": 0.4267488420009613, + "learning_rate": 9.833952647327784e-05, + "loss": 0.0318, + "step": 7690 + }, + { + "epoch": 4.481955762514552, + "grad_norm": 0.31655097007751465, + "learning_rate": 9.833247617426342e-05, + "loss": 0.0277, + "step": 7700 + }, + { + "epoch": 4.487776484284051, + "grad_norm": 0.41885697841644287, + "learning_rate": 9.832541119311889e-05, + "loss": 0.0294, + "step": 7710 + }, + { + "epoch": 4.4935972060535505, + "grad_norm": 0.30107560753822327, + "learning_rate": 9.83183315319904e-05, + "loss": 0.0417, + "step": 7720 + }, + { + "epoch": 4.49941792782305, + "grad_norm": 0.33157530426979065, + "learning_rate": 9.831123719302855e-05, + "loss": 0.0362, + "step": 7730 + }, + { + "epoch": 4.50523864959255, + "grad_norm": 0.2936960458755493, + "learning_rate": 9.830412817838842e-05, + "loss": 0.0325, + "step": 7740 + }, + { + "epoch": 4.511059371362049, + "grad_norm": 0.18463480472564697, + "learning_rate": 9.829700449022956e-05, + "loss": 0.0352, + "step": 7750 + }, + { + "epoch": 4.516880093131548, + "grad_norm": 0.2543729543685913, + "learning_rate": 9.828986613071593e-05, + "loss": 0.0293, + "step": 7760 + }, + { + "epoch": 4.5227008149010475, + "grad_norm": 0.4292618930339813, + "learning_rate": 9.828271310201601e-05, + "loss": 0.0284, + "step": 7770 + }, + { + "epoch": 4.528521536670548, + "grad_norm": 0.26980265974998474, + "learning_rate": 9.827554540630268e-05, + "loss": 0.0269, + "step": 7780 + }, + { + "epoch": 4.534342258440047, + "grad_norm": 0.303373783826828, + "learning_rate": 9.826836304575329e-05, + "loss": 0.0414, + "step": 7790 + }, + { + "epoch": 4.540162980209546, + "grad_norm": 0.3486476242542267, + "learning_rate": 9.826116602254966e-05, + "loss": 0.0276, + "step": 7800 + }, + { + "epoch": 4.545983701979045, + "grad_norm": 0.2687126100063324, + "learning_rate": 9.825395433887805e-05, + "loss": 0.0323, + "step": 7810 + }, + { + "epoch": 4.551804423748544, + "grad_norm": 0.2447064071893692, + "learning_rate": 9.824672799692917e-05, + "loss": 0.0307, + "step": 7820 + }, + { + "epoch": 4.5576251455180445, + "grad_norm": 0.3821541965007782, + "learning_rate": 9.823948699889823e-05, + "loss": 0.0388, + "step": 7830 + }, + { + "epoch": 4.563445867287544, + "grad_norm": 0.3648925721645355, + "learning_rate": 9.823223134698483e-05, + "loss": 0.0283, + "step": 7840 + }, + { + "epoch": 4.569266589057043, + "grad_norm": 0.23836758732795715, + "learning_rate": 9.822496104339303e-05, + "loss": 0.0309, + "step": 7850 + }, + { + "epoch": 4.575087310826542, + "grad_norm": 0.25998878479003906, + "learning_rate": 9.821767609033138e-05, + "loss": 0.0308, + "step": 7860 + }, + { + "epoch": 4.580908032596042, + "grad_norm": 0.28108271956443787, + "learning_rate": 9.821037649001284e-05, + "loss": 0.0344, + "step": 7870 + }, + { + "epoch": 4.5867287543655415, + "grad_norm": 0.3239397704601288, + "learning_rate": 9.820306224465486e-05, + "loss": 0.0338, + "step": 7880 + }, + { + "epoch": 4.592549476135041, + "grad_norm": 0.2554384469985962, + "learning_rate": 9.819573335647928e-05, + "loss": 0.0344, + "step": 7890 + }, + { + "epoch": 4.59837019790454, + "grad_norm": 0.3141991198062897, + "learning_rate": 9.818838982771246e-05, + "loss": 0.0393, + "step": 7900 + }, + { + "epoch": 4.604190919674039, + "grad_norm": 0.3819063901901245, + "learning_rate": 9.818103166058514e-05, + "loss": 0.0321, + "step": 7910 + }, + { + "epoch": 4.610011641443539, + "grad_norm": 0.29224616289138794, + "learning_rate": 9.817365885733254e-05, + "loss": 0.0287, + "step": 7920 + }, + { + "epoch": 4.6158323632130385, + "grad_norm": 0.288590669631958, + "learning_rate": 9.816627142019434e-05, + "loss": 0.0323, + "step": 7930 + }, + { + "epoch": 4.621653084982538, + "grad_norm": 0.2057899832725525, + "learning_rate": 9.815886935141463e-05, + "loss": 0.0392, + "step": 7940 + }, + { + "epoch": 4.627473806752037, + "grad_norm": 0.29842716455459595, + "learning_rate": 9.8151452653242e-05, + "loss": 0.0323, + "step": 7950 + }, + { + "epoch": 4.633294528521537, + "grad_norm": 0.26944732666015625, + "learning_rate": 9.814402132792939e-05, + "loss": 0.0315, + "step": 7960 + }, + { + "epoch": 4.639115250291036, + "grad_norm": 0.2836814224720001, + "learning_rate": 9.813657537773428e-05, + "loss": 0.0314, + "step": 7970 + }, + { + "epoch": 4.644935972060535, + "grad_norm": 0.3187209665775299, + "learning_rate": 9.812911480491854e-05, + "loss": 0.0303, + "step": 7980 + }, + { + "epoch": 4.650756693830035, + "grad_norm": 0.36353638768196106, + "learning_rate": 9.81216396117485e-05, + "loss": 0.0354, + "step": 7990 + }, + { + "epoch": 4.656577415599534, + "grad_norm": 0.30408570170402527, + "learning_rate": 9.811414980049491e-05, + "loss": 0.0355, + "step": 8000 + }, + { + "epoch": 4.662398137369034, + "grad_norm": 0.2374667525291443, + "learning_rate": 9.810664537343301e-05, + "loss": 0.0289, + "step": 8010 + }, + { + "epoch": 4.668218859138533, + "grad_norm": 0.26145830750465393, + "learning_rate": 9.809912633284243e-05, + "loss": 0.029, + "step": 8020 + }, + { + "epoch": 4.674039580908032, + "grad_norm": 0.3212907910346985, + "learning_rate": 9.809159268100725e-05, + "loss": 0.0312, + "step": 8030 + }, + { + "epoch": 4.6798603026775325, + "grad_norm": 0.22052724659442902, + "learning_rate": 9.808404442021599e-05, + "loss": 0.0302, + "step": 8040 + }, + { + "epoch": 4.685681024447032, + "grad_norm": 0.3555351495742798, + "learning_rate": 9.807648155276163e-05, + "loss": 0.0337, + "step": 8050 + }, + { + "epoch": 4.691501746216531, + "grad_norm": 0.30418241024017334, + "learning_rate": 9.806890408094156e-05, + "loss": 0.0345, + "step": 8060 + }, + { + "epoch": 4.69732246798603, + "grad_norm": 0.27220186591148376, + "learning_rate": 9.806131200705761e-05, + "loss": 0.0261, + "step": 8070 + }, + { + "epoch": 4.703143189755529, + "grad_norm": 0.2119966447353363, + "learning_rate": 9.805370533341605e-05, + "loss": 0.0347, + "step": 8080 + }, + { + "epoch": 4.7089639115250295, + "grad_norm": 0.33282145857810974, + "learning_rate": 9.804608406232762e-05, + "loss": 0.0243, + "step": 8090 + }, + { + "epoch": 4.714784633294529, + "grad_norm": 0.3547261655330658, + "learning_rate": 9.803844819610741e-05, + "loss": 0.0309, + "step": 8100 + }, + { + "epoch": 4.720605355064028, + "grad_norm": 0.22260618209838867, + "learning_rate": 9.803079773707504e-05, + "loss": 0.0358, + "step": 8110 + }, + { + "epoch": 4.726426076833527, + "grad_norm": 0.3806505799293518, + "learning_rate": 9.802313268755447e-05, + "loss": 0.04, + "step": 8120 + }, + { + "epoch": 4.732246798603027, + "grad_norm": 0.3544078469276428, + "learning_rate": 9.801545304987419e-05, + "loss": 0.0333, + "step": 8130 + }, + { + "epoch": 4.738067520372526, + "grad_norm": 0.3856416940689087, + "learning_rate": 9.800775882636704e-05, + "loss": 0.0316, + "step": 8140 + }, + { + "epoch": 4.743888242142026, + "grad_norm": 0.26278337836265564, + "learning_rate": 9.800005001937034e-05, + "loss": 0.0247, + "step": 8150 + }, + { + "epoch": 4.749708963911525, + "grad_norm": 0.4771169424057007, + "learning_rate": 9.79923266312258e-05, + "loss": 0.0307, + "step": 8160 + }, + { + "epoch": 4.755529685681024, + "grad_norm": 0.2936297357082367, + "learning_rate": 9.79845886642796e-05, + "loss": 0.0302, + "step": 8170 + }, + { + "epoch": 4.761350407450524, + "grad_norm": 0.30127644538879395, + "learning_rate": 9.797683612088233e-05, + "loss": 0.0302, + "step": 8180 + }, + { + "epoch": 4.767171129220023, + "grad_norm": 0.2574884593486786, + "learning_rate": 9.796906900338898e-05, + "loss": 0.0336, + "step": 8190 + }, + { + "epoch": 4.772991850989523, + "grad_norm": 0.46757885813713074, + "learning_rate": 9.796128731415903e-05, + "loss": 0.0341, + "step": 8200 + }, + { + "epoch": 4.778812572759022, + "grad_norm": 0.30725181102752686, + "learning_rate": 9.795349105555634e-05, + "loss": 0.0303, + "step": 8210 + }, + { + "epoch": 4.784633294528522, + "grad_norm": 0.33236315846443176, + "learning_rate": 9.794568022994922e-05, + "loss": 0.0316, + "step": 8220 + }, + { + "epoch": 4.790454016298021, + "grad_norm": 0.29929009079933167, + "learning_rate": 9.793785483971034e-05, + "loss": 0.0283, + "step": 8230 + }, + { + "epoch": 4.79627473806752, + "grad_norm": 0.309983491897583, + "learning_rate": 9.793001488721691e-05, + "loss": 0.0293, + "step": 8240 + }, + { + "epoch": 4.80209545983702, + "grad_norm": 0.2916111350059509, + "learning_rate": 9.792216037485047e-05, + "loss": 0.0296, + "step": 8250 + }, + { + "epoch": 4.807916181606519, + "grad_norm": 0.33252206444740295, + "learning_rate": 9.791429130499704e-05, + "loss": 0.0348, + "step": 8260 + }, + { + "epoch": 4.813736903376019, + "grad_norm": 0.27045682072639465, + "learning_rate": 9.790640768004698e-05, + "loss": 0.0309, + "step": 8270 + }, + { + "epoch": 4.819557625145518, + "grad_norm": 0.35520318150520325, + "learning_rate": 9.789850950239518e-05, + "loss": 0.0343, + "step": 8280 + }, + { + "epoch": 4.825378346915017, + "grad_norm": 0.276334673166275, + "learning_rate": 9.789059677444089e-05, + "loss": 0.0313, + "step": 8290 + }, + { + "epoch": 4.8311990686845165, + "grad_norm": 0.26522016525268555, + "learning_rate": 9.788266949858776e-05, + "loss": 0.0268, + "step": 8300 + }, + { + "epoch": 4.837019790454017, + "grad_norm": 0.4540693759918213, + "learning_rate": 9.787472767724392e-05, + "loss": 0.0213, + "step": 8310 + }, + { + "epoch": 4.842840512223516, + "grad_norm": 0.27632442116737366, + "learning_rate": 9.786677131282185e-05, + "loss": 0.0288, + "step": 8320 + }, + { + "epoch": 4.848661233993015, + "grad_norm": 0.2582131028175354, + "learning_rate": 9.785880040773853e-05, + "loss": 0.0277, + "step": 8330 + }, + { + "epoch": 4.854481955762514, + "grad_norm": 0.46244606375694275, + "learning_rate": 9.785081496441527e-05, + "loss": 0.0356, + "step": 8340 + }, + { + "epoch": 4.8603026775320135, + "grad_norm": 0.26774895191192627, + "learning_rate": 9.784281498527785e-05, + "loss": 0.0336, + "step": 8350 + }, + { + "epoch": 4.866123399301514, + "grad_norm": 0.455896258354187, + "learning_rate": 9.783480047275646e-05, + "loss": 0.0422, + "step": 8360 + }, + { + "epoch": 4.871944121071013, + "grad_norm": 0.4861133396625519, + "learning_rate": 9.78267714292857e-05, + "loss": 0.0373, + "step": 8370 + }, + { + "epoch": 4.877764842840512, + "grad_norm": 0.38683825731277466, + "learning_rate": 9.781872785730454e-05, + "loss": 0.0389, + "step": 8380 + }, + { + "epoch": 4.883585564610011, + "grad_norm": 0.3777424097061157, + "learning_rate": 9.781066975925646e-05, + "loss": 0.0324, + "step": 8390 + }, + { + "epoch": 4.889406286379511, + "grad_norm": 0.2835695743560791, + "learning_rate": 9.780259713758928e-05, + "loss": 0.0304, + "step": 8400 + }, + { + "epoch": 4.895227008149011, + "grad_norm": 0.3169991374015808, + "learning_rate": 9.779450999475524e-05, + "loss": 0.0286, + "step": 8410 + }, + { + "epoch": 4.90104772991851, + "grad_norm": 0.33693087100982666, + "learning_rate": 9.7786408333211e-05, + "loss": 0.0261, + "step": 8420 + }, + { + "epoch": 4.906868451688009, + "grad_norm": 0.3619339168071747, + "learning_rate": 9.777829215541764e-05, + "loss": 0.0339, + "step": 8430 + }, + { + "epoch": 4.912689173457509, + "grad_norm": 0.21245186030864716, + "learning_rate": 9.777016146384064e-05, + "loss": 0.0263, + "step": 8440 + }, + { + "epoch": 4.918509895227008, + "grad_norm": 0.2846100628376007, + "learning_rate": 9.776201626094988e-05, + "loss": 0.0278, + "step": 8450 + }, + { + "epoch": 4.9243306169965075, + "grad_norm": 0.30399903655052185, + "learning_rate": 9.775385654921965e-05, + "loss": 0.025, + "step": 8460 + }, + { + "epoch": 4.930151338766007, + "grad_norm": 0.23329970240592957, + "learning_rate": 9.774568233112868e-05, + "loss": 0.0281, + "step": 8470 + }, + { + "epoch": 4.935972060535507, + "grad_norm": 0.34507837891578674, + "learning_rate": 9.773749360916007e-05, + "loss": 0.0349, + "step": 8480 + }, + { + "epoch": 4.941792782305006, + "grad_norm": 0.22298507392406464, + "learning_rate": 9.772929038580134e-05, + "loss": 0.0301, + "step": 8490 + }, + { + "epoch": 4.947613504074505, + "grad_norm": 0.3912322223186493, + "learning_rate": 9.772107266354439e-05, + "loss": 0.034, + "step": 8500 + }, + { + "epoch": 4.9534342258440045, + "grad_norm": 0.286010205745697, + "learning_rate": 9.77128404448856e-05, + "loss": 0.042, + "step": 8510 + }, + { + "epoch": 4.959254947613504, + "grad_norm": 0.3957435190677643, + "learning_rate": 9.770459373232565e-05, + "loss": 0.0271, + "step": 8520 + }, + { + "epoch": 4.965075669383004, + "grad_norm": 0.27569580078125, + "learning_rate": 9.769633252836969e-05, + "loss": 0.0387, + "step": 8530 + }, + { + "epoch": 4.970896391152503, + "grad_norm": 0.23417359590530396, + "learning_rate": 9.768805683552724e-05, + "loss": 0.0269, + "step": 8540 + }, + { + "epoch": 4.976717112922002, + "grad_norm": 0.19766351580619812, + "learning_rate": 9.767976665631228e-05, + "loss": 0.0285, + "step": 8550 + }, + { + "epoch": 4.9825378346915015, + "grad_norm": 0.24424602091312408, + "learning_rate": 9.767146199324311e-05, + "loss": 0.0289, + "step": 8560 + }, + { + "epoch": 4.988358556461002, + "grad_norm": 0.17575430870056152, + "learning_rate": 9.766314284884249e-05, + "loss": 0.022, + "step": 8570 + }, + { + "epoch": 4.994179278230501, + "grad_norm": 0.3118011951446533, + "learning_rate": 9.765480922563752e-05, + "loss": 0.0341, + "step": 8580 + }, + { + "epoch": 5.0, + "grad_norm": 0.3681282103061676, + "learning_rate": 9.764646112615978e-05, + "loss": 0.0293, + "step": 8590 + }, + { + "epoch": 5.005820721769499, + "grad_norm": 0.2745116353034973, + "learning_rate": 9.763809855294517e-05, + "loss": 0.0299, + "step": 8600 + }, + { + "epoch": 5.011641443538998, + "grad_norm": 0.31445538997650146, + "learning_rate": 9.762972150853404e-05, + "loss": 0.0273, + "step": 8610 + }, + { + "epoch": 5.0174621653084985, + "grad_norm": 0.40660813450813293, + "learning_rate": 9.762132999547111e-05, + "loss": 0.0284, + "step": 8620 + }, + { + "epoch": 5.023282887077998, + "grad_norm": 0.21973906457424164, + "learning_rate": 9.761292401630549e-05, + "loss": 0.0321, + "step": 8630 + }, + { + "epoch": 5.029103608847497, + "grad_norm": 0.24583366513252258, + "learning_rate": 9.76045035735907e-05, + "loss": 0.0254, + "step": 8640 + }, + { + "epoch": 5.034924330616996, + "grad_norm": 0.21140247583389282, + "learning_rate": 9.759606866988464e-05, + "loss": 0.0288, + "step": 8650 + }, + { + "epoch": 5.040745052386496, + "grad_norm": 0.3057156503200531, + "learning_rate": 9.758761930774963e-05, + "loss": 0.0238, + "step": 8660 + }, + { + "epoch": 5.0465657741559955, + "grad_norm": 0.35822394490242004, + "learning_rate": 9.757915548975235e-05, + "loss": 0.0286, + "step": 8670 + }, + { + "epoch": 5.052386495925495, + "grad_norm": 0.36092519760131836, + "learning_rate": 9.757067721846389e-05, + "loss": 0.0292, + "step": 8680 + }, + { + "epoch": 5.058207217694994, + "grad_norm": 0.2617834210395813, + "learning_rate": 9.756218449645971e-05, + "loss": 0.0287, + "step": 8690 + }, + { + "epoch": 5.064027939464494, + "grad_norm": 0.25446611642837524, + "learning_rate": 9.75536773263197e-05, + "loss": 0.0273, + "step": 8700 + }, + { + "epoch": 5.069848661233993, + "grad_norm": 0.2622671127319336, + "learning_rate": 9.75451557106281e-05, + "loss": 0.0239, + "step": 8710 + }, + { + "epoch": 5.0756693830034925, + "grad_norm": 0.32500898838043213, + "learning_rate": 9.753661965197354e-05, + "loss": 0.0294, + "step": 8720 + }, + { + "epoch": 5.081490104772992, + "grad_norm": 0.357164591550827, + "learning_rate": 9.752806915294908e-05, + "loss": 0.0324, + "step": 8730 + }, + { + "epoch": 5.087310826542491, + "grad_norm": 0.354848712682724, + "learning_rate": 9.75195042161521e-05, + "loss": 0.0298, + "step": 8740 + }, + { + "epoch": 5.093131548311991, + "grad_norm": 0.2351839244365692, + "learning_rate": 9.751092484418442e-05, + "loss": 0.0254, + "step": 8750 + }, + { + "epoch": 5.09895227008149, + "grad_norm": 0.2823387086391449, + "learning_rate": 9.750233103965224e-05, + "loss": 0.0297, + "step": 8760 + }, + { + "epoch": 5.104772991850989, + "grad_norm": 0.3433556854724884, + "learning_rate": 9.749372280516611e-05, + "loss": 0.0311, + "step": 8770 + }, + { + "epoch": 5.110593713620489, + "grad_norm": 0.3091376721858978, + "learning_rate": 9.748510014334097e-05, + "loss": 0.0338, + "step": 8780 + }, + { + "epoch": 5.116414435389989, + "grad_norm": 0.2891037166118622, + "learning_rate": 9.747646305679621e-05, + "loss": 0.0356, + "step": 8790 + }, + { + "epoch": 5.122235157159488, + "grad_norm": 0.3026866316795349, + "learning_rate": 9.74678115481555e-05, + "loss": 0.0379, + "step": 8800 + }, + { + "epoch": 5.128055878928987, + "grad_norm": 0.269062340259552, + "learning_rate": 9.745914562004696e-05, + "loss": 0.0329, + "step": 8810 + }, + { + "epoch": 5.133876600698486, + "grad_norm": 0.2834736704826355, + "learning_rate": 9.745046527510307e-05, + "loss": 0.0261, + "step": 8820 + }, + { + "epoch": 5.139697322467986, + "grad_norm": 0.39927417039871216, + "learning_rate": 9.744177051596068e-05, + "loss": 0.0338, + "step": 8830 + }, + { + "epoch": 5.145518044237486, + "grad_norm": 0.3089328110218048, + "learning_rate": 9.743306134526105e-05, + "loss": 0.0313, + "step": 8840 + }, + { + "epoch": 5.151338766006985, + "grad_norm": 0.30311131477355957, + "learning_rate": 9.742433776564977e-05, + "loss": 0.0323, + "step": 8850 + }, + { + "epoch": 5.157159487776484, + "grad_norm": 0.32421207427978516, + "learning_rate": 9.741559977977683e-05, + "loss": 0.0316, + "step": 8860 + }, + { + "epoch": 5.162980209545983, + "grad_norm": 0.27980881929397583, + "learning_rate": 9.740684739029661e-05, + "loss": 0.0302, + "step": 8870 + }, + { + "epoch": 5.1688009313154835, + "grad_norm": 0.30028510093688965, + "learning_rate": 9.739808059986789e-05, + "loss": 0.0366, + "step": 8880 + }, + { + "epoch": 5.174621653084983, + "grad_norm": 0.23879553377628326, + "learning_rate": 9.738929941115373e-05, + "loss": 0.0328, + "step": 8890 + }, + { + "epoch": 5.180442374854482, + "grad_norm": 0.26703301072120667, + "learning_rate": 9.738050382682167e-05, + "loss": 0.0226, + "step": 8900 + }, + { + "epoch": 5.186263096623981, + "grad_norm": 0.3741218149662018, + "learning_rate": 9.737169384954355e-05, + "loss": 0.031, + "step": 8910 + }, + { + "epoch": 5.192083818393481, + "grad_norm": 0.22629381716251373, + "learning_rate": 9.736286948199562e-05, + "loss": 0.0294, + "step": 8920 + }, + { + "epoch": 5.19790454016298, + "grad_norm": 0.21070019900798798, + "learning_rate": 9.735403072685848e-05, + "loss": 0.0297, + "step": 8930 + }, + { + "epoch": 5.20372526193248, + "grad_norm": 0.2769431471824646, + "learning_rate": 9.734517758681712e-05, + "loss": 0.0298, + "step": 8940 + }, + { + "epoch": 5.209545983701979, + "grad_norm": 0.35348930954933167, + "learning_rate": 9.733631006456088e-05, + "loss": 0.0284, + "step": 8950 + }, + { + "epoch": 5.215366705471478, + "grad_norm": 0.22606156766414642, + "learning_rate": 9.732742816278348e-05, + "loss": 0.0307, + "step": 8960 + }, + { + "epoch": 5.221187427240978, + "grad_norm": 0.270920991897583, + "learning_rate": 9.731853188418302e-05, + "loss": 0.0257, + "step": 8970 + }, + { + "epoch": 5.227008149010477, + "grad_norm": 0.35033416748046875, + "learning_rate": 9.730962123146194e-05, + "loss": 0.0355, + "step": 8980 + }, + { + "epoch": 5.232828870779977, + "grad_norm": 0.3175217807292938, + "learning_rate": 9.730069620732709e-05, + "loss": 0.0312, + "step": 8990 + }, + { + "epoch": 5.238649592549476, + "grad_norm": 0.33981460332870483, + "learning_rate": 9.72917568144896e-05, + "loss": 0.0292, + "step": 9000 + }, + { + "epoch": 5.244470314318976, + "grad_norm": 0.29995182156562805, + "learning_rate": 9.728280305566509e-05, + "loss": 0.0318, + "step": 9010 + }, + { + "epoch": 5.250291036088475, + "grad_norm": 0.3434208333492279, + "learning_rate": 9.727383493357343e-05, + "loss": 0.0317, + "step": 9020 + }, + { + "epoch": 5.256111757857974, + "grad_norm": 0.3383476734161377, + "learning_rate": 9.726485245093891e-05, + "loss": 0.0253, + "step": 9030 + }, + { + "epoch": 5.261932479627474, + "grad_norm": 0.2973507046699524, + "learning_rate": 9.725585561049018e-05, + "loss": 0.0261, + "step": 9040 + }, + { + "epoch": 5.267753201396973, + "grad_norm": 0.3432774841785431, + "learning_rate": 9.724684441496022e-05, + "loss": 0.0319, + "step": 9050 + }, + { + "epoch": 5.273573923166473, + "grad_norm": 0.3119683861732483, + "learning_rate": 9.72378188670864e-05, + "loss": 0.0265, + "step": 9060 + }, + { + "epoch": 5.279394644935972, + "grad_norm": 0.2844869792461395, + "learning_rate": 9.722877896961047e-05, + "loss": 0.026, + "step": 9070 + }, + { + "epoch": 5.285215366705471, + "grad_norm": 0.23981453478336334, + "learning_rate": 9.721972472527848e-05, + "loss": 0.0278, + "step": 9080 + }, + { + "epoch": 5.2910360884749705, + "grad_norm": 0.299370676279068, + "learning_rate": 9.721065613684089e-05, + "loss": 0.0232, + "step": 9090 + }, + { + "epoch": 5.296856810244471, + "grad_norm": 0.1979278177022934, + "learning_rate": 9.72015732070525e-05, + "loss": 0.027, + "step": 9100 + }, + { + "epoch": 5.30267753201397, + "grad_norm": 0.32906824350357056, + "learning_rate": 9.719247593867244e-05, + "loss": 0.0253, + "step": 9110 + }, + { + "epoch": 5.308498253783469, + "grad_norm": 0.3466174304485321, + "learning_rate": 9.718336433446423e-05, + "loss": 0.0261, + "step": 9120 + }, + { + "epoch": 5.314318975552968, + "grad_norm": 0.2825937569141388, + "learning_rate": 9.717423839719574e-05, + "loss": 0.0293, + "step": 9130 + }, + { + "epoch": 5.320139697322468, + "grad_norm": 0.33590978384017944, + "learning_rate": 9.71650981296392e-05, + "loss": 0.0291, + "step": 9140 + }, + { + "epoch": 5.325960419091968, + "grad_norm": 0.3719336986541748, + "learning_rate": 9.715594353457118e-05, + "loss": 0.027, + "step": 9150 + }, + { + "epoch": 5.331781140861467, + "grad_norm": 0.3093244135379791, + "learning_rate": 9.714677461477257e-05, + "loss": 0.0306, + "step": 9160 + }, + { + "epoch": 5.337601862630966, + "grad_norm": 0.2994060814380646, + "learning_rate": 9.713759137302869e-05, + "loss": 0.029, + "step": 9170 + }, + { + "epoch": 5.343422584400465, + "grad_norm": 0.3336246609687805, + "learning_rate": 9.712839381212914e-05, + "loss": 0.0232, + "step": 9180 + }, + { + "epoch": 5.349243306169965, + "grad_norm": 0.3859998285770416, + "learning_rate": 9.71191819348679e-05, + "loss": 0.0267, + "step": 9190 + }, + { + "epoch": 5.355064027939465, + "grad_norm": 0.3540157675743103, + "learning_rate": 9.710995574404331e-05, + "loss": 0.0297, + "step": 9200 + }, + { + "epoch": 5.360884749708964, + "grad_norm": 0.3089393675327301, + "learning_rate": 9.710071524245802e-05, + "loss": 0.0301, + "step": 9210 + }, + { + "epoch": 5.366705471478463, + "grad_norm": 0.320477694272995, + "learning_rate": 9.709146043291906e-05, + "loss": 0.0215, + "step": 9220 + }, + { + "epoch": 5.372526193247963, + "grad_norm": 0.28084465861320496, + "learning_rate": 9.70821913182378e-05, + "loss": 0.0291, + "step": 9230 + }, + { + "epoch": 5.378346915017462, + "grad_norm": 0.22664350271224976, + "learning_rate": 9.707290790122995e-05, + "loss": 0.0297, + "step": 9240 + }, + { + "epoch": 5.3841676367869615, + "grad_norm": 0.2186378836631775, + "learning_rate": 9.706361018471557e-05, + "loss": 0.0283, + "step": 9250 + }, + { + "epoch": 5.389988358556461, + "grad_norm": 0.32134509086608887, + "learning_rate": 9.705429817151906e-05, + "loss": 0.0307, + "step": 9260 + }, + { + "epoch": 5.395809080325961, + "grad_norm": 0.34575656056404114, + "learning_rate": 9.704497186446917e-05, + "loss": 0.0261, + "step": 9270 + }, + { + "epoch": 5.40162980209546, + "grad_norm": 0.25503382086753845, + "learning_rate": 9.703563126639896e-05, + "loss": 0.0337, + "step": 9280 + }, + { + "epoch": 5.407450523864959, + "grad_norm": 0.3006303608417511, + "learning_rate": 9.70262763801459e-05, + "loss": 0.0285, + "step": 9290 + }, + { + "epoch": 5.4132712456344585, + "grad_norm": 0.3044355809688568, + "learning_rate": 9.701690720855171e-05, + "loss": 0.032, + "step": 9300 + }, + { + "epoch": 5.419091967403958, + "grad_norm": 0.32066115736961365, + "learning_rate": 9.700752375446253e-05, + "loss": 0.0304, + "step": 9310 + }, + { + "epoch": 5.424912689173458, + "grad_norm": 0.2644328474998474, + "learning_rate": 9.69981260207288e-05, + "loss": 0.0403, + "step": 9320 + }, + { + "epoch": 5.430733410942957, + "grad_norm": 0.2899671792984009, + "learning_rate": 9.698871401020529e-05, + "loss": 0.0298, + "step": 9330 + }, + { + "epoch": 5.436554132712456, + "grad_norm": 0.2862459421157837, + "learning_rate": 9.697928772575112e-05, + "loss": 0.0277, + "step": 9340 + }, + { + "epoch": 5.4423748544819555, + "grad_norm": 0.2784060835838318, + "learning_rate": 9.696984717022976e-05, + "loss": 0.025, + "step": 9350 + }, + { + "epoch": 5.448195576251456, + "grad_norm": 0.27367472648620605, + "learning_rate": 9.6960392346509e-05, + "loss": 0.0225, + "step": 9360 + }, + { + "epoch": 5.454016298020955, + "grad_norm": 0.1996067315340042, + "learning_rate": 9.695092325746097e-05, + "loss": 0.0376, + "step": 9370 + }, + { + "epoch": 5.459837019790454, + "grad_norm": 0.3091476261615753, + "learning_rate": 9.694143990596211e-05, + "loss": 0.0241, + "step": 9380 + }, + { + "epoch": 5.465657741559953, + "grad_norm": 0.3844224810600281, + "learning_rate": 9.693194229489325e-05, + "loss": 0.0384, + "step": 9390 + }, + { + "epoch": 5.471478463329452, + "grad_norm": 0.3749852776527405, + "learning_rate": 9.692243042713944e-05, + "loss": 0.0265, + "step": 9400 + }, + { + "epoch": 5.4772991850989525, + "grad_norm": 0.25917714834213257, + "learning_rate": 9.691290430559022e-05, + "loss": 0.029, + "step": 9410 + }, + { + "epoch": 5.483119906868452, + "grad_norm": 0.2624014616012573, + "learning_rate": 9.690336393313932e-05, + "loss": 0.0277, + "step": 9420 + }, + { + "epoch": 5.488940628637951, + "grad_norm": 0.35411402583122253, + "learning_rate": 9.689380931268487e-05, + "loss": 0.0319, + "step": 9430 + }, + { + "epoch": 5.49476135040745, + "grad_norm": 0.39634039998054504, + "learning_rate": 9.688424044712932e-05, + "loss": 0.0321, + "step": 9440 + }, + { + "epoch": 5.50058207217695, + "grad_norm": 0.26770657300949097, + "learning_rate": 9.687465733937942e-05, + "loss": 0.036, + "step": 9450 + }, + { + "epoch": 5.5064027939464495, + "grad_norm": 0.34847885370254517, + "learning_rate": 9.686505999234627e-05, + "loss": 0.0284, + "step": 9460 + }, + { + "epoch": 5.512223515715949, + "grad_norm": 0.16722723841667175, + "learning_rate": 9.685544840894529e-05, + "loss": 0.0299, + "step": 9470 + }, + { + "epoch": 5.518044237485448, + "grad_norm": 0.39993759989738464, + "learning_rate": 9.684582259209624e-05, + "loss": 0.027, + "step": 9480 + }, + { + "epoch": 5.523864959254947, + "grad_norm": 0.23280899226665497, + "learning_rate": 9.683618254472317e-05, + "loss": 0.0283, + "step": 9490 + }, + { + "epoch": 5.529685681024447, + "grad_norm": 0.28795477747917175, + "learning_rate": 9.682652826975449e-05, + "loss": 0.0258, + "step": 9500 + }, + { + "epoch": 5.5355064027939465, + "grad_norm": 0.22744226455688477, + "learning_rate": 9.681685977012291e-05, + "loss": 0.0272, + "step": 9510 + }, + { + "epoch": 5.541327124563446, + "grad_norm": 0.21913710236549377, + "learning_rate": 9.680717704876546e-05, + "loss": 0.0272, + "step": 9520 + }, + { + "epoch": 5.547147846332946, + "grad_norm": 0.33276960253715515, + "learning_rate": 9.679748010862349e-05, + "loss": 0.0294, + "step": 9530 + }, + { + "epoch": 5.552968568102445, + "grad_norm": 0.2518536448478699, + "learning_rate": 9.678776895264267e-05, + "loss": 0.03, + "step": 9540 + }, + { + "epoch": 5.558789289871944, + "grad_norm": 0.3743119239807129, + "learning_rate": 9.6778043583773e-05, + "loss": 0.0356, + "step": 9550 + }, + { + "epoch": 5.564610011641443, + "grad_norm": 0.2599638104438782, + "learning_rate": 9.67683040049688e-05, + "loss": 0.0256, + "step": 9560 + }, + { + "epoch": 5.570430733410943, + "grad_norm": 0.315051406621933, + "learning_rate": 9.675855021918869e-05, + "loss": 0.0325, + "step": 9570 + }, + { + "epoch": 5.576251455180443, + "grad_norm": 0.2475096434354782, + "learning_rate": 9.674878222939561e-05, + "loss": 0.0225, + "step": 9580 + }, + { + "epoch": 5.582072176949942, + "grad_norm": 0.24457235634326935, + "learning_rate": 9.673900003855681e-05, + "loss": 0.0217, + "step": 9590 + }, + { + "epoch": 5.587892898719441, + "grad_norm": 0.2921212315559387, + "learning_rate": 9.672920364964389e-05, + "loss": 0.0343, + "step": 9600 + }, + { + "epoch": 5.59371362048894, + "grad_norm": 0.33207812905311584, + "learning_rate": 9.671939306563269e-05, + "loss": 0.0221, + "step": 9610 + }, + { + "epoch": 5.5995343422584405, + "grad_norm": 0.2327045053243637, + "learning_rate": 9.670956828950345e-05, + "loss": 0.0298, + "step": 9620 + }, + { + "epoch": 5.60535506402794, + "grad_norm": 0.259838342666626, + "learning_rate": 9.669972932424065e-05, + "loss": 0.025, + "step": 9630 + }, + { + "epoch": 5.611175785797439, + "grad_norm": 0.21612700819969177, + "learning_rate": 9.668987617283312e-05, + "loss": 0.0226, + "step": 9640 + }, + { + "epoch": 5.616996507566938, + "grad_norm": 0.3452034592628479, + "learning_rate": 9.668000883827397e-05, + "loss": 0.033, + "step": 9650 + }, + { + "epoch": 5.622817229336437, + "grad_norm": 0.3942038118839264, + "learning_rate": 9.667012732356067e-05, + "loss": 0.0317, + "step": 9660 + }, + { + "epoch": 5.6286379511059375, + "grad_norm": 0.2348872721195221, + "learning_rate": 9.666023163169493e-05, + "loss": 0.0423, + "step": 9670 + }, + { + "epoch": 5.634458672875437, + "grad_norm": 0.26723581552505493, + "learning_rate": 9.665032176568281e-05, + "loss": 0.0289, + "step": 9680 + }, + { + "epoch": 5.640279394644936, + "grad_norm": 0.2873128056526184, + "learning_rate": 9.664039772853469e-05, + "loss": 0.0297, + "step": 9690 + }, + { + "epoch": 5.646100116414435, + "grad_norm": 0.2736475169658661, + "learning_rate": 9.663045952326518e-05, + "loss": 0.0274, + "step": 9700 + }, + { + "epoch": 5.651920838183935, + "grad_norm": 0.2881721556186676, + "learning_rate": 9.662050715289328e-05, + "loss": 0.0211, + "step": 9710 + }, + { + "epoch": 5.657741559953434, + "grad_norm": 0.19016215205192566, + "learning_rate": 9.661054062044226e-05, + "loss": 0.0321, + "step": 9720 + }, + { + "epoch": 5.663562281722934, + "grad_norm": 0.28978970646858215, + "learning_rate": 9.660055992893968e-05, + "loss": 0.0218, + "step": 9730 + }, + { + "epoch": 5.669383003492433, + "grad_norm": 0.30262482166290283, + "learning_rate": 9.659056508141739e-05, + "loss": 0.028, + "step": 9740 + }, + { + "epoch": 5.675203725261932, + "grad_norm": 0.38813042640686035, + "learning_rate": 9.658055608091161e-05, + "loss": 0.0318, + "step": 9750 + }, + { + "epoch": 5.681024447031432, + "grad_norm": 0.3654748201370239, + "learning_rate": 9.657053293046276e-05, + "loss": 0.0317, + "step": 9760 + }, + { + "epoch": 5.686845168800931, + "grad_norm": 0.3171159029006958, + "learning_rate": 9.656049563311564e-05, + "loss": 0.0259, + "step": 9770 + }, + { + "epoch": 5.692665890570431, + "grad_norm": 0.2903099060058594, + "learning_rate": 9.655044419191929e-05, + "loss": 0.0321, + "step": 9780 + }, + { + "epoch": 5.69848661233993, + "grad_norm": 0.22357884049415588, + "learning_rate": 9.654037860992711e-05, + "loss": 0.0253, + "step": 9790 + }, + { + "epoch": 5.70430733410943, + "grad_norm": 0.195988729596138, + "learning_rate": 9.653029889019672e-05, + "loss": 0.0285, + "step": 9800 + }, + { + "epoch": 5.710128055878929, + "grad_norm": 0.2945406436920166, + "learning_rate": 9.65202050357901e-05, + "loss": 0.0249, + "step": 9810 + }, + { + "epoch": 5.715948777648428, + "grad_norm": 0.23970866203308105, + "learning_rate": 9.651009704977347e-05, + "loss": 0.0268, + "step": 9820 + }, + { + "epoch": 5.721769499417928, + "grad_norm": 0.35993441939353943, + "learning_rate": 9.649997493521738e-05, + "loss": 0.0295, + "step": 9830 + }, + { + "epoch": 5.727590221187427, + "grad_norm": 0.40474560856819153, + "learning_rate": 9.64898386951967e-05, + "loss": 0.0267, + "step": 9840 + }, + { + "epoch": 5.733410942956927, + "grad_norm": 0.3716301918029785, + "learning_rate": 9.647968833279049e-05, + "loss": 0.0299, + "step": 9850 + }, + { + "epoch": 5.739231664726426, + "grad_norm": 0.30530211329460144, + "learning_rate": 9.646952385108218e-05, + "loss": 0.0329, + "step": 9860 + }, + { + "epoch": 5.745052386495925, + "grad_norm": 0.3212750256061554, + "learning_rate": 9.645934525315951e-05, + "loss": 0.0238, + "step": 9870 + }, + { + "epoch": 5.7508731082654245, + "grad_norm": 0.3503616154193878, + "learning_rate": 9.644915254211442e-05, + "loss": 0.0255, + "step": 9880 + }, + { + "epoch": 5.756693830034925, + "grad_norm": 0.3831939101219177, + "learning_rate": 9.643894572104321e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 5.762514551804424, + "grad_norm": 0.3143749237060547, + "learning_rate": 9.642872479304644e-05, + "loss": 0.0295, + "step": 9900 + }, + { + "epoch": 5.768335273573923, + "grad_norm": 0.31607869267463684, + "learning_rate": 9.641848976122895e-05, + "loss": 0.0246, + "step": 9910 + }, + { + "epoch": 5.774155995343422, + "grad_norm": 0.3041110336780548, + "learning_rate": 9.64082406286999e-05, + "loss": 0.0223, + "step": 9920 + }, + { + "epoch": 5.779976717112922, + "grad_norm": 0.44187498092651367, + "learning_rate": 9.639797739857269e-05, + "loss": 0.0368, + "step": 9930 + }, + { + "epoch": 5.785797438882422, + "grad_norm": 0.31432193517684937, + "learning_rate": 9.638770007396498e-05, + "loss": 0.0291, + "step": 9940 + }, + { + "epoch": 5.791618160651921, + "grad_norm": 0.2838660180568695, + "learning_rate": 9.63774086579988e-05, + "loss": 0.0367, + "step": 9950 + }, + { + "epoch": 5.79743888242142, + "grad_norm": 0.26412907242774963, + "learning_rate": 9.63671031538004e-05, + "loss": 0.0259, + "step": 9960 + }, + { + "epoch": 5.80325960419092, + "grad_norm": 0.3235618770122528, + "learning_rate": 9.635678356450031e-05, + "loss": 0.0259, + "step": 9970 + }, + { + "epoch": 5.809080325960419, + "grad_norm": 0.30197545886039734, + "learning_rate": 9.634644989323336e-05, + "loss": 0.0298, + "step": 9980 + }, + { + "epoch": 5.814901047729919, + "grad_norm": 0.41135868430137634, + "learning_rate": 9.633610214313861e-05, + "loss": 0.0275, + "step": 9990 + }, + { + "epoch": 5.820721769499418, + "grad_norm": 0.31232592463493347, + "learning_rate": 9.632574031735951e-05, + "loss": 0.0253, + "step": 10000 + }, + { + "epoch": 5.826542491268917, + "grad_norm": 0.2941375970840454, + "learning_rate": 9.631536441904364e-05, + "loss": 0.0257, + "step": 10010 + }, + { + "epoch": 5.832363213038417, + "grad_norm": 0.31117257475852966, + "learning_rate": 9.630497445134293e-05, + "loss": 0.03, + "step": 10020 + }, + { + "epoch": 5.838183934807916, + "grad_norm": 0.18792855739593506, + "learning_rate": 9.62945704174136e-05, + "loss": 0.0243, + "step": 10030 + }, + { + "epoch": 5.8440046565774155, + "grad_norm": 0.3407415449619293, + "learning_rate": 9.628415232041612e-05, + "loss": 0.0334, + "step": 10040 + }, + { + "epoch": 5.849825378346915, + "grad_norm": 0.38098669052124023, + "learning_rate": 9.627372016351524e-05, + "loss": 0.03, + "step": 10050 + }, + { + "epoch": 5.855646100116415, + "grad_norm": 0.3249993622303009, + "learning_rate": 9.626327394987995e-05, + "loss": 0.0255, + "step": 10060 + }, + { + "epoch": 5.861466821885914, + "grad_norm": 0.2961386740207672, + "learning_rate": 9.625281368268355e-05, + "loss": 0.0265, + "step": 10070 + }, + { + "epoch": 5.867287543655413, + "grad_norm": 0.3370297849178314, + "learning_rate": 9.624233936510357e-05, + "loss": 0.0316, + "step": 10080 + }, + { + "epoch": 5.8731082654249125, + "grad_norm": 0.3450591266155243, + "learning_rate": 9.623185100032187e-05, + "loss": 0.0317, + "step": 10090 + }, + { + "epoch": 5.878928987194412, + "grad_norm": 0.24700306355953217, + "learning_rate": 9.62213485915245e-05, + "loss": 0.0267, + "step": 10100 + }, + { + "epoch": 5.884749708963912, + "grad_norm": 0.2914005517959595, + "learning_rate": 9.621083214190186e-05, + "loss": 0.0274, + "step": 10110 + }, + { + "epoch": 5.890570430733411, + "grad_norm": 0.26179754734039307, + "learning_rate": 9.62003016546485e-05, + "loss": 0.0292, + "step": 10120 + }, + { + "epoch": 5.89639115250291, + "grad_norm": 0.3360522389411926, + "learning_rate": 9.618975713296339e-05, + "loss": 0.0236, + "step": 10130 + }, + { + "epoch": 5.9022118742724095, + "grad_norm": 0.23601940274238586, + "learning_rate": 9.61791985800496e-05, + "loss": 0.0213, + "step": 10140 + }, + { + "epoch": 5.90803259604191, + "grad_norm": 0.34737423062324524, + "learning_rate": 9.616862599911458e-05, + "loss": 0.0284, + "step": 10150 + }, + { + "epoch": 5.913853317811409, + "grad_norm": 0.23361477255821228, + "learning_rate": 9.615803939337e-05, + "loss": 0.0269, + "step": 10160 + }, + { + "epoch": 5.919674039580908, + "grad_norm": 0.3128803074359894, + "learning_rate": 9.614743876603178e-05, + "loss": 0.0271, + "step": 10170 + }, + { + "epoch": 5.925494761350407, + "grad_norm": 0.1693059802055359, + "learning_rate": 9.613682412032013e-05, + "loss": 0.0288, + "step": 10180 + }, + { + "epoch": 5.931315483119906, + "grad_norm": 0.3016536831855774, + "learning_rate": 9.612619545945947e-05, + "loss": 0.035, + "step": 10190 + }, + { + "epoch": 5.9371362048894065, + "grad_norm": 0.3183857798576355, + "learning_rate": 9.611555278667852e-05, + "loss": 0.0263, + "step": 10200 + }, + { + "epoch": 5.942956926658906, + "grad_norm": 0.18038499355316162, + "learning_rate": 9.610489610521024e-05, + "loss": 0.0223, + "step": 10210 + }, + { + "epoch": 5.948777648428405, + "grad_norm": 0.19322970509529114, + "learning_rate": 9.609422541829187e-05, + "loss": 0.0266, + "step": 10220 + }, + { + "epoch": 5.954598370197904, + "grad_norm": 0.20358052849769592, + "learning_rate": 9.608354072916486e-05, + "loss": 0.0358, + "step": 10230 + }, + { + "epoch": 5.960419091967404, + "grad_norm": 0.28910472989082336, + "learning_rate": 9.607284204107493e-05, + "loss": 0.0336, + "step": 10240 + }, + { + "epoch": 5.9662398137369035, + "grad_norm": 0.3803527057170868, + "learning_rate": 9.606212935727208e-05, + "loss": 0.0231, + "step": 10250 + }, + { + "epoch": 5.972060535506403, + "grad_norm": 0.2496783286333084, + "learning_rate": 9.605140268101052e-05, + "loss": 0.0265, + "step": 10260 + }, + { + "epoch": 5.977881257275902, + "grad_norm": 0.3664393723011017, + "learning_rate": 9.604066201554875e-05, + "loss": 0.023, + "step": 10270 + }, + { + "epoch": 5.983701979045401, + "grad_norm": 0.2764727473258972, + "learning_rate": 9.60299073641495e-05, + "loss": 0.028, + "step": 10280 + }, + { + "epoch": 5.989522700814901, + "grad_norm": 0.26600730419158936, + "learning_rate": 9.601913873007974e-05, + "loss": 0.0209, + "step": 10290 + }, + { + "epoch": 5.9953434225844005, + "grad_norm": 0.2535768151283264, + "learning_rate": 9.60083561166107e-05, + "loss": 0.026, + "step": 10300 + }, + { + "epoch": 6.0011641443539, + "grad_norm": 0.3044598698616028, + "learning_rate": 9.599755952701783e-05, + "loss": 0.0298, + "step": 10310 + }, + { + "epoch": 6.006984866123399, + "grad_norm": 0.3348448872566223, + "learning_rate": 9.598674896458089e-05, + "loss": 0.0284, + "step": 10320 + }, + { + "epoch": 6.012805587892899, + "grad_norm": 0.2525580823421478, + "learning_rate": 9.597592443258383e-05, + "loss": 0.0195, + "step": 10330 + }, + { + "epoch": 6.018626309662398, + "grad_norm": 0.2645496129989624, + "learning_rate": 9.596508593431483e-05, + "loss": 0.0223, + "step": 10340 + }, + { + "epoch": 6.024447031431897, + "grad_norm": 0.1832256019115448, + "learning_rate": 9.59542334730664e-05, + "loss": 0.0246, + "step": 10350 + }, + { + "epoch": 6.030267753201397, + "grad_norm": 0.2854319214820862, + "learning_rate": 9.594336705213516e-05, + "loss": 0.0279, + "step": 10360 + }, + { + "epoch": 6.036088474970897, + "grad_norm": 0.19047343730926514, + "learning_rate": 9.593248667482208e-05, + "loss": 0.0278, + "step": 10370 + }, + { + "epoch": 6.041909196740396, + "grad_norm": 0.2932100296020508, + "learning_rate": 9.592159234443233e-05, + "loss": 0.0312, + "step": 10380 + }, + { + "epoch": 6.047729918509895, + "grad_norm": 0.2996804118156433, + "learning_rate": 9.59106840642753e-05, + "loss": 0.035, + "step": 10390 + }, + { + "epoch": 6.053550640279394, + "grad_norm": 0.27939924597740173, + "learning_rate": 9.589976183766467e-05, + "loss": 0.0266, + "step": 10400 + }, + { + "epoch": 6.0593713620488945, + "grad_norm": 0.33766815066337585, + "learning_rate": 9.58888256679183e-05, + "loss": 0.0296, + "step": 10410 + }, + { + "epoch": 6.065192083818394, + "grad_norm": 0.29058006405830383, + "learning_rate": 9.587787555835832e-05, + "loss": 0.0254, + "step": 10420 + }, + { + "epoch": 6.071012805587893, + "grad_norm": 0.3202958405017853, + "learning_rate": 9.586691151231107e-05, + "loss": 0.0254, + "step": 10430 + }, + { + "epoch": 6.076833527357392, + "grad_norm": 0.2877427637577057, + "learning_rate": 9.585593353310715e-05, + "loss": 0.0236, + "step": 10440 + }, + { + "epoch": 6.082654249126891, + "grad_norm": 0.27137327194213867, + "learning_rate": 9.58449416240814e-05, + "loss": 0.0313, + "step": 10450 + }, + { + "epoch": 6.0884749708963914, + "grad_norm": 0.26526597142219543, + "learning_rate": 9.583393578857283e-05, + "loss": 0.0269, + "step": 10460 + }, + { + "epoch": 6.094295692665891, + "grad_norm": 0.2630753219127655, + "learning_rate": 9.582291602992474e-05, + "loss": 0.024, + "step": 10470 + }, + { + "epoch": 6.10011641443539, + "grad_norm": 0.3060391843318939, + "learning_rate": 9.581188235148466e-05, + "loss": 0.0247, + "step": 10480 + }, + { + "epoch": 6.105937136204889, + "grad_norm": 0.24119700491428375, + "learning_rate": 9.58008347566043e-05, + "loss": 0.021, + "step": 10490 + }, + { + "epoch": 6.111757857974389, + "grad_norm": 0.2543535828590393, + "learning_rate": 9.578977324863965e-05, + "loss": 0.0234, + "step": 10500 + }, + { + "epoch": 6.117578579743888, + "grad_norm": 0.2902235984802246, + "learning_rate": 9.577869783095089e-05, + "loss": 0.0296, + "step": 10510 + }, + { + "epoch": 6.123399301513388, + "grad_norm": 0.22509783506393433, + "learning_rate": 9.576760850690245e-05, + "loss": 0.0274, + "step": 10520 + }, + { + "epoch": 6.129220023282887, + "grad_norm": 0.24324366450309753, + "learning_rate": 9.575650527986298e-05, + "loss": 0.0348, + "step": 10530 + }, + { + "epoch": 6.135040745052386, + "grad_norm": 0.3018445670604706, + "learning_rate": 9.574538815320531e-05, + "loss": 0.0241, + "step": 10540 + }, + { + "epoch": 6.140861466821886, + "grad_norm": 0.4279215931892395, + "learning_rate": 9.573425713030656e-05, + "loss": 0.033, + "step": 10550 + }, + { + "epoch": 6.146682188591385, + "grad_norm": 0.2954906225204468, + "learning_rate": 9.572311221454806e-05, + "loss": 0.0272, + "step": 10560 + }, + { + "epoch": 6.152502910360885, + "grad_norm": 0.3261367976665497, + "learning_rate": 9.57119534093153e-05, + "loss": 0.0329, + "step": 10570 + }, + { + "epoch": 6.158323632130384, + "grad_norm": 0.25966331362724304, + "learning_rate": 9.570078071799806e-05, + "loss": 0.0244, + "step": 10580 + }, + { + "epoch": 6.164144353899884, + "grad_norm": 0.3128201961517334, + "learning_rate": 9.568959414399028e-05, + "loss": 0.0243, + "step": 10590 + }, + { + "epoch": 6.169965075669383, + "grad_norm": 0.2482699304819107, + "learning_rate": 9.567839369069018e-05, + "loss": 0.0243, + "step": 10600 + }, + { + "epoch": 6.175785797438882, + "grad_norm": 0.28515002131462097, + "learning_rate": 9.566717936150013e-05, + "loss": 0.0289, + "step": 10610 + }, + { + "epoch": 6.181606519208382, + "grad_norm": 0.28069421648979187, + "learning_rate": 9.565595115982678e-05, + "loss": 0.0252, + "step": 10620 + }, + { + "epoch": 6.187427240977882, + "grad_norm": 0.4143858253955841, + "learning_rate": 9.564470908908094e-05, + "loss": 0.0282, + "step": 10630 + }, + { + "epoch": 6.193247962747381, + "grad_norm": 0.2552490234375, + "learning_rate": 9.563345315267764e-05, + "loss": 0.0299, + "step": 10640 + }, + { + "epoch": 6.19906868451688, + "grad_norm": 0.42476680874824524, + "learning_rate": 9.562218335403616e-05, + "loss": 0.0279, + "step": 10650 + }, + { + "epoch": 6.204889406286379, + "grad_norm": 0.13937810063362122, + "learning_rate": 9.561089969657999e-05, + "loss": 0.0248, + "step": 10660 + }, + { + "epoch": 6.2107101280558785, + "grad_norm": 0.28503134846687317, + "learning_rate": 9.559960218373673e-05, + "loss": 0.0276, + "step": 10670 + }, + { + "epoch": 6.216530849825379, + "grad_norm": 0.2948184311389923, + "learning_rate": 9.558829081893836e-05, + "loss": 0.0274, + "step": 10680 + }, + { + "epoch": 6.222351571594878, + "grad_norm": 0.24125419557094574, + "learning_rate": 9.55769656056209e-05, + "loss": 0.0224, + "step": 10690 + }, + { + "epoch": 6.228172293364377, + "grad_norm": 0.283447802066803, + "learning_rate": 9.556562654722469e-05, + "loss": 0.0273, + "step": 10700 + }, + { + "epoch": 6.233993015133876, + "grad_norm": 0.1874697059392929, + "learning_rate": 9.555427364719422e-05, + "loss": 0.032, + "step": 10710 + }, + { + "epoch": 6.239813736903376, + "grad_norm": 0.27953535318374634, + "learning_rate": 9.55429069089782e-05, + "loss": 0.0232, + "step": 10720 + }, + { + "epoch": 6.245634458672876, + "grad_norm": 0.3227843642234802, + "learning_rate": 9.553152633602956e-05, + "loss": 0.0272, + "step": 10730 + }, + { + "epoch": 6.251455180442375, + "grad_norm": 0.3341033458709717, + "learning_rate": 9.552013193180543e-05, + "loss": 0.0299, + "step": 10740 + }, + { + "epoch": 6.257275902211874, + "grad_norm": 0.3807360827922821, + "learning_rate": 9.550872369976707e-05, + "loss": 0.0257, + "step": 10750 + }, + { + "epoch": 6.263096623981374, + "grad_norm": 0.4502421021461487, + "learning_rate": 9.549730164338007e-05, + "loss": 0.0271, + "step": 10760 + }, + { + "epoch": 6.268917345750873, + "grad_norm": 0.3101041615009308, + "learning_rate": 9.548586576611408e-05, + "loss": 0.0261, + "step": 10770 + }, + { + "epoch": 6.2747380675203726, + "grad_norm": 0.2762364149093628, + "learning_rate": 9.54744160714431e-05, + "loss": 0.0252, + "step": 10780 + }, + { + "epoch": 6.280558789289872, + "grad_norm": 0.29810017347335815, + "learning_rate": 9.546295256284516e-05, + "loss": 0.0242, + "step": 10790 + }, + { + "epoch": 6.286379511059371, + "grad_norm": 0.19658209383487701, + "learning_rate": 9.545147524380265e-05, + "loss": 0.0261, + "step": 10800 + }, + { + "epoch": 6.292200232828871, + "grad_norm": 0.20363454520702362, + "learning_rate": 9.543998411780201e-05, + "loss": 0.0258, + "step": 10810 + }, + { + "epoch": 6.29802095459837, + "grad_norm": 0.24392762780189514, + "learning_rate": 9.542847918833397e-05, + "loss": 0.0248, + "step": 10820 + }, + { + "epoch": 6.3038416763678695, + "grad_norm": 0.34497615694999695, + "learning_rate": 9.541696045889343e-05, + "loss": 0.0258, + "step": 10830 + }, + { + "epoch": 6.309662398137369, + "grad_norm": 0.15507875382900238, + "learning_rate": 9.540542793297947e-05, + "loss": 0.022, + "step": 10840 + }, + { + "epoch": 6.315483119906869, + "grad_norm": 0.2252160906791687, + "learning_rate": 9.539388161409537e-05, + "loss": 0.0231, + "step": 10850 + }, + { + "epoch": 6.321303841676368, + "grad_norm": 0.2921842634677887, + "learning_rate": 9.538232150574857e-05, + "loss": 0.0258, + "step": 10860 + }, + { + "epoch": 6.327124563445867, + "grad_norm": 0.17416934669017792, + "learning_rate": 9.537074761145076e-05, + "loss": 0.0224, + "step": 10870 + }, + { + "epoch": 6.3329452852153665, + "grad_norm": 0.40828248858451843, + "learning_rate": 9.535915993471778e-05, + "loss": 0.0312, + "step": 10880 + }, + { + "epoch": 6.338766006984866, + "grad_norm": 0.20112977921962738, + "learning_rate": 9.534755847906964e-05, + "loss": 0.0342, + "step": 10890 + }, + { + "epoch": 6.344586728754366, + "grad_norm": 0.36793601512908936, + "learning_rate": 9.533594324803057e-05, + "loss": 0.0278, + "step": 10900 + }, + { + "epoch": 6.350407450523865, + "grad_norm": 0.27709075808525085, + "learning_rate": 9.532431424512895e-05, + "loss": 0.0213, + "step": 10910 + }, + { + "epoch": 6.356228172293364, + "grad_norm": 0.18620198965072632, + "learning_rate": 9.531267147389741e-05, + "loss": 0.0284, + "step": 10920 + }, + { + "epoch": 6.3620488940628634, + "grad_norm": 0.33648332953453064, + "learning_rate": 9.530101493787266e-05, + "loss": 0.0283, + "step": 10930 + }, + { + "epoch": 6.3678696158323636, + "grad_norm": 0.25762873888015747, + "learning_rate": 9.528934464059571e-05, + "loss": 0.0219, + "step": 10940 + }, + { + "epoch": 6.373690337601863, + "grad_norm": 0.3006114661693573, + "learning_rate": 9.527766058561163e-05, + "loss": 0.0227, + "step": 10950 + }, + { + "epoch": 6.379511059371362, + "grad_norm": 0.30722740292549133, + "learning_rate": 9.526596277646976e-05, + "loss": 0.028, + "step": 10960 + }, + { + "epoch": 6.385331781140861, + "grad_norm": 0.27116382122039795, + "learning_rate": 9.525425121672358e-05, + "loss": 0.0256, + "step": 10970 + }, + { + "epoch": 6.391152502910361, + "grad_norm": 0.30885592103004456, + "learning_rate": 9.524252590993074e-05, + "loss": 0.0217, + "step": 10980 + }, + { + "epoch": 6.3969732246798605, + "grad_norm": 0.14093942940235138, + "learning_rate": 9.523078685965309e-05, + "loss": 0.0243, + "step": 10990 + }, + { + "epoch": 6.40279394644936, + "grad_norm": 0.3448682427406311, + "learning_rate": 9.521903406945664e-05, + "loss": 0.022, + "step": 11000 + }, + { + "epoch": 6.408614668218859, + "grad_norm": 0.20031985640525818, + "learning_rate": 9.520726754291158e-05, + "loss": 0.0239, + "step": 11010 + }, + { + "epoch": 6.414435389988358, + "grad_norm": 0.23860538005828857, + "learning_rate": 9.519548728359227e-05, + "loss": 0.027, + "step": 11020 + }, + { + "epoch": 6.420256111757858, + "grad_norm": 0.29936251044273376, + "learning_rate": 9.518369329507726e-05, + "loss": 0.0207, + "step": 11030 + }, + { + "epoch": 6.4260768335273575, + "grad_norm": 0.2851185202598572, + "learning_rate": 9.51718855809492e-05, + "loss": 0.0211, + "step": 11040 + }, + { + "epoch": 6.431897555296857, + "grad_norm": 0.30796223878860474, + "learning_rate": 9.516006414479502e-05, + "loss": 0.0296, + "step": 11050 + }, + { + "epoch": 6.437718277066356, + "grad_norm": 0.3346829116344452, + "learning_rate": 9.514822899020572e-05, + "loss": 0.0388, + "step": 11060 + }, + { + "epoch": 6.443538998835856, + "grad_norm": 0.21104884147644043, + "learning_rate": 9.513638012077654e-05, + "loss": 0.0304, + "step": 11070 + }, + { + "epoch": 6.449359720605355, + "grad_norm": 0.22497624158859253, + "learning_rate": 9.512451754010683e-05, + "loss": 0.0237, + "step": 11080 + }, + { + "epoch": 6.455180442374854, + "grad_norm": 0.26851096749305725, + "learning_rate": 9.511264125180013e-05, + "loss": 0.0202, + "step": 11090 + }, + { + "epoch": 6.461001164144354, + "grad_norm": 0.2667107582092285, + "learning_rate": 9.510075125946414e-05, + "loss": 0.0253, + "step": 11100 + }, + { + "epoch": 6.466821885913854, + "grad_norm": 0.25902363657951355, + "learning_rate": 9.508884756671075e-05, + "loss": 0.0268, + "step": 11110 + }, + { + "epoch": 6.472642607683353, + "grad_norm": 0.25643274188041687, + "learning_rate": 9.507693017715596e-05, + "loss": 0.0244, + "step": 11120 + }, + { + "epoch": 6.478463329452852, + "grad_norm": 0.2328399121761322, + "learning_rate": 9.506499909441997e-05, + "loss": 0.0242, + "step": 11130 + }, + { + "epoch": 6.484284051222351, + "grad_norm": 0.301040917634964, + "learning_rate": 9.505305432212713e-05, + "loss": 0.0242, + "step": 11140 + }, + { + "epoch": 6.490104772991851, + "grad_norm": 0.30066782236099243, + "learning_rate": 9.504109586390595e-05, + "loss": 0.0239, + "step": 11150 + }, + { + "epoch": 6.495925494761351, + "grad_norm": 0.23411887884140015, + "learning_rate": 9.502912372338908e-05, + "loss": 0.0278, + "step": 11160 + }, + { + "epoch": 6.50174621653085, + "grad_norm": 0.2123546600341797, + "learning_rate": 9.501713790421335e-05, + "loss": 0.024, + "step": 11170 + }, + { + "epoch": 6.507566938300349, + "grad_norm": 0.22573134303092957, + "learning_rate": 9.500513841001974e-05, + "loss": 0.0259, + "step": 11180 + }, + { + "epoch": 6.513387660069848, + "grad_norm": 0.24069786071777344, + "learning_rate": 9.499312524445336e-05, + "loss": 0.0298, + "step": 11190 + }, + { + "epoch": 6.5192083818393485, + "grad_norm": 0.31413379311561584, + "learning_rate": 9.498109841116351e-05, + "loss": 0.0255, + "step": 11200 + }, + { + "epoch": 6.525029103608848, + "grad_norm": 0.26661282777786255, + "learning_rate": 9.496905791380363e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 6.530849825378347, + "grad_norm": 0.32802045345306396, + "learning_rate": 9.495700375603129e-05, + "loss": 0.0214, + "step": 11220 + }, + { + "epoch": 6.536670547147846, + "grad_norm": 0.31903737783432007, + "learning_rate": 9.494493594150822e-05, + "loss": 0.0293, + "step": 11230 + }, + { + "epoch": 6.542491268917345, + "grad_norm": 0.25846055150032043, + "learning_rate": 9.493285447390032e-05, + "loss": 0.0263, + "step": 11240 + }, + { + "epoch": 6.548311990686845, + "grad_norm": 0.2982645034790039, + "learning_rate": 9.492075935687761e-05, + "loss": 0.0237, + "step": 11250 + }, + { + "epoch": 6.554132712456345, + "grad_norm": 0.2745411694049835, + "learning_rate": 9.490865059411427e-05, + "loss": 0.026, + "step": 11260 + }, + { + "epoch": 6.559953434225844, + "grad_norm": 0.1886666715145111, + "learning_rate": 9.489652818928863e-05, + "loss": 0.0273, + "step": 11270 + }, + { + "epoch": 6.565774155995343, + "grad_norm": 0.2956908941268921, + "learning_rate": 9.488439214608315e-05, + "loss": 0.0385, + "step": 11280 + }, + { + "epoch": 6.571594877764843, + "grad_norm": 0.2793770730495453, + "learning_rate": 9.487224246818444e-05, + "loss": 0.0222, + "step": 11290 + }, + { + "epoch": 6.577415599534342, + "grad_norm": 0.2408176213502884, + "learning_rate": 9.486007915928325e-05, + "loss": 0.0204, + "step": 11300 + }, + { + "epoch": 6.583236321303842, + "grad_norm": 0.23099303245544434, + "learning_rate": 9.484790222307448e-05, + "loss": 0.0214, + "step": 11310 + }, + { + "epoch": 6.589057043073341, + "grad_norm": 0.1925797015428543, + "learning_rate": 9.483571166325716e-05, + "loss": 0.0297, + "step": 11320 + }, + { + "epoch": 6.59487776484284, + "grad_norm": 0.3428218960762024, + "learning_rate": 9.482350748353444e-05, + "loss": 0.0266, + "step": 11330 + }, + { + "epoch": 6.60069848661234, + "grad_norm": 0.33038535714149475, + "learning_rate": 9.481128968761363e-05, + "loss": 0.0251, + "step": 11340 + }, + { + "epoch": 6.606519208381839, + "grad_norm": 0.21122638881206512, + "learning_rate": 9.479905827920621e-05, + "loss": 0.0218, + "step": 11350 + }, + { + "epoch": 6.612339930151339, + "grad_norm": 0.3633985221385956, + "learning_rate": 9.478681326202773e-05, + "loss": 0.0278, + "step": 11360 + }, + { + "epoch": 6.618160651920838, + "grad_norm": 0.23546962440013885, + "learning_rate": 9.477455463979791e-05, + "loss": 0.0238, + "step": 11370 + }, + { + "epoch": 6.623981373690338, + "grad_norm": 0.22096098959445953, + "learning_rate": 9.476228241624059e-05, + "loss": 0.036, + "step": 11380 + }, + { + "epoch": 6.629802095459837, + "grad_norm": 0.3064873218536377, + "learning_rate": 9.474999659508374e-05, + "loss": 0.0238, + "step": 11390 + }, + { + "epoch": 6.635622817229336, + "grad_norm": 0.18581673502922058, + "learning_rate": 9.47376971800595e-05, + "loss": 0.0199, + "step": 11400 + }, + { + "epoch": 6.6414435389988355, + "grad_norm": 0.2144235074520111, + "learning_rate": 9.472538417490409e-05, + "loss": 0.0292, + "step": 11410 + }, + { + "epoch": 6.647264260768336, + "grad_norm": 0.24478232860565186, + "learning_rate": 9.471305758335784e-05, + "loss": 0.0287, + "step": 11420 + }, + { + "epoch": 6.653084982537835, + "grad_norm": 0.2165740430355072, + "learning_rate": 9.47007174091653e-05, + "loss": 0.0332, + "step": 11430 + }, + { + "epoch": 6.658905704307334, + "grad_norm": 0.33104971051216125, + "learning_rate": 9.468836365607507e-05, + "loss": 0.0308, + "step": 11440 + }, + { + "epoch": 6.664726426076833, + "grad_norm": 0.32205769419670105, + "learning_rate": 9.467599632783988e-05, + "loss": 0.0307, + "step": 11450 + }, + { + "epoch": 6.670547147846333, + "grad_norm": 0.3280721604824066, + "learning_rate": 9.466361542821662e-05, + "loss": 0.0296, + "step": 11460 + }, + { + "epoch": 6.676367869615833, + "grad_norm": 0.29177436232566833, + "learning_rate": 9.465122096096625e-05, + "loss": 0.0216, + "step": 11470 + }, + { + "epoch": 6.682188591385332, + "grad_norm": 0.2612217664718628, + "learning_rate": 9.463881292985391e-05, + "loss": 0.0272, + "step": 11480 + }, + { + "epoch": 6.688009313154831, + "grad_norm": 0.4537910223007202, + "learning_rate": 9.462639133864881e-05, + "loss": 0.0241, + "step": 11490 + }, + { + "epoch": 6.69383003492433, + "grad_norm": 0.27586713433265686, + "learning_rate": 9.461395619112432e-05, + "loss": 0.0254, + "step": 11500 + }, + { + "epoch": 6.69965075669383, + "grad_norm": 0.25583598017692566, + "learning_rate": 9.460150749105791e-05, + "loss": 0.0224, + "step": 11510 + }, + { + "epoch": 6.70547147846333, + "grad_norm": 0.3602408766746521, + "learning_rate": 9.458904524223116e-05, + "loss": 0.0263, + "step": 11520 + }, + { + "epoch": 6.711292200232829, + "grad_norm": 0.3116949200630188, + "learning_rate": 9.457656944842976e-05, + "loss": 0.0283, + "step": 11530 + }, + { + "epoch": 6.717112922002328, + "grad_norm": 0.20310455560684204, + "learning_rate": 9.456408011344353e-05, + "loss": 0.0226, + "step": 11540 + }, + { + "epoch": 6.722933643771828, + "grad_norm": 0.25106844305992126, + "learning_rate": 9.455157724106643e-05, + "loss": 0.0247, + "step": 11550 + }, + { + "epoch": 6.728754365541327, + "grad_norm": 0.14478684961795807, + "learning_rate": 9.453906083509647e-05, + "loss": 0.026, + "step": 11560 + }, + { + "epoch": 6.7345750873108265, + "grad_norm": 0.41387972235679626, + "learning_rate": 9.45265308993358e-05, + "loss": 0.021, + "step": 11570 + }, + { + "epoch": 6.740395809080326, + "grad_norm": 0.29446372389793396, + "learning_rate": 9.451398743759071e-05, + "loss": 0.0324, + "step": 11580 + }, + { + "epoch": 6.746216530849825, + "grad_norm": 0.2263680249452591, + "learning_rate": 9.450143045367156e-05, + "loss": 0.0208, + "step": 11590 + }, + { + "epoch": 6.752037252619325, + "grad_norm": 0.2598641514778137, + "learning_rate": 9.448885995139283e-05, + "loss": 0.0211, + "step": 11600 + }, + { + "epoch": 6.757857974388824, + "grad_norm": 0.27063506841659546, + "learning_rate": 9.44762759345731e-05, + "loss": 0.0215, + "step": 11610 + }, + { + "epoch": 6.7636786961583235, + "grad_norm": 0.356902539730072, + "learning_rate": 9.446367840703509e-05, + "loss": 0.0264, + "step": 11620 + }, + { + "epoch": 6.769499417927823, + "grad_norm": 0.26654040813446045, + "learning_rate": 9.445106737260556e-05, + "loss": 0.0209, + "step": 11630 + }, + { + "epoch": 6.775320139697323, + "grad_norm": 0.2726166546344757, + "learning_rate": 9.443844283511543e-05, + "loss": 0.0249, + "step": 11640 + }, + { + "epoch": 6.781140861466822, + "grad_norm": 0.31384870409965515, + "learning_rate": 9.442580479839968e-05, + "loss": 0.0277, + "step": 11650 + }, + { + "epoch": 6.786961583236321, + "grad_norm": 0.27978378534317017, + "learning_rate": 9.441315326629745e-05, + "loss": 0.0269, + "step": 11660 + }, + { + "epoch": 6.7927823050058205, + "grad_norm": 0.22928740084171295, + "learning_rate": 9.44004882426519e-05, + "loss": 0.0254, + "step": 11670 + }, + { + "epoch": 6.79860302677532, + "grad_norm": 0.2474094033241272, + "learning_rate": 9.438780973131037e-05, + "loss": 0.021, + "step": 11680 + }, + { + "epoch": 6.80442374854482, + "grad_norm": 0.25184667110443115, + "learning_rate": 9.437511773612423e-05, + "loss": 0.0309, + "step": 11690 + }, + { + "epoch": 6.810244470314319, + "grad_norm": 0.24806170165538788, + "learning_rate": 9.436241226094896e-05, + "loss": 0.0346, + "step": 11700 + }, + { + "epoch": 6.816065192083818, + "grad_norm": 0.15510109066963196, + "learning_rate": 9.434969330964418e-05, + "loss": 0.0231, + "step": 11710 + }, + { + "epoch": 6.821885913853317, + "grad_norm": 0.21203738451004028, + "learning_rate": 9.433696088607356e-05, + "loss": 0.0214, + "step": 11720 + }, + { + "epoch": 6.8277066356228175, + "grad_norm": 0.2472565472126007, + "learning_rate": 9.432421499410486e-05, + "loss": 0.0236, + "step": 11730 + }, + { + "epoch": 6.833527357392317, + "grad_norm": 0.2719338536262512, + "learning_rate": 9.431145563760998e-05, + "loss": 0.021, + "step": 11740 + }, + { + "epoch": 6.839348079161816, + "grad_norm": 0.35437387228012085, + "learning_rate": 9.429868282046484e-05, + "loss": 0.0312, + "step": 11750 + }, + { + "epoch": 6.845168800931315, + "grad_norm": 0.2552362084388733, + "learning_rate": 9.428589654654951e-05, + "loss": 0.0343, + "step": 11760 + }, + { + "epoch": 6.850989522700814, + "grad_norm": 0.2309507578611374, + "learning_rate": 9.42730968197481e-05, + "loss": 0.0253, + "step": 11770 + }, + { + "epoch": 6.8568102444703145, + "grad_norm": 0.31407445669174194, + "learning_rate": 9.426028364394883e-05, + "loss": 0.022, + "step": 11780 + }, + { + "epoch": 6.862630966239814, + "grad_norm": 0.32331952452659607, + "learning_rate": 9.424745702304402e-05, + "loss": 0.0251, + "step": 11790 + }, + { + "epoch": 6.868451688009313, + "grad_norm": 0.21886920928955078, + "learning_rate": 9.423461696093006e-05, + "loss": 0.032, + "step": 11800 + }, + { + "epoch": 6.874272409778813, + "grad_norm": 0.26566243171691895, + "learning_rate": 9.422176346150741e-05, + "loss": 0.0252, + "step": 11810 + }, + { + "epoch": 6.880093131548312, + "grad_norm": 0.3076276183128357, + "learning_rate": 9.420889652868063e-05, + "loss": 0.0295, + "step": 11820 + }, + { + "epoch": 6.8859138533178115, + "grad_norm": 0.16434115171432495, + "learning_rate": 9.419601616635836e-05, + "loss": 0.0262, + "step": 11830 + }, + { + "epoch": 6.891734575087311, + "grad_norm": 0.29159700870513916, + "learning_rate": 9.418312237845331e-05, + "loss": 0.0265, + "step": 11840 + }, + { + "epoch": 6.89755529685681, + "grad_norm": 0.41260161995887756, + "learning_rate": 9.417021516888225e-05, + "loss": 0.0263, + "step": 11850 + }, + { + "epoch": 6.90337601862631, + "grad_norm": 0.3377751410007477, + "learning_rate": 9.415729454156608e-05, + "loss": 0.024, + "step": 11860 + }, + { + "epoch": 6.909196740395809, + "grad_norm": 0.274707555770874, + "learning_rate": 9.414436050042973e-05, + "loss": 0.0274, + "step": 11870 + }, + { + "epoch": 6.915017462165308, + "grad_norm": 0.33443349599838257, + "learning_rate": 9.413141304940223e-05, + "loss": 0.0234, + "step": 11880 + }, + { + "epoch": 6.920838183934808, + "grad_norm": 0.3014127016067505, + "learning_rate": 9.411845219241666e-05, + "loss": 0.0222, + "step": 11890 + }, + { + "epoch": 6.926658905704308, + "grad_norm": 0.2917824685573578, + "learning_rate": 9.410547793341021e-05, + "loss": 0.027, + "step": 11900 + }, + { + "epoch": 6.932479627473807, + "grad_norm": 0.27402758598327637, + "learning_rate": 9.409249027632408e-05, + "loss": 0.0221, + "step": 11910 + }, + { + "epoch": 6.938300349243306, + "grad_norm": 0.26657167077064514, + "learning_rate": 9.407948922510362e-05, + "loss": 0.0239, + "step": 11920 + }, + { + "epoch": 6.944121071012805, + "grad_norm": 0.18335026502609253, + "learning_rate": 9.406647478369817e-05, + "loss": 0.0181, + "step": 11930 + }, + { + "epoch": 6.949941792782305, + "grad_norm": 0.24308784306049347, + "learning_rate": 9.405344695606118e-05, + "loss": 0.0237, + "step": 11940 + }, + { + "epoch": 6.955762514551805, + "grad_norm": 0.3533664345741272, + "learning_rate": 9.404040574615018e-05, + "loss": 0.0236, + "step": 11950 + }, + { + "epoch": 6.961583236321304, + "grad_norm": 0.24157826602458954, + "learning_rate": 9.402735115792674e-05, + "loss": 0.0256, + "step": 11960 + }, + { + "epoch": 6.967403958090803, + "grad_norm": 0.23687800765037537, + "learning_rate": 9.401428319535649e-05, + "loss": 0.0194, + "step": 11970 + }, + { + "epoch": 6.973224679860302, + "grad_norm": 0.2778899669647217, + "learning_rate": 9.400120186240912e-05, + "loss": 0.0254, + "step": 11980 + }, + { + "epoch": 6.9790454016298025, + "grad_norm": 0.31760475039482117, + "learning_rate": 9.398810716305844e-05, + "loss": 0.023, + "step": 11990 + }, + { + "epoch": 6.984866123399302, + "grad_norm": 0.21583762764930725, + "learning_rate": 9.397499910128222e-05, + "loss": 0.0255, + "step": 12000 + }, + { + "epoch": 6.990686845168801, + "grad_norm": 0.29714465141296387, + "learning_rate": 9.396187768106237e-05, + "loss": 0.0171, + "step": 12010 + }, + { + "epoch": 6.9965075669383, + "grad_norm": 0.244157612323761, + "learning_rate": 9.394874290638482e-05, + "loss": 0.0217, + "step": 12020 + }, + { + "epoch": 7.002328288707799, + "grad_norm": 0.3701115846633911, + "learning_rate": 9.393559478123959e-05, + "loss": 0.024, + "step": 12030 + }, + { + "epoch": 7.008149010477299, + "grad_norm": 0.26404130458831787, + "learning_rate": 9.39224333096207e-05, + "loss": 0.0238, + "step": 12040 + }, + { + "epoch": 7.013969732246799, + "grad_norm": 0.21147103607654572, + "learning_rate": 9.390925849552629e-05, + "loss": 0.0212, + "step": 12050 + }, + { + "epoch": 7.019790454016298, + "grad_norm": 0.20156490802764893, + "learning_rate": 9.389607034295849e-05, + "loss": 0.0188, + "step": 12060 + }, + { + "epoch": 7.025611175785797, + "grad_norm": 0.179017573595047, + "learning_rate": 9.388286885592355e-05, + "loss": 0.0301, + "step": 12070 + }, + { + "epoch": 7.031431897555297, + "grad_norm": 0.21227312088012695, + "learning_rate": 9.386965403843168e-05, + "loss": 0.0244, + "step": 12080 + }, + { + "epoch": 7.037252619324796, + "grad_norm": 0.2133495956659317, + "learning_rate": 9.385642589449726e-05, + "loss": 0.0256, + "step": 12090 + }, + { + "epoch": 7.043073341094296, + "grad_norm": 0.3033390939235687, + "learning_rate": 9.38431844281386e-05, + "loss": 0.0191, + "step": 12100 + }, + { + "epoch": 7.048894062863795, + "grad_norm": 0.28381866216659546, + "learning_rate": 9.38299296433781e-05, + "loss": 0.0246, + "step": 12110 + }, + { + "epoch": 7.054714784633295, + "grad_norm": 0.22959521412849426, + "learning_rate": 9.381666154424226e-05, + "loss": 0.0244, + "step": 12120 + }, + { + "epoch": 7.060535506402794, + "grad_norm": 0.27311161160469055, + "learning_rate": 9.380338013476157e-05, + "loss": 0.0224, + "step": 12130 + }, + { + "epoch": 7.066356228172293, + "grad_norm": 0.3107220232486725, + "learning_rate": 9.379008541897054e-05, + "loss": 0.0243, + "step": 12140 + }, + { + "epoch": 7.072176949941793, + "grad_norm": 0.2614183723926544, + "learning_rate": 9.377677740090777e-05, + "loss": 0.0285, + "step": 12150 + }, + { + "epoch": 7.077997671711292, + "grad_norm": 0.25340506434440613, + "learning_rate": 9.376345608461588e-05, + "loss": 0.0252, + "step": 12160 + }, + { + "epoch": 7.083818393480792, + "grad_norm": 0.20823338627815247, + "learning_rate": 9.375012147414155e-05, + "loss": 0.0234, + "step": 12170 + }, + { + "epoch": 7.089639115250291, + "grad_norm": 0.16885796189308167, + "learning_rate": 9.373677357353545e-05, + "loss": 0.0275, + "step": 12180 + }, + { + "epoch": 7.09545983701979, + "grad_norm": 0.23156988620758057, + "learning_rate": 9.372341238685237e-05, + "loss": 0.0237, + "step": 12190 + }, + { + "epoch": 7.1012805587892895, + "grad_norm": 0.33914387226104736, + "learning_rate": 9.371003791815102e-05, + "loss": 0.0226, + "step": 12200 + }, + { + "epoch": 7.10710128055879, + "grad_norm": 0.2942030429840088, + "learning_rate": 9.369665017149429e-05, + "loss": 0.0242, + "step": 12210 + }, + { + "epoch": 7.112922002328289, + "grad_norm": 0.13276977837085724, + "learning_rate": 9.368324915094895e-05, + "loss": 0.0198, + "step": 12220 + }, + { + "epoch": 7.118742724097788, + "grad_norm": 0.25390082597732544, + "learning_rate": 9.366983486058591e-05, + "loss": 0.026, + "step": 12230 + }, + { + "epoch": 7.124563445867287, + "grad_norm": 0.30192020535469055, + "learning_rate": 9.365640730448009e-05, + "loss": 0.0232, + "step": 12240 + }, + { + "epoch": 7.130384167636787, + "grad_norm": 0.25896891951560974, + "learning_rate": 9.36429664867104e-05, + "loss": 0.026, + "step": 12250 + }, + { + "epoch": 7.136204889406287, + "grad_norm": 0.287841796875, + "learning_rate": 9.362951241135982e-05, + "loss": 0.0262, + "step": 12260 + }, + { + "epoch": 7.142025611175786, + "grad_norm": 0.3116583228111267, + "learning_rate": 9.361604508251534e-05, + "loss": 0.0211, + "step": 12270 + }, + { + "epoch": 7.147846332945285, + "grad_norm": 0.24082635343074799, + "learning_rate": 9.360256450426799e-05, + "loss": 0.0249, + "step": 12280 + }, + { + "epoch": 7.153667054714784, + "grad_norm": 0.3035711646080017, + "learning_rate": 9.358907068071279e-05, + "loss": 0.0245, + "step": 12290 + }, + { + "epoch": 7.159487776484284, + "grad_norm": 0.20286443829536438, + "learning_rate": 9.357556361594882e-05, + "loss": 0.019, + "step": 12300 + }, + { + "epoch": 7.165308498253784, + "grad_norm": 0.32624852657318115, + "learning_rate": 9.356204331407917e-05, + "loss": 0.023, + "step": 12310 + }, + { + "epoch": 7.171129220023283, + "grad_norm": 0.25154784321784973, + "learning_rate": 9.354850977921094e-05, + "loss": 0.029, + "step": 12320 + }, + { + "epoch": 7.176949941792782, + "grad_norm": 0.32378092408180237, + "learning_rate": 9.353496301545529e-05, + "loss": 0.0254, + "step": 12330 + }, + { + "epoch": 7.182770663562282, + "grad_norm": 0.3103863596916199, + "learning_rate": 9.352140302692733e-05, + "loss": 0.0252, + "step": 12340 + }, + { + "epoch": 7.188591385331781, + "grad_norm": 0.2916843295097351, + "learning_rate": 9.350782981774627e-05, + "loss": 0.0291, + "step": 12350 + }, + { + "epoch": 7.1944121071012805, + "grad_norm": 0.20890837907791138, + "learning_rate": 9.349424339203526e-05, + "loss": 0.0252, + "step": 12360 + }, + { + "epoch": 7.20023282887078, + "grad_norm": 0.25206896662712097, + "learning_rate": 9.34806437539215e-05, + "loss": 0.0262, + "step": 12370 + }, + { + "epoch": 7.206053550640279, + "grad_norm": 0.16359388828277588, + "learning_rate": 9.346703090753622e-05, + "loss": 0.0301, + "step": 12380 + }, + { + "epoch": 7.211874272409779, + "grad_norm": 0.32168444991111755, + "learning_rate": 9.345340485701461e-05, + "loss": 0.0257, + "step": 12390 + }, + { + "epoch": 7.217694994179278, + "grad_norm": 0.1494053602218628, + "learning_rate": 9.343976560649595e-05, + "loss": 0.0221, + "step": 12400 + }, + { + "epoch": 7.2235157159487775, + "grad_norm": 0.2857935428619385, + "learning_rate": 9.342611316012344e-05, + "loss": 0.0212, + "step": 12410 + }, + { + "epoch": 7.229336437718277, + "grad_norm": 0.28637444972991943, + "learning_rate": 9.341244752204437e-05, + "loss": 0.0264, + "step": 12420 + }, + { + "epoch": 7.235157159487777, + "grad_norm": 0.23058266937732697, + "learning_rate": 9.339876869640995e-05, + "loss": 0.0206, + "step": 12430 + }, + { + "epoch": 7.240977881257276, + "grad_norm": 0.20427487790584564, + "learning_rate": 9.33850766873755e-05, + "loss": 0.0259, + "step": 12440 + }, + { + "epoch": 7.246798603026775, + "grad_norm": 0.40773093700408936, + "learning_rate": 9.337137149910028e-05, + "loss": 0.0239, + "step": 12450 + }, + { + "epoch": 7.2526193247962745, + "grad_norm": 0.2652283310890198, + "learning_rate": 9.335765313574753e-05, + "loss": 0.0196, + "step": 12460 + }, + { + "epoch": 7.258440046565774, + "grad_norm": 0.2563445568084717, + "learning_rate": 9.334392160148457e-05, + "loss": 0.0201, + "step": 12470 + }, + { + "epoch": 7.264260768335274, + "grad_norm": 0.26800528168678284, + "learning_rate": 9.333017690048264e-05, + "loss": 0.0247, + "step": 12480 + }, + { + "epoch": 7.270081490104773, + "grad_norm": 0.30935388803482056, + "learning_rate": 9.331641903691706e-05, + "loss": 0.024, + "step": 12490 + }, + { + "epoch": 7.275902211874272, + "grad_norm": 0.37470588088035583, + "learning_rate": 9.330264801496707e-05, + "loss": 0.0294, + "step": 12500 + }, + { + "epoch": 7.281722933643771, + "grad_norm": 0.2578244209289551, + "learning_rate": 9.328886383881594e-05, + "loss": 0.0241, + "step": 12510 + }, + { + "epoch": 7.2875436554132715, + "grad_norm": 0.1812494993209839, + "learning_rate": 9.327506651265095e-05, + "loss": 0.0262, + "step": 12520 + }, + { + "epoch": 7.293364377182771, + "grad_norm": 0.36474597454071045, + "learning_rate": 9.326125604066338e-05, + "loss": 0.0241, + "step": 12530 + }, + { + "epoch": 7.29918509895227, + "grad_norm": 0.16501429677009583, + "learning_rate": 9.324743242704847e-05, + "loss": 0.0217, + "step": 12540 + }, + { + "epoch": 7.305005820721769, + "grad_norm": 0.2531312108039856, + "learning_rate": 9.323359567600546e-05, + "loss": 0.025, + "step": 12550 + }, + { + "epoch": 7.310826542491269, + "grad_norm": 0.329582542181015, + "learning_rate": 9.321974579173761e-05, + "loss": 0.0225, + "step": 12560 + }, + { + "epoch": 7.3166472642607685, + "grad_norm": 0.2736565172672272, + "learning_rate": 9.320588277845213e-05, + "loss": 0.0299, + "step": 12570 + }, + { + "epoch": 7.322467986030268, + "grad_norm": 0.21355405449867249, + "learning_rate": 9.319200664036026e-05, + "loss": 0.0237, + "step": 12580 + }, + { + "epoch": 7.328288707799767, + "grad_norm": 0.28346315026283264, + "learning_rate": 9.31781173816772e-05, + "loss": 0.0201, + "step": 12590 + }, + { + "epoch": 7.334109429569267, + "grad_norm": 0.24780240654945374, + "learning_rate": 9.316421500662212e-05, + "loss": 0.0228, + "step": 12600 + }, + { + "epoch": 7.339930151338766, + "grad_norm": 0.25121599435806274, + "learning_rate": 9.31502995194182e-05, + "loss": 0.0213, + "step": 12610 + }, + { + "epoch": 7.3457508731082655, + "grad_norm": 0.230169877409935, + "learning_rate": 9.31363709242926e-05, + "loss": 0.0227, + "step": 12620 + }, + { + "epoch": 7.351571594877765, + "grad_norm": 0.2679348587989807, + "learning_rate": 9.312242922547647e-05, + "loss": 0.0162, + "step": 12630 + }, + { + "epoch": 7.357392316647264, + "grad_norm": 0.23028600215911865, + "learning_rate": 9.310847442720492e-05, + "loss": 0.0264, + "step": 12640 + }, + { + "epoch": 7.363213038416764, + "grad_norm": 0.1450856477022171, + "learning_rate": 9.309450653371706e-05, + "loss": 0.0201, + "step": 12650 + }, + { + "epoch": 7.369033760186263, + "grad_norm": 0.3687927722930908, + "learning_rate": 9.308052554925595e-05, + "loss": 0.0194, + "step": 12660 + }, + { + "epoch": 7.374854481955762, + "grad_norm": 0.1791687309741974, + "learning_rate": 9.306653147806867e-05, + "loss": 0.021, + "step": 12670 + }, + { + "epoch": 7.380675203725262, + "grad_norm": 0.3472365736961365, + "learning_rate": 9.305252432440622e-05, + "loss": 0.0244, + "step": 12680 + }, + { + "epoch": 7.386495925494762, + "grad_norm": 0.20824599266052246, + "learning_rate": 9.303850409252361e-05, + "loss": 0.0231, + "step": 12690 + }, + { + "epoch": 7.392316647264261, + "grad_norm": 0.274304062128067, + "learning_rate": 9.302447078667985e-05, + "loss": 0.0213, + "step": 12700 + }, + { + "epoch": 7.39813736903376, + "grad_norm": 0.4146553575992584, + "learning_rate": 9.301042441113783e-05, + "loss": 0.0233, + "step": 12710 + }, + { + "epoch": 7.403958090803259, + "grad_norm": 0.3186495900154114, + "learning_rate": 9.299636497016451e-05, + "loss": 0.0285, + "step": 12720 + }, + { + "epoch": 7.409778812572759, + "grad_norm": 0.29138556122779846, + "learning_rate": 9.298229246803076e-05, + "loss": 0.0276, + "step": 12730 + }, + { + "epoch": 7.415599534342259, + "grad_norm": 0.3049946129322052, + "learning_rate": 9.296820690901144e-05, + "loss": 0.0275, + "step": 12740 + }, + { + "epoch": 7.421420256111758, + "grad_norm": 0.20746977627277374, + "learning_rate": 9.295410829738539e-05, + "loss": 0.0239, + "step": 12750 + }, + { + "epoch": 7.427240977881257, + "grad_norm": 0.3811992108821869, + "learning_rate": 9.293999663743535e-05, + "loss": 0.0256, + "step": 12760 + }, + { + "epoch": 7.433061699650756, + "grad_norm": 0.2928377091884613, + "learning_rate": 9.292587193344813e-05, + "loss": 0.0289, + "step": 12770 + }, + { + "epoch": 7.4388824214202565, + "grad_norm": 0.35431766510009766, + "learning_rate": 9.291173418971437e-05, + "loss": 0.0247, + "step": 12780 + }, + { + "epoch": 7.444703143189756, + "grad_norm": 0.31566891074180603, + "learning_rate": 9.28975834105288e-05, + "loss": 0.0265, + "step": 12790 + }, + { + "epoch": 7.450523864959255, + "grad_norm": 0.29647257924079895, + "learning_rate": 9.288341960019004e-05, + "loss": 0.0235, + "step": 12800 + }, + { + "epoch": 7.456344586728754, + "grad_norm": 0.3299614191055298, + "learning_rate": 9.286924276300067e-05, + "loss": 0.0242, + "step": 12810 + }, + { + "epoch": 7.462165308498253, + "grad_norm": 0.27652695775032043, + "learning_rate": 9.285505290326726e-05, + "loss": 0.0233, + "step": 12820 + }, + { + "epoch": 7.467986030267753, + "grad_norm": 0.35573410987854004, + "learning_rate": 9.284085002530027e-05, + "loss": 0.0221, + "step": 12830 + }, + { + "epoch": 7.473806752037253, + "grad_norm": 0.30121248960494995, + "learning_rate": 9.282663413341422e-05, + "loss": 0.027, + "step": 12840 + }, + { + "epoch": 7.479627473806752, + "grad_norm": 0.27811360359191895, + "learning_rate": 9.281240523192747e-05, + "loss": 0.0225, + "step": 12850 + }, + { + "epoch": 7.485448195576251, + "grad_norm": 0.2144586443901062, + "learning_rate": 9.279816332516242e-05, + "loss": 0.02, + "step": 12860 + }, + { + "epoch": 7.491268917345751, + "grad_norm": 0.23108258843421936, + "learning_rate": 9.278390841744536e-05, + "loss": 0.0228, + "step": 12870 + }, + { + "epoch": 7.49708963911525, + "grad_norm": 0.25010591745376587, + "learning_rate": 9.276964051310658e-05, + "loss": 0.0221, + "step": 12880 + }, + { + "epoch": 7.50291036088475, + "grad_norm": 0.30408260226249695, + "learning_rate": 9.275535961648027e-05, + "loss": 0.0195, + "step": 12890 + }, + { + "epoch": 7.508731082654249, + "grad_norm": 0.24536876380443573, + "learning_rate": 9.274106573190459e-05, + "loss": 0.0216, + "step": 12900 + }, + { + "epoch": 7.514551804423749, + "grad_norm": 0.15839490294456482, + "learning_rate": 9.272675886372168e-05, + "loss": 0.0179, + "step": 12910 + }, + { + "epoch": 7.520372526193248, + "grad_norm": 0.20973947644233704, + "learning_rate": 9.271243901627754e-05, + "loss": 0.0239, + "step": 12920 + }, + { + "epoch": 7.526193247962747, + "grad_norm": 0.34919169545173645, + "learning_rate": 9.269810619392219e-05, + "loss": 0.0211, + "step": 12930 + }, + { + "epoch": 7.532013969732247, + "grad_norm": 0.321679025888443, + "learning_rate": 9.268376040100955e-05, + "loss": 0.0284, + "step": 12940 + }, + { + "epoch": 7.537834691501747, + "grad_norm": 0.3473064601421356, + "learning_rate": 9.266940164189752e-05, + "loss": 0.0232, + "step": 12950 + }, + { + "epoch": 7.543655413271246, + "grad_norm": 0.3174935579299927, + "learning_rate": 9.265502992094787e-05, + "loss": 0.0261, + "step": 12960 + }, + { + "epoch": 7.549476135040745, + "grad_norm": 0.21024583280086517, + "learning_rate": 9.264064524252638e-05, + "loss": 0.0199, + "step": 12970 + }, + { + "epoch": 7.555296856810244, + "grad_norm": 0.20113329589366913, + "learning_rate": 9.262624761100271e-05, + "loss": 0.0275, + "step": 12980 + }, + { + "epoch": 7.5611175785797435, + "grad_norm": 0.2653343081474304, + "learning_rate": 9.261183703075051e-05, + "loss": 0.025, + "step": 12990 + }, + { + "epoch": 7.566938300349244, + "grad_norm": 0.43966543674468994, + "learning_rate": 9.259741350614733e-05, + "loss": 0.0251, + "step": 13000 + }, + { + "epoch": 7.572759022118743, + "grad_norm": 0.24774815142154694, + "learning_rate": 9.258297704157464e-05, + "loss": 0.025, + "step": 13010 + }, + { + "epoch": 7.578579743888242, + "grad_norm": 0.2624939978122711, + "learning_rate": 9.256852764141786e-05, + "loss": 0.0248, + "step": 13020 + }, + { + "epoch": 7.584400465657741, + "grad_norm": 0.23326309025287628, + "learning_rate": 9.255406531006634e-05, + "loss": 0.0214, + "step": 13030 + }, + { + "epoch": 7.590221187427241, + "grad_norm": 0.35588496923446655, + "learning_rate": 9.253959005191335e-05, + "loss": 0.0307, + "step": 13040 + }, + { + "epoch": 7.596041909196741, + "grad_norm": 0.33853450417518616, + "learning_rate": 9.25251018713561e-05, + "loss": 0.0294, + "step": 13050 + }, + { + "epoch": 7.60186263096624, + "grad_norm": 0.19832202792167664, + "learning_rate": 9.251060077279571e-05, + "loss": 0.0216, + "step": 13060 + }, + { + "epoch": 7.607683352735739, + "grad_norm": 0.23937523365020752, + "learning_rate": 9.249608676063724e-05, + "loss": 0.0227, + "step": 13070 + }, + { + "epoch": 7.613504074505238, + "grad_norm": 0.3857629597187042, + "learning_rate": 9.248155983928964e-05, + "loss": 0.0242, + "step": 13080 + }, + { + "epoch": 7.619324796274738, + "grad_norm": 0.2988866865634918, + "learning_rate": 9.246702001316583e-05, + "loss": 0.0213, + "step": 13090 + }, + { + "epoch": 7.625145518044238, + "grad_norm": 0.19465553760528564, + "learning_rate": 9.245246728668262e-05, + "loss": 0.0273, + "step": 13100 + }, + { + "epoch": 7.630966239813737, + "grad_norm": 0.31351330876350403, + "learning_rate": 9.243790166426073e-05, + "loss": 0.0273, + "step": 13110 + }, + { + "epoch": 7.636786961583236, + "grad_norm": 0.3298444449901581, + "learning_rate": 9.242332315032484e-05, + "loss": 0.0239, + "step": 13120 + }, + { + "epoch": 7.642607683352736, + "grad_norm": 0.3190672993659973, + "learning_rate": 9.240873174930349e-05, + "loss": 0.0279, + "step": 13130 + }, + { + "epoch": 7.648428405122235, + "grad_norm": 0.26409608125686646, + "learning_rate": 9.239412746562917e-05, + "loss": 0.0284, + "step": 13140 + }, + { + "epoch": 7.6542491268917345, + "grad_norm": 0.21352924406528473, + "learning_rate": 9.237951030373828e-05, + "loss": 0.0321, + "step": 13150 + }, + { + "epoch": 7.660069848661234, + "grad_norm": 0.24274320900440216, + "learning_rate": 9.236488026807113e-05, + "loss": 0.03, + "step": 13160 + }, + { + "epoch": 7.665890570430733, + "grad_norm": 0.21439392864704132, + "learning_rate": 9.235023736307193e-05, + "loss": 0.0211, + "step": 13170 + }, + { + "epoch": 7.671711292200233, + "grad_norm": 0.27502337098121643, + "learning_rate": 9.233558159318881e-05, + "loss": 0.0234, + "step": 13180 + }, + { + "epoch": 7.677532013969732, + "grad_norm": 0.23655374348163605, + "learning_rate": 9.232091296287382e-05, + "loss": 0.0218, + "step": 13190 + }, + { + "epoch": 7.6833527357392315, + "grad_norm": 0.24719800055027008, + "learning_rate": 9.230623147658288e-05, + "loss": 0.0261, + "step": 13200 + }, + { + "epoch": 7.689173457508731, + "grad_norm": 0.2442905455827713, + "learning_rate": 9.229153713877586e-05, + "loss": 0.0214, + "step": 13210 + }, + { + "epoch": 7.694994179278231, + "grad_norm": 0.221347376704216, + "learning_rate": 9.227682995391649e-05, + "loss": 0.0275, + "step": 13220 + }, + { + "epoch": 7.70081490104773, + "grad_norm": 0.3515969216823578, + "learning_rate": 9.226210992647243e-05, + "loss": 0.0224, + "step": 13230 + }, + { + "epoch": 7.706635622817229, + "grad_norm": 0.27471962571144104, + "learning_rate": 9.224737706091525e-05, + "loss": 0.0192, + "step": 13240 + }, + { + "epoch": 7.7124563445867285, + "grad_norm": 0.29495474696159363, + "learning_rate": 9.223263136172039e-05, + "loss": 0.023, + "step": 13250 + }, + { + "epoch": 7.718277066356228, + "grad_norm": 0.2476419061422348, + "learning_rate": 9.22178728333672e-05, + "loss": 0.0286, + "step": 13260 + }, + { + "epoch": 7.724097788125728, + "grad_norm": 0.28568097949028015, + "learning_rate": 9.220310148033897e-05, + "loss": 0.0241, + "step": 13270 + }, + { + "epoch": 7.729918509895227, + "grad_norm": 0.3004738688468933, + "learning_rate": 9.21883173071228e-05, + "loss": 0.0218, + "step": 13280 + }, + { + "epoch": 7.735739231664726, + "grad_norm": 0.23009593784809113, + "learning_rate": 9.217352031820976e-05, + "loss": 0.0186, + "step": 13290 + }, + { + "epoch": 7.741559953434226, + "grad_norm": 0.2672450840473175, + "learning_rate": 9.215871051809477e-05, + "loss": 0.0229, + "step": 13300 + }, + { + "epoch": 7.7473806752037255, + "grad_norm": 0.24631567299365997, + "learning_rate": 9.214388791127666e-05, + "loss": 0.0203, + "step": 13310 + }, + { + "epoch": 7.753201396973225, + "grad_norm": 0.2504797875881195, + "learning_rate": 9.212905250225814e-05, + "loss": 0.019, + "step": 13320 + }, + { + "epoch": 7.759022118742724, + "grad_norm": 0.31730204820632935, + "learning_rate": 9.211420429554583e-05, + "loss": 0.0228, + "step": 13330 + }, + { + "epoch": 7.764842840512223, + "grad_norm": 0.18668071925640106, + "learning_rate": 9.209934329565022e-05, + "loss": 0.021, + "step": 13340 + }, + { + "epoch": 7.770663562281723, + "grad_norm": 0.23576927185058594, + "learning_rate": 9.208446950708568e-05, + "loss": 0.0249, + "step": 13350 + }, + { + "epoch": 7.7764842840512225, + "grad_norm": 0.25479912757873535, + "learning_rate": 9.20695829343705e-05, + "loss": 0.0199, + "step": 13360 + }, + { + "epoch": 7.782305005820722, + "grad_norm": 0.3787166476249695, + "learning_rate": 9.205468358202678e-05, + "loss": 0.0202, + "step": 13370 + }, + { + "epoch": 7.788125727590221, + "grad_norm": 0.24215295910835266, + "learning_rate": 9.203977145458059e-05, + "loss": 0.0237, + "step": 13380 + }, + { + "epoch": 7.793946449359721, + "grad_norm": 0.2919982969760895, + "learning_rate": 9.202484655656182e-05, + "loss": 0.0222, + "step": 13390 + }, + { + "epoch": 7.79976717112922, + "grad_norm": 0.33773073554039, + "learning_rate": 9.200990889250427e-05, + "loss": 0.0227, + "step": 13400 + }, + { + "epoch": 7.8055878928987195, + "grad_norm": 0.21378573775291443, + "learning_rate": 9.19949584669456e-05, + "loss": 0.0208, + "step": 13410 + }, + { + "epoch": 7.811408614668219, + "grad_norm": 0.34147047996520996, + "learning_rate": 9.197999528442738e-05, + "loss": 0.024, + "step": 13420 + }, + { + "epoch": 7.817229336437718, + "grad_norm": 0.16621002554893494, + "learning_rate": 9.196501934949499e-05, + "loss": 0.0216, + "step": 13430 + }, + { + "epoch": 7.823050058207218, + "grad_norm": 0.2648584544658661, + "learning_rate": 9.195003066669776e-05, + "loss": 0.0216, + "step": 13440 + }, + { + "epoch": 7.828870779976717, + "grad_norm": 0.2536657154560089, + "learning_rate": 9.193502924058884e-05, + "loss": 0.0213, + "step": 13450 + }, + { + "epoch": 7.834691501746216, + "grad_norm": 0.25927069783210754, + "learning_rate": 9.192001507572526e-05, + "loss": 0.024, + "step": 13460 + }, + { + "epoch": 7.840512223515716, + "grad_norm": 0.27341052889823914, + "learning_rate": 9.190498817666793e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 7.846332945285216, + "grad_norm": 0.22176331281661987, + "learning_rate": 9.188994854798163e-05, + "loss": 0.0199, + "step": 13480 + }, + { + "epoch": 7.852153667054715, + "grad_norm": 0.2061086893081665, + "learning_rate": 9.187489619423499e-05, + "loss": 0.0219, + "step": 13490 + }, + { + "epoch": 7.857974388824214, + "grad_norm": 0.3271821141242981, + "learning_rate": 9.185983112000056e-05, + "loss": 0.0209, + "step": 13500 + }, + { + "epoch": 7.863795110593713, + "grad_norm": 0.2290130853652954, + "learning_rate": 9.184475332985464e-05, + "loss": 0.0196, + "step": 13510 + }, + { + "epoch": 7.869615832363213, + "grad_norm": 0.3378830552101135, + "learning_rate": 9.182966282837754e-05, + "loss": 0.0229, + "step": 13520 + }, + { + "epoch": 7.875436554132713, + "grad_norm": 0.34781986474990845, + "learning_rate": 9.18145596201533e-05, + "loss": 0.0202, + "step": 13530 + }, + { + "epoch": 7.881257275902212, + "grad_norm": 0.31604060530662537, + "learning_rate": 9.179944370976991e-05, + "loss": 0.0234, + "step": 13540 + }, + { + "epoch": 7.887077997671711, + "grad_norm": 0.22007212042808533, + "learning_rate": 9.178431510181918e-05, + "loss": 0.0196, + "step": 13550 + }, + { + "epoch": 7.89289871944121, + "grad_norm": 0.24924646317958832, + "learning_rate": 9.176917380089675e-05, + "loss": 0.0225, + "step": 13560 + }, + { + "epoch": 7.8987194412107105, + "grad_norm": 0.3291139602661133, + "learning_rate": 9.175401981160219e-05, + "loss": 0.0234, + "step": 13570 + }, + { + "epoch": 7.90454016298021, + "grad_norm": 0.35372069478034973, + "learning_rate": 9.173885313853885e-05, + "loss": 0.0211, + "step": 13580 + }, + { + "epoch": 7.910360884749709, + "grad_norm": 0.35640430450439453, + "learning_rate": 9.172367378631398e-05, + "loss": 0.0262, + "step": 13590 + }, + { + "epoch": 7.916181606519208, + "grad_norm": 0.27234023809432983, + "learning_rate": 9.170848175953866e-05, + "loss": 0.0275, + "step": 13600 + }, + { + "epoch": 7.922002328288707, + "grad_norm": 0.25959840416908264, + "learning_rate": 9.169327706282784e-05, + "loss": 0.0195, + "step": 13610 + }, + { + "epoch": 7.927823050058207, + "grad_norm": 0.19904853403568268, + "learning_rate": 9.167805970080029e-05, + "loss": 0.0184, + "step": 13620 + }, + { + "epoch": 7.933643771827707, + "grad_norm": 0.2955288589000702, + "learning_rate": 9.166282967807864e-05, + "loss": 0.0217, + "step": 13630 + }, + { + "epoch": 7.939464493597206, + "grad_norm": 0.24368837475776672, + "learning_rate": 9.16475869992894e-05, + "loss": 0.019, + "step": 13640 + }, + { + "epoch": 7.945285215366706, + "grad_norm": 0.244931161403656, + "learning_rate": 9.163233166906284e-05, + "loss": 0.0152, + "step": 13650 + }, + { + "epoch": 7.951105937136205, + "grad_norm": 0.2457057237625122, + "learning_rate": 9.161706369203317e-05, + "loss": 0.022, + "step": 13660 + }, + { + "epoch": 7.956926658905704, + "grad_norm": 0.2450215220451355, + "learning_rate": 9.16017830728384e-05, + "loss": 0.0161, + "step": 13670 + }, + { + "epoch": 7.962747380675204, + "grad_norm": 0.22628091275691986, + "learning_rate": 9.158648981612035e-05, + "loss": 0.0239, + "step": 13680 + }, + { + "epoch": 7.968568102444703, + "grad_norm": 0.18320415914058685, + "learning_rate": 9.157118392652472e-05, + "loss": 0.0204, + "step": 13690 + }, + { + "epoch": 7.974388824214203, + "grad_norm": 0.226125568151474, + "learning_rate": 9.155586540870104e-05, + "loss": 0.0189, + "step": 13700 + }, + { + "epoch": 7.980209545983702, + "grad_norm": 0.24334928393363953, + "learning_rate": 9.154053426730267e-05, + "loss": 0.0232, + "step": 13710 + }, + { + "epoch": 7.986030267753201, + "grad_norm": 0.2639826834201813, + "learning_rate": 9.15251905069868e-05, + "loss": 0.0272, + "step": 13720 + }, + { + "epoch": 7.991850989522701, + "grad_norm": 0.21234755218029022, + "learning_rate": 9.150983413241446e-05, + "loss": 0.0207, + "step": 13730 + }, + { + "epoch": 7.997671711292201, + "grad_norm": 0.25053393840789795, + "learning_rate": 9.149446514825051e-05, + "loss": 0.0267, + "step": 13740 + }, + { + "epoch": 8.0034924330617, + "grad_norm": 0.27223366498947144, + "learning_rate": 9.147908355916365e-05, + "loss": 0.0264, + "step": 13750 + }, + { + "epoch": 8.009313154831199, + "grad_norm": 0.19691060483455658, + "learning_rate": 9.146368936982642e-05, + "loss": 0.0275, + "step": 13760 + }, + { + "epoch": 8.015133876600698, + "grad_norm": 0.2137145698070526, + "learning_rate": 9.144828258491511e-05, + "loss": 0.0239, + "step": 13770 + }, + { + "epoch": 8.020954598370198, + "grad_norm": 0.13384635746479034, + "learning_rate": 9.143286320910996e-05, + "loss": 0.0197, + "step": 13780 + }, + { + "epoch": 8.026775320139697, + "grad_norm": 0.25321438908576965, + "learning_rate": 9.141743124709491e-05, + "loss": 0.0222, + "step": 13790 + }, + { + "epoch": 8.032596041909196, + "grad_norm": 0.21835900843143463, + "learning_rate": 9.140198670355784e-05, + "loss": 0.025, + "step": 13800 + }, + { + "epoch": 8.038416763678697, + "grad_norm": 0.23152929544448853, + "learning_rate": 9.138652958319034e-05, + "loss": 0.0266, + "step": 13810 + }, + { + "epoch": 8.044237485448196, + "grad_norm": 0.2525329291820526, + "learning_rate": 9.137105989068791e-05, + "loss": 0.0203, + "step": 13820 + }, + { + "epoch": 8.050058207217695, + "grad_norm": 0.27707648277282715, + "learning_rate": 9.135557763074983e-05, + "loss": 0.0312, + "step": 13830 + }, + { + "epoch": 8.055878928987195, + "grad_norm": 0.3007937967777252, + "learning_rate": 9.13400828080792e-05, + "loss": 0.028, + "step": 13840 + }, + { + "epoch": 8.061699650756694, + "grad_norm": 0.2595648765563965, + "learning_rate": 9.132457542738292e-05, + "loss": 0.0235, + "step": 13850 + }, + { + "epoch": 8.067520372526193, + "grad_norm": 0.17737482488155365, + "learning_rate": 9.130905549337174e-05, + "loss": 0.0189, + "step": 13860 + }, + { + "epoch": 8.073341094295692, + "grad_norm": 0.28783243894577026, + "learning_rate": 9.129352301076021e-05, + "loss": 0.0197, + "step": 13870 + }, + { + "epoch": 8.079161816065191, + "grad_norm": 0.1922224760055542, + "learning_rate": 9.127797798426668e-05, + "loss": 0.0222, + "step": 13880 + }, + { + "epoch": 8.08498253783469, + "grad_norm": 0.19056957960128784, + "learning_rate": 9.126242041861333e-05, + "loss": 0.0298, + "step": 13890 + }, + { + "epoch": 8.090803259604192, + "grad_norm": 0.23369261622428894, + "learning_rate": 9.124685031852611e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 8.09662398137369, + "grad_norm": 0.32423678040504456, + "learning_rate": 9.123126768873482e-05, + "loss": 0.0247, + "step": 13910 + }, + { + "epoch": 8.10244470314319, + "grad_norm": 0.2787069082260132, + "learning_rate": 9.121567253397308e-05, + "loss": 0.0223, + "step": 13920 + }, + { + "epoch": 8.10826542491269, + "grad_norm": 0.2249143421649933, + "learning_rate": 9.120006485897824e-05, + "loss": 0.0189, + "step": 13930 + }, + { + "epoch": 8.114086146682189, + "grad_norm": 0.22632119059562683, + "learning_rate": 9.118444466849152e-05, + "loss": 0.0244, + "step": 13940 + }, + { + "epoch": 8.119906868451688, + "grad_norm": 0.2620958089828491, + "learning_rate": 9.116881196725793e-05, + "loss": 0.0232, + "step": 13950 + }, + { + "epoch": 8.125727590221187, + "grad_norm": 0.29165858030319214, + "learning_rate": 9.115316676002627e-05, + "loss": 0.0256, + "step": 13960 + }, + { + "epoch": 8.131548311990686, + "grad_norm": 0.2873265743255615, + "learning_rate": 9.113750905154911e-05, + "loss": 0.0253, + "step": 13970 + }, + { + "epoch": 8.137369033760187, + "grad_norm": 0.16223548352718353, + "learning_rate": 9.112183884658289e-05, + "loss": 0.0268, + "step": 13980 + }, + { + "epoch": 8.143189755529686, + "grad_norm": 0.3325653374195099, + "learning_rate": 9.11061561498878e-05, + "loss": 0.0178, + "step": 13990 + }, + { + "epoch": 8.149010477299186, + "grad_norm": 0.3625556230545044, + "learning_rate": 9.109046096622779e-05, + "loss": 0.0255, + "step": 14000 + }, + { + "epoch": 8.154831199068685, + "grad_norm": 0.33800601959228516, + "learning_rate": 9.107475330037069e-05, + "loss": 0.0244, + "step": 14010 + }, + { + "epoch": 8.160651920838184, + "grad_norm": 0.3315700590610504, + "learning_rate": 9.105903315708806e-05, + "loss": 0.0194, + "step": 14020 + }, + { + "epoch": 8.166472642607683, + "grad_norm": 0.25055238604545593, + "learning_rate": 9.104330054115524e-05, + "loss": 0.019, + "step": 14030 + }, + { + "epoch": 8.172293364377182, + "grad_norm": 0.20662745833396912, + "learning_rate": 9.102755545735141e-05, + "loss": 0.0187, + "step": 14040 + }, + { + "epoch": 8.178114086146682, + "grad_norm": 0.22963719069957733, + "learning_rate": 9.10117979104595e-05, + "loss": 0.0213, + "step": 14050 + }, + { + "epoch": 8.18393480791618, + "grad_norm": 0.249906986951828, + "learning_rate": 9.099602790526624e-05, + "loss": 0.0244, + "step": 14060 + }, + { + "epoch": 8.189755529685682, + "grad_norm": 0.2959202527999878, + "learning_rate": 9.098024544656212e-05, + "loss": 0.0215, + "step": 14070 + }, + { + "epoch": 8.195576251455181, + "grad_norm": 0.27140650153160095, + "learning_rate": 9.096445053914148e-05, + "loss": 0.0276, + "step": 14080 + }, + { + "epoch": 8.20139697322468, + "grad_norm": 0.1712128072977066, + "learning_rate": 9.094864318780236e-05, + "loss": 0.0241, + "step": 14090 + }, + { + "epoch": 8.20721769499418, + "grad_norm": 0.401364266872406, + "learning_rate": 9.093282339734663e-05, + "loss": 0.0355, + "step": 14100 + }, + { + "epoch": 8.213038416763679, + "grad_norm": 0.3150787055492401, + "learning_rate": 9.091699117257992e-05, + "loss": 0.0211, + "step": 14110 + }, + { + "epoch": 8.218859138533178, + "grad_norm": 0.21389251947402954, + "learning_rate": 9.090114651831163e-05, + "loss": 0.0201, + "step": 14120 + }, + { + "epoch": 8.224679860302677, + "grad_norm": 0.2635035514831543, + "learning_rate": 9.088528943935497e-05, + "loss": 0.0269, + "step": 14130 + }, + { + "epoch": 8.230500582072176, + "grad_norm": 0.15357570350170135, + "learning_rate": 9.086941994052689e-05, + "loss": 0.0194, + "step": 14140 + }, + { + "epoch": 8.236321303841676, + "grad_norm": 0.2604835331439972, + "learning_rate": 9.085353802664813e-05, + "loss": 0.0201, + "step": 14150 + }, + { + "epoch": 8.242142025611177, + "grad_norm": 0.27398908138275146, + "learning_rate": 9.08376437025432e-05, + "loss": 0.0211, + "step": 14160 + }, + { + "epoch": 8.247962747380676, + "grad_norm": 0.3056034445762634, + "learning_rate": 9.082173697304035e-05, + "loss": 0.0226, + "step": 14170 + }, + { + "epoch": 8.253783469150175, + "grad_norm": 0.1958601176738739, + "learning_rate": 9.080581784297166e-05, + "loss": 0.0152, + "step": 14180 + }, + { + "epoch": 8.259604190919674, + "grad_norm": 0.23957321047782898, + "learning_rate": 9.078988631717291e-05, + "loss": 0.0205, + "step": 14190 + }, + { + "epoch": 8.265424912689173, + "grad_norm": 0.22455406188964844, + "learning_rate": 9.077394240048369e-05, + "loss": 0.0203, + "step": 14200 + }, + { + "epoch": 8.271245634458673, + "grad_norm": 0.1841709464788437, + "learning_rate": 9.075798609774736e-05, + "loss": 0.0235, + "step": 14210 + }, + { + "epoch": 8.277066356228172, + "grad_norm": 0.24242432415485382, + "learning_rate": 9.0742017413811e-05, + "loss": 0.0197, + "step": 14220 + }, + { + "epoch": 8.282887077997671, + "grad_norm": 0.2507302463054657, + "learning_rate": 9.072603635352548e-05, + "loss": 0.0209, + "step": 14230 + }, + { + "epoch": 8.28870779976717, + "grad_norm": 0.2077583521604538, + "learning_rate": 9.071004292174541e-05, + "loss": 0.0217, + "step": 14240 + }, + { + "epoch": 8.294528521536671, + "grad_norm": 0.20055702328681946, + "learning_rate": 9.06940371233292e-05, + "loss": 0.0168, + "step": 14250 + }, + { + "epoch": 8.30034924330617, + "grad_norm": 0.18970724940299988, + "learning_rate": 9.067801896313898e-05, + "loss": 0.0214, + "step": 14260 + }, + { + "epoch": 8.30616996507567, + "grad_norm": 0.2697969377040863, + "learning_rate": 9.066198844604064e-05, + "loss": 0.0217, + "step": 14270 + }, + { + "epoch": 8.311990686845169, + "grad_norm": 0.2723968029022217, + "learning_rate": 9.06459455769038e-05, + "loss": 0.0244, + "step": 14280 + }, + { + "epoch": 8.317811408614668, + "grad_norm": 0.3367982506752014, + "learning_rate": 9.062989036060193e-05, + "loss": 0.0311, + "step": 14290 + }, + { + "epoch": 8.323632130384167, + "grad_norm": 0.16677655279636383, + "learning_rate": 9.061382280201212e-05, + "loss": 0.0194, + "step": 14300 + }, + { + "epoch": 8.329452852153667, + "grad_norm": 0.249659463763237, + "learning_rate": 9.059774290601528e-05, + "loss": 0.031, + "step": 14310 + }, + { + "epoch": 8.335273573923166, + "grad_norm": 0.2050747573375702, + "learning_rate": 9.058165067749606e-05, + "loss": 0.0223, + "step": 14320 + }, + { + "epoch": 8.341094295692667, + "grad_norm": 0.2179233431816101, + "learning_rate": 9.056554612134288e-05, + "loss": 0.0244, + "step": 14330 + }, + { + "epoch": 8.346915017462166, + "grad_norm": 0.2939705550670624, + "learning_rate": 9.054942924244785e-05, + "loss": 0.0212, + "step": 14340 + }, + { + "epoch": 8.352735739231665, + "grad_norm": 0.22675907611846924, + "learning_rate": 9.053330004570686e-05, + "loss": 0.0212, + "step": 14350 + }, + { + "epoch": 8.358556461001164, + "grad_norm": 0.23904305696487427, + "learning_rate": 9.051715853601955e-05, + "loss": 0.0215, + "step": 14360 + }, + { + "epoch": 8.364377182770664, + "grad_norm": 0.25281187891960144, + "learning_rate": 9.050100471828926e-05, + "loss": 0.0257, + "step": 14370 + }, + { + "epoch": 8.370197904540163, + "grad_norm": 0.32703647017478943, + "learning_rate": 9.048483859742311e-05, + "loss": 0.0259, + "step": 14380 + }, + { + "epoch": 8.376018626309662, + "grad_norm": 0.2663521468639374, + "learning_rate": 9.046866017833193e-05, + "loss": 0.0181, + "step": 14390 + }, + { + "epoch": 8.381839348079161, + "grad_norm": 0.2230817824602127, + "learning_rate": 9.045246946593029e-05, + "loss": 0.0224, + "step": 14400 + }, + { + "epoch": 8.38766006984866, + "grad_norm": 0.23420964181423187, + "learning_rate": 9.043626646513652e-05, + "loss": 0.0193, + "step": 14410 + }, + { + "epoch": 8.39348079161816, + "grad_norm": 0.17331963777542114, + "learning_rate": 9.042005118087267e-05, + "loss": 0.0182, + "step": 14420 + }, + { + "epoch": 8.39930151338766, + "grad_norm": 0.23710450530052185, + "learning_rate": 9.040382361806448e-05, + "loss": 0.0263, + "step": 14430 + }, + { + "epoch": 8.40512223515716, + "grad_norm": 0.19672775268554688, + "learning_rate": 9.038758378164148e-05, + "loss": 0.0281, + "step": 14440 + }, + { + "epoch": 8.41094295692666, + "grad_norm": 0.19275687634944916, + "learning_rate": 9.037133167653691e-05, + "loss": 0.0201, + "step": 14450 + }, + { + "epoch": 8.416763678696158, + "grad_norm": 0.24649089574813843, + "learning_rate": 9.035506730768771e-05, + "loss": 0.0222, + "step": 14460 + }, + { + "epoch": 8.422584400465658, + "grad_norm": 0.24669769406318665, + "learning_rate": 9.033879068003458e-05, + "loss": 0.0189, + "step": 14470 + }, + { + "epoch": 8.428405122235157, + "grad_norm": 0.25613951683044434, + "learning_rate": 9.032250179852193e-05, + "loss": 0.0223, + "step": 14480 + }, + { + "epoch": 8.434225844004656, + "grad_norm": 0.23116756975650787, + "learning_rate": 9.030620066809787e-05, + "loss": 0.0204, + "step": 14490 + }, + { + "epoch": 8.440046565774155, + "grad_norm": 0.2117726057767868, + "learning_rate": 9.028988729371428e-05, + "loss": 0.0189, + "step": 14500 + }, + { + "epoch": 8.445867287543656, + "grad_norm": 0.2669698894023895, + "learning_rate": 9.027356168032673e-05, + "loss": 0.023, + "step": 14510 + }, + { + "epoch": 8.451688009313155, + "grad_norm": 0.3277817368507385, + "learning_rate": 9.02572238328945e-05, + "loss": 0.0221, + "step": 14520 + }, + { + "epoch": 8.457508731082655, + "grad_norm": 0.21833890676498413, + "learning_rate": 9.02408737563806e-05, + "loss": 0.0211, + "step": 14530 + }, + { + "epoch": 8.463329452852154, + "grad_norm": 0.21126294136047363, + "learning_rate": 9.022451145575174e-05, + "loss": 0.0213, + "step": 14540 + }, + { + "epoch": 8.469150174621653, + "grad_norm": 0.27639397978782654, + "learning_rate": 9.02081369359784e-05, + "loss": 0.0198, + "step": 14550 + }, + { + "epoch": 8.474970896391152, + "grad_norm": 0.26042860746383667, + "learning_rate": 9.019175020203465e-05, + "loss": 0.0187, + "step": 14560 + }, + { + "epoch": 8.480791618160652, + "grad_norm": 0.33776620030403137, + "learning_rate": 9.017535125889842e-05, + "loss": 0.0187, + "step": 14570 + }, + { + "epoch": 8.48661233993015, + "grad_norm": 0.22651219367980957, + "learning_rate": 9.015894011155124e-05, + "loss": 0.0223, + "step": 14580 + }, + { + "epoch": 8.49243306169965, + "grad_norm": 0.3160834312438965, + "learning_rate": 9.014251676497838e-05, + "loss": 0.0285, + "step": 14590 + }, + { + "epoch": 8.498253783469151, + "grad_norm": 0.17219653725624084, + "learning_rate": 9.012608122416884e-05, + "loss": 0.0227, + "step": 14600 + }, + { + "epoch": 8.50407450523865, + "grad_norm": 0.2375258505344391, + "learning_rate": 9.010963349411529e-05, + "loss": 0.0254, + "step": 14610 + }, + { + "epoch": 8.50989522700815, + "grad_norm": 0.21157078444957733, + "learning_rate": 9.00931735798141e-05, + "loss": 0.0239, + "step": 14620 + }, + { + "epoch": 8.515715948777649, + "grad_norm": 0.24351295828819275, + "learning_rate": 9.00767014862654e-05, + "loss": 0.0194, + "step": 14630 + }, + { + "epoch": 8.521536670547148, + "grad_norm": 0.264281302690506, + "learning_rate": 9.006021721847295e-05, + "loss": 0.0198, + "step": 14640 + }, + { + "epoch": 8.527357392316647, + "grad_norm": 0.2928541898727417, + "learning_rate": 9.004372078144423e-05, + "loss": 0.0199, + "step": 14650 + }, + { + "epoch": 8.533178114086146, + "grad_norm": 0.1825636774301529, + "learning_rate": 9.002721218019043e-05, + "loss": 0.022, + "step": 14660 + }, + { + "epoch": 8.538998835855645, + "grad_norm": 0.24701450765132904, + "learning_rate": 9.001069141972642e-05, + "loss": 0.0201, + "step": 14670 + }, + { + "epoch": 8.544819557625146, + "grad_norm": 0.19037684798240662, + "learning_rate": 8.99941585050708e-05, + "loss": 0.0207, + "step": 14680 + }, + { + "epoch": 8.550640279394646, + "grad_norm": 0.3048546016216278, + "learning_rate": 8.997761344124578e-05, + "loss": 0.0211, + "step": 14690 + }, + { + "epoch": 8.556461001164145, + "grad_norm": 0.24802452325820923, + "learning_rate": 8.996105623327737e-05, + "loss": 0.0213, + "step": 14700 + }, + { + "epoch": 8.562281722933644, + "grad_norm": 0.2780308425426483, + "learning_rate": 8.994448688619517e-05, + "loss": 0.0199, + "step": 14710 + }, + { + "epoch": 8.568102444703143, + "grad_norm": 0.20404139161109924, + "learning_rate": 8.992790540503253e-05, + "loss": 0.0207, + "step": 14720 + }, + { + "epoch": 8.573923166472643, + "grad_norm": 0.2326977699995041, + "learning_rate": 8.991131179482648e-05, + "loss": 0.0183, + "step": 14730 + }, + { + "epoch": 8.579743888242142, + "grad_norm": 0.22341932356357574, + "learning_rate": 8.989470606061768e-05, + "loss": 0.0214, + "step": 14740 + }, + { + "epoch": 8.585564610011641, + "grad_norm": 0.24191319942474365, + "learning_rate": 8.987808820745056e-05, + "loss": 0.0223, + "step": 14750 + }, + { + "epoch": 8.59138533178114, + "grad_norm": 0.2633685767650604, + "learning_rate": 8.986145824037315e-05, + "loss": 0.0183, + "step": 14760 + }, + { + "epoch": 8.59720605355064, + "grad_norm": 0.1912059187889099, + "learning_rate": 8.984481616443721e-05, + "loss": 0.0267, + "step": 14770 + }, + { + "epoch": 8.60302677532014, + "grad_norm": 0.14146623015403748, + "learning_rate": 8.982816198469815e-05, + "loss": 0.0196, + "step": 14780 + }, + { + "epoch": 8.60884749708964, + "grad_norm": 0.20309817790985107, + "learning_rate": 8.98114957062151e-05, + "loss": 0.022, + "step": 14790 + }, + { + "epoch": 8.614668218859139, + "grad_norm": 0.20561538636684418, + "learning_rate": 8.97948173340508e-05, + "loss": 0.0189, + "step": 14800 + }, + { + "epoch": 8.620488940628638, + "grad_norm": 0.3238257169723511, + "learning_rate": 8.977812687327172e-05, + "loss": 0.0153, + "step": 14810 + }, + { + "epoch": 8.626309662398137, + "grad_norm": 0.2511746883392334, + "learning_rate": 8.976142432894798e-05, + "loss": 0.021, + "step": 14820 + }, + { + "epoch": 8.632130384167636, + "grad_norm": 0.17523956298828125, + "learning_rate": 8.974470970615336e-05, + "loss": 0.0218, + "step": 14830 + }, + { + "epoch": 8.637951105937136, + "grad_norm": 0.24116192758083344, + "learning_rate": 8.972798300996534e-05, + "loss": 0.0227, + "step": 14840 + }, + { + "epoch": 8.643771827706635, + "grad_norm": 0.21298716962337494, + "learning_rate": 8.971124424546504e-05, + "loss": 0.0183, + "step": 14850 + }, + { + "epoch": 8.649592549476136, + "grad_norm": 0.23332016170024872, + "learning_rate": 8.969449341773724e-05, + "loss": 0.0218, + "step": 14860 + }, + { + "epoch": 8.655413271245635, + "grad_norm": 0.28009381890296936, + "learning_rate": 8.967773053187042e-05, + "loss": 0.0246, + "step": 14870 + }, + { + "epoch": 8.661233993015134, + "grad_norm": 0.25546181201934814, + "learning_rate": 8.966095559295668e-05, + "loss": 0.0191, + "step": 14880 + }, + { + "epoch": 8.667054714784634, + "grad_norm": 0.31030163168907166, + "learning_rate": 8.964416860609184e-05, + "loss": 0.0201, + "step": 14890 + }, + { + "epoch": 8.672875436554133, + "grad_norm": 0.2188451886177063, + "learning_rate": 8.962736957637532e-05, + "loss": 0.0172, + "step": 14900 + }, + { + "epoch": 8.678696158323632, + "grad_norm": 0.25942128896713257, + "learning_rate": 8.96105585089102e-05, + "loss": 0.0165, + "step": 14910 + }, + { + "epoch": 8.684516880093131, + "grad_norm": 0.2785361707210541, + "learning_rate": 8.959373540880329e-05, + "loss": 0.0316, + "step": 14920 + }, + { + "epoch": 8.69033760186263, + "grad_norm": 0.2492259293794632, + "learning_rate": 8.957690028116495e-05, + "loss": 0.0225, + "step": 14930 + }, + { + "epoch": 8.69615832363213, + "grad_norm": 0.19436213374137878, + "learning_rate": 8.956005313110928e-05, + "loss": 0.0205, + "step": 14940 + }, + { + "epoch": 8.70197904540163, + "grad_norm": 0.25135931372642517, + "learning_rate": 8.9543193963754e-05, + "loss": 0.0242, + "step": 14950 + }, + { + "epoch": 8.70779976717113, + "grad_norm": 0.29891514778137207, + "learning_rate": 8.952632278422048e-05, + "loss": 0.0192, + "step": 14960 + }, + { + "epoch": 8.713620488940629, + "grad_norm": 0.21782360970973969, + "learning_rate": 8.95094395976337e-05, + "loss": 0.0239, + "step": 14970 + }, + { + "epoch": 8.719441210710128, + "grad_norm": 0.2099573165178299, + "learning_rate": 8.949254440912239e-05, + "loss": 0.022, + "step": 14980 + }, + { + "epoch": 8.725261932479627, + "grad_norm": 0.2256445437669754, + "learning_rate": 8.94756372238188e-05, + "loss": 0.0195, + "step": 14990 + }, + { + "epoch": 8.731082654249127, + "grad_norm": 0.3616402745246887, + "learning_rate": 8.945871804685892e-05, + "loss": 0.0202, + "step": 15000 + }, + { + "epoch": 8.736903376018626, + "grad_norm": 0.18924319744110107, + "learning_rate": 8.944178688338236e-05, + "loss": 0.0165, + "step": 15010 + }, + { + "epoch": 8.742724097788125, + "grad_norm": 0.3563123941421509, + "learning_rate": 8.942484373853233e-05, + "loss": 0.0165, + "step": 15020 + }, + { + "epoch": 8.748544819557626, + "grad_norm": 0.3443748652935028, + "learning_rate": 8.940788861745572e-05, + "loss": 0.0238, + "step": 15030 + }, + { + "epoch": 8.754365541327125, + "grad_norm": 0.2844447195529938, + "learning_rate": 8.939092152530308e-05, + "loss": 0.0218, + "step": 15040 + }, + { + "epoch": 8.760186263096625, + "grad_norm": 0.38421785831451416, + "learning_rate": 8.937394246722853e-05, + "loss": 0.0209, + "step": 15050 + }, + { + "epoch": 8.766006984866124, + "grad_norm": 0.18420103192329407, + "learning_rate": 8.935695144838984e-05, + "loss": 0.0258, + "step": 15060 + }, + { + "epoch": 8.771827706635623, + "grad_norm": 0.3234125077724457, + "learning_rate": 8.933994847394849e-05, + "loss": 0.0206, + "step": 15070 + }, + { + "epoch": 8.777648428405122, + "grad_norm": 0.24373885989189148, + "learning_rate": 8.932293354906949e-05, + "loss": 0.0166, + "step": 15080 + }, + { + "epoch": 8.783469150174621, + "grad_norm": 0.1901940554380417, + "learning_rate": 8.930590667892153e-05, + "loss": 0.0211, + "step": 15090 + }, + { + "epoch": 8.78928987194412, + "grad_norm": 0.23081248998641968, + "learning_rate": 8.928886786867696e-05, + "loss": 0.0185, + "step": 15100 + }, + { + "epoch": 8.79511059371362, + "grad_norm": 0.19061952829360962, + "learning_rate": 8.927181712351168e-05, + "loss": 0.0155, + "step": 15110 + }, + { + "epoch": 8.800931315483119, + "grad_norm": 0.26007023453712463, + "learning_rate": 8.925475444860527e-05, + "loss": 0.02, + "step": 15120 + }, + { + "epoch": 8.80675203725262, + "grad_norm": 0.20844946801662445, + "learning_rate": 8.923767984914092e-05, + "loss": 0.016, + "step": 15130 + }, + { + "epoch": 8.81257275902212, + "grad_norm": 0.22643472254276276, + "learning_rate": 8.922059333030545e-05, + "loss": 0.0248, + "step": 15140 + }, + { + "epoch": 8.818393480791618, + "grad_norm": 0.28464746475219727, + "learning_rate": 8.920349489728928e-05, + "loss": 0.0194, + "step": 15150 + }, + { + "epoch": 8.824214202561118, + "grad_norm": 0.26708585023880005, + "learning_rate": 8.918638455528646e-05, + "loss": 0.0263, + "step": 15160 + }, + { + "epoch": 8.830034924330617, + "grad_norm": 0.30318304896354675, + "learning_rate": 8.916926230949468e-05, + "loss": 0.0209, + "step": 15170 + }, + { + "epoch": 8.835855646100116, + "grad_norm": 0.2854878306388855, + "learning_rate": 8.915212816511522e-05, + "loss": 0.0189, + "step": 15180 + }, + { + "epoch": 8.841676367869615, + "grad_norm": 0.1845693141222, + "learning_rate": 8.913498212735296e-05, + "loss": 0.0226, + "step": 15190 + }, + { + "epoch": 8.847497089639115, + "grad_norm": 0.21604900062084198, + "learning_rate": 8.911782420141643e-05, + "loss": 0.0203, + "step": 15200 + }, + { + "epoch": 8.853317811408616, + "grad_norm": 0.21460247039794922, + "learning_rate": 8.910065439251775e-05, + "loss": 0.0165, + "step": 15210 + }, + { + "epoch": 8.859138533178115, + "grad_norm": 0.1519969254732132, + "learning_rate": 8.908347270587268e-05, + "loss": 0.0211, + "step": 15220 + }, + { + "epoch": 8.864959254947614, + "grad_norm": 0.2497864067554474, + "learning_rate": 8.906627914670054e-05, + "loss": 0.0206, + "step": 15230 + }, + { + "epoch": 8.870779976717113, + "grad_norm": 0.1607341766357422, + "learning_rate": 8.904907372022427e-05, + "loss": 0.0189, + "step": 15240 + }, + { + "epoch": 8.876600698486612, + "grad_norm": 0.28268200159072876, + "learning_rate": 8.903185643167042e-05, + "loss": 0.0214, + "step": 15250 + }, + { + "epoch": 8.882421420256112, + "grad_norm": 0.17582298815250397, + "learning_rate": 8.901462728626919e-05, + "loss": 0.0195, + "step": 15260 + }, + { + "epoch": 8.88824214202561, + "grad_norm": 0.2752760052680969, + "learning_rate": 8.899738628925429e-05, + "loss": 0.0228, + "step": 15270 + }, + { + "epoch": 8.89406286379511, + "grad_norm": 0.3029894530773163, + "learning_rate": 8.898013344586312e-05, + "loss": 0.018, + "step": 15280 + }, + { + "epoch": 8.89988358556461, + "grad_norm": 0.2733170986175537, + "learning_rate": 8.896286876133661e-05, + "loss": 0.0262, + "step": 15290 + }, + { + "epoch": 8.90570430733411, + "grad_norm": 0.27550560235977173, + "learning_rate": 8.894559224091933e-05, + "loss": 0.0197, + "step": 15300 + }, + { + "epoch": 8.91152502910361, + "grad_norm": 0.2392350137233734, + "learning_rate": 8.892830388985942e-05, + "loss": 0.0213, + "step": 15310 + }, + { + "epoch": 8.917345750873109, + "grad_norm": 0.2253786325454712, + "learning_rate": 8.891100371340864e-05, + "loss": 0.0192, + "step": 15320 + }, + { + "epoch": 8.923166472642608, + "grad_norm": 0.20418082177639008, + "learning_rate": 8.889369171682231e-05, + "loss": 0.022, + "step": 15330 + }, + { + "epoch": 8.928987194412107, + "grad_norm": 0.25059688091278076, + "learning_rate": 8.887636790535936e-05, + "loss": 0.0167, + "step": 15340 + }, + { + "epoch": 8.934807916181606, + "grad_norm": 0.20962026715278625, + "learning_rate": 8.885903228428231e-05, + "loss": 0.021, + "step": 15350 + }, + { + "epoch": 8.940628637951106, + "grad_norm": 0.27027055621147156, + "learning_rate": 8.884168485885727e-05, + "loss": 0.019, + "step": 15360 + }, + { + "epoch": 8.946449359720605, + "grad_norm": 0.20033429563045502, + "learning_rate": 8.882432563435393e-05, + "loss": 0.0201, + "step": 15370 + }, + { + "epoch": 8.952270081490104, + "grad_norm": 0.23400548100471497, + "learning_rate": 8.880695461604556e-05, + "loss": 0.0179, + "step": 15380 + }, + { + "epoch": 8.958090803259605, + "grad_norm": 0.21670618653297424, + "learning_rate": 8.878957180920901e-05, + "loss": 0.0183, + "step": 15390 + }, + { + "epoch": 8.963911525029104, + "grad_norm": 0.38295549154281616, + "learning_rate": 8.877217721912473e-05, + "loss": 0.0209, + "step": 15400 + }, + { + "epoch": 8.969732246798603, + "grad_norm": 0.24192175269126892, + "learning_rate": 8.875477085107673e-05, + "loss": 0.0209, + "step": 15410 + }, + { + "epoch": 8.975552968568103, + "grad_norm": 0.2784593999385834, + "learning_rate": 8.87373527103526e-05, + "loss": 0.0238, + "step": 15420 + }, + { + "epoch": 8.981373690337602, + "grad_norm": 0.2830581068992615, + "learning_rate": 8.871992280224353e-05, + "loss": 0.018, + "step": 15430 + }, + { + "epoch": 8.987194412107101, + "grad_norm": 0.25682422518730164, + "learning_rate": 8.870248113204422e-05, + "loss": 0.0204, + "step": 15440 + }, + { + "epoch": 8.9930151338766, + "grad_norm": 0.21552495658397675, + "learning_rate": 8.868502770505306e-05, + "loss": 0.0224, + "step": 15450 + }, + { + "epoch": 8.9988358556461, + "grad_norm": 0.19104106724262238, + "learning_rate": 8.86675625265719e-05, + "loss": 0.0243, + "step": 15460 + }, + { + "epoch": 9.004656577415599, + "grad_norm": 0.22790558636188507, + "learning_rate": 8.865008560190618e-05, + "loss": 0.0165, + "step": 15470 + }, + { + "epoch": 9.0104772991851, + "grad_norm": 0.2122763842344284, + "learning_rate": 8.863259693636496e-05, + "loss": 0.022, + "step": 15480 + }, + { + "epoch": 9.016298020954599, + "grad_norm": 0.2486647516489029, + "learning_rate": 8.861509653526083e-05, + "loss": 0.0192, + "step": 15490 + }, + { + "epoch": 9.022118742724098, + "grad_norm": 0.25469738245010376, + "learning_rate": 8.859758440390993e-05, + "loss": 0.0206, + "step": 15500 + }, + { + "epoch": 9.027939464493597, + "grad_norm": 0.1995510756969452, + "learning_rate": 8.858006054763202e-05, + "loss": 0.0193, + "step": 15510 + }, + { + "epoch": 9.033760186263097, + "grad_norm": 0.1838023066520691, + "learning_rate": 8.856252497175035e-05, + "loss": 0.0252, + "step": 15520 + }, + { + "epoch": 9.039580908032596, + "grad_norm": 0.11841829866170883, + "learning_rate": 8.854497768159178e-05, + "loss": 0.0142, + "step": 15530 + }, + { + "epoch": 9.045401629802095, + "grad_norm": 0.31696465611457825, + "learning_rate": 8.852741868248671e-05, + "loss": 0.0199, + "step": 15540 + }, + { + "epoch": 9.051222351571594, + "grad_norm": 0.3716218173503876, + "learning_rate": 8.85098479797691e-05, + "loss": 0.0246, + "step": 15550 + }, + { + "epoch": 9.057043073341095, + "grad_norm": 0.25987035036087036, + "learning_rate": 8.849226557877646e-05, + "loss": 0.0219, + "step": 15560 + }, + { + "epoch": 9.062863795110594, + "grad_norm": 0.25150054693222046, + "learning_rate": 8.84746714848499e-05, + "loss": 0.0216, + "step": 15570 + }, + { + "epoch": 9.068684516880094, + "grad_norm": 0.22338144481182098, + "learning_rate": 8.845706570333397e-05, + "loss": 0.0163, + "step": 15580 + }, + { + "epoch": 9.074505238649593, + "grad_norm": 0.22948530316352844, + "learning_rate": 8.84394482395769e-05, + "loss": 0.0204, + "step": 15590 + }, + { + "epoch": 9.080325960419092, + "grad_norm": 0.2115151286125183, + "learning_rate": 8.842181909893038e-05, + "loss": 0.0169, + "step": 15600 + }, + { + "epoch": 9.086146682188591, + "grad_norm": 0.18000079691410065, + "learning_rate": 8.840417828674969e-05, + "loss": 0.0209, + "step": 15610 + }, + { + "epoch": 9.09196740395809, + "grad_norm": 0.2117864340543747, + "learning_rate": 8.838652580839364e-05, + "loss": 0.0229, + "step": 15620 + }, + { + "epoch": 9.09778812572759, + "grad_norm": 0.2803856134414673, + "learning_rate": 8.836886166922458e-05, + "loss": 0.0309, + "step": 15630 + }, + { + "epoch": 9.103608847497089, + "grad_norm": 0.21040956676006317, + "learning_rate": 8.835118587460844e-05, + "loss": 0.0182, + "step": 15640 + }, + { + "epoch": 9.10942956926659, + "grad_norm": 0.2940482795238495, + "learning_rate": 8.83334984299146e-05, + "loss": 0.0248, + "step": 15650 + }, + { + "epoch": 9.115250291036089, + "grad_norm": 0.21012605726718903, + "learning_rate": 8.83157993405161e-05, + "loss": 0.0318, + "step": 15660 + }, + { + "epoch": 9.121071012805588, + "grad_norm": 0.25006750226020813, + "learning_rate": 8.829808861178943e-05, + "loss": 0.0209, + "step": 15670 + }, + { + "epoch": 9.126891734575088, + "grad_norm": 0.21175114810466766, + "learning_rate": 8.828036624911464e-05, + "loss": 0.025, + "step": 15680 + }, + { + "epoch": 9.132712456344587, + "grad_norm": 0.19374974071979523, + "learning_rate": 8.826263225787532e-05, + "loss": 0.02, + "step": 15690 + }, + { + "epoch": 9.138533178114086, + "grad_norm": 0.2710120975971222, + "learning_rate": 8.824488664345858e-05, + "loss": 0.0197, + "step": 15700 + }, + { + "epoch": 9.144353899883585, + "grad_norm": 0.24014879763126373, + "learning_rate": 8.822712941125508e-05, + "loss": 0.0156, + "step": 15710 + }, + { + "epoch": 9.150174621653084, + "grad_norm": 0.37185588479042053, + "learning_rate": 8.820936056665898e-05, + "loss": 0.0216, + "step": 15720 + }, + { + "epoch": 9.155995343422584, + "grad_norm": 0.24801112711429596, + "learning_rate": 8.819158011506801e-05, + "loss": 0.0201, + "step": 15730 + }, + { + "epoch": 9.161816065192085, + "grad_norm": 0.2487548589706421, + "learning_rate": 8.81737880618834e-05, + "loss": 0.0185, + "step": 15740 + }, + { + "epoch": 9.167636786961584, + "grad_norm": 0.2097630798816681, + "learning_rate": 8.815598441250987e-05, + "loss": 0.022, + "step": 15750 + }, + { + "epoch": 9.173457508731083, + "grad_norm": 0.2306274175643921, + "learning_rate": 8.813816917235576e-05, + "loss": 0.0198, + "step": 15760 + }, + { + "epoch": 9.179278230500582, + "grad_norm": 0.3132690191268921, + "learning_rate": 8.812034234683282e-05, + "loss": 0.0216, + "step": 15770 + }, + { + "epoch": 9.185098952270081, + "grad_norm": 0.21121828258037567, + "learning_rate": 8.810250394135637e-05, + "loss": 0.0206, + "step": 15780 + }, + { + "epoch": 9.19091967403958, + "grad_norm": 0.2720232903957367, + "learning_rate": 8.808465396134529e-05, + "loss": 0.0228, + "step": 15790 + }, + { + "epoch": 9.19674039580908, + "grad_norm": 0.243368461728096, + "learning_rate": 8.806679241222189e-05, + "loss": 0.0199, + "step": 15800 + }, + { + "epoch": 9.202561117578579, + "grad_norm": 0.2557641565799713, + "learning_rate": 8.804891929941203e-05, + "loss": 0.0234, + "step": 15810 + }, + { + "epoch": 9.208381839348078, + "grad_norm": 0.2167285978794098, + "learning_rate": 8.803103462834514e-05, + "loss": 0.0218, + "step": 15820 + }, + { + "epoch": 9.21420256111758, + "grad_norm": 0.2270018309354782, + "learning_rate": 8.801313840445408e-05, + "loss": 0.0203, + "step": 15830 + }, + { + "epoch": 9.220023282887079, + "grad_norm": 0.3168254792690277, + "learning_rate": 8.799523063317524e-05, + "loss": 0.0176, + "step": 15840 + }, + { + "epoch": 9.225844004656578, + "grad_norm": 0.15654808282852173, + "learning_rate": 8.797731131994854e-05, + "loss": 0.0192, + "step": 15850 + }, + { + "epoch": 9.231664726426077, + "grad_norm": 0.31418663263320923, + "learning_rate": 8.795938047021739e-05, + "loss": 0.0239, + "step": 15860 + }, + { + "epoch": 9.237485448195576, + "grad_norm": 0.17867417633533478, + "learning_rate": 8.794143808942872e-05, + "loss": 0.0188, + "step": 15870 + }, + { + "epoch": 9.243306169965075, + "grad_norm": 0.2379506379365921, + "learning_rate": 8.792348418303296e-05, + "loss": 0.0182, + "step": 15880 + }, + { + "epoch": 9.249126891734575, + "grad_norm": 0.20081186294555664, + "learning_rate": 8.790551875648398e-05, + "loss": 0.0238, + "step": 15890 + }, + { + "epoch": 9.254947613504074, + "grad_norm": 0.23522378504276276, + "learning_rate": 8.788754181523926e-05, + "loss": 0.0253, + "step": 15900 + }, + { + "epoch": 9.260768335273575, + "grad_norm": 0.19660773873329163, + "learning_rate": 8.78695533647597e-05, + "loss": 0.0252, + "step": 15910 + }, + { + "epoch": 9.266589057043074, + "grad_norm": 0.2194165587425232, + "learning_rate": 8.785155341050972e-05, + "loss": 0.0277, + "step": 15920 + }, + { + "epoch": 9.272409778812573, + "grad_norm": 0.20139029622077942, + "learning_rate": 8.783354195795721e-05, + "loss": 0.0223, + "step": 15930 + }, + { + "epoch": 9.278230500582072, + "grad_norm": 0.20834027230739594, + "learning_rate": 8.78155190125736e-05, + "loss": 0.0206, + "step": 15940 + }, + { + "epoch": 9.284051222351572, + "grad_norm": 0.30279043316841125, + "learning_rate": 8.779748457983378e-05, + "loss": 0.0256, + "step": 15950 + }, + { + "epoch": 9.28987194412107, + "grad_norm": 0.28141501545906067, + "learning_rate": 8.777943866521612e-05, + "loss": 0.0269, + "step": 15960 + }, + { + "epoch": 9.29569266589057, + "grad_norm": 0.2131347507238388, + "learning_rate": 8.77613812742025e-05, + "loss": 0.0188, + "step": 15970 + }, + { + "epoch": 9.30151338766007, + "grad_norm": 0.25764939188957214, + "learning_rate": 8.774331241227829e-05, + "loss": 0.0258, + "step": 15980 + }, + { + "epoch": 9.307334109429569, + "grad_norm": 0.2511085271835327, + "learning_rate": 8.772523208493232e-05, + "loss": 0.0173, + "step": 15990 + }, + { + "epoch": 9.31315483119907, + "grad_norm": 0.17234288156032562, + "learning_rate": 8.770714029765692e-05, + "loss": 0.0209, + "step": 16000 + }, + { + "epoch": 9.318975552968569, + "grad_norm": 0.3311176598072052, + "learning_rate": 8.768903705594789e-05, + "loss": 0.0218, + "step": 16010 + }, + { + "epoch": 9.324796274738068, + "grad_norm": 0.3135879933834076, + "learning_rate": 8.767092236530453e-05, + "loss": 0.0208, + "step": 16020 + }, + { + "epoch": 9.330616996507567, + "grad_norm": 0.24514348804950714, + "learning_rate": 8.76527962312296e-05, + "loss": 0.0236, + "step": 16030 + }, + { + "epoch": 9.336437718277066, + "grad_norm": 0.1922386735677719, + "learning_rate": 8.763465865922934e-05, + "loss": 0.0167, + "step": 16040 + }, + { + "epoch": 9.342258440046566, + "grad_norm": 0.17470096051692963, + "learning_rate": 8.761650965481347e-05, + "loss": 0.0219, + "step": 16050 + }, + { + "epoch": 9.348079161816065, + "grad_norm": 0.18018603324890137, + "learning_rate": 8.759834922349516e-05, + "loss": 0.0196, + "step": 16060 + }, + { + "epoch": 9.353899883585564, + "grad_norm": 0.2460308074951172, + "learning_rate": 8.758017737079108e-05, + "loss": 0.0206, + "step": 16070 + }, + { + "epoch": 9.359720605355063, + "grad_norm": 0.17295986413955688, + "learning_rate": 8.756199410222137e-05, + "loss": 0.0148, + "step": 16080 + }, + { + "epoch": 9.365541327124564, + "grad_norm": 0.18195827305316925, + "learning_rate": 8.754379942330963e-05, + "loss": 0.018, + "step": 16090 + }, + { + "epoch": 9.371362048894063, + "grad_norm": 0.17688904702663422, + "learning_rate": 8.75255933395829e-05, + "loss": 0.0167, + "step": 16100 + }, + { + "epoch": 9.377182770663563, + "grad_norm": 0.24497097730636597, + "learning_rate": 8.750737585657171e-05, + "loss": 0.0195, + "step": 16110 + }, + { + "epoch": 9.383003492433062, + "grad_norm": 0.23330028355121613, + "learning_rate": 8.748914697981008e-05, + "loss": 0.018, + "step": 16120 + }, + { + "epoch": 9.388824214202561, + "grad_norm": 0.18658243119716644, + "learning_rate": 8.747090671483542e-05, + "loss": 0.023, + "step": 16130 + }, + { + "epoch": 9.39464493597206, + "grad_norm": 0.1507173627614975, + "learning_rate": 8.745265506718869e-05, + "loss": 0.0142, + "step": 16140 + }, + { + "epoch": 9.40046565774156, + "grad_norm": 0.1942693442106247, + "learning_rate": 8.74343920424142e-05, + "loss": 0.018, + "step": 16150 + }, + { + "epoch": 9.406286379511059, + "grad_norm": 0.2796126902103424, + "learning_rate": 8.741611764605982e-05, + "loss": 0.0212, + "step": 16160 + }, + { + "epoch": 9.412107101280558, + "grad_norm": 0.14071056246757507, + "learning_rate": 8.739783188367682e-05, + "loss": 0.0186, + "step": 16170 + }, + { + "epoch": 9.417927823050059, + "grad_norm": 0.3310302495956421, + "learning_rate": 8.737953476081991e-05, + "loss": 0.0171, + "step": 16180 + }, + { + "epoch": 9.423748544819558, + "grad_norm": 0.2709824740886688, + "learning_rate": 8.73612262830473e-05, + "loss": 0.019, + "step": 16190 + }, + { + "epoch": 9.429569266589057, + "grad_norm": 0.21613867580890656, + "learning_rate": 8.734290645592061e-05, + "loss": 0.0161, + "step": 16200 + }, + { + "epoch": 9.435389988358557, + "grad_norm": 0.19773273169994354, + "learning_rate": 8.732457528500493e-05, + "loss": 0.0184, + "step": 16210 + }, + { + "epoch": 9.441210710128056, + "grad_norm": 0.264499306678772, + "learning_rate": 8.730623277586875e-05, + "loss": 0.0205, + "step": 16220 + }, + { + "epoch": 9.447031431897555, + "grad_norm": 0.21206548810005188, + "learning_rate": 8.72878789340841e-05, + "loss": 0.0195, + "step": 16230 + }, + { + "epoch": 9.452852153667054, + "grad_norm": 0.1923251897096634, + "learning_rate": 8.726951376522635e-05, + "loss": 0.0229, + "step": 16240 + }, + { + "epoch": 9.458672875436553, + "grad_norm": 0.2915855348110199, + "learning_rate": 8.725113727487435e-05, + "loss": 0.0185, + "step": 16250 + }, + { + "epoch": 9.464493597206054, + "grad_norm": 0.11557412892580032, + "learning_rate": 8.723274946861042e-05, + "loss": 0.0155, + "step": 16260 + }, + { + "epoch": 9.470314318975554, + "grad_norm": 0.2466229945421219, + "learning_rate": 8.721435035202026e-05, + "loss": 0.0175, + "step": 16270 + }, + { + "epoch": 9.476135040745053, + "grad_norm": 0.24574153125286102, + "learning_rate": 8.719593993069306e-05, + "loss": 0.0296, + "step": 16280 + }, + { + "epoch": 9.481955762514552, + "grad_norm": 0.2988555133342743, + "learning_rate": 8.717751821022139e-05, + "loss": 0.0216, + "step": 16290 + }, + { + "epoch": 9.487776484284051, + "grad_norm": 0.2302100658416748, + "learning_rate": 8.715908519620134e-05, + "loss": 0.0204, + "step": 16300 + }, + { + "epoch": 9.49359720605355, + "grad_norm": 0.2789667844772339, + "learning_rate": 8.71406408942323e-05, + "loss": 0.0215, + "step": 16310 + }, + { + "epoch": 9.49941792782305, + "grad_norm": 0.17890651524066925, + "learning_rate": 8.712218530991723e-05, + "loss": 0.0178, + "step": 16320 + }, + { + "epoch": 9.505238649592549, + "grad_norm": 0.29136940836906433, + "learning_rate": 8.710371844886241e-05, + "loss": 0.0186, + "step": 16330 + }, + { + "epoch": 9.511059371362048, + "grad_norm": 0.27167943120002747, + "learning_rate": 8.708524031667758e-05, + "loss": 0.0223, + "step": 16340 + }, + { + "epoch": 9.516880093131547, + "grad_norm": 0.26290684938430786, + "learning_rate": 8.706675091897592e-05, + "loss": 0.0231, + "step": 16350 + }, + { + "epoch": 9.522700814901048, + "grad_norm": 0.23498797416687012, + "learning_rate": 8.704825026137404e-05, + "loss": 0.0217, + "step": 16360 + }, + { + "epoch": 9.528521536670548, + "grad_norm": 0.18929556012153625, + "learning_rate": 8.702973834949192e-05, + "loss": 0.0191, + "step": 16370 + }, + { + "epoch": 9.534342258440047, + "grad_norm": 0.20051266252994537, + "learning_rate": 8.701121518895301e-05, + "loss": 0.0177, + "step": 16380 + }, + { + "epoch": 9.540162980209546, + "grad_norm": 0.22873622179031372, + "learning_rate": 8.699268078538414e-05, + "loss": 0.0205, + "step": 16390 + }, + { + "epoch": 9.545983701979045, + "grad_norm": 0.14724905788898468, + "learning_rate": 8.69741351444156e-05, + "loss": 0.015, + "step": 16400 + }, + { + "epoch": 9.551804423748544, + "grad_norm": 0.2675820291042328, + "learning_rate": 8.695557827168101e-05, + "loss": 0.0179, + "step": 16410 + }, + { + "epoch": 9.557625145518044, + "grad_norm": 0.18153317272663116, + "learning_rate": 8.693701017281753e-05, + "loss": 0.0222, + "step": 16420 + }, + { + "epoch": 9.563445867287543, + "grad_norm": 0.22183392941951752, + "learning_rate": 8.691843085346563e-05, + "loss": 0.0261, + "step": 16430 + }, + { + "epoch": 9.569266589057044, + "grad_norm": 0.2300952970981598, + "learning_rate": 8.689984031926919e-05, + "loss": 0.0169, + "step": 16440 + }, + { + "epoch": 9.575087310826543, + "grad_norm": 0.2382417619228363, + "learning_rate": 8.688123857587555e-05, + "loss": 0.0196, + "step": 16450 + }, + { + "epoch": 9.580908032596042, + "grad_norm": 0.35198545455932617, + "learning_rate": 8.686262562893544e-05, + "loss": 0.0218, + "step": 16460 + }, + { + "epoch": 9.586728754365542, + "grad_norm": 0.29608777165412903, + "learning_rate": 8.684400148410294e-05, + "loss": 0.0159, + "step": 16470 + }, + { + "epoch": 9.59254947613504, + "grad_norm": 0.16189293563365936, + "learning_rate": 8.682536614703562e-05, + "loss": 0.0187, + "step": 16480 + }, + { + "epoch": 9.59837019790454, + "grad_norm": 0.28894147276878357, + "learning_rate": 8.680671962339437e-05, + "loss": 0.0183, + "step": 16490 + }, + { + "epoch": 9.60419091967404, + "grad_norm": 0.2645092010498047, + "learning_rate": 8.678806191884352e-05, + "loss": 0.0186, + "step": 16500 + }, + { + "epoch": 9.610011641443538, + "grad_norm": 0.1710399091243744, + "learning_rate": 8.67693930390508e-05, + "loss": 0.0181, + "step": 16510 + }, + { + "epoch": 9.615832363213038, + "grad_norm": 0.2490326464176178, + "learning_rate": 8.67507129896873e-05, + "loss": 0.0193, + "step": 16520 + }, + { + "epoch": 9.621653084982539, + "grad_norm": 0.19656889140605927, + "learning_rate": 8.673202177642757e-05, + "loss": 0.0241, + "step": 16530 + }, + { + "epoch": 9.627473806752038, + "grad_norm": 0.17119090259075165, + "learning_rate": 8.671331940494945e-05, + "loss": 0.0264, + "step": 16540 + }, + { + "epoch": 9.633294528521537, + "grad_norm": 0.23274104297161102, + "learning_rate": 8.669460588093427e-05, + "loss": 0.0209, + "step": 16550 + }, + { + "epoch": 9.639115250291036, + "grad_norm": 0.2457660585641861, + "learning_rate": 8.667588121006667e-05, + "loss": 0.018, + "step": 16560 + }, + { + "epoch": 9.644935972060535, + "grad_norm": 0.2995380163192749, + "learning_rate": 8.665714539803475e-05, + "loss": 0.0245, + "step": 16570 + }, + { + "epoch": 9.650756693830035, + "grad_norm": 0.25346270203590393, + "learning_rate": 8.663839845052993e-05, + "loss": 0.0173, + "step": 16580 + }, + { + "epoch": 9.656577415599534, + "grad_norm": 0.3381359875202179, + "learning_rate": 8.661964037324703e-05, + "loss": 0.0205, + "step": 16590 + }, + { + "epoch": 9.662398137369033, + "grad_norm": 0.15756109356880188, + "learning_rate": 8.660087117188427e-05, + "loss": 0.0145, + "step": 16600 + }, + { + "epoch": 9.668218859138534, + "grad_norm": 0.24850009381771088, + "learning_rate": 8.658209085214325e-05, + "loss": 0.0222, + "step": 16610 + }, + { + "epoch": 9.674039580908033, + "grad_norm": 0.20767264068126678, + "learning_rate": 8.656329941972891e-05, + "loss": 0.0275, + "step": 16620 + }, + { + "epoch": 9.679860302677533, + "grad_norm": 0.14949962496757507, + "learning_rate": 8.654449688034963e-05, + "loss": 0.0221, + "step": 16630 + }, + { + "epoch": 9.685681024447032, + "grad_norm": 0.2545766234397888, + "learning_rate": 8.652568323971706e-05, + "loss": 0.0218, + "step": 16640 + }, + { + "epoch": 9.691501746216531, + "grad_norm": 0.29437732696533203, + "learning_rate": 8.650685850354636e-05, + "loss": 0.0195, + "step": 16650 + }, + { + "epoch": 9.69732246798603, + "grad_norm": 0.28543850779533386, + "learning_rate": 8.648802267755593e-05, + "loss": 0.0211, + "step": 16660 + }, + { + "epoch": 9.70314318975553, + "grad_norm": 0.1842445731163025, + "learning_rate": 8.646917576746764e-05, + "loss": 0.0205, + "step": 16670 + }, + { + "epoch": 9.708963911525029, + "grad_norm": 0.1872662901878357, + "learning_rate": 8.645031777900666e-05, + "loss": 0.0156, + "step": 16680 + }, + { + "epoch": 9.714784633294528, + "grad_norm": 0.3088105320930481, + "learning_rate": 8.643144871790154e-05, + "loss": 0.024, + "step": 16690 + }, + { + "epoch": 9.720605355064027, + "grad_norm": 0.18444286286830902, + "learning_rate": 8.641256858988424e-05, + "loss": 0.0195, + "step": 16700 + }, + { + "epoch": 9.726426076833528, + "grad_norm": 0.2304573804140091, + "learning_rate": 8.639367740069e-05, + "loss": 0.0161, + "step": 16710 + }, + { + "epoch": 9.732246798603027, + "grad_norm": 0.2607024908065796, + "learning_rate": 8.63747751560575e-05, + "loss": 0.0173, + "step": 16720 + }, + { + "epoch": 9.738067520372526, + "grad_norm": 0.2075866162776947, + "learning_rate": 8.635586186172871e-05, + "loss": 0.0161, + "step": 16730 + }, + { + "epoch": 9.743888242142026, + "grad_norm": 0.19168955087661743, + "learning_rate": 8.633693752344902e-05, + "loss": 0.0147, + "step": 16740 + }, + { + "epoch": 9.749708963911525, + "grad_norm": 0.21785899996757507, + "learning_rate": 8.631800214696713e-05, + "loss": 0.0185, + "step": 16750 + }, + { + "epoch": 9.755529685681024, + "grad_norm": 0.19336248934268951, + "learning_rate": 8.629905573803511e-05, + "loss": 0.0179, + "step": 16760 + }, + { + "epoch": 9.761350407450523, + "grad_norm": 0.14329251646995544, + "learning_rate": 8.628009830240839e-05, + "loss": 0.0207, + "step": 16770 + }, + { + "epoch": 9.767171129220023, + "grad_norm": 0.15545016527175903, + "learning_rate": 8.626112984584571e-05, + "loss": 0.0189, + "step": 16780 + }, + { + "epoch": 9.772991850989523, + "grad_norm": 0.20466673374176025, + "learning_rate": 8.62421503741092e-05, + "loss": 0.017, + "step": 16790 + }, + { + "epoch": 9.778812572759023, + "grad_norm": 0.12211467325687408, + "learning_rate": 8.622315989296432e-05, + "loss": 0.013, + "step": 16800 + }, + { + "epoch": 9.784633294528522, + "grad_norm": 0.23634886741638184, + "learning_rate": 8.62041584081799e-05, + "loss": 0.0174, + "step": 16810 + }, + { + "epoch": 9.790454016298021, + "grad_norm": 0.20163452625274658, + "learning_rate": 8.618514592552807e-05, + "loss": 0.0188, + "step": 16820 + }, + { + "epoch": 9.79627473806752, + "grad_norm": 0.2761308550834656, + "learning_rate": 8.616612245078431e-05, + "loss": 0.0173, + "step": 16830 + }, + { + "epoch": 9.80209545983702, + "grad_norm": 0.2383989542722702, + "learning_rate": 8.614708798972746e-05, + "loss": 0.0179, + "step": 16840 + }, + { + "epoch": 9.807916181606519, + "grad_norm": 0.21791067719459534, + "learning_rate": 8.61280425481397e-05, + "loss": 0.017, + "step": 16850 + }, + { + "epoch": 9.813736903376018, + "grad_norm": 0.3677397072315216, + "learning_rate": 8.61089861318065e-05, + "loss": 0.0204, + "step": 16860 + }, + { + "epoch": 9.819557625145517, + "grad_norm": 0.19449234008789062, + "learning_rate": 8.608991874651673e-05, + "loss": 0.0138, + "step": 16870 + }, + { + "epoch": 9.825378346915018, + "grad_norm": 0.1778031885623932, + "learning_rate": 8.607084039806255e-05, + "loss": 0.0169, + "step": 16880 + }, + { + "epoch": 9.831199068684517, + "grad_norm": 0.29096901416778564, + "learning_rate": 8.605175109223944e-05, + "loss": 0.0169, + "step": 16890 + }, + { + "epoch": 9.837019790454017, + "grad_norm": 0.21455195546150208, + "learning_rate": 8.603265083484624e-05, + "loss": 0.0175, + "step": 16900 + }, + { + "epoch": 9.842840512223516, + "grad_norm": 0.1598731279373169, + "learning_rate": 8.60135396316851e-05, + "loss": 0.0155, + "step": 16910 + }, + { + "epoch": 9.848661233993015, + "grad_norm": 0.2155141830444336, + "learning_rate": 8.599441748856152e-05, + "loss": 0.0152, + "step": 16920 + }, + { + "epoch": 9.854481955762514, + "grad_norm": 0.2196480631828308, + "learning_rate": 8.597528441128427e-05, + "loss": 0.0188, + "step": 16930 + }, + { + "epoch": 9.860302677532014, + "grad_norm": 0.23197786509990692, + "learning_rate": 8.595614040566549e-05, + "loss": 0.0177, + "step": 16940 + }, + { + "epoch": 9.866123399301513, + "grad_norm": 0.20048731565475464, + "learning_rate": 8.593698547752063e-05, + "loss": 0.0188, + "step": 16950 + }, + { + "epoch": 9.871944121071014, + "grad_norm": 0.23264729976654053, + "learning_rate": 8.591781963266843e-05, + "loss": 0.032, + "step": 16960 + }, + { + "epoch": 9.877764842840513, + "grad_norm": 0.27122974395751953, + "learning_rate": 8.5898642876931e-05, + "loss": 0.0185, + "step": 16970 + }, + { + "epoch": 9.883585564610012, + "grad_norm": 0.2067384272813797, + "learning_rate": 8.587945521613369e-05, + "loss": 0.0265, + "step": 16980 + }, + { + "epoch": 9.889406286379511, + "grad_norm": 0.2321958690881729, + "learning_rate": 8.586025665610524e-05, + "loss": 0.0153, + "step": 16990 + }, + { + "epoch": 9.89522700814901, + "grad_norm": 0.20197981595993042, + "learning_rate": 8.584104720267765e-05, + "loss": 0.0193, + "step": 17000 + }, + { + "epoch": 9.90104772991851, + "grad_norm": 0.19833116233348846, + "learning_rate": 8.582182686168625e-05, + "loss": 0.0179, + "step": 17010 + }, + { + "epoch": 9.906868451688009, + "grad_norm": 0.24964554607868195, + "learning_rate": 8.580259563896967e-05, + "loss": 0.0216, + "step": 17020 + }, + { + "epoch": 9.912689173457508, + "grad_norm": 0.3434305787086487, + "learning_rate": 8.578335354036983e-05, + "loss": 0.0205, + "step": 17030 + }, + { + "epoch": 9.918509895227007, + "grad_norm": 0.19014036655426025, + "learning_rate": 8.576410057173201e-05, + "loss": 0.0211, + "step": 17040 + }, + { + "epoch": 9.924330616996507, + "grad_norm": 0.238619863986969, + "learning_rate": 8.574483673890474e-05, + "loss": 0.018, + "step": 17050 + }, + { + "epoch": 9.930151338766008, + "grad_norm": 0.2385728657245636, + "learning_rate": 8.572556204773983e-05, + "loss": 0.0161, + "step": 17060 + }, + { + "epoch": 9.935972060535507, + "grad_norm": 0.25055864453315735, + "learning_rate": 8.570627650409246e-05, + "loss": 0.0169, + "step": 17070 + }, + { + "epoch": 9.941792782305006, + "grad_norm": 0.16649354994297028, + "learning_rate": 8.568698011382107e-05, + "loss": 0.0159, + "step": 17080 + }, + { + "epoch": 9.947613504074505, + "grad_norm": 0.33506345748901367, + "learning_rate": 8.566767288278738e-05, + "loss": 0.0235, + "step": 17090 + }, + { + "epoch": 9.953434225844005, + "grad_norm": 0.24568426609039307, + "learning_rate": 8.56483548168564e-05, + "loss": 0.0187, + "step": 17100 + }, + { + "epoch": 9.959254947613504, + "grad_norm": 0.2570675313472748, + "learning_rate": 8.562902592189648e-05, + "loss": 0.0173, + "step": 17110 + }, + { + "epoch": 9.965075669383003, + "grad_norm": 0.16782048344612122, + "learning_rate": 8.560968620377921e-05, + "loss": 0.0155, + "step": 17120 + }, + { + "epoch": 9.970896391152502, + "grad_norm": 0.2028018981218338, + "learning_rate": 8.559033566837951e-05, + "loss": 0.0202, + "step": 17130 + }, + { + "epoch": 9.976717112922003, + "grad_norm": 0.2383461892604828, + "learning_rate": 8.557097432157551e-05, + "loss": 0.0212, + "step": 17140 + }, + { + "epoch": 9.982537834691502, + "grad_norm": 0.2289532572031021, + "learning_rate": 8.555160216924872e-05, + "loss": 0.0165, + "step": 17150 + }, + { + "epoch": 9.988358556461002, + "grad_norm": 0.3005697429180145, + "learning_rate": 8.55322192172839e-05, + "loss": 0.0178, + "step": 17160 + }, + { + "epoch": 9.9941792782305, + "grad_norm": 0.15643465518951416, + "learning_rate": 8.551282547156902e-05, + "loss": 0.0148, + "step": 17170 + }, + { + "epoch": 10.0, + "grad_norm": 0.22274887561798096, + "learning_rate": 8.549342093799544e-05, + "loss": 0.0212, + "step": 17180 + }, + { + "epoch": 10.0058207217695, + "grad_norm": 0.209442600607872, + "learning_rate": 8.547400562245773e-05, + "loss": 0.0218, + "step": 17190 + }, + { + "epoch": 10.011641443538998, + "grad_norm": 0.20647242665290833, + "learning_rate": 8.545457953085374e-05, + "loss": 0.0203, + "step": 17200 + }, + { + "epoch": 10.017462165308498, + "grad_norm": 0.1307518184185028, + "learning_rate": 8.543514266908463e-05, + "loss": 0.0169, + "step": 17210 + }, + { + "epoch": 10.023282887077997, + "grad_norm": 0.18747274577617645, + "learning_rate": 8.541569504305478e-05, + "loss": 0.0209, + "step": 17220 + }, + { + "epoch": 10.029103608847498, + "grad_norm": 0.12739287316799164, + "learning_rate": 8.539623665867187e-05, + "loss": 0.0185, + "step": 17230 + }, + { + "epoch": 10.034924330616997, + "grad_norm": 0.16846221685409546, + "learning_rate": 8.537676752184685e-05, + "loss": 0.0226, + "step": 17240 + }, + { + "epoch": 10.040745052386496, + "grad_norm": 0.2803844213485718, + "learning_rate": 8.53572876384939e-05, + "loss": 0.0184, + "step": 17250 + }, + { + "epoch": 10.046565774155995, + "grad_norm": 0.28924745321273804, + "learning_rate": 8.533779701453056e-05, + "loss": 0.0188, + "step": 17260 + }, + { + "epoch": 10.052386495925495, + "grad_norm": 0.16548849642276764, + "learning_rate": 8.53182956558775e-05, + "loss": 0.0184, + "step": 17270 + }, + { + "epoch": 10.058207217694994, + "grad_norm": 0.1450534164905548, + "learning_rate": 8.529878356845877e-05, + "loss": 0.0146, + "step": 17280 + }, + { + "epoch": 10.064027939464493, + "grad_norm": 0.18913759291172028, + "learning_rate": 8.527926075820158e-05, + "loss": 0.02, + "step": 17290 + }, + { + "epoch": 10.069848661233992, + "grad_norm": 0.19680841267108917, + "learning_rate": 8.525972723103648e-05, + "loss": 0.018, + "step": 17300 + }, + { + "epoch": 10.075669383003492, + "grad_norm": 0.15823276340961456, + "learning_rate": 8.524018299289722e-05, + "loss": 0.0169, + "step": 17310 + }, + { + "epoch": 10.081490104772993, + "grad_norm": 0.3015984296798706, + "learning_rate": 8.522062804972083e-05, + "loss": 0.0189, + "step": 17320 + }, + { + "epoch": 10.087310826542492, + "grad_norm": 0.2064056098461151, + "learning_rate": 8.520106240744759e-05, + "loss": 0.018, + "step": 17330 + }, + { + "epoch": 10.093131548311991, + "grad_norm": 0.2674863338470459, + "learning_rate": 8.518148607202102e-05, + "loss": 0.0201, + "step": 17340 + }, + { + "epoch": 10.09895227008149, + "grad_norm": 0.2067824751138687, + "learning_rate": 8.51618990493879e-05, + "loss": 0.0205, + "step": 17350 + }, + { + "epoch": 10.10477299185099, + "grad_norm": 0.22374391555786133, + "learning_rate": 8.514230134549823e-05, + "loss": 0.0244, + "step": 17360 + }, + { + "epoch": 10.110593713620489, + "grad_norm": 0.18834033608436584, + "learning_rate": 8.51226929663053e-05, + "loss": 0.0182, + "step": 17370 + }, + { + "epoch": 10.116414435389988, + "grad_norm": 0.23068824410438538, + "learning_rate": 8.51030739177656e-05, + "loss": 0.0192, + "step": 17380 + }, + { + "epoch": 10.122235157159487, + "grad_norm": 0.1513330191373825, + "learning_rate": 8.508344420583889e-05, + "loss": 0.0221, + "step": 17390 + }, + { + "epoch": 10.128055878928988, + "grad_norm": 0.26884838938713074, + "learning_rate": 8.506380383648816e-05, + "loss": 0.0176, + "step": 17400 + }, + { + "epoch": 10.133876600698487, + "grad_norm": 0.20528478920459747, + "learning_rate": 8.504415281567963e-05, + "loss": 0.0176, + "step": 17410 + }, + { + "epoch": 10.139697322467986, + "grad_norm": 0.2330584079027176, + "learning_rate": 8.502449114938275e-05, + "loss": 0.0156, + "step": 17420 + }, + { + "epoch": 10.145518044237486, + "grad_norm": 0.250207781791687, + "learning_rate": 8.500481884357025e-05, + "loss": 0.0204, + "step": 17430 + }, + { + "epoch": 10.151338766006985, + "grad_norm": 0.24147149920463562, + "learning_rate": 8.498513590421801e-05, + "loss": 0.0173, + "step": 17440 + }, + { + "epoch": 10.157159487776484, + "grad_norm": 0.21444883942604065, + "learning_rate": 8.496544233730522e-05, + "loss": 0.021, + "step": 17450 + }, + { + "epoch": 10.162980209545983, + "grad_norm": 0.19544382393360138, + "learning_rate": 8.494573814881426e-05, + "loss": 0.0153, + "step": 17460 + }, + { + "epoch": 10.168800931315483, + "grad_norm": 0.22210484743118286, + "learning_rate": 8.492602334473074e-05, + "loss": 0.0157, + "step": 17470 + }, + { + "epoch": 10.174621653084982, + "grad_norm": 0.31665611267089844, + "learning_rate": 8.49062979310435e-05, + "loss": 0.0234, + "step": 17480 + }, + { + "epoch": 10.180442374854483, + "grad_norm": 0.1568090319633484, + "learning_rate": 8.488656191374458e-05, + "loss": 0.0217, + "step": 17490 + }, + { + "epoch": 10.186263096623982, + "grad_norm": 0.2092939019203186, + "learning_rate": 8.48668152988293e-05, + "loss": 0.0175, + "step": 17500 + }, + { + "epoch": 10.192083818393481, + "grad_norm": 0.2214668244123459, + "learning_rate": 8.484705809229612e-05, + "loss": 0.0169, + "step": 17510 + }, + { + "epoch": 10.19790454016298, + "grad_norm": 0.22446595132350922, + "learning_rate": 8.482729030014677e-05, + "loss": 0.0235, + "step": 17520 + }, + { + "epoch": 10.20372526193248, + "grad_norm": 0.2727365493774414, + "learning_rate": 8.48075119283862e-05, + "loss": 0.0179, + "step": 17530 + }, + { + "epoch": 10.209545983701979, + "grad_norm": 0.1682157963514328, + "learning_rate": 8.478772298302254e-05, + "loss": 0.0186, + "step": 17540 + }, + { + "epoch": 10.215366705471478, + "grad_norm": 0.2959231436252594, + "learning_rate": 8.476792347006716e-05, + "loss": 0.024, + "step": 17550 + }, + { + "epoch": 10.221187427240977, + "grad_norm": 0.3278026878833771, + "learning_rate": 8.474811339553462e-05, + "loss": 0.018, + "step": 17560 + }, + { + "epoch": 10.227008149010477, + "grad_norm": 0.29943618178367615, + "learning_rate": 8.47282927654427e-05, + "loss": 0.0295, + "step": 17570 + }, + { + "epoch": 10.232828870779977, + "grad_norm": 0.17124754190444946, + "learning_rate": 8.470846158581238e-05, + "loss": 0.0165, + "step": 17580 + }, + { + "epoch": 10.238649592549477, + "grad_norm": 0.19771285355091095, + "learning_rate": 8.468861986266787e-05, + "loss": 0.018, + "step": 17590 + }, + { + "epoch": 10.244470314318976, + "grad_norm": 0.2657853960990906, + "learning_rate": 8.466876760203654e-05, + "loss": 0.0181, + "step": 17600 + }, + { + "epoch": 10.250291036088475, + "grad_norm": 0.1781921237707138, + "learning_rate": 8.464890480994898e-05, + "loss": 0.0239, + "step": 17610 + }, + { + "epoch": 10.256111757857974, + "grad_norm": 0.12220761179924011, + "learning_rate": 8.462903149243899e-05, + "loss": 0.0165, + "step": 17620 + }, + { + "epoch": 10.261932479627474, + "grad_norm": 0.20806044340133667, + "learning_rate": 8.460914765554357e-05, + "loss": 0.0136, + "step": 17630 + }, + { + "epoch": 10.267753201396973, + "grad_norm": 0.18667694926261902, + "learning_rate": 8.458925330530288e-05, + "loss": 0.0241, + "step": 17640 + }, + { + "epoch": 10.273573923166472, + "grad_norm": 0.2806110680103302, + "learning_rate": 8.456934844776032e-05, + "loss": 0.0182, + "step": 17650 + }, + { + "epoch": 10.279394644935971, + "grad_norm": 0.2749525308609009, + "learning_rate": 8.454943308896246e-05, + "loss": 0.0201, + "step": 17660 + }, + { + "epoch": 10.285215366705472, + "grad_norm": 0.20053569972515106, + "learning_rate": 8.452950723495905e-05, + "loss": 0.0156, + "step": 17670 + }, + { + "epoch": 10.291036088474971, + "grad_norm": 0.15299849212169647, + "learning_rate": 8.450957089180303e-05, + "loss": 0.0163, + "step": 17680 + }, + { + "epoch": 10.29685681024447, + "grad_norm": 0.32454028725624084, + "learning_rate": 8.448962406555055e-05, + "loss": 0.0193, + "step": 17690 + }, + { + "epoch": 10.30267753201397, + "grad_norm": 0.2889762222766876, + "learning_rate": 8.446966676226093e-05, + "loss": 0.0248, + "step": 17700 + }, + { + "epoch": 10.308498253783469, + "grad_norm": 0.326643168926239, + "learning_rate": 8.444969898799667e-05, + "loss": 0.019, + "step": 17710 + }, + { + "epoch": 10.314318975552968, + "grad_norm": 0.36977580189704895, + "learning_rate": 8.442972074882343e-05, + "loss": 0.0194, + "step": 17720 + }, + { + "epoch": 10.320139697322467, + "grad_norm": 0.2837190628051758, + "learning_rate": 8.44097320508101e-05, + "loss": 0.0185, + "step": 17730 + }, + { + "epoch": 10.325960419091967, + "grad_norm": 0.2912046015262604, + "learning_rate": 8.43897329000287e-05, + "loss": 0.0151, + "step": 17740 + }, + { + "epoch": 10.331781140861466, + "grad_norm": 0.25294017791748047, + "learning_rate": 8.436972330255448e-05, + "loss": 0.0188, + "step": 17750 + }, + { + "epoch": 10.337601862630967, + "grad_norm": 0.15879201889038086, + "learning_rate": 8.434970326446579e-05, + "loss": 0.0195, + "step": 17760 + }, + { + "epoch": 10.343422584400466, + "grad_norm": 0.16516436636447906, + "learning_rate": 8.432967279184418e-05, + "loss": 0.0151, + "step": 17770 + }, + { + "epoch": 10.349243306169965, + "grad_norm": 0.21189634501934052, + "learning_rate": 8.430963189077441e-05, + "loss": 0.0155, + "step": 17780 + }, + { + "epoch": 10.355064027939465, + "grad_norm": 0.206316277384758, + "learning_rate": 8.428958056734437e-05, + "loss": 0.0153, + "step": 17790 + }, + { + "epoch": 10.360884749708964, + "grad_norm": 0.22734135389328003, + "learning_rate": 8.426951882764513e-05, + "loss": 0.026, + "step": 17800 + }, + { + "epoch": 10.366705471478463, + "grad_norm": 0.14772124588489532, + "learning_rate": 8.424944667777089e-05, + "loss": 0.014, + "step": 17810 + }, + { + "epoch": 10.372526193247962, + "grad_norm": 0.3098524808883667, + "learning_rate": 8.422936412381905e-05, + "loss": 0.0189, + "step": 17820 + }, + { + "epoch": 10.378346915017461, + "grad_norm": 0.23244920372962952, + "learning_rate": 8.420927117189017e-05, + "loss": 0.0167, + "step": 17830 + }, + { + "epoch": 10.384167636786962, + "grad_norm": 0.21086595952510834, + "learning_rate": 8.418916782808795e-05, + "loss": 0.019, + "step": 17840 + }, + { + "epoch": 10.389988358556462, + "grad_norm": 0.2047625482082367, + "learning_rate": 8.416905409851926e-05, + "loss": 0.018, + "step": 17850 + }, + { + "epoch": 10.39580908032596, + "grad_norm": 0.17399294674396515, + "learning_rate": 8.41489299892941e-05, + "loss": 0.0156, + "step": 17860 + }, + { + "epoch": 10.40162980209546, + "grad_norm": 0.1998453438282013, + "learning_rate": 8.412879550652566e-05, + "loss": 0.0164, + "step": 17870 + }, + { + "epoch": 10.40745052386496, + "grad_norm": 0.30424532294273376, + "learning_rate": 8.410865065633029e-05, + "loss": 0.019, + "step": 17880 + }, + { + "epoch": 10.413271245634458, + "grad_norm": 0.26151326298713684, + "learning_rate": 8.408849544482742e-05, + "loss": 0.021, + "step": 17890 + }, + { + "epoch": 10.419091967403958, + "grad_norm": 0.35675275325775146, + "learning_rate": 8.406832987813968e-05, + "loss": 0.0197, + "step": 17900 + }, + { + "epoch": 10.424912689173457, + "grad_norm": 0.22692671418190002, + "learning_rate": 8.404815396239286e-05, + "loss": 0.0194, + "step": 17910 + }, + { + "epoch": 10.430733410942956, + "grad_norm": 0.2514532208442688, + "learning_rate": 8.402796770371587e-05, + "loss": 0.0217, + "step": 17920 + }, + { + "epoch": 10.436554132712457, + "grad_norm": 0.22259771823883057, + "learning_rate": 8.400777110824071e-05, + "loss": 0.0156, + "step": 17930 + }, + { + "epoch": 10.442374854481956, + "grad_norm": 0.2915078103542328, + "learning_rate": 8.398756418210263e-05, + "loss": 0.0179, + "step": 17940 + }, + { + "epoch": 10.448195576251456, + "grad_norm": 0.1570025086402893, + "learning_rate": 8.396734693143993e-05, + "loss": 0.0159, + "step": 17950 + }, + { + "epoch": 10.454016298020955, + "grad_norm": 0.20970425009727478, + "learning_rate": 8.39471193623941e-05, + "loss": 0.0249, + "step": 17960 + }, + { + "epoch": 10.459837019790454, + "grad_norm": 0.2556745707988739, + "learning_rate": 8.392688148110974e-05, + "loss": 0.0189, + "step": 17970 + }, + { + "epoch": 10.465657741559953, + "grad_norm": 0.19324441254138947, + "learning_rate": 8.390663329373456e-05, + "loss": 0.0177, + "step": 17980 + }, + { + "epoch": 10.471478463329452, + "grad_norm": 0.22139586508274078, + "learning_rate": 8.388637480641944e-05, + "loss": 0.0212, + "step": 17990 + }, + { + "epoch": 10.477299185098952, + "grad_norm": 0.2954111695289612, + "learning_rate": 8.386610602531837e-05, + "loss": 0.0215, + "step": 18000 + }, + { + "epoch": 10.48311990686845, + "grad_norm": 0.19873371720314026, + "learning_rate": 8.384582695658847e-05, + "loss": 0.0172, + "step": 18010 + }, + { + "epoch": 10.488940628637952, + "grad_norm": 0.18362240493297577, + "learning_rate": 8.382553760638999e-05, + "loss": 0.0153, + "step": 18020 + }, + { + "epoch": 10.494761350407451, + "grad_norm": 0.3560687005519867, + "learning_rate": 8.380523798088631e-05, + "loss": 0.0155, + "step": 18030 + }, + { + "epoch": 10.50058207217695, + "grad_norm": 0.21443207561969757, + "learning_rate": 8.378492808624389e-05, + "loss": 0.0179, + "step": 18040 + }, + { + "epoch": 10.50640279394645, + "grad_norm": 0.2615169286727905, + "learning_rate": 8.376460792863237e-05, + "loss": 0.0217, + "step": 18050 + }, + { + "epoch": 10.512223515715949, + "grad_norm": 0.1528695970773697, + "learning_rate": 8.374427751422444e-05, + "loss": 0.0137, + "step": 18060 + }, + { + "epoch": 10.518044237485448, + "grad_norm": 0.19896160066127777, + "learning_rate": 8.3723936849196e-05, + "loss": 0.0149, + "step": 18070 + }, + { + "epoch": 10.523864959254947, + "grad_norm": 0.14641165733337402, + "learning_rate": 8.370358593972595e-05, + "loss": 0.0175, + "step": 18080 + }, + { + "epoch": 10.529685681024446, + "grad_norm": 0.25063616037368774, + "learning_rate": 8.36832247919964e-05, + "loss": 0.0152, + "step": 18090 + }, + { + "epoch": 10.535506402793946, + "grad_norm": 0.17447710037231445, + "learning_rate": 8.36628534121925e-05, + "loss": 0.0182, + "step": 18100 + }, + { + "epoch": 10.541327124563447, + "grad_norm": 0.16283778846263885, + "learning_rate": 8.364247180650254e-05, + "loss": 0.0199, + "step": 18110 + }, + { + "epoch": 10.547147846332946, + "grad_norm": 0.1387554258108139, + "learning_rate": 8.362207998111794e-05, + "loss": 0.0127, + "step": 18120 + }, + { + "epoch": 10.552968568102445, + "grad_norm": 0.14129047095775604, + "learning_rate": 8.360167794223318e-05, + "loss": 0.0167, + "step": 18130 + }, + { + "epoch": 10.558789289871944, + "grad_norm": 0.28077778220176697, + "learning_rate": 8.358126569604586e-05, + "loss": 0.0191, + "step": 18140 + }, + { + "epoch": 10.564610011641443, + "grad_norm": 0.16980934143066406, + "learning_rate": 8.356084324875668e-05, + "loss": 0.0205, + "step": 18150 + }, + { + "epoch": 10.570430733410943, + "grad_norm": 0.33876392245292664, + "learning_rate": 8.354041060656945e-05, + "loss": 0.0155, + "step": 18160 + }, + { + "epoch": 10.576251455180442, + "grad_norm": 0.16544397175312042, + "learning_rate": 8.351996777569106e-05, + "loss": 0.0147, + "step": 18170 + }, + { + "epoch": 10.582072176949941, + "grad_norm": 0.17015352845191956, + "learning_rate": 8.349951476233148e-05, + "loss": 0.0157, + "step": 18180 + }, + { + "epoch": 10.587892898719442, + "grad_norm": 0.20199590921401978, + "learning_rate": 8.347905157270386e-05, + "loss": 0.0154, + "step": 18190 + }, + { + "epoch": 10.593713620488941, + "grad_norm": 0.21366867423057556, + "learning_rate": 8.345857821302432e-05, + "loss": 0.0196, + "step": 18200 + }, + { + "epoch": 10.59953434225844, + "grad_norm": 0.2520920932292938, + "learning_rate": 8.343809468951213e-05, + "loss": 0.0174, + "step": 18210 + }, + { + "epoch": 10.60535506402794, + "grad_norm": 0.139428973197937, + "learning_rate": 8.341760100838965e-05, + "loss": 0.0211, + "step": 18220 + }, + { + "epoch": 10.611175785797439, + "grad_norm": 0.20134760439395905, + "learning_rate": 8.339709717588233e-05, + "loss": 0.0207, + "step": 18230 + }, + { + "epoch": 10.616996507566938, + "grad_norm": 0.3287809491157532, + "learning_rate": 8.33765831982187e-05, + "loss": 0.0178, + "step": 18240 + }, + { + "epoch": 10.622817229336437, + "grad_norm": 0.16149769723415375, + "learning_rate": 8.335605908163035e-05, + "loss": 0.0129, + "step": 18250 + }, + { + "epoch": 10.628637951105937, + "grad_norm": 0.24089346826076508, + "learning_rate": 8.333552483235196e-05, + "loss": 0.013, + "step": 18260 + }, + { + "epoch": 10.634458672875436, + "grad_norm": 0.17712941765785217, + "learning_rate": 8.33149804566213e-05, + "loss": 0.0173, + "step": 18270 + }, + { + "epoch": 10.640279394644937, + "grad_norm": 0.2865535616874695, + "learning_rate": 8.329442596067921e-05, + "loss": 0.0194, + "step": 18280 + }, + { + "epoch": 10.646100116414436, + "grad_norm": 0.23576733469963074, + "learning_rate": 8.32738613507696e-05, + "loss": 0.024, + "step": 18290 + }, + { + "epoch": 10.651920838183935, + "grad_norm": 0.24898062646389008, + "learning_rate": 8.325328663313946e-05, + "loss": 0.0202, + "step": 18300 + }, + { + "epoch": 10.657741559953434, + "grad_norm": 0.27558383345603943, + "learning_rate": 8.323270181403884e-05, + "loss": 0.0178, + "step": 18310 + }, + { + "epoch": 10.663562281722934, + "grad_norm": 0.1602187156677246, + "learning_rate": 8.321210689972086e-05, + "loss": 0.0175, + "step": 18320 + }, + { + "epoch": 10.669383003492433, + "grad_norm": 0.2397642582654953, + "learning_rate": 8.319150189644174e-05, + "loss": 0.0179, + "step": 18330 + }, + { + "epoch": 10.675203725261932, + "grad_norm": 0.16514863073825836, + "learning_rate": 8.31708868104607e-05, + "loss": 0.0255, + "step": 18340 + }, + { + "epoch": 10.681024447031431, + "grad_norm": 0.2335004359483719, + "learning_rate": 8.315026164804007e-05, + "loss": 0.0195, + "step": 18350 + }, + { + "epoch": 10.68684516880093, + "grad_norm": 0.2996123135089874, + "learning_rate": 8.312962641544524e-05, + "loss": 0.0203, + "step": 18360 + }, + { + "epoch": 10.692665890570431, + "grad_norm": 0.27255356311798096, + "learning_rate": 8.310898111894465e-05, + "loss": 0.0152, + "step": 18370 + }, + { + "epoch": 10.69848661233993, + "grad_norm": 0.22568996250629425, + "learning_rate": 8.308832576480977e-05, + "loss": 0.0152, + "step": 18380 + }, + { + "epoch": 10.70430733410943, + "grad_norm": 0.2669939398765564, + "learning_rate": 8.306766035931519e-05, + "loss": 0.0229, + "step": 18390 + }, + { + "epoch": 10.71012805587893, + "grad_norm": 0.21763084828853607, + "learning_rate": 8.304698490873847e-05, + "loss": 0.0159, + "step": 18400 + }, + { + "epoch": 10.715948777648428, + "grad_norm": 0.3696274161338806, + "learning_rate": 8.30262994193603e-05, + "loss": 0.0199, + "step": 18410 + }, + { + "epoch": 10.721769499417928, + "grad_norm": 0.2806268036365509, + "learning_rate": 8.300560389746438e-05, + "loss": 0.0156, + "step": 18420 + }, + { + "epoch": 10.727590221187427, + "grad_norm": 0.35184019804000854, + "learning_rate": 8.298489834933745e-05, + "loss": 0.0211, + "step": 18430 + }, + { + "epoch": 10.733410942956926, + "grad_norm": 0.19172371923923492, + "learning_rate": 8.296418278126934e-05, + "loss": 0.0216, + "step": 18440 + }, + { + "epoch": 10.739231664726425, + "grad_norm": 0.16485480964183807, + "learning_rate": 8.294345719955284e-05, + "loss": 0.0189, + "step": 18450 + }, + { + "epoch": 10.745052386495926, + "grad_norm": 0.34978750348091125, + "learning_rate": 8.29227216104839e-05, + "loss": 0.0179, + "step": 18460 + }, + { + "epoch": 10.750873108265425, + "grad_norm": 0.18039515614509583, + "learning_rate": 8.290197602036137e-05, + "loss": 0.0195, + "step": 18470 + }, + { + "epoch": 10.756693830034925, + "grad_norm": 0.2223493456840515, + "learning_rate": 8.288122043548725e-05, + "loss": 0.0133, + "step": 18480 + }, + { + "epoch": 10.762514551804424, + "grad_norm": 0.15908989310264587, + "learning_rate": 8.286045486216657e-05, + "loss": 0.0222, + "step": 18490 + }, + { + "epoch": 10.768335273573923, + "grad_norm": 0.23303554952144623, + "learning_rate": 8.283967930670733e-05, + "loss": 0.0167, + "step": 18500 + }, + { + "epoch": 10.774155995343422, + "grad_norm": 0.2671649754047394, + "learning_rate": 8.281889377542058e-05, + "loss": 0.0189, + "step": 18510 + }, + { + "epoch": 10.779976717112921, + "grad_norm": 0.2550906240940094, + "learning_rate": 8.279809827462045e-05, + "loss": 0.0169, + "step": 18520 + }, + { + "epoch": 10.78579743888242, + "grad_norm": 0.2985118329524994, + "learning_rate": 8.277729281062402e-05, + "loss": 0.0183, + "step": 18530 + }, + { + "epoch": 10.791618160651922, + "grad_norm": 0.2006126195192337, + "learning_rate": 8.27564773897515e-05, + "loss": 0.0154, + "step": 18540 + }, + { + "epoch": 10.797438882421421, + "grad_norm": 0.13686466217041016, + "learning_rate": 8.273565201832602e-05, + "loss": 0.0118, + "step": 18550 + }, + { + "epoch": 10.80325960419092, + "grad_norm": 0.25159212946891785, + "learning_rate": 8.27148167026738e-05, + "loss": 0.0133, + "step": 18560 + }, + { + "epoch": 10.80908032596042, + "grad_norm": 0.20270690321922302, + "learning_rate": 8.269397144912405e-05, + "loss": 0.0176, + "step": 18570 + }, + { + "epoch": 10.814901047729919, + "grad_norm": 0.16580365598201752, + "learning_rate": 8.267311626400899e-05, + "loss": 0.0137, + "step": 18580 + }, + { + "epoch": 10.820721769499418, + "grad_norm": 0.24698670208454132, + "learning_rate": 8.26522511536639e-05, + "loss": 0.0199, + "step": 18590 + }, + { + "epoch": 10.826542491268917, + "grad_norm": 0.22424699366092682, + "learning_rate": 8.263137612442706e-05, + "loss": 0.0157, + "step": 18600 + }, + { + "epoch": 10.832363213038416, + "grad_norm": 0.21760915219783783, + "learning_rate": 8.261049118263971e-05, + "loss": 0.0148, + "step": 18610 + }, + { + "epoch": 10.838183934807915, + "grad_norm": 0.2550506889820099, + "learning_rate": 8.258959633464619e-05, + "loss": 0.019, + "step": 18620 + }, + { + "epoch": 10.844004656577416, + "grad_norm": 0.3006627559661865, + "learning_rate": 8.256869158679377e-05, + "loss": 0.0163, + "step": 18630 + }, + { + "epoch": 10.849825378346916, + "grad_norm": 0.14096316695213318, + "learning_rate": 8.254777694543278e-05, + "loss": 0.017, + "step": 18640 + }, + { + "epoch": 10.855646100116415, + "grad_norm": 0.16482512652873993, + "learning_rate": 8.252685241691651e-05, + "loss": 0.0163, + "step": 18650 + }, + { + "epoch": 10.861466821885914, + "grad_norm": 0.21144314110279083, + "learning_rate": 8.250591800760133e-05, + "loss": 0.0267, + "step": 18660 + }, + { + "epoch": 10.867287543655413, + "grad_norm": 0.3092564344406128, + "learning_rate": 8.248497372384649e-05, + "loss": 0.0208, + "step": 18670 + }, + { + "epoch": 10.873108265424912, + "grad_norm": 0.2367086261510849, + "learning_rate": 8.246401957201437e-05, + "loss": 0.0172, + "step": 18680 + }, + { + "epoch": 10.878928987194412, + "grad_norm": 0.2891194820404053, + "learning_rate": 8.244305555847027e-05, + "loss": 0.0187, + "step": 18690 + }, + { + "epoch": 10.884749708963911, + "grad_norm": 0.2969621419906616, + "learning_rate": 8.24220816895825e-05, + "loss": 0.015, + "step": 18700 + }, + { + "epoch": 10.89057043073341, + "grad_norm": 0.22237373888492584, + "learning_rate": 8.240109797172237e-05, + "loss": 0.0206, + "step": 18710 + }, + { + "epoch": 10.896391152502911, + "grad_norm": 0.2382725477218628, + "learning_rate": 8.238010441126416e-05, + "loss": 0.0198, + "step": 18720 + }, + { + "epoch": 10.90221187427241, + "grad_norm": 0.32382524013519287, + "learning_rate": 8.23591010145852e-05, + "loss": 0.0227, + "step": 18730 + }, + { + "epoch": 10.90803259604191, + "grad_norm": 0.2673744857311249, + "learning_rate": 8.233808778806571e-05, + "loss": 0.0207, + "step": 18740 + }, + { + "epoch": 10.913853317811409, + "grad_norm": 0.31019604206085205, + "learning_rate": 8.231706473808903e-05, + "loss": 0.0188, + "step": 18750 + }, + { + "epoch": 10.919674039580908, + "grad_norm": 0.23554939031600952, + "learning_rate": 8.229603187104133e-05, + "loss": 0.0212, + "step": 18760 + }, + { + "epoch": 10.925494761350407, + "grad_norm": 0.22837324440479279, + "learning_rate": 8.22749891933119e-05, + "loss": 0.0186, + "step": 18770 + }, + { + "epoch": 10.931315483119906, + "grad_norm": 0.26378047466278076, + "learning_rate": 8.225393671129291e-05, + "loss": 0.0216, + "step": 18780 + }, + { + "epoch": 10.937136204889406, + "grad_norm": 0.230399951338768, + "learning_rate": 8.223287443137957e-05, + "loss": 0.0157, + "step": 18790 + }, + { + "epoch": 10.942956926658905, + "grad_norm": 0.2377876192331314, + "learning_rate": 8.221180235997004e-05, + "loss": 0.0152, + "step": 18800 + }, + { + "epoch": 10.948777648428406, + "grad_norm": 0.13325798511505127, + "learning_rate": 8.219072050346544e-05, + "loss": 0.0185, + "step": 18810 + }, + { + "epoch": 10.954598370197905, + "grad_norm": 0.1964443027973175, + "learning_rate": 8.216962886826992e-05, + "loss": 0.0184, + "step": 18820 + }, + { + "epoch": 10.960419091967404, + "grad_norm": 0.23692990839481354, + "learning_rate": 8.214852746079054e-05, + "loss": 0.0203, + "step": 18830 + }, + { + "epoch": 10.966239813736903, + "grad_norm": 0.20464853942394257, + "learning_rate": 8.212741628743732e-05, + "loss": 0.0165, + "step": 18840 + }, + { + "epoch": 10.972060535506403, + "grad_norm": 0.17301097512245178, + "learning_rate": 8.210629535462333e-05, + "loss": 0.0168, + "step": 18850 + }, + { + "epoch": 10.977881257275902, + "grad_norm": 0.2287324219942093, + "learning_rate": 8.208516466876453e-05, + "loss": 0.0174, + "step": 18860 + }, + { + "epoch": 10.983701979045401, + "grad_norm": 0.20610955357551575, + "learning_rate": 8.206402423627986e-05, + "loss": 0.0135, + "step": 18870 + }, + { + "epoch": 10.9895227008149, + "grad_norm": 0.2822803854942322, + "learning_rate": 8.204287406359124e-05, + "loss": 0.0257, + "step": 18880 + }, + { + "epoch": 10.995343422584401, + "grad_norm": 0.20535634458065033, + "learning_rate": 8.20217141571235e-05, + "loss": 0.0188, + "step": 18890 + }, + { + "epoch": 11.0011641443539, + "grad_norm": 0.25398656725883484, + "learning_rate": 8.200054452330449e-05, + "loss": 0.0148, + "step": 18900 + }, + { + "epoch": 11.0069848661234, + "grad_norm": 0.18343505263328552, + "learning_rate": 8.197936516856499e-05, + "loss": 0.0196, + "step": 18910 + }, + { + "epoch": 11.012805587892899, + "grad_norm": 0.3369884788990021, + "learning_rate": 8.195817609933871e-05, + "loss": 0.0186, + "step": 18920 + }, + { + "epoch": 11.018626309662398, + "grad_norm": 0.20851044356822968, + "learning_rate": 8.193697732206233e-05, + "loss": 0.0194, + "step": 18930 + }, + { + "epoch": 11.024447031431897, + "grad_norm": 0.2348732352256775, + "learning_rate": 8.19157688431755e-05, + "loss": 0.0204, + "step": 18940 + }, + { + "epoch": 11.030267753201397, + "grad_norm": 0.2185640186071396, + "learning_rate": 8.189455066912077e-05, + "loss": 0.0195, + "step": 18950 + }, + { + "epoch": 11.036088474970896, + "grad_norm": 0.29503610730171204, + "learning_rate": 8.187332280634369e-05, + "loss": 0.0196, + "step": 18960 + }, + { + "epoch": 11.041909196740395, + "grad_norm": 0.25185060501098633, + "learning_rate": 8.18520852612927e-05, + "loss": 0.0186, + "step": 18970 + }, + { + "epoch": 11.047729918509896, + "grad_norm": 0.24821482598781586, + "learning_rate": 8.183083804041921e-05, + "loss": 0.0237, + "step": 18980 + }, + { + "epoch": 11.053550640279395, + "grad_norm": 0.2073218822479248, + "learning_rate": 8.180958115017757e-05, + "loss": 0.0155, + "step": 18990 + }, + { + "epoch": 11.059371362048894, + "grad_norm": 0.31765761971473694, + "learning_rate": 8.178831459702505e-05, + "loss": 0.025, + "step": 19000 + }, + { + "epoch": 11.065192083818394, + "grad_norm": 0.22924944758415222, + "learning_rate": 8.17670383874219e-05, + "loss": 0.0194, + "step": 19010 + }, + { + "epoch": 11.071012805587893, + "grad_norm": 0.21676258742809296, + "learning_rate": 8.174575252783124e-05, + "loss": 0.0153, + "step": 19020 + }, + { + "epoch": 11.076833527357392, + "grad_norm": 0.24681539833545685, + "learning_rate": 8.172445702471914e-05, + "loss": 0.0232, + "step": 19030 + }, + { + "epoch": 11.082654249126891, + "grad_norm": 0.22984717786312103, + "learning_rate": 8.170315188455466e-05, + "loss": 0.0186, + "step": 19040 + }, + { + "epoch": 11.08847497089639, + "grad_norm": 0.3171824514865875, + "learning_rate": 8.168183711380969e-05, + "loss": 0.0187, + "step": 19050 + }, + { + "epoch": 11.09429569266589, + "grad_norm": 0.25586220622062683, + "learning_rate": 8.166051271895913e-05, + "loss": 0.0189, + "step": 19060 + }, + { + "epoch": 11.10011641443539, + "grad_norm": 0.2146492749452591, + "learning_rate": 8.163917870648075e-05, + "loss": 0.0227, + "step": 19070 + }, + { + "epoch": 11.10593713620489, + "grad_norm": 0.25322866439819336, + "learning_rate": 8.161783508285526e-05, + "loss": 0.0157, + "step": 19080 + }, + { + "epoch": 11.11175785797439, + "grad_norm": 0.20702509582042694, + "learning_rate": 8.159648185456628e-05, + "loss": 0.0187, + "step": 19090 + }, + { + "epoch": 11.117578579743888, + "grad_norm": 0.16563735902309418, + "learning_rate": 8.157511902810038e-05, + "loss": 0.0208, + "step": 19100 + }, + { + "epoch": 11.123399301513388, + "grad_norm": 0.2681899070739746, + "learning_rate": 8.155374660994701e-05, + "loss": 0.0188, + "step": 19110 + }, + { + "epoch": 11.129220023282887, + "grad_norm": 0.22535236179828644, + "learning_rate": 8.153236460659857e-05, + "loss": 0.0214, + "step": 19120 + }, + { + "epoch": 11.135040745052386, + "grad_norm": 0.27642568945884705, + "learning_rate": 8.151097302455031e-05, + "loss": 0.0156, + "step": 19130 + }, + { + "epoch": 11.140861466821885, + "grad_norm": 0.18890736997127533, + "learning_rate": 8.148957187030044e-05, + "loss": 0.023, + "step": 19140 + }, + { + "epoch": 11.146682188591384, + "grad_norm": 0.20868104696273804, + "learning_rate": 8.146816115035006e-05, + "loss": 0.0199, + "step": 19150 + }, + { + "epoch": 11.152502910360885, + "grad_norm": 0.29160934686660767, + "learning_rate": 8.14467408712032e-05, + "loss": 0.0259, + "step": 19160 + }, + { + "epoch": 11.158323632130385, + "grad_norm": 0.2402823269367218, + "learning_rate": 8.142531103936678e-05, + "loss": 0.019, + "step": 19170 + }, + { + "epoch": 11.164144353899884, + "grad_norm": 0.1808064877986908, + "learning_rate": 8.14038716613506e-05, + "loss": 0.0134, + "step": 19180 + }, + { + "epoch": 11.169965075669383, + "grad_norm": 0.1732216328382492, + "learning_rate": 8.138242274366736e-05, + "loss": 0.0193, + "step": 19190 + }, + { + "epoch": 11.175785797438882, + "grad_norm": 0.19221064448356628, + "learning_rate": 8.136096429283271e-05, + "loss": 0.0186, + "step": 19200 + }, + { + "epoch": 11.181606519208382, + "grad_norm": 0.30801984667778015, + "learning_rate": 8.133949631536515e-05, + "loss": 0.0209, + "step": 19210 + }, + { + "epoch": 11.18742724097788, + "grad_norm": 0.2688869833946228, + "learning_rate": 8.131801881778607e-05, + "loss": 0.0242, + "step": 19220 + }, + { + "epoch": 11.19324796274738, + "grad_norm": 0.21007929742336273, + "learning_rate": 8.129653180661978e-05, + "loss": 0.0209, + "step": 19230 + }, + { + "epoch": 11.199068684516881, + "grad_norm": 0.2627229690551758, + "learning_rate": 8.127503528839346e-05, + "loss": 0.0206, + "step": 19240 + }, + { + "epoch": 11.20488940628638, + "grad_norm": 0.2521759569644928, + "learning_rate": 8.125352926963721e-05, + "loss": 0.0257, + "step": 19250 + }, + { + "epoch": 11.21071012805588, + "grad_norm": 0.25335681438446045, + "learning_rate": 8.123201375688395e-05, + "loss": 0.0167, + "step": 19260 + }, + { + "epoch": 11.216530849825379, + "grad_norm": 0.205439954996109, + "learning_rate": 8.121048875666954e-05, + "loss": 0.0174, + "step": 19270 + }, + { + "epoch": 11.222351571594878, + "grad_norm": 0.15452200174331665, + "learning_rate": 8.118895427553274e-05, + "loss": 0.0135, + "step": 19280 + }, + { + "epoch": 11.228172293364377, + "grad_norm": 0.2922830879688263, + "learning_rate": 8.116741032001511e-05, + "loss": 0.0212, + "step": 19290 + }, + { + "epoch": 11.233993015133876, + "grad_norm": 0.2672381103038788, + "learning_rate": 8.114585689666114e-05, + "loss": 0.0187, + "step": 19300 + }, + { + "epoch": 11.239813736903375, + "grad_norm": 0.17447736859321594, + "learning_rate": 8.112429401201821e-05, + "loss": 0.0147, + "step": 19310 + }, + { + "epoch": 11.245634458672875, + "grad_norm": 0.1783382147550583, + "learning_rate": 8.110272167263656e-05, + "loss": 0.0194, + "step": 19320 + }, + { + "epoch": 11.251455180442376, + "grad_norm": 0.17631365358829498, + "learning_rate": 8.108113988506929e-05, + "loss": 0.018, + "step": 19330 + }, + { + "epoch": 11.257275902211875, + "grad_norm": 0.14696697890758514, + "learning_rate": 8.105954865587235e-05, + "loss": 0.0171, + "step": 19340 + }, + { + "epoch": 11.263096623981374, + "grad_norm": 0.20815768837928772, + "learning_rate": 8.103794799160463e-05, + "loss": 0.0158, + "step": 19350 + }, + { + "epoch": 11.268917345750873, + "grad_norm": 0.25097692012786865, + "learning_rate": 8.101633789882781e-05, + "loss": 0.0217, + "step": 19360 + }, + { + "epoch": 11.274738067520373, + "grad_norm": 0.1942133754491806, + "learning_rate": 8.099471838410648e-05, + "loss": 0.0153, + "step": 19370 + }, + { + "epoch": 11.280558789289872, + "grad_norm": 0.19854331016540527, + "learning_rate": 8.097308945400806e-05, + "loss": 0.0172, + "step": 19380 + }, + { + "epoch": 11.286379511059371, + "grad_norm": 0.21126487851142883, + "learning_rate": 8.095145111510288e-05, + "loss": 0.0156, + "step": 19390 + }, + { + "epoch": 11.29220023282887, + "grad_norm": 0.24046725034713745, + "learning_rate": 8.092980337396406e-05, + "loss": 0.0137, + "step": 19400 + }, + { + "epoch": 11.29802095459837, + "grad_norm": 0.26102691888809204, + "learning_rate": 8.090814623716763e-05, + "loss": 0.0186, + "step": 19410 + }, + { + "epoch": 11.30384167636787, + "grad_norm": 0.21979552507400513, + "learning_rate": 8.088647971129246e-05, + "loss": 0.0176, + "step": 19420 + }, + { + "epoch": 11.30966239813737, + "grad_norm": 0.16729988157749176, + "learning_rate": 8.086480380292026e-05, + "loss": 0.0181, + "step": 19430 + }, + { + "epoch": 11.315483119906869, + "grad_norm": 0.18871034681797028, + "learning_rate": 8.084311851863562e-05, + "loss": 0.015, + "step": 19440 + }, + { + "epoch": 11.321303841676368, + "grad_norm": 0.31152650713920593, + "learning_rate": 8.082142386502591e-05, + "loss": 0.0168, + "step": 19450 + }, + { + "epoch": 11.327124563445867, + "grad_norm": 0.2415240854024887, + "learning_rate": 8.079971984868145e-05, + "loss": 0.0222, + "step": 19460 + }, + { + "epoch": 11.332945285215366, + "grad_norm": 0.3086613416671753, + "learning_rate": 8.077800647619532e-05, + "loss": 0.0192, + "step": 19470 + }, + { + "epoch": 11.338766006984866, + "grad_norm": 0.1784486621618271, + "learning_rate": 8.075628375416345e-05, + "loss": 0.0147, + "step": 19480 + }, + { + "epoch": 11.344586728754365, + "grad_norm": 0.27019423246383667, + "learning_rate": 8.073455168918464e-05, + "loss": 0.0188, + "step": 19490 + }, + { + "epoch": 11.350407450523864, + "grad_norm": 0.22066685557365417, + "learning_rate": 8.071281028786055e-05, + "loss": 0.0159, + "step": 19500 + }, + { + "epoch": 11.356228172293365, + "grad_norm": 0.2047853171825409, + "learning_rate": 8.069105955679562e-05, + "loss": 0.0166, + "step": 19510 + }, + { + "epoch": 11.362048894062864, + "grad_norm": 0.20914235711097717, + "learning_rate": 8.066929950259713e-05, + "loss": 0.019, + "step": 19520 + }, + { + "epoch": 11.367869615832364, + "grad_norm": 0.19665449857711792, + "learning_rate": 8.064753013187522e-05, + "loss": 0.0153, + "step": 19530 + }, + { + "epoch": 11.373690337601863, + "grad_norm": 0.21649004518985748, + "learning_rate": 8.062575145124289e-05, + "loss": 0.0159, + "step": 19540 + }, + { + "epoch": 11.379511059371362, + "grad_norm": 0.23354151844978333, + "learning_rate": 8.060396346731587e-05, + "loss": 0.0172, + "step": 19550 + }, + { + "epoch": 11.385331781140861, + "grad_norm": 0.1811951994895935, + "learning_rate": 8.058216618671281e-05, + "loss": 0.0142, + "step": 19560 + }, + { + "epoch": 11.39115250291036, + "grad_norm": 0.2506617307662964, + "learning_rate": 8.056035961605514e-05, + "loss": 0.0147, + "step": 19570 + }, + { + "epoch": 11.39697322467986, + "grad_norm": 0.18809330463409424, + "learning_rate": 8.05385437619671e-05, + "loss": 0.0222, + "step": 19580 + }, + { + "epoch": 11.40279394644936, + "grad_norm": 0.16152116656303406, + "learning_rate": 8.05167186310758e-05, + "loss": 0.0133, + "step": 19590 + }, + { + "epoch": 11.40861466821886, + "grad_norm": 0.19095440208911896, + "learning_rate": 8.049488423001113e-05, + "loss": 0.0156, + "step": 19600 + }, + { + "epoch": 11.414435389988359, + "grad_norm": 0.27824804186820984, + "learning_rate": 8.047304056540581e-05, + "loss": 0.0189, + "step": 19610 + }, + { + "epoch": 11.420256111757858, + "grad_norm": 0.2817663550376892, + "learning_rate": 8.045118764389534e-05, + "loss": 0.0196, + "step": 19620 + }, + { + "epoch": 11.426076833527357, + "grad_norm": 0.21375985443592072, + "learning_rate": 8.042932547211809e-05, + "loss": 0.0134, + "step": 19630 + }, + { + "epoch": 11.431897555296857, + "grad_norm": 0.22229167819023132, + "learning_rate": 8.04074540567152e-05, + "loss": 0.0186, + "step": 19640 + }, + { + "epoch": 11.437718277066356, + "grad_norm": 0.2735293209552765, + "learning_rate": 8.038557340433063e-05, + "loss": 0.0263, + "step": 19650 + }, + { + "epoch": 11.443538998835855, + "grad_norm": 0.24309708178043365, + "learning_rate": 8.036368352161115e-05, + "loss": 0.017, + "step": 19660 + }, + { + "epoch": 11.449359720605354, + "grad_norm": 0.20526811480522156, + "learning_rate": 8.034178441520633e-05, + "loss": 0.0174, + "step": 19670 + }, + { + "epoch": 11.455180442374855, + "grad_norm": 0.25283145904541016, + "learning_rate": 8.031987609176852e-05, + "loss": 0.0204, + "step": 19680 + }, + { + "epoch": 11.461001164144355, + "grad_norm": 0.16945886611938477, + "learning_rate": 8.02979585579529e-05, + "loss": 0.0166, + "step": 19690 + }, + { + "epoch": 11.466821885913854, + "grad_norm": 0.2441062480211258, + "learning_rate": 8.027603182041745e-05, + "loss": 0.0241, + "step": 19700 + }, + { + "epoch": 11.472642607683353, + "grad_norm": 0.171798437833786, + "learning_rate": 8.025409588582292e-05, + "loss": 0.0222, + "step": 19710 + }, + { + "epoch": 11.478463329452852, + "grad_norm": 0.3086794316768646, + "learning_rate": 8.023215076083288e-05, + "loss": 0.0208, + "step": 19720 + }, + { + "epoch": 11.484284051222351, + "grad_norm": 0.20706917345523834, + "learning_rate": 8.021019645211367e-05, + "loss": 0.0162, + "step": 19730 + }, + { + "epoch": 11.49010477299185, + "grad_norm": 0.24100975692272186, + "learning_rate": 8.018823296633441e-05, + "loss": 0.0171, + "step": 19740 + }, + { + "epoch": 11.49592549476135, + "grad_norm": 0.21652235090732574, + "learning_rate": 8.016626031016708e-05, + "loss": 0.0221, + "step": 19750 + }, + { + "epoch": 11.501746216530849, + "grad_norm": 0.2320483773946762, + "learning_rate": 8.014427849028636e-05, + "loss": 0.0151, + "step": 19760 + }, + { + "epoch": 11.50756693830035, + "grad_norm": 0.31503355503082275, + "learning_rate": 8.012228751336974e-05, + "loss": 0.0186, + "step": 19770 + }, + { + "epoch": 11.51338766006985, + "grad_norm": 0.3266433775424957, + "learning_rate": 8.01002873860975e-05, + "loss": 0.0247, + "step": 19780 + }, + { + "epoch": 11.519208381839348, + "grad_norm": 0.2287236899137497, + "learning_rate": 8.00782781151527e-05, + "loss": 0.0174, + "step": 19790 + }, + { + "epoch": 11.525029103608848, + "grad_norm": 0.17216549813747406, + "learning_rate": 8.005625970722119e-05, + "loss": 0.0193, + "step": 19800 + }, + { + "epoch": 11.530849825378347, + "grad_norm": 0.24620038270950317, + "learning_rate": 8.003423216899158e-05, + "loss": 0.0143, + "step": 19810 + }, + { + "epoch": 11.536670547147846, + "grad_norm": 0.18587343394756317, + "learning_rate": 8.001219550715522e-05, + "loss": 0.0209, + "step": 19820 + }, + { + "epoch": 11.542491268917345, + "grad_norm": 0.16814132034778595, + "learning_rate": 7.999014972840632e-05, + "loss": 0.0252, + "step": 19830 + }, + { + "epoch": 11.548311990686845, + "grad_norm": 0.22083908319473267, + "learning_rate": 7.996809483944174e-05, + "loss": 0.0158, + "step": 19840 + }, + { + "epoch": 11.554132712456344, + "grad_norm": 0.19533054530620575, + "learning_rate": 7.994603084696124e-05, + "loss": 0.0137, + "step": 19850 + }, + { + "epoch": 11.559953434225845, + "grad_norm": 0.19559234380722046, + "learning_rate": 7.992395775766724e-05, + "loss": 0.0148, + "step": 19860 + }, + { + "epoch": 11.565774155995344, + "grad_norm": 0.17013007402420044, + "learning_rate": 7.990187557826497e-05, + "loss": 0.0222, + "step": 19870 + }, + { + "epoch": 11.571594877764843, + "grad_norm": 0.16111983358860016, + "learning_rate": 7.987978431546242e-05, + "loss": 0.0176, + "step": 19880 + }, + { + "epoch": 11.577415599534342, + "grad_norm": 0.22499506175518036, + "learning_rate": 7.985768397597031e-05, + "loss": 0.0183, + "step": 19890 + }, + { + "epoch": 11.583236321303842, + "grad_norm": 0.2535330355167389, + "learning_rate": 7.983557456650216e-05, + "loss": 0.0191, + "step": 19900 + }, + { + "epoch": 11.58905704307334, + "grad_norm": 0.3245653808116913, + "learning_rate": 7.981345609377422e-05, + "loss": 0.0185, + "step": 19910 + }, + { + "epoch": 11.59487776484284, + "grad_norm": 0.21642908453941345, + "learning_rate": 7.97913285645055e-05, + "loss": 0.0163, + "step": 19920 + }, + { + "epoch": 11.60069848661234, + "grad_norm": 0.260347843170166, + "learning_rate": 7.976919198541776e-05, + "loss": 0.0152, + "step": 19930 + }, + { + "epoch": 11.60651920838184, + "grad_norm": 0.27145302295684814, + "learning_rate": 7.974704636323548e-05, + "loss": 0.0188, + "step": 19940 + }, + { + "epoch": 11.61233993015134, + "grad_norm": 0.15297627449035645, + "learning_rate": 7.972489170468597e-05, + "loss": 0.0129, + "step": 19950 + }, + { + "epoch": 11.618160651920839, + "grad_norm": 0.24000860750675201, + "learning_rate": 7.970272801649918e-05, + "loss": 0.0177, + "step": 19960 + }, + { + "epoch": 11.623981373690338, + "grad_norm": 0.19117452204227448, + "learning_rate": 7.96805553054079e-05, + "loss": 0.0191, + "step": 19970 + }, + { + "epoch": 11.629802095459837, + "grad_norm": 0.21297620236873627, + "learning_rate": 7.965837357814756e-05, + "loss": 0.014, + "step": 19980 + }, + { + "epoch": 11.635622817229336, + "grad_norm": 0.21617533266544342, + "learning_rate": 7.963618284145643e-05, + "loss": 0.0184, + "step": 19990 + }, + { + "epoch": 11.641443538998836, + "grad_norm": 0.2627561688423157, + "learning_rate": 7.961398310207544e-05, + "loss": 0.0201, + "step": 20000 + }, + { + "epoch": 11.647264260768335, + "grad_norm": 0.17411494255065918, + "learning_rate": 7.95917743667483e-05, + "loss": 0.0182, + "step": 20010 + }, + { + "epoch": 11.653084982537834, + "grad_norm": 0.1702374368906021, + "learning_rate": 7.956955664222144e-05, + "loss": 0.0182, + "step": 20020 + }, + { + "epoch": 11.658905704307333, + "grad_norm": 0.21468934416770935, + "learning_rate": 7.954732993524399e-05, + "loss": 0.0167, + "step": 20030 + }, + { + "epoch": 11.664726426076834, + "grad_norm": 0.20127710700035095, + "learning_rate": 7.952509425256786e-05, + "loss": 0.0166, + "step": 20040 + }, + { + "epoch": 11.670547147846333, + "grad_norm": 0.2058652937412262, + "learning_rate": 7.950284960094767e-05, + "loss": 0.0167, + "step": 20050 + }, + { + "epoch": 11.676367869615833, + "grad_norm": 0.2613903880119324, + "learning_rate": 7.948059598714076e-05, + "loss": 0.0162, + "step": 20060 + }, + { + "epoch": 11.682188591385332, + "grad_norm": 0.3248015344142914, + "learning_rate": 7.945833341790717e-05, + "loss": 0.0203, + "step": 20070 + }, + { + "epoch": 11.688009313154831, + "grad_norm": 0.20409666001796722, + "learning_rate": 7.94360619000097e-05, + "loss": 0.0209, + "step": 20080 + }, + { + "epoch": 11.69383003492433, + "grad_norm": 0.1596234142780304, + "learning_rate": 7.941378144021381e-05, + "loss": 0.0159, + "step": 20090 + }, + { + "epoch": 11.69965075669383, + "grad_norm": 0.19008305668830872, + "learning_rate": 7.939149204528777e-05, + "loss": 0.0155, + "step": 20100 + }, + { + "epoch": 11.705471478463329, + "grad_norm": 0.254766583442688, + "learning_rate": 7.936919372200246e-05, + "loss": 0.0187, + "step": 20110 + }, + { + "epoch": 11.71129220023283, + "grad_norm": 0.16489793360233307, + "learning_rate": 7.934688647713158e-05, + "loss": 0.0157, + "step": 20120 + }, + { + "epoch": 11.717112922002329, + "grad_norm": 0.21005848050117493, + "learning_rate": 7.932457031745143e-05, + "loss": 0.017, + "step": 20130 + }, + { + "epoch": 11.722933643771828, + "grad_norm": 0.20026734471321106, + "learning_rate": 7.930224524974108e-05, + "loss": 0.0185, + "step": 20140 + }, + { + "epoch": 11.728754365541327, + "grad_norm": 0.1739179790019989, + "learning_rate": 7.927991128078232e-05, + "loss": 0.0143, + "step": 20150 + }, + { + "epoch": 11.734575087310827, + "grad_norm": 0.1499994993209839, + "learning_rate": 7.925756841735958e-05, + "loss": 0.0154, + "step": 20160 + }, + { + "epoch": 11.740395809080326, + "grad_norm": 0.2603696882724762, + "learning_rate": 7.923521666626008e-05, + "loss": 0.0139, + "step": 20170 + }, + { + "epoch": 11.746216530849825, + "grad_norm": 0.19742295145988464, + "learning_rate": 7.921285603427366e-05, + "loss": 0.0136, + "step": 20180 + }, + { + "epoch": 11.752037252619324, + "grad_norm": 0.188218891620636, + "learning_rate": 7.91904865281929e-05, + "loss": 0.0158, + "step": 20190 + }, + { + "epoch": 11.757857974388823, + "grad_norm": 0.3114897608757019, + "learning_rate": 7.916810815481307e-05, + "loss": 0.0154, + "step": 20200 + }, + { + "epoch": 11.763678696158324, + "grad_norm": 0.22178000211715698, + "learning_rate": 7.914572092093211e-05, + "loss": 0.0161, + "step": 20210 + }, + { + "epoch": 11.769499417927824, + "grad_norm": 0.1730736494064331, + "learning_rate": 7.912332483335068e-05, + "loss": 0.0161, + "step": 20220 + }, + { + "epoch": 11.775320139697323, + "grad_norm": 0.2279079109430313, + "learning_rate": 7.910091989887213e-05, + "loss": 0.0147, + "step": 20230 + }, + { + "epoch": 11.781140861466822, + "grad_norm": 0.180404394865036, + "learning_rate": 7.907850612430248e-05, + "loss": 0.0177, + "step": 20240 + }, + { + "epoch": 11.786961583236321, + "grad_norm": 0.16237911581993103, + "learning_rate": 7.905608351645044e-05, + "loss": 0.0133, + "step": 20250 + }, + { + "epoch": 11.79278230500582, + "grad_norm": 0.1835232526063919, + "learning_rate": 7.90336520821274e-05, + "loss": 0.0161, + "step": 20260 + }, + { + "epoch": 11.79860302677532, + "grad_norm": 0.16870100796222687, + "learning_rate": 7.901121182814746e-05, + "loss": 0.0137, + "step": 20270 + }, + { + "epoch": 11.804423748544819, + "grad_norm": 0.23639777302742004, + "learning_rate": 7.898876276132736e-05, + "loss": 0.0155, + "step": 20280 + }, + { + "epoch": 11.81024447031432, + "grad_norm": 0.21817871928215027, + "learning_rate": 7.896630488848654e-05, + "loss": 0.0173, + "step": 20290 + }, + { + "epoch": 11.81606519208382, + "grad_norm": 0.2574286460876465, + "learning_rate": 7.89438382164471e-05, + "loss": 0.0169, + "step": 20300 + }, + { + "epoch": 11.821885913853318, + "grad_norm": 0.28169262409210205, + "learning_rate": 7.892136275203383e-05, + "loss": 0.0164, + "step": 20310 + }, + { + "epoch": 11.827706635622818, + "grad_norm": 0.21902769804000854, + "learning_rate": 7.889887850207418e-05, + "loss": 0.0157, + "step": 20320 + }, + { + "epoch": 11.833527357392317, + "grad_norm": 0.2378809005022049, + "learning_rate": 7.887638547339827e-05, + "loss": 0.016, + "step": 20330 + }, + { + "epoch": 11.839348079161816, + "grad_norm": 0.20796839892864227, + "learning_rate": 7.885388367283891e-05, + "loss": 0.0155, + "step": 20340 + }, + { + "epoch": 11.845168800931315, + "grad_norm": 0.26265019178390503, + "learning_rate": 7.88313731072315e-05, + "loss": 0.0159, + "step": 20350 + }, + { + "epoch": 11.850989522700814, + "grad_norm": 0.3141365349292755, + "learning_rate": 7.88088537834142e-05, + "loss": 0.0165, + "step": 20360 + }, + { + "epoch": 11.856810244470314, + "grad_norm": 0.29016250371932983, + "learning_rate": 7.878632570822778e-05, + "loss": 0.0159, + "step": 20370 + }, + { + "epoch": 11.862630966239813, + "grad_norm": 0.27584153413772583, + "learning_rate": 7.876378888851567e-05, + "loss": 0.0198, + "step": 20380 + }, + { + "epoch": 11.868451688009314, + "grad_norm": 0.25728175044059753, + "learning_rate": 7.874124333112396e-05, + "loss": 0.021, + "step": 20390 + }, + { + "epoch": 11.874272409778813, + "grad_norm": 0.19174371659755707, + "learning_rate": 7.871868904290138e-05, + "loss": 0.0162, + "step": 20400 + }, + { + "epoch": 11.880093131548312, + "grad_norm": 0.22612838447093964, + "learning_rate": 7.869612603069935e-05, + "loss": 0.0206, + "step": 20410 + }, + { + "epoch": 11.885913853317811, + "grad_norm": 0.2280396968126297, + "learning_rate": 7.867355430137192e-05, + "loss": 0.0147, + "step": 20420 + }, + { + "epoch": 11.89173457508731, + "grad_norm": 0.23797328770160675, + "learning_rate": 7.865097386177577e-05, + "loss": 0.0128, + "step": 20430 + }, + { + "epoch": 11.89755529685681, + "grad_norm": 0.25045567750930786, + "learning_rate": 7.862838471877023e-05, + "loss": 0.019, + "step": 20440 + }, + { + "epoch": 11.90337601862631, + "grad_norm": 0.22676895558834076, + "learning_rate": 7.860578687921731e-05, + "loss": 0.0174, + "step": 20450 + }, + { + "epoch": 11.909196740395808, + "grad_norm": 0.2341916412115097, + "learning_rate": 7.858318034998164e-05, + "loss": 0.0171, + "step": 20460 + }, + { + "epoch": 11.91501746216531, + "grad_norm": 0.18340864777565002, + "learning_rate": 7.856056513793046e-05, + "loss": 0.0135, + "step": 20470 + }, + { + "epoch": 11.920838183934809, + "grad_norm": 0.24467894434928894, + "learning_rate": 7.85379412499337e-05, + "loss": 0.0159, + "step": 20480 + }, + { + "epoch": 11.926658905704308, + "grad_norm": 0.13259612023830414, + "learning_rate": 7.851530869286389e-05, + "loss": 0.0158, + "step": 20490 + }, + { + "epoch": 11.932479627473807, + "grad_norm": 0.18937689065933228, + "learning_rate": 7.849266747359619e-05, + "loss": 0.0167, + "step": 20500 + }, + { + "epoch": 11.938300349243306, + "grad_norm": 0.24204155802726746, + "learning_rate": 7.847001759900843e-05, + "loss": 0.0129, + "step": 20510 + }, + { + "epoch": 11.944121071012805, + "grad_norm": 0.24233794212341309, + "learning_rate": 7.844735907598102e-05, + "loss": 0.0222, + "step": 20520 + }, + { + "epoch": 11.949941792782305, + "grad_norm": 0.1752784550189972, + "learning_rate": 7.842469191139703e-05, + "loss": 0.0171, + "step": 20530 + }, + { + "epoch": 11.955762514551804, + "grad_norm": 0.15093624591827393, + "learning_rate": 7.840201611214215e-05, + "loss": 0.0172, + "step": 20540 + }, + { + "epoch": 11.961583236321303, + "grad_norm": 0.2421426922082901, + "learning_rate": 7.837933168510469e-05, + "loss": 0.0245, + "step": 20550 + }, + { + "epoch": 11.967403958090804, + "grad_norm": 0.2619166672229767, + "learning_rate": 7.835663863717559e-05, + "loss": 0.0201, + "step": 20560 + }, + { + "epoch": 11.973224679860303, + "grad_norm": 0.18784230947494507, + "learning_rate": 7.833393697524838e-05, + "loss": 0.0217, + "step": 20570 + }, + { + "epoch": 11.979045401629802, + "grad_norm": 0.2925585210323334, + "learning_rate": 7.831122670621922e-05, + "loss": 0.0183, + "step": 20580 + }, + { + "epoch": 11.984866123399302, + "grad_norm": 0.19713769853115082, + "learning_rate": 7.82885078369869e-05, + "loss": 0.0162, + "step": 20590 + }, + { + "epoch": 11.990686845168801, + "grad_norm": 0.1721346378326416, + "learning_rate": 7.826578037445283e-05, + "loss": 0.0139, + "step": 20600 + }, + { + "epoch": 11.9965075669383, + "grad_norm": 0.17323729395866394, + "learning_rate": 7.824304432552097e-05, + "loss": 0.0135, + "step": 20610 + }, + { + "epoch": 12.0023282887078, + "grad_norm": 0.19746924936771393, + "learning_rate": 7.822029969709798e-05, + "loss": 0.0185, + "step": 20620 + }, + { + "epoch": 12.008149010477299, + "grad_norm": 0.1425185352563858, + "learning_rate": 7.819754649609306e-05, + "loss": 0.022, + "step": 20630 + }, + { + "epoch": 12.013969732246798, + "grad_norm": 0.10329946875572205, + "learning_rate": 7.817478472941802e-05, + "loss": 0.0193, + "step": 20640 + }, + { + "epoch": 12.019790454016299, + "grad_norm": 0.13250726461410522, + "learning_rate": 7.815201440398727e-05, + "loss": 0.0114, + "step": 20650 + }, + { + "epoch": 12.025611175785798, + "grad_norm": 0.22921691834926605, + "learning_rate": 7.812923552671789e-05, + "loss": 0.0187, + "step": 20660 + }, + { + "epoch": 12.031431897555297, + "grad_norm": 0.1604098081588745, + "learning_rate": 7.810644810452945e-05, + "loss": 0.0137, + "step": 20670 + }, + { + "epoch": 12.037252619324796, + "grad_norm": 0.25434768199920654, + "learning_rate": 7.808365214434417e-05, + "loss": 0.0177, + "step": 20680 + }, + { + "epoch": 12.043073341094296, + "grad_norm": 0.14137497544288635, + "learning_rate": 7.80608476530869e-05, + "loss": 0.017, + "step": 20690 + }, + { + "epoch": 12.048894062863795, + "grad_norm": 0.165045365691185, + "learning_rate": 7.8038034637685e-05, + "loss": 0.0139, + "step": 20700 + }, + { + "epoch": 12.054714784633294, + "grad_norm": 0.17686539888381958, + "learning_rate": 7.801521310506848e-05, + "loss": 0.0147, + "step": 20710 + }, + { + "epoch": 12.060535506402793, + "grad_norm": 0.2457243949174881, + "learning_rate": 7.799238306216994e-05, + "loss": 0.012, + "step": 20720 + }, + { + "epoch": 12.066356228172292, + "grad_norm": 0.26829853653907776, + "learning_rate": 7.796954451592448e-05, + "loss": 0.0184, + "step": 20730 + }, + { + "epoch": 12.072176949941793, + "grad_norm": 0.253937691450119, + "learning_rate": 7.794669747326992e-05, + "loss": 0.0161, + "step": 20740 + }, + { + "epoch": 12.077997671711293, + "grad_norm": 0.30259475111961365, + "learning_rate": 7.792384194114654e-05, + "loss": 0.0143, + "step": 20750 + }, + { + "epoch": 12.083818393480792, + "grad_norm": 0.24508382380008698, + "learning_rate": 7.790097792649729e-05, + "loss": 0.0187, + "step": 20760 + }, + { + "epoch": 12.089639115250291, + "grad_norm": 0.24212339520454407, + "learning_rate": 7.787810543626762e-05, + "loss": 0.0169, + "step": 20770 + }, + { + "epoch": 12.09545983701979, + "grad_norm": 0.28533434867858887, + "learning_rate": 7.785522447740558e-05, + "loss": 0.0159, + "step": 20780 + }, + { + "epoch": 12.10128055878929, + "grad_norm": 0.18227174878120422, + "learning_rate": 7.783233505686182e-05, + "loss": 0.0237, + "step": 20790 + }, + { + "epoch": 12.107101280558789, + "grad_norm": 0.16641297936439514, + "learning_rate": 7.780943718158955e-05, + "loss": 0.0132, + "step": 20800 + }, + { + "epoch": 12.112922002328288, + "grad_norm": 0.23089095950126648, + "learning_rate": 7.778653085854453e-05, + "loss": 0.0176, + "step": 20810 + }, + { + "epoch": 12.118742724097789, + "grad_norm": 0.1894545704126358, + "learning_rate": 7.77636160946851e-05, + "loss": 0.0145, + "step": 20820 + }, + { + "epoch": 12.124563445867288, + "grad_norm": 0.20238475501537323, + "learning_rate": 7.774069289697215e-05, + "loss": 0.0168, + "step": 20830 + }, + { + "epoch": 12.130384167636787, + "grad_norm": 0.16456246376037598, + "learning_rate": 7.771776127236913e-05, + "loss": 0.0141, + "step": 20840 + }, + { + "epoch": 12.136204889406287, + "grad_norm": 0.12468015402555466, + "learning_rate": 7.769482122784212e-05, + "loss": 0.0138, + "step": 20850 + }, + { + "epoch": 12.142025611175786, + "grad_norm": 0.24469716846942902, + "learning_rate": 7.767187277035963e-05, + "loss": 0.0147, + "step": 20860 + }, + { + "epoch": 12.147846332945285, + "grad_norm": 0.2076352834701538, + "learning_rate": 7.764891590689285e-05, + "loss": 0.0171, + "step": 20870 + }, + { + "epoch": 12.153667054714784, + "grad_norm": 0.19082427024841309, + "learning_rate": 7.762595064441542e-05, + "loss": 0.0172, + "step": 20880 + }, + { + "epoch": 12.159487776484283, + "grad_norm": 0.26109203696250916, + "learning_rate": 7.760297698990362e-05, + "loss": 0.014, + "step": 20890 + }, + { + "epoch": 12.165308498253783, + "grad_norm": 0.19281694293022156, + "learning_rate": 7.757999495033623e-05, + "loss": 0.0167, + "step": 20900 + }, + { + "epoch": 12.171129220023284, + "grad_norm": 0.17814026772975922, + "learning_rate": 7.755700453269456e-05, + "loss": 0.0103, + "step": 20910 + }, + { + "epoch": 12.176949941792783, + "grad_norm": 0.21510113775730133, + "learning_rate": 7.753400574396254e-05, + "loss": 0.0167, + "step": 20920 + }, + { + "epoch": 12.182770663562282, + "grad_norm": 0.28264766931533813, + "learning_rate": 7.751099859112655e-05, + "loss": 0.0186, + "step": 20930 + }, + { + "epoch": 12.188591385331781, + "grad_norm": 0.19979433715343475, + "learning_rate": 7.748798308117557e-05, + "loss": 0.0139, + "step": 20940 + }, + { + "epoch": 12.19441210710128, + "grad_norm": 0.1721186488866806, + "learning_rate": 7.746495922110112e-05, + "loss": 0.0154, + "step": 20950 + }, + { + "epoch": 12.20023282887078, + "grad_norm": 0.21921850740909576, + "learning_rate": 7.744192701789723e-05, + "loss": 0.0151, + "step": 20960 + }, + { + "epoch": 12.206053550640279, + "grad_norm": 0.19674888253211975, + "learning_rate": 7.741888647856046e-05, + "loss": 0.0147, + "step": 20970 + }, + { + "epoch": 12.211874272409778, + "grad_norm": 0.19540442526340485, + "learning_rate": 7.739583761008994e-05, + "loss": 0.0167, + "step": 20980 + }, + { + "epoch": 12.217694994179277, + "grad_norm": 0.2261795699596405, + "learning_rate": 7.73727804194873e-05, + "loss": 0.0151, + "step": 20990 + }, + { + "epoch": 12.223515715948778, + "grad_norm": 0.26893723011016846, + "learning_rate": 7.734971491375671e-05, + "loss": 0.0145, + "step": 21000 + }, + { + "epoch": 12.229336437718278, + "grad_norm": 0.3287508189678192, + "learning_rate": 7.732664109990485e-05, + "loss": 0.016, + "step": 21010 + }, + { + "epoch": 12.235157159487777, + "grad_norm": 0.2458789199590683, + "learning_rate": 7.730355898494095e-05, + "loss": 0.0239, + "step": 21020 + }, + { + "epoch": 12.240977881257276, + "grad_norm": 0.20077748596668243, + "learning_rate": 7.728046857587673e-05, + "loss": 0.0199, + "step": 21030 + }, + { + "epoch": 12.246798603026775, + "grad_norm": 0.24284479022026062, + "learning_rate": 7.725736987972647e-05, + "loss": 0.0158, + "step": 21040 + }, + { + "epoch": 12.252619324796274, + "grad_norm": 0.2172631472349167, + "learning_rate": 7.723426290350691e-05, + "loss": 0.0177, + "step": 21050 + }, + { + "epoch": 12.258440046565774, + "grad_norm": 0.2208704650402069, + "learning_rate": 7.721114765423736e-05, + "loss": 0.0198, + "step": 21060 + }, + { + "epoch": 12.264260768335273, + "grad_norm": 0.19737696647644043, + "learning_rate": 7.718802413893963e-05, + "loss": 0.0144, + "step": 21070 + }, + { + "epoch": 12.270081490104772, + "grad_norm": 0.16809995472431183, + "learning_rate": 7.716489236463802e-05, + "loss": 0.0161, + "step": 21080 + }, + { + "epoch": 12.275902211874273, + "grad_norm": 0.1880379319190979, + "learning_rate": 7.714175233835936e-05, + "loss": 0.0129, + "step": 21090 + }, + { + "epoch": 12.281722933643772, + "grad_norm": 0.2483110874891281, + "learning_rate": 7.711860406713299e-05, + "loss": 0.0144, + "step": 21100 + }, + { + "epoch": 12.287543655413272, + "grad_norm": 0.141978457570076, + "learning_rate": 7.70954475579907e-05, + "loss": 0.0136, + "step": 21110 + }, + { + "epoch": 12.29336437718277, + "grad_norm": 0.20658130943775177, + "learning_rate": 7.707228281796688e-05, + "loss": 0.0155, + "step": 21120 + }, + { + "epoch": 12.29918509895227, + "grad_norm": 0.30627480149269104, + "learning_rate": 7.704910985409833e-05, + "loss": 0.0162, + "step": 21130 + }, + { + "epoch": 12.30500582072177, + "grad_norm": 0.10079514980316162, + "learning_rate": 7.702592867342439e-05, + "loss": 0.0167, + "step": 21140 + }, + { + "epoch": 12.310826542491268, + "grad_norm": 0.24622513353824615, + "learning_rate": 7.700273928298691e-05, + "loss": 0.0147, + "step": 21150 + }, + { + "epoch": 12.316647264260768, + "grad_norm": 0.26937249302864075, + "learning_rate": 7.697954168983021e-05, + "loss": 0.0161, + "step": 21160 + }, + { + "epoch": 12.322467986030269, + "grad_norm": 0.16641119122505188, + "learning_rate": 7.695633590100109e-05, + "loss": 0.0178, + "step": 21170 + }, + { + "epoch": 12.328288707799768, + "grad_norm": 0.18821941316127777, + "learning_rate": 7.693312192354886e-05, + "loss": 0.0168, + "step": 21180 + }, + { + "epoch": 12.334109429569267, + "grad_norm": 0.15231360495090485, + "learning_rate": 7.690989976452532e-05, + "loss": 0.0149, + "step": 21190 + }, + { + "epoch": 12.339930151338766, + "grad_norm": 0.20581334829330444, + "learning_rate": 7.688666943098475e-05, + "loss": 0.0158, + "step": 21200 + }, + { + "epoch": 12.345750873108265, + "grad_norm": 0.16680045425891876, + "learning_rate": 7.686343092998389e-05, + "loss": 0.0152, + "step": 21210 + }, + { + "epoch": 12.351571594877765, + "grad_norm": 0.22436363995075226, + "learning_rate": 7.684018426858202e-05, + "loss": 0.016, + "step": 21220 + }, + { + "epoch": 12.357392316647264, + "grad_norm": 0.1857082098722458, + "learning_rate": 7.681692945384084e-05, + "loss": 0.0159, + "step": 21230 + }, + { + "epoch": 12.363213038416763, + "grad_norm": 0.1579626202583313, + "learning_rate": 7.679366649282456e-05, + "loss": 0.0118, + "step": 21240 + }, + { + "epoch": 12.369033760186262, + "grad_norm": 0.2649032175540924, + "learning_rate": 7.677039539259983e-05, + "loss": 0.016, + "step": 21250 + }, + { + "epoch": 12.374854481955763, + "grad_norm": 0.15498220920562744, + "learning_rate": 7.674711616023581e-05, + "loss": 0.0154, + "step": 21260 + }, + { + "epoch": 12.380675203725263, + "grad_norm": 0.1908063292503357, + "learning_rate": 7.672382880280413e-05, + "loss": 0.0153, + "step": 21270 + }, + { + "epoch": 12.386495925494762, + "grad_norm": 0.25102922320365906, + "learning_rate": 7.670053332737885e-05, + "loss": 0.015, + "step": 21280 + }, + { + "epoch": 12.392316647264261, + "grad_norm": 0.17627394199371338, + "learning_rate": 7.667722974103654e-05, + "loss": 0.0131, + "step": 21290 + }, + { + "epoch": 12.39813736903376, + "grad_norm": 0.3068438172340393, + "learning_rate": 7.66539180508562e-05, + "loss": 0.019, + "step": 21300 + }, + { + "epoch": 12.40395809080326, + "grad_norm": 0.21766528487205505, + "learning_rate": 7.663059826391932e-05, + "loss": 0.0189, + "step": 21310 + }, + { + "epoch": 12.409778812572759, + "grad_norm": 0.16225820779800415, + "learning_rate": 7.660727038730981e-05, + "loss": 0.0149, + "step": 21320 + }, + { + "epoch": 12.415599534342258, + "grad_norm": 0.17872437834739685, + "learning_rate": 7.65839344281141e-05, + "loss": 0.0164, + "step": 21330 + }, + { + "epoch": 12.421420256111757, + "grad_norm": 0.16732190549373627, + "learning_rate": 7.656059039342101e-05, + "loss": 0.013, + "step": 21340 + }, + { + "epoch": 12.427240977881258, + "grad_norm": 0.19207806885242462, + "learning_rate": 7.653723829032187e-05, + "loss": 0.0168, + "step": 21350 + }, + { + "epoch": 12.433061699650757, + "grad_norm": 0.25531241297721863, + "learning_rate": 7.65138781259104e-05, + "loss": 0.0214, + "step": 21360 + }, + { + "epoch": 12.438882421420256, + "grad_norm": 0.2925136983394623, + "learning_rate": 7.649050990728279e-05, + "loss": 0.0133, + "step": 21370 + }, + { + "epoch": 12.444703143189756, + "grad_norm": 0.2626064419746399, + "learning_rate": 7.646713364153774e-05, + "loss": 0.0184, + "step": 21380 + }, + { + "epoch": 12.450523864959255, + "grad_norm": 0.25359976291656494, + "learning_rate": 7.64437493357763e-05, + "loss": 0.016, + "step": 21390 + }, + { + "epoch": 12.456344586728754, + "grad_norm": 0.14786125719547272, + "learning_rate": 7.642035699710202e-05, + "loss": 0.016, + "step": 21400 + }, + { + "epoch": 12.462165308498253, + "grad_norm": 0.199729785323143, + "learning_rate": 7.639695663262089e-05, + "loss": 0.0135, + "step": 21410 + }, + { + "epoch": 12.467986030267753, + "grad_norm": 0.21709366142749786, + "learning_rate": 7.637354824944128e-05, + "loss": 0.0259, + "step": 21420 + }, + { + "epoch": 12.473806752037252, + "grad_norm": 0.206264927983284, + "learning_rate": 7.635013185467408e-05, + "loss": 0.0146, + "step": 21430 + }, + { + "epoch": 12.479627473806753, + "grad_norm": 0.21952103078365326, + "learning_rate": 7.632670745543256e-05, + "loss": 0.0147, + "step": 21440 + }, + { + "epoch": 12.485448195576252, + "grad_norm": 0.20218923687934875, + "learning_rate": 7.630327505883242e-05, + "loss": 0.0162, + "step": 21450 + }, + { + "epoch": 12.491268917345751, + "grad_norm": 0.1833810955286026, + "learning_rate": 7.627983467199182e-05, + "loss": 0.0138, + "step": 21460 + }, + { + "epoch": 12.49708963911525, + "grad_norm": 0.16628196835517883, + "learning_rate": 7.625638630203132e-05, + "loss": 0.0184, + "step": 21470 + }, + { + "epoch": 12.50291036088475, + "grad_norm": 0.21002434194087982, + "learning_rate": 7.623292995607394e-05, + "loss": 0.0154, + "step": 21480 + }, + { + "epoch": 12.508731082654249, + "grad_norm": 0.3238052725791931, + "learning_rate": 7.620946564124507e-05, + "loss": 0.0176, + "step": 21490 + }, + { + "epoch": 12.514551804423748, + "grad_norm": 0.1995585858821869, + "learning_rate": 7.618599336467256e-05, + "loss": 0.0151, + "step": 21500 + }, + { + "epoch": 12.520372526193247, + "grad_norm": 0.20417387783527374, + "learning_rate": 7.616251313348666e-05, + "loss": 0.0152, + "step": 21510 + }, + { + "epoch": 12.526193247962748, + "grad_norm": 0.2690769135951996, + "learning_rate": 7.613902495482005e-05, + "loss": 0.0189, + "step": 21520 + }, + { + "epoch": 12.532013969732247, + "grad_norm": 0.19695527851581573, + "learning_rate": 7.611552883580784e-05, + "loss": 0.0147, + "step": 21530 + }, + { + "epoch": 12.537834691501747, + "grad_norm": 0.18423610925674438, + "learning_rate": 7.609202478358748e-05, + "loss": 0.017, + "step": 21540 + }, + { + "epoch": 12.543655413271246, + "grad_norm": 0.1967875063419342, + "learning_rate": 7.606851280529895e-05, + "loss": 0.016, + "step": 21550 + }, + { + "epoch": 12.549476135040745, + "grad_norm": 0.14806769788265228, + "learning_rate": 7.604499290808449e-05, + "loss": 0.0156, + "step": 21560 + }, + { + "epoch": 12.555296856810244, + "grad_norm": 0.15965336561203003, + "learning_rate": 7.602146509908888e-05, + "loss": 0.0147, + "step": 21570 + }, + { + "epoch": 12.561117578579744, + "grad_norm": 0.21828746795654297, + "learning_rate": 7.599792938545921e-05, + "loss": 0.0213, + "step": 21580 + }, + { + "epoch": 12.566938300349243, + "grad_norm": 0.17718054354190826, + "learning_rate": 7.597438577434506e-05, + "loss": 0.0164, + "step": 21590 + }, + { + "epoch": 12.572759022118742, + "grad_norm": 0.19092483818531036, + "learning_rate": 7.595083427289831e-05, + "loss": 0.0134, + "step": 21600 + }, + { + "epoch": 12.578579743888243, + "grad_norm": 0.19901928305625916, + "learning_rate": 7.59272748882733e-05, + "loss": 0.0127, + "step": 21610 + }, + { + "epoch": 12.584400465657742, + "grad_norm": 0.17498882114887238, + "learning_rate": 7.590370762762675e-05, + "loss": 0.015, + "step": 21620 + }, + { + "epoch": 12.590221187427241, + "grad_norm": 0.2826768457889557, + "learning_rate": 7.588013249811777e-05, + "loss": 0.0204, + "step": 21630 + }, + { + "epoch": 12.59604190919674, + "grad_norm": 0.10271266102790833, + "learning_rate": 7.585654950690786e-05, + "loss": 0.0143, + "step": 21640 + }, + { + "epoch": 12.60186263096624, + "grad_norm": 0.2332507222890854, + "learning_rate": 7.583295866116091e-05, + "loss": 0.0173, + "step": 21650 + }, + { + "epoch": 12.607683352735739, + "grad_norm": 0.19887591898441315, + "learning_rate": 7.580935996804321e-05, + "loss": 0.0184, + "step": 21660 + }, + { + "epoch": 12.613504074505238, + "grad_norm": 0.2634870409965515, + "learning_rate": 7.57857534347234e-05, + "loss": 0.0177, + "step": 21670 + }, + { + "epoch": 12.619324796274737, + "grad_norm": 0.18823125958442688, + "learning_rate": 7.576213906837254e-05, + "loss": 0.0153, + "step": 21680 + }, + { + "epoch": 12.625145518044237, + "grad_norm": 0.17495368421077728, + "learning_rate": 7.573851687616403e-05, + "loss": 0.0174, + "step": 21690 + }, + { + "epoch": 12.630966239813738, + "grad_norm": 0.24874170124530792, + "learning_rate": 7.571488686527368e-05, + "loss": 0.0138, + "step": 21700 + }, + { + "epoch": 12.636786961583237, + "grad_norm": 0.17474348843097687, + "learning_rate": 7.569124904287968e-05, + "loss": 0.0128, + "step": 21710 + }, + { + "epoch": 12.642607683352736, + "grad_norm": 0.16583983600139618, + "learning_rate": 7.566760341616254e-05, + "loss": 0.0166, + "step": 21720 + }, + { + "epoch": 12.648428405122235, + "grad_norm": 0.3067742884159088, + "learning_rate": 7.564394999230519e-05, + "loss": 0.0166, + "step": 21730 + }, + { + "epoch": 12.654249126891735, + "grad_norm": 0.18090088665485382, + "learning_rate": 7.562028877849294e-05, + "loss": 0.012, + "step": 21740 + }, + { + "epoch": 12.660069848661234, + "grad_norm": 0.2981374263763428, + "learning_rate": 7.559661978191341e-05, + "loss": 0.0158, + "step": 21750 + }, + { + "epoch": 12.665890570430733, + "grad_norm": 0.21285536885261536, + "learning_rate": 7.557294300975664e-05, + "loss": 0.0132, + "step": 21760 + }, + { + "epoch": 12.671711292200232, + "grad_norm": 0.15219783782958984, + "learning_rate": 7.554925846921499e-05, + "loss": 0.0171, + "step": 21770 + }, + { + "epoch": 12.677532013969731, + "grad_norm": 0.24557222425937653, + "learning_rate": 7.552556616748321e-05, + "loss": 0.0137, + "step": 21780 + }, + { + "epoch": 12.683352735739232, + "grad_norm": 0.23756413161754608, + "learning_rate": 7.550186611175838e-05, + "loss": 0.0209, + "step": 21790 + }, + { + "epoch": 12.689173457508732, + "grad_norm": 0.19859588146209717, + "learning_rate": 7.547815830923998e-05, + "loss": 0.0119, + "step": 21800 + }, + { + "epoch": 12.69499417927823, + "grad_norm": 0.2925613224506378, + "learning_rate": 7.54544427671298e-05, + "loss": 0.0165, + "step": 21810 + }, + { + "epoch": 12.70081490104773, + "grad_norm": 0.18354704976081848, + "learning_rate": 7.543071949263198e-05, + "loss": 0.0142, + "step": 21820 + }, + { + "epoch": 12.70663562281723, + "grad_norm": 0.1711689531803131, + "learning_rate": 7.540698849295305e-05, + "loss": 0.0113, + "step": 21830 + }, + { + "epoch": 12.712456344586728, + "grad_norm": 0.20088250935077667, + "learning_rate": 7.538324977530183e-05, + "loss": 0.0137, + "step": 21840 + }, + { + "epoch": 12.718277066356228, + "grad_norm": 0.22169822454452515, + "learning_rate": 7.535950334688955e-05, + "loss": 0.019, + "step": 21850 + }, + { + "epoch": 12.724097788125727, + "grad_norm": 0.2210608869791031, + "learning_rate": 7.533574921492972e-05, + "loss": 0.0146, + "step": 21860 + }, + { + "epoch": 12.729918509895228, + "grad_norm": 0.18488039076328278, + "learning_rate": 7.531198738663824e-05, + "loss": 0.012, + "step": 21870 + }, + { + "epoch": 12.735739231664727, + "grad_norm": 0.202838733792305, + "learning_rate": 7.528821786923333e-05, + "loss": 0.0162, + "step": 21880 + }, + { + "epoch": 12.741559953434226, + "grad_norm": 0.27206557989120483, + "learning_rate": 7.52644406699355e-05, + "loss": 0.0191, + "step": 21890 + }, + { + "epoch": 12.747380675203726, + "grad_norm": 0.17029070854187012, + "learning_rate": 7.524065579596766e-05, + "loss": 0.0123, + "step": 21900 + }, + { + "epoch": 12.753201396973225, + "grad_norm": 0.18637794256210327, + "learning_rate": 7.521686325455506e-05, + "loss": 0.0115, + "step": 21910 + }, + { + "epoch": 12.759022118742724, + "grad_norm": 0.21629586815834045, + "learning_rate": 7.51930630529252e-05, + "loss": 0.016, + "step": 21920 + }, + { + "epoch": 12.764842840512223, + "grad_norm": 0.22350120544433594, + "learning_rate": 7.516925519830797e-05, + "loss": 0.0157, + "step": 21930 + }, + { + "epoch": 12.770663562281722, + "grad_norm": 0.19447210431098938, + "learning_rate": 7.514543969793557e-05, + "loss": 0.0165, + "step": 21940 + }, + { + "epoch": 12.776484284051222, + "grad_norm": 0.24149774014949799, + "learning_rate": 7.512161655904251e-05, + "loss": 0.0174, + "step": 21950 + }, + { + "epoch": 12.782305005820723, + "grad_norm": 0.2917745113372803, + "learning_rate": 7.509778578886563e-05, + "loss": 0.016, + "step": 21960 + }, + { + "epoch": 12.788125727590222, + "grad_norm": 0.18218223750591278, + "learning_rate": 7.507394739464412e-05, + "loss": 0.0151, + "step": 21970 + }, + { + "epoch": 12.793946449359721, + "grad_norm": 0.19260862469673157, + "learning_rate": 7.50501013836194e-05, + "loss": 0.0194, + "step": 21980 + }, + { + "epoch": 12.79976717112922, + "grad_norm": 0.17869427800178528, + "learning_rate": 7.50262477630353e-05, + "loss": 0.0179, + "step": 21990 + }, + { + "epoch": 12.80558789289872, + "grad_norm": 0.1955806165933609, + "learning_rate": 7.500238654013794e-05, + "loss": 0.0195, + "step": 22000 + }, + { + "epoch": 12.811408614668219, + "grad_norm": 0.2427777200937271, + "learning_rate": 7.497851772217566e-05, + "loss": 0.0143, + "step": 22010 + }, + { + "epoch": 12.817229336437718, + "grad_norm": 0.21133267879486084, + "learning_rate": 7.495464131639924e-05, + "loss": 0.0125, + "step": 22020 + }, + { + "epoch": 12.823050058207217, + "grad_norm": 0.22028416395187378, + "learning_rate": 7.493075733006166e-05, + "loss": 0.0144, + "step": 22030 + }, + { + "epoch": 12.828870779976716, + "grad_norm": 0.2000439316034317, + "learning_rate": 7.490686577041828e-05, + "loss": 0.0169, + "step": 22040 + }, + { + "epoch": 12.834691501746217, + "grad_norm": 0.1979343742132187, + "learning_rate": 7.488296664472668e-05, + "loss": 0.019, + "step": 22050 + }, + { + "epoch": 12.840512223515717, + "grad_norm": 0.15577925741672516, + "learning_rate": 7.485905996024682e-05, + "loss": 0.0146, + "step": 22060 + }, + { + "epoch": 12.846332945285216, + "grad_norm": 0.29534536600112915, + "learning_rate": 7.483514572424093e-05, + "loss": 0.0123, + "step": 22070 + }, + { + "epoch": 12.852153667054715, + "grad_norm": 0.1376355141401291, + "learning_rate": 7.481122394397349e-05, + "loss": 0.013, + "step": 22080 + }, + { + "epoch": 12.857974388824214, + "grad_norm": 0.17599479854106903, + "learning_rate": 7.478729462671131e-05, + "loss": 0.0121, + "step": 22090 + }, + { + "epoch": 12.863795110593713, + "grad_norm": 0.4282821714878082, + "learning_rate": 7.47633577797235e-05, + "loss": 0.0203, + "step": 22100 + }, + { + "epoch": 12.869615832363213, + "grad_norm": 0.22186964750289917, + "learning_rate": 7.473941341028144e-05, + "loss": 0.0131, + "step": 22110 + }, + { + "epoch": 12.875436554132712, + "grad_norm": 0.26041486859321594, + "learning_rate": 7.471546152565879e-05, + "loss": 0.0127, + "step": 22120 + }, + { + "epoch": 12.881257275902211, + "grad_norm": 0.14944392442703247, + "learning_rate": 7.46915021331315e-05, + "loss": 0.0203, + "step": 22130 + }, + { + "epoch": 12.887077997671712, + "grad_norm": 0.17182980477809906, + "learning_rate": 7.466753523997778e-05, + "loss": 0.016, + "step": 22140 + }, + { + "epoch": 12.892898719441211, + "grad_norm": 0.25048938393592834, + "learning_rate": 7.464356085347819e-05, + "loss": 0.0173, + "step": 22150 + }, + { + "epoch": 12.89871944121071, + "grad_norm": 0.1884486973285675, + "learning_rate": 7.461957898091548e-05, + "loss": 0.0168, + "step": 22160 + }, + { + "epoch": 12.90454016298021, + "grad_norm": 0.16889619827270508, + "learning_rate": 7.459558962957473e-05, + "loss": 0.0152, + "step": 22170 + }, + { + "epoch": 12.910360884749709, + "grad_norm": 0.23374371230602264, + "learning_rate": 7.457159280674326e-05, + "loss": 0.015, + "step": 22180 + }, + { + "epoch": 12.916181606519208, + "grad_norm": 0.20487189292907715, + "learning_rate": 7.454758851971066e-05, + "loss": 0.0137, + "step": 22190 + }, + { + "epoch": 12.922002328288707, + "grad_norm": 0.23057334125041962, + "learning_rate": 7.45235767757688e-05, + "loss": 0.0167, + "step": 22200 + }, + { + "epoch": 12.927823050058207, + "grad_norm": 0.16939254105091095, + "learning_rate": 7.449955758221183e-05, + "loss": 0.0234, + "step": 22210 + }, + { + "epoch": 12.933643771827708, + "grad_norm": 0.12457867711782455, + "learning_rate": 7.447553094633615e-05, + "loss": 0.0156, + "step": 22220 + }, + { + "epoch": 12.939464493597207, + "grad_norm": 0.21835145354270935, + "learning_rate": 7.445149687544039e-05, + "loss": 0.0154, + "step": 22230 + }, + { + "epoch": 12.945285215366706, + "grad_norm": 0.33623284101486206, + "learning_rate": 7.44274553768255e-05, + "loss": 0.016, + "step": 22240 + }, + { + "epoch": 12.951105937136205, + "grad_norm": 0.16882464289665222, + "learning_rate": 7.440340645779464e-05, + "loss": 0.0142, + "step": 22250 + }, + { + "epoch": 12.956926658905704, + "grad_norm": 0.12000371515750885, + "learning_rate": 7.437935012565322e-05, + "loss": 0.0162, + "step": 22260 + }, + { + "epoch": 12.962747380675204, + "grad_norm": 0.2209947258234024, + "learning_rate": 7.435528638770893e-05, + "loss": 0.0187, + "step": 22270 + }, + { + "epoch": 12.968568102444703, + "grad_norm": 0.18295638263225555, + "learning_rate": 7.433121525127171e-05, + "loss": 0.0195, + "step": 22280 + }, + { + "epoch": 12.974388824214202, + "grad_norm": 0.2151643931865692, + "learning_rate": 7.430713672365371e-05, + "loss": 0.0296, + "step": 22290 + }, + { + "epoch": 12.980209545983701, + "grad_norm": 0.18202900886535645, + "learning_rate": 7.428305081216938e-05, + "loss": 0.0155, + "step": 22300 + }, + { + "epoch": 12.9860302677532, + "grad_norm": 0.22288943827152252, + "learning_rate": 7.425895752413536e-05, + "loss": 0.0153, + "step": 22310 + }, + { + "epoch": 12.991850989522701, + "grad_norm": 0.1590266227722168, + "learning_rate": 7.423485686687057e-05, + "loss": 0.0191, + "step": 22320 + }, + { + "epoch": 12.9976717112922, + "grad_norm": 0.18743152916431427, + "learning_rate": 7.421074884769616e-05, + "loss": 0.0142, + "step": 22330 + }, + { + "epoch": 13.0034924330617, + "grad_norm": 0.1148550733923912, + "learning_rate": 7.418663347393548e-05, + "loss": 0.018, + "step": 22340 + }, + { + "epoch": 13.009313154831199, + "grad_norm": 0.2585376501083374, + "learning_rate": 7.416251075291418e-05, + "loss": 0.0159, + "step": 22350 + }, + { + "epoch": 13.015133876600698, + "grad_norm": 0.16569727659225464, + "learning_rate": 7.413838069196007e-05, + "loss": 0.0117, + "step": 22360 + }, + { + "epoch": 13.020954598370198, + "grad_norm": 0.22135592997074127, + "learning_rate": 7.411424329840324e-05, + "loss": 0.0166, + "step": 22370 + }, + { + "epoch": 13.026775320139697, + "grad_norm": 0.23292632400989532, + "learning_rate": 7.409009857957601e-05, + "loss": 0.0232, + "step": 22380 + }, + { + "epoch": 13.032596041909196, + "grad_norm": 0.21639177203178406, + "learning_rate": 7.40659465428129e-05, + "loss": 0.019, + "step": 22390 + }, + { + "epoch": 13.038416763678697, + "grad_norm": 0.2085385024547577, + "learning_rate": 7.404178719545063e-05, + "loss": 0.0188, + "step": 22400 + }, + { + "epoch": 13.044237485448196, + "grad_norm": 0.28173694014549255, + "learning_rate": 7.401762054482822e-05, + "loss": 0.0234, + "step": 22410 + }, + { + "epoch": 13.050058207217695, + "grad_norm": 0.1746482402086258, + "learning_rate": 7.39934465982868e-05, + "loss": 0.0175, + "step": 22420 + }, + { + "epoch": 13.055878928987195, + "grad_norm": 0.3178461194038391, + "learning_rate": 7.396926536316984e-05, + "loss": 0.0145, + "step": 22430 + }, + { + "epoch": 13.061699650756694, + "grad_norm": 0.22266443073749542, + "learning_rate": 7.394507684682293e-05, + "loss": 0.0195, + "step": 22440 + }, + { + "epoch": 13.067520372526193, + "grad_norm": 0.21203652024269104, + "learning_rate": 7.392088105659393e-05, + "loss": 0.016, + "step": 22450 + }, + { + "epoch": 13.073341094295692, + "grad_norm": 0.2577235698699951, + "learning_rate": 7.389667799983284e-05, + "loss": 0.0193, + "step": 22460 + }, + { + "epoch": 13.079161816065191, + "grad_norm": 0.22858773171901703, + "learning_rate": 7.387246768389193e-05, + "loss": 0.0162, + "step": 22470 + }, + { + "epoch": 13.08498253783469, + "grad_norm": 0.2844577133655548, + "learning_rate": 7.384825011612563e-05, + "loss": 0.0165, + "step": 22480 + }, + { + "epoch": 13.090803259604192, + "grad_norm": 0.2528366446495056, + "learning_rate": 7.382402530389066e-05, + "loss": 0.0238, + "step": 22490 + }, + { + "epoch": 13.09662398137369, + "grad_norm": 0.17641180753707886, + "learning_rate": 7.379979325454582e-05, + "loss": 0.018, + "step": 22500 + }, + { + "epoch": 13.10244470314319, + "grad_norm": 0.23830117285251617, + "learning_rate": 7.37755539754522e-05, + "loss": 0.0155, + "step": 22510 + }, + { + "epoch": 13.10826542491269, + "grad_norm": 0.1284002959728241, + "learning_rate": 7.375130747397302e-05, + "loss": 0.0143, + "step": 22520 + }, + { + "epoch": 13.114086146682189, + "grad_norm": 0.15706345438957214, + "learning_rate": 7.372705375747377e-05, + "loss": 0.0152, + "step": 22530 + }, + { + "epoch": 13.119906868451688, + "grad_norm": 0.2154955267906189, + "learning_rate": 7.370279283332205e-05, + "loss": 0.0153, + "step": 22540 + }, + { + "epoch": 13.125727590221187, + "grad_norm": 0.18787477910518646, + "learning_rate": 7.36785247088877e-05, + "loss": 0.0155, + "step": 22550 + }, + { + "epoch": 13.131548311990686, + "grad_norm": 0.1785164177417755, + "learning_rate": 7.365424939154275e-05, + "loss": 0.0163, + "step": 22560 + }, + { + "epoch": 13.137369033760187, + "grad_norm": 0.14930205047130585, + "learning_rate": 7.362996688866138e-05, + "loss": 0.0145, + "step": 22570 + }, + { + "epoch": 13.143189755529686, + "grad_norm": 0.23196038603782654, + "learning_rate": 7.360567720761999e-05, + "loss": 0.0165, + "step": 22580 + }, + { + "epoch": 13.149010477299186, + "grad_norm": 0.28238844871520996, + "learning_rate": 7.358138035579711e-05, + "loss": 0.0178, + "step": 22590 + }, + { + "epoch": 13.154831199068685, + "grad_norm": 0.17969916760921478, + "learning_rate": 7.355707634057354e-05, + "loss": 0.0185, + "step": 22600 + }, + { + "epoch": 13.160651920838184, + "grad_norm": 0.17774102091789246, + "learning_rate": 7.353276516933215e-05, + "loss": 0.0117, + "step": 22610 + }, + { + "epoch": 13.166472642607683, + "grad_norm": 0.15383021533489227, + "learning_rate": 7.350844684945806e-05, + "loss": 0.0187, + "step": 22620 + }, + { + "epoch": 13.172293364377182, + "grad_norm": 0.1783594787120819, + "learning_rate": 7.348412138833851e-05, + "loss": 0.0153, + "step": 22630 + }, + { + "epoch": 13.178114086146682, + "grad_norm": 0.17529648542404175, + "learning_rate": 7.345978879336295e-05, + "loss": 0.0146, + "step": 22640 + }, + { + "epoch": 13.18393480791618, + "grad_norm": 0.1430836319923401, + "learning_rate": 7.343544907192296e-05, + "loss": 0.0189, + "step": 22650 + }, + { + "epoch": 13.189755529685682, + "grad_norm": 0.1528661549091339, + "learning_rate": 7.341110223141235e-05, + "loss": 0.0175, + "step": 22660 + }, + { + "epoch": 13.195576251455181, + "grad_norm": 0.290580153465271, + "learning_rate": 7.3386748279227e-05, + "loss": 0.0167, + "step": 22670 + }, + { + "epoch": 13.20139697322468, + "grad_norm": 0.2357521802186966, + "learning_rate": 7.336238722276501e-05, + "loss": 0.0206, + "step": 22680 + }, + { + "epoch": 13.20721769499418, + "grad_norm": 0.24362829327583313, + "learning_rate": 7.333801906942663e-05, + "loss": 0.0131, + "step": 22690 + }, + { + "epoch": 13.213038416763679, + "grad_norm": 0.14461453258991241, + "learning_rate": 7.331364382661428e-05, + "loss": 0.0173, + "step": 22700 + }, + { + "epoch": 13.218859138533178, + "grad_norm": 0.17056497931480408, + "learning_rate": 7.328926150173248e-05, + "loss": 0.0138, + "step": 22710 + }, + { + "epoch": 13.224679860302677, + "grad_norm": 0.25442373752593994, + "learning_rate": 7.326487210218795e-05, + "loss": 0.013, + "step": 22720 + }, + { + "epoch": 13.230500582072176, + "grad_norm": 0.2352045625448227, + "learning_rate": 7.324047563538955e-05, + "loss": 0.0235, + "step": 22730 + }, + { + "epoch": 13.236321303841676, + "grad_norm": 0.14290817081928253, + "learning_rate": 7.321607210874828e-05, + "loss": 0.0164, + "step": 22740 + }, + { + "epoch": 13.242142025611177, + "grad_norm": 0.23170119524002075, + "learning_rate": 7.31916615296773e-05, + "loss": 0.0139, + "step": 22750 + }, + { + "epoch": 13.247962747380676, + "grad_norm": 0.20255859196186066, + "learning_rate": 7.316724390559188e-05, + "loss": 0.014, + "step": 22760 + }, + { + "epoch": 13.253783469150175, + "grad_norm": 0.18499068915843964, + "learning_rate": 7.314281924390946e-05, + "loss": 0.0092, + "step": 22770 + }, + { + "epoch": 13.259604190919674, + "grad_norm": 0.17771655321121216, + "learning_rate": 7.311838755204959e-05, + "loss": 0.0159, + "step": 22780 + }, + { + "epoch": 13.265424912689173, + "grad_norm": 0.17983508110046387, + "learning_rate": 7.3093948837434e-05, + "loss": 0.0155, + "step": 22790 + }, + { + "epoch": 13.271245634458673, + "grad_norm": 0.2206009030342102, + "learning_rate": 7.306950310748651e-05, + "loss": 0.0113, + "step": 22800 + }, + { + "epoch": 13.277066356228172, + "grad_norm": 0.1836846023797989, + "learning_rate": 7.304505036963311e-05, + "loss": 0.018, + "step": 22810 + }, + { + "epoch": 13.282887077997671, + "grad_norm": 0.24322178959846497, + "learning_rate": 7.302059063130186e-05, + "loss": 0.0147, + "step": 22820 + }, + { + "epoch": 13.28870779976717, + "grad_norm": 0.1823319047689438, + "learning_rate": 7.2996123899923e-05, + "loss": 0.0124, + "step": 22830 + }, + { + "epoch": 13.294528521536671, + "grad_norm": 0.2618110179901123, + "learning_rate": 7.297165018292886e-05, + "loss": 0.0182, + "step": 22840 + }, + { + "epoch": 13.30034924330617, + "grad_norm": 0.1242389976978302, + "learning_rate": 7.294716948775396e-05, + "loss": 0.0135, + "step": 22850 + }, + { + "epoch": 13.30616996507567, + "grad_norm": 0.20854033529758453, + "learning_rate": 7.292268182183484e-05, + "loss": 0.0153, + "step": 22860 + }, + { + "epoch": 13.311990686845169, + "grad_norm": 0.2499571293592453, + "learning_rate": 7.28981871926102e-05, + "loss": 0.0112, + "step": 22870 + }, + { + "epoch": 13.317811408614668, + "grad_norm": 0.2093207985162735, + "learning_rate": 7.28736856075209e-05, + "loss": 0.0171, + "step": 22880 + }, + { + "epoch": 13.323632130384167, + "grad_norm": 0.2100481539964676, + "learning_rate": 7.284917707400985e-05, + "loss": 0.0115, + "step": 22890 + }, + { + "epoch": 13.329452852153667, + "grad_norm": 0.1627095490694046, + "learning_rate": 7.282466159952212e-05, + "loss": 0.0179, + "step": 22900 + }, + { + "epoch": 13.335273573923166, + "grad_norm": 0.140729621052742, + "learning_rate": 7.280013919150483e-05, + "loss": 0.0167, + "step": 22910 + }, + { + "epoch": 13.341094295692667, + "grad_norm": 0.1892407238483429, + "learning_rate": 7.277560985740728e-05, + "loss": 0.0251, + "step": 22920 + }, + { + "epoch": 13.346915017462166, + "grad_norm": 0.2081761658191681, + "learning_rate": 7.275107360468079e-05, + "loss": 0.0169, + "step": 22930 + }, + { + "epoch": 13.352735739231665, + "grad_norm": 0.16938753426074982, + "learning_rate": 7.272653044077885e-05, + "loss": 0.0149, + "step": 22940 + }, + { + "epoch": 13.358556461001164, + "grad_norm": 0.24273957312107086, + "learning_rate": 7.270198037315703e-05, + "loss": 0.0189, + "step": 22950 + }, + { + "epoch": 13.364377182770664, + "grad_norm": 0.18352364003658295, + "learning_rate": 7.267742340927297e-05, + "loss": 0.0129, + "step": 22960 + }, + { + "epoch": 13.370197904540163, + "grad_norm": 0.22175651788711548, + "learning_rate": 7.265285955658645e-05, + "loss": 0.0103, + "step": 22970 + }, + { + "epoch": 13.376018626309662, + "grad_norm": 0.13032619655132294, + "learning_rate": 7.26282888225593e-05, + "loss": 0.0138, + "step": 22980 + }, + { + "epoch": 13.381839348079161, + "grad_norm": 0.18065781891345978, + "learning_rate": 7.260371121465548e-05, + "loss": 0.0131, + "step": 22990 + }, + { + "epoch": 13.38766006984866, + "grad_norm": 0.28536346554756165, + "learning_rate": 7.2579126740341e-05, + "loss": 0.0197, + "step": 23000 + }, + { + "epoch": 13.39348079161816, + "grad_norm": 0.15923555195331573, + "learning_rate": 7.2554535407084e-05, + "loss": 0.0136, + "step": 23010 + }, + { + "epoch": 13.39930151338766, + "grad_norm": 0.1797960251569748, + "learning_rate": 7.252993722235464e-05, + "loss": 0.016, + "step": 23020 + }, + { + "epoch": 13.40512223515716, + "grad_norm": 0.22504176199436188, + "learning_rate": 7.250533219362523e-05, + "loss": 0.0123, + "step": 23030 + }, + { + "epoch": 13.41094295692666, + "grad_norm": 0.25316673517227173, + "learning_rate": 7.248072032837012e-05, + "loss": 0.0204, + "step": 23040 + }, + { + "epoch": 13.416763678696158, + "grad_norm": 0.16120973229408264, + "learning_rate": 7.245610163406575e-05, + "loss": 0.0168, + "step": 23050 + }, + { + "epoch": 13.422584400465658, + "grad_norm": 0.15970492362976074, + "learning_rate": 7.243147611819061e-05, + "loss": 0.0161, + "step": 23060 + }, + { + "epoch": 13.428405122235157, + "grad_norm": 0.21032443642616272, + "learning_rate": 7.240684378822531e-05, + "loss": 0.0147, + "step": 23070 + }, + { + "epoch": 13.434225844004656, + "grad_norm": 0.30467984080314636, + "learning_rate": 7.238220465165248e-05, + "loss": 0.014, + "step": 23080 + }, + { + "epoch": 13.440046565774155, + "grad_norm": 0.26586246490478516, + "learning_rate": 7.235755871595684e-05, + "loss": 0.0188, + "step": 23090 + }, + { + "epoch": 13.445867287543656, + "grad_norm": 0.19304537773132324, + "learning_rate": 7.233290598862517e-05, + "loss": 0.0167, + "step": 23100 + }, + { + "epoch": 13.451688009313155, + "grad_norm": 0.20898987352848053, + "learning_rate": 7.230824647714635e-05, + "loss": 0.0153, + "step": 23110 + }, + { + "epoch": 13.457508731082655, + "grad_norm": 0.21035461127758026, + "learning_rate": 7.228358018901124e-05, + "loss": 0.014, + "step": 23120 + }, + { + "epoch": 13.463329452852154, + "grad_norm": 0.3405665159225464, + "learning_rate": 7.225890713171286e-05, + "loss": 0.0172, + "step": 23130 + }, + { + "epoch": 13.469150174621653, + "grad_norm": 0.14437147974967957, + "learning_rate": 7.223422731274618e-05, + "loss": 0.0119, + "step": 23140 + }, + { + "epoch": 13.474970896391152, + "grad_norm": 0.2153075635433197, + "learning_rate": 7.220954073960832e-05, + "loss": 0.014, + "step": 23150 + }, + { + "epoch": 13.480791618160652, + "grad_norm": 0.25884929299354553, + "learning_rate": 7.218484741979838e-05, + "loss": 0.0146, + "step": 23160 + }, + { + "epoch": 13.48661233993015, + "grad_norm": 0.22311559319496155, + "learning_rate": 7.216014736081756e-05, + "loss": 0.0192, + "step": 23170 + }, + { + "epoch": 13.49243306169965, + "grad_norm": 0.1693362295627594, + "learning_rate": 7.213544057016906e-05, + "loss": 0.0135, + "step": 23180 + }, + { + "epoch": 13.498253783469151, + "grad_norm": 0.21571168303489685, + "learning_rate": 7.211072705535819e-05, + "loss": 0.0138, + "step": 23190 + }, + { + "epoch": 13.50407450523865, + "grad_norm": 0.19887971878051758, + "learning_rate": 7.208600682389224e-05, + "loss": 0.0132, + "step": 23200 + }, + { + "epoch": 13.50989522700815, + "grad_norm": 0.19537147879600525, + "learning_rate": 7.206127988328055e-05, + "loss": 0.0152, + "step": 23210 + }, + { + "epoch": 13.515715948777649, + "grad_norm": 0.204345703125, + "learning_rate": 7.203654624103453e-05, + "loss": 0.0122, + "step": 23220 + }, + { + "epoch": 13.521536670547148, + "grad_norm": 0.16384857892990112, + "learning_rate": 7.201180590466761e-05, + "loss": 0.0138, + "step": 23230 + }, + { + "epoch": 13.527357392316647, + "grad_norm": 0.17245934903621674, + "learning_rate": 7.198705888169523e-05, + "loss": 0.0142, + "step": 23240 + }, + { + "epoch": 13.533178114086146, + "grad_norm": 0.14471761882305145, + "learning_rate": 7.196230517963491e-05, + "loss": 0.0116, + "step": 23250 + }, + { + "epoch": 13.538998835855645, + "grad_norm": 0.1159023642539978, + "learning_rate": 7.193754480600615e-05, + "loss": 0.0119, + "step": 23260 + }, + { + "epoch": 13.544819557625146, + "grad_norm": 0.16465289890766144, + "learning_rate": 7.19127777683305e-05, + "loss": 0.0216, + "step": 23270 + }, + { + "epoch": 13.550640279394646, + "grad_norm": 0.11482014507055283, + "learning_rate": 7.188800407413156e-05, + "loss": 0.0092, + "step": 23280 + }, + { + "epoch": 13.556461001164145, + "grad_norm": 0.15525783598423004, + "learning_rate": 7.186322373093489e-05, + "loss": 0.0178, + "step": 23290 + }, + { + "epoch": 13.562281722933644, + "grad_norm": 0.19909757375717163, + "learning_rate": 7.18384367462681e-05, + "loss": 0.0119, + "step": 23300 + }, + { + "epoch": 13.568102444703143, + "grad_norm": 0.20281395316123962, + "learning_rate": 7.181364312766085e-05, + "loss": 0.0151, + "step": 23310 + }, + { + "epoch": 13.573923166472643, + "grad_norm": 0.2317584753036499, + "learning_rate": 7.178884288264477e-05, + "loss": 0.0103, + "step": 23320 + }, + { + "epoch": 13.579743888242142, + "grad_norm": 0.22861093282699585, + "learning_rate": 7.176403601875353e-05, + "loss": 0.0176, + "step": 23330 + }, + { + "epoch": 13.585564610011641, + "grad_norm": 0.18239377439022064, + "learning_rate": 7.173922254352279e-05, + "loss": 0.0148, + "step": 23340 + }, + { + "epoch": 13.59138533178114, + "grad_norm": 0.15021909773349762, + "learning_rate": 7.171440246449024e-05, + "loss": 0.0127, + "step": 23350 + }, + { + "epoch": 13.59720605355064, + "grad_norm": 0.15634369850158691, + "learning_rate": 7.168957578919555e-05, + "loss": 0.0148, + "step": 23360 + }, + { + "epoch": 13.60302677532014, + "grad_norm": 0.13219110667705536, + "learning_rate": 7.16647425251804e-05, + "loss": 0.0136, + "step": 23370 + }, + { + "epoch": 13.60884749708964, + "grad_norm": 0.13903973996639252, + "learning_rate": 7.163990267998852e-05, + "loss": 0.0144, + "step": 23380 + }, + { + "epoch": 13.614668218859139, + "grad_norm": 0.1653580218553543, + "learning_rate": 7.161505626116556e-05, + "loss": 0.0161, + "step": 23390 + }, + { + "epoch": 13.620488940628638, + "grad_norm": 0.20811183750629425, + "learning_rate": 7.159020327625923e-05, + "loss": 0.0132, + "step": 23400 + }, + { + "epoch": 13.626309662398137, + "grad_norm": 0.21914011240005493, + "learning_rate": 7.15653437328192e-05, + "loss": 0.0225, + "step": 23410 + }, + { + "epoch": 13.632130384167636, + "grad_norm": 0.1804657280445099, + "learning_rate": 7.154047763839713e-05, + "loss": 0.0135, + "step": 23420 + }, + { + "epoch": 13.637951105937136, + "grad_norm": 0.2554371953010559, + "learning_rate": 7.15156050005467e-05, + "loss": 0.014, + "step": 23430 + }, + { + "epoch": 13.643771827706635, + "grad_norm": 0.23306523263454437, + "learning_rate": 7.149072582682357e-05, + "loss": 0.0139, + "step": 23440 + }, + { + "epoch": 13.649592549476136, + "grad_norm": 0.2305123656988144, + "learning_rate": 7.146584012478535e-05, + "loss": 0.0122, + "step": 23450 + }, + { + "epoch": 13.655413271245635, + "grad_norm": 0.2390785813331604, + "learning_rate": 7.144094790199169e-05, + "loss": 0.0157, + "step": 23460 + }, + { + "epoch": 13.661233993015134, + "grad_norm": 0.23295018076896667, + "learning_rate": 7.141604916600415e-05, + "loss": 0.0146, + "step": 23470 + }, + { + "epoch": 13.667054714784634, + "grad_norm": 0.12449701130390167, + "learning_rate": 7.139114392438635e-05, + "loss": 0.0117, + "step": 23480 + }, + { + "epoch": 13.672875436554133, + "grad_norm": 0.10030125826597214, + "learning_rate": 7.136623218470382e-05, + "loss": 0.0135, + "step": 23490 + }, + { + "epoch": 13.678696158323632, + "grad_norm": 0.20952144265174866, + "learning_rate": 7.13413139545241e-05, + "loss": 0.0143, + "step": 23500 + }, + { + "epoch": 13.684516880093131, + "grad_norm": 0.16773578524589539, + "learning_rate": 7.131638924141668e-05, + "loss": 0.0149, + "step": 23510 + }, + { + "epoch": 13.69033760186263, + "grad_norm": 0.18625982105731964, + "learning_rate": 7.129145805295304e-05, + "loss": 0.0154, + "step": 23520 + }, + { + "epoch": 13.69615832363213, + "grad_norm": 0.16663098335266113, + "learning_rate": 7.126652039670661e-05, + "loss": 0.0098, + "step": 23530 + }, + { + "epoch": 13.70197904540163, + "grad_norm": 0.2846135199069977, + "learning_rate": 7.124157628025278e-05, + "loss": 0.013, + "step": 23540 + }, + { + "epoch": 13.70779976717113, + "grad_norm": 0.23023471236228943, + "learning_rate": 7.121662571116894e-05, + "loss": 0.0123, + "step": 23550 + }, + { + "epoch": 13.713620488940629, + "grad_norm": 0.18924590945243835, + "learning_rate": 7.119166869703441e-05, + "loss": 0.0108, + "step": 23560 + }, + { + "epoch": 13.719441210710128, + "grad_norm": 0.1678827852010727, + "learning_rate": 7.116670524543044e-05, + "loss": 0.0177, + "step": 23570 + }, + { + "epoch": 13.725261932479627, + "grad_norm": 0.19821271300315857, + "learning_rate": 7.114173536394032e-05, + "loss": 0.0125, + "step": 23580 + }, + { + "epoch": 13.731082654249127, + "grad_norm": 0.18671877682209015, + "learning_rate": 7.111675906014917e-05, + "loss": 0.0172, + "step": 23590 + }, + { + "epoch": 13.736903376018626, + "grad_norm": 0.16690348088741302, + "learning_rate": 7.109177634164421e-05, + "loss": 0.0111, + "step": 23600 + }, + { + "epoch": 13.742724097788125, + "grad_norm": 0.20922337472438812, + "learning_rate": 7.106678721601449e-05, + "loss": 0.0106, + "step": 23610 + }, + { + "epoch": 13.748544819557626, + "grad_norm": 0.14817316830158234, + "learning_rate": 7.104179169085103e-05, + "loss": 0.0133, + "step": 23620 + }, + { + "epoch": 13.754365541327125, + "grad_norm": 0.26883894205093384, + "learning_rate": 7.101678977374683e-05, + "loss": 0.0182, + "step": 23630 + }, + { + "epoch": 13.760186263096625, + "grad_norm": 0.16002535820007324, + "learning_rate": 7.099178147229685e-05, + "loss": 0.0138, + "step": 23640 + }, + { + "epoch": 13.766006984866124, + "grad_norm": 0.18179692327976227, + "learning_rate": 7.096676679409789e-05, + "loss": 0.0156, + "step": 23650 + }, + { + "epoch": 13.771827706635623, + "grad_norm": 0.21251296997070312, + "learning_rate": 7.094174574674877e-05, + "loss": 0.0135, + "step": 23660 + }, + { + "epoch": 13.777648428405122, + "grad_norm": 0.16152207553386688, + "learning_rate": 7.091671833785025e-05, + "loss": 0.0106, + "step": 23670 + }, + { + "epoch": 13.783469150174621, + "grad_norm": 0.22787030041217804, + "learning_rate": 7.089168457500493e-05, + "loss": 0.0144, + "step": 23680 + }, + { + "epoch": 13.78928987194412, + "grad_norm": 0.15555255115032196, + "learning_rate": 7.086664446581747e-05, + "loss": 0.0184, + "step": 23690 + }, + { + "epoch": 13.79511059371362, + "grad_norm": 0.1839178502559662, + "learning_rate": 7.084159801789438e-05, + "loss": 0.0136, + "step": 23700 + }, + { + "epoch": 13.800931315483119, + "grad_norm": 0.2050034999847412, + "learning_rate": 7.081654523884411e-05, + "loss": 0.0153, + "step": 23710 + }, + { + "epoch": 13.80675203725262, + "grad_norm": 0.2105141282081604, + "learning_rate": 7.0791486136277e-05, + "loss": 0.018, + "step": 23720 + }, + { + "epoch": 13.81257275902212, + "grad_norm": 0.19686727225780487, + "learning_rate": 7.07664207178054e-05, + "loss": 0.0189, + "step": 23730 + }, + { + "epoch": 13.818393480791618, + "grad_norm": 0.2840605080127716, + "learning_rate": 7.074134899104345e-05, + "loss": 0.0168, + "step": 23740 + }, + { + "epoch": 13.824214202561118, + "grad_norm": 0.19715330004692078, + "learning_rate": 7.071627096360735e-05, + "loss": 0.0129, + "step": 23750 + }, + { + "epoch": 13.830034924330617, + "grad_norm": 0.25669029355049133, + "learning_rate": 7.069118664311511e-05, + "loss": 0.0123, + "step": 23760 + }, + { + "epoch": 13.835855646100116, + "grad_norm": 0.1537303775548935, + "learning_rate": 7.06660960371867e-05, + "loss": 0.0107, + "step": 23770 + }, + { + "epoch": 13.841676367869615, + "grad_norm": 0.30991002917289734, + "learning_rate": 7.064099915344396e-05, + "loss": 0.0208, + "step": 23780 + }, + { + "epoch": 13.847497089639115, + "grad_norm": 0.26907503604888916, + "learning_rate": 7.061589599951066e-05, + "loss": 0.0161, + "step": 23790 + }, + { + "epoch": 13.853317811408616, + "grad_norm": 0.15052780508995056, + "learning_rate": 7.05907865830125e-05, + "loss": 0.0165, + "step": 23800 + }, + { + "epoch": 13.859138533178115, + "grad_norm": 0.1705310195684433, + "learning_rate": 7.056567091157703e-05, + "loss": 0.0154, + "step": 23810 + }, + { + "epoch": 13.864959254947614, + "grad_norm": 0.18790645897388458, + "learning_rate": 7.054054899283375e-05, + "loss": 0.0154, + "step": 23820 + }, + { + "epoch": 13.870779976717113, + "grad_norm": 0.21665486693382263, + "learning_rate": 7.051542083441403e-05, + "loss": 0.0149, + "step": 23830 + }, + { + "epoch": 13.876600698486612, + "grad_norm": 0.21012261509895325, + "learning_rate": 7.049028644395113e-05, + "loss": 0.0142, + "step": 23840 + }, + { + "epoch": 13.882421420256112, + "grad_norm": 0.1719137579202652, + "learning_rate": 7.046514582908024e-05, + "loss": 0.012, + "step": 23850 + }, + { + "epoch": 13.88824214202561, + "grad_norm": 0.2427322119474411, + "learning_rate": 7.043999899743838e-05, + "loss": 0.0143, + "step": 23860 + }, + { + "epoch": 13.89406286379511, + "grad_norm": 0.25594082474708557, + "learning_rate": 7.041484595666451e-05, + "loss": 0.018, + "step": 23870 + }, + { + "epoch": 13.89988358556461, + "grad_norm": 0.17163865268230438, + "learning_rate": 7.038968671439948e-05, + "loss": 0.0144, + "step": 23880 + }, + { + "epoch": 13.90570430733411, + "grad_norm": 0.20816820859909058, + "learning_rate": 7.036452127828596e-05, + "loss": 0.02, + "step": 23890 + }, + { + "epoch": 13.91152502910361, + "grad_norm": 0.2173209935426712, + "learning_rate": 7.033934965596859e-05, + "loss": 0.0111, + "step": 23900 + }, + { + "epoch": 13.917345750873109, + "grad_norm": 0.21912527084350586, + "learning_rate": 7.031417185509381e-05, + "loss": 0.0144, + "step": 23910 + }, + { + "epoch": 13.923166472642608, + "grad_norm": 0.18840588629245758, + "learning_rate": 7.028898788331e-05, + "loss": 0.0148, + "step": 23920 + }, + { + "epoch": 13.928987194412107, + "grad_norm": 0.13310721516609192, + "learning_rate": 7.026379774826736e-05, + "loss": 0.0216, + "step": 23930 + }, + { + "epoch": 13.934807916181606, + "grad_norm": 0.19194206595420837, + "learning_rate": 7.0238601457618e-05, + "loss": 0.0115, + "step": 23940 + }, + { + "epoch": 13.940628637951106, + "grad_norm": 0.11834936589002609, + "learning_rate": 7.02133990190159e-05, + "loss": 0.0133, + "step": 23950 + }, + { + "epoch": 13.946449359720605, + "grad_norm": 0.1964089572429657, + "learning_rate": 7.018819044011687e-05, + "loss": 0.014, + "step": 23960 + }, + { + "epoch": 13.952270081490104, + "grad_norm": 0.1793932318687439, + "learning_rate": 7.016297572857863e-05, + "loss": 0.0119, + "step": 23970 + }, + { + "epoch": 13.958090803259605, + "grad_norm": 0.1463908553123474, + "learning_rate": 7.013775489206072e-05, + "loss": 0.0132, + "step": 23980 + }, + { + "epoch": 13.963911525029104, + "grad_norm": 0.15929311513900757, + "learning_rate": 7.01125279382246e-05, + "loss": 0.0101, + "step": 23990 + }, + { + "epoch": 13.969732246798603, + "grad_norm": 0.19319561123847961, + "learning_rate": 7.008729487473351e-05, + "loss": 0.013, + "step": 24000 + }, + { + "epoch": 13.975552968568103, + "grad_norm": 0.357269287109375, + "learning_rate": 7.006205570925263e-05, + "loss": 0.0176, + "step": 24010 + }, + { + "epoch": 13.981373690337602, + "grad_norm": 0.22437237203121185, + "learning_rate": 7.003681044944892e-05, + "loss": 0.0162, + "step": 24020 + }, + { + "epoch": 13.987194412107101, + "grad_norm": 0.21716268360614777, + "learning_rate": 7.001155910299126e-05, + "loss": 0.0134, + "step": 24030 + }, + { + "epoch": 13.9930151338766, + "grad_norm": 0.2256990522146225, + "learning_rate": 6.99863016775503e-05, + "loss": 0.0187, + "step": 24040 + }, + { + "epoch": 13.9988358556461, + "grad_norm": 0.24405546486377716, + "learning_rate": 6.996103818079859e-05, + "loss": 0.0151, + "step": 24050 + }, + { + "epoch": 14.004656577415599, + "grad_norm": 0.28702035546302795, + "learning_rate": 6.993576862041054e-05, + "loss": 0.0161, + "step": 24060 + }, + { + "epoch": 14.0104772991851, + "grad_norm": 0.20380428433418274, + "learning_rate": 6.991049300406235e-05, + "loss": 0.0118, + "step": 24070 + }, + { + "epoch": 14.016298020954599, + "grad_norm": 0.17949816584587097, + "learning_rate": 6.988521133943209e-05, + "loss": 0.0132, + "step": 24080 + }, + { + "epoch": 14.022118742724098, + "grad_norm": 0.20369888842105865, + "learning_rate": 6.985992363419966e-05, + "loss": 0.0149, + "step": 24090 + }, + { + "epoch": 14.027939464493597, + "grad_norm": 0.28803664445877075, + "learning_rate": 6.983462989604682e-05, + "loss": 0.0158, + "step": 24100 + }, + { + "epoch": 14.033760186263097, + "grad_norm": 0.15637195110321045, + "learning_rate": 6.980933013265709e-05, + "loss": 0.0121, + "step": 24110 + }, + { + "epoch": 14.039580908032596, + "grad_norm": 0.167800635099411, + "learning_rate": 6.978402435171592e-05, + "loss": 0.0225, + "step": 24120 + }, + { + "epoch": 14.045401629802095, + "grad_norm": 0.23208411037921906, + "learning_rate": 6.975871256091052e-05, + "loss": 0.0156, + "step": 24130 + }, + { + "epoch": 14.051222351571594, + "grad_norm": 0.1949760764837265, + "learning_rate": 6.973339476792995e-05, + "loss": 0.0128, + "step": 24140 + }, + { + "epoch": 14.057043073341095, + "grad_norm": 0.18270331621170044, + "learning_rate": 6.970807098046505e-05, + "loss": 0.0125, + "step": 24150 + }, + { + "epoch": 14.062863795110594, + "grad_norm": 0.20098543167114258, + "learning_rate": 6.968274120620858e-05, + "loss": 0.0142, + "step": 24160 + }, + { + "epoch": 14.068684516880094, + "grad_norm": 0.18368713557720184, + "learning_rate": 6.965740545285499e-05, + "loss": 0.0098, + "step": 24170 + }, + { + "epoch": 14.074505238649593, + "grad_norm": 0.20599010586738586, + "learning_rate": 6.963206372810068e-05, + "loss": 0.0118, + "step": 24180 + }, + { + "epoch": 14.080325960419092, + "grad_norm": 0.2524663209915161, + "learning_rate": 6.960671603964375e-05, + "loss": 0.0199, + "step": 24190 + }, + { + "epoch": 14.086146682188591, + "grad_norm": 0.12636473774909973, + "learning_rate": 6.958136239518418e-05, + "loss": 0.0147, + "step": 24200 + }, + { + "epoch": 14.09196740395809, + "grad_norm": 0.17211678624153137, + "learning_rate": 6.955600280242371e-05, + "loss": 0.0136, + "step": 24210 + }, + { + "epoch": 14.09778812572759, + "grad_norm": 0.1815568506717682, + "learning_rate": 6.953063726906596e-05, + "loss": 0.0156, + "step": 24220 + }, + { + "epoch": 14.103608847497089, + "grad_norm": 0.2385401576757431, + "learning_rate": 6.950526580281626e-05, + "loss": 0.0148, + "step": 24230 + }, + { + "epoch": 14.10942956926659, + "grad_norm": 0.18535886704921722, + "learning_rate": 6.947988841138184e-05, + "loss": 0.0168, + "step": 24240 + }, + { + "epoch": 14.115250291036089, + "grad_norm": 0.1844160556793213, + "learning_rate": 6.945450510247165e-05, + "loss": 0.014, + "step": 24250 + }, + { + "epoch": 14.121071012805588, + "grad_norm": 0.12657812237739563, + "learning_rate": 6.942911588379647e-05, + "loss": 0.0143, + "step": 24260 + }, + { + "epoch": 14.126891734575088, + "grad_norm": 0.1203121468424797, + "learning_rate": 6.940372076306888e-05, + "loss": 0.0141, + "step": 24270 + }, + { + "epoch": 14.132712456344587, + "grad_norm": 0.14101149141788483, + "learning_rate": 6.937831974800326e-05, + "loss": 0.0141, + "step": 24280 + }, + { + "epoch": 14.138533178114086, + "grad_norm": 0.2540149986743927, + "learning_rate": 6.935291284631574e-05, + "loss": 0.0181, + "step": 24290 + }, + { + "epoch": 14.144353899883585, + "grad_norm": 0.20235799252986908, + "learning_rate": 6.932750006572428e-05, + "loss": 0.0114, + "step": 24300 + }, + { + "epoch": 14.150174621653084, + "grad_norm": 0.19734512269496918, + "learning_rate": 6.930208141394863e-05, + "loss": 0.0296, + "step": 24310 + }, + { + "epoch": 14.155995343422584, + "grad_norm": 0.1260039061307907, + "learning_rate": 6.927665689871026e-05, + "loss": 0.0127, + "step": 24320 + }, + { + "epoch": 14.161816065192085, + "grad_norm": 0.2541213631629944, + "learning_rate": 6.925122652773253e-05, + "loss": 0.0123, + "step": 24330 + }, + { + "epoch": 14.167636786961584, + "grad_norm": 0.14251276850700378, + "learning_rate": 6.922579030874046e-05, + "loss": 0.0118, + "step": 24340 + }, + { + "epoch": 14.173457508731083, + "grad_norm": 0.24980030953884125, + "learning_rate": 6.920034824946093e-05, + "loss": 0.0174, + "step": 24350 + }, + { + "epoch": 14.179278230500582, + "grad_norm": 0.10479041188955307, + "learning_rate": 6.917490035762255e-05, + "loss": 0.0154, + "step": 24360 + }, + { + "epoch": 14.185098952270081, + "grad_norm": 0.2216416597366333, + "learning_rate": 6.914944664095573e-05, + "loss": 0.0175, + "step": 24370 + }, + { + "epoch": 14.19091967403958, + "grad_norm": 0.23090343177318573, + "learning_rate": 6.912398710719264e-05, + "loss": 0.0146, + "step": 24380 + }, + { + "epoch": 14.19674039580908, + "grad_norm": 0.17897067964076996, + "learning_rate": 6.90985217640672e-05, + "loss": 0.0102, + "step": 24390 + }, + { + "epoch": 14.202561117578579, + "grad_norm": 0.2531725764274597, + "learning_rate": 6.90730506193151e-05, + "loss": 0.0132, + "step": 24400 + }, + { + "epoch": 14.208381839348078, + "grad_norm": 0.21103635430335999, + "learning_rate": 6.904757368067384e-05, + "loss": 0.0157, + "step": 24410 + }, + { + "epoch": 14.21420256111758, + "grad_norm": 0.18357965350151062, + "learning_rate": 6.90220909558826e-05, + "loss": 0.0206, + "step": 24420 + }, + { + "epoch": 14.220023282887079, + "grad_norm": 0.16601617634296417, + "learning_rate": 6.899660245268237e-05, + "loss": 0.0141, + "step": 24430 + }, + { + "epoch": 14.225844004656578, + "grad_norm": 0.1620277315378189, + "learning_rate": 6.897110817881592e-05, + "loss": 0.0164, + "step": 24440 + }, + { + "epoch": 14.231664726426077, + "grad_norm": 0.23933638632297516, + "learning_rate": 6.894560814202769e-05, + "loss": 0.017, + "step": 24450 + }, + { + "epoch": 14.237485448195576, + "grad_norm": 0.17678925395011902, + "learning_rate": 6.892010235006394e-05, + "loss": 0.0198, + "step": 24460 + }, + { + "epoch": 14.243306169965075, + "grad_norm": 0.22951652109622955, + "learning_rate": 6.889459081067264e-05, + "loss": 0.0111, + "step": 24470 + }, + { + "epoch": 14.249126891734575, + "grad_norm": 0.1456948071718216, + "learning_rate": 6.886907353160356e-05, + "loss": 0.0147, + "step": 24480 + }, + { + "epoch": 14.254947613504074, + "grad_norm": 0.22055423259735107, + "learning_rate": 6.884355052060814e-05, + "loss": 0.0153, + "step": 24490 + }, + { + "epoch": 14.260768335273575, + "grad_norm": 0.23159410059452057, + "learning_rate": 6.88180217854396e-05, + "loss": 0.0145, + "step": 24500 + }, + { + "epoch": 14.266589057043074, + "grad_norm": 0.2158675640821457, + "learning_rate": 6.87924873338529e-05, + "loss": 0.0178, + "step": 24510 + }, + { + "epoch": 14.272409778812573, + "grad_norm": 0.32293882966041565, + "learning_rate": 6.876694717360475e-05, + "loss": 0.0147, + "step": 24520 + }, + { + "epoch": 14.278230500582072, + "grad_norm": 0.19318826496601105, + "learning_rate": 6.874140131245355e-05, + "loss": 0.0155, + "step": 24530 + }, + { + "epoch": 14.284051222351572, + "grad_norm": 0.25592586398124695, + "learning_rate": 6.871584975815948e-05, + "loss": 0.0186, + "step": 24540 + }, + { + "epoch": 14.28987194412107, + "grad_norm": 0.192097008228302, + "learning_rate": 6.86902925184844e-05, + "loss": 0.0126, + "step": 24550 + }, + { + "epoch": 14.29569266589057, + "grad_norm": 0.16800256073474884, + "learning_rate": 6.866472960119195e-05, + "loss": 0.0169, + "step": 24560 + }, + { + "epoch": 14.30151338766007, + "grad_norm": 0.19023509323596954, + "learning_rate": 6.863916101404748e-05, + "loss": 0.0142, + "step": 24570 + }, + { + "epoch": 14.307334109429569, + "grad_norm": 0.1332455575466156, + "learning_rate": 6.8613586764818e-05, + "loss": 0.0125, + "step": 24580 + }, + { + "epoch": 14.31315483119907, + "grad_norm": 0.14466401934623718, + "learning_rate": 6.858800686127233e-05, + "loss": 0.0118, + "step": 24590 + }, + { + "epoch": 14.318975552968569, + "grad_norm": 0.13272367417812347, + "learning_rate": 6.856242131118097e-05, + "loss": 0.0119, + "step": 24600 + }, + { + "epoch": 14.324796274738068, + "grad_norm": 0.21290816366672516, + "learning_rate": 6.853683012231614e-05, + "loss": 0.0133, + "step": 24610 + }, + { + "epoch": 14.330616996507567, + "grad_norm": 0.4175182282924652, + "learning_rate": 6.851123330245173e-05, + "loss": 0.0134, + "step": 24620 + }, + { + "epoch": 14.336437718277066, + "grad_norm": 0.28452736139297485, + "learning_rate": 6.848563085936343e-05, + "loss": 0.0135, + "step": 24630 + }, + { + "epoch": 14.342258440046566, + "grad_norm": 0.1705893576145172, + "learning_rate": 6.846002280082853e-05, + "loss": 0.0136, + "step": 24640 + }, + { + "epoch": 14.348079161816065, + "grad_norm": 0.15046043694019318, + "learning_rate": 6.843440913462614e-05, + "loss": 0.0132, + "step": 24650 + }, + { + "epoch": 14.353899883585564, + "grad_norm": 0.2339194416999817, + "learning_rate": 6.840878986853698e-05, + "loss": 0.0149, + "step": 24660 + }, + { + "epoch": 14.359720605355063, + "grad_norm": 0.20603011548519135, + "learning_rate": 6.838316501034352e-05, + "loss": 0.0162, + "step": 24670 + }, + { + "epoch": 14.365541327124564, + "grad_norm": 0.10393299162387848, + "learning_rate": 6.83575345678299e-05, + "loss": 0.0104, + "step": 24680 + }, + { + "epoch": 14.371362048894063, + "grad_norm": 0.24818800389766693, + "learning_rate": 6.833189854878196e-05, + "loss": 0.0139, + "step": 24690 + }, + { + "epoch": 14.377182770663563, + "grad_norm": 0.20190605521202087, + "learning_rate": 6.83062569609873e-05, + "loss": 0.0151, + "step": 24700 + }, + { + "epoch": 14.383003492433062, + "grad_norm": 0.2408572882413864, + "learning_rate": 6.828060981223512e-05, + "loss": 0.0113, + "step": 24710 + }, + { + "epoch": 14.388824214202561, + "grad_norm": 0.16882649064064026, + "learning_rate": 6.825495711031634e-05, + "loss": 0.0139, + "step": 24720 + }, + { + "epoch": 14.39464493597206, + "grad_norm": 0.18472284078598022, + "learning_rate": 6.822929886302359e-05, + "loss": 0.0116, + "step": 24730 + }, + { + "epoch": 14.40046565774156, + "grad_norm": 0.1450139880180359, + "learning_rate": 6.820363507815116e-05, + "loss": 0.0096, + "step": 24740 + }, + { + "epoch": 14.406286379511059, + "grad_norm": 0.23131100833415985, + "learning_rate": 6.817796576349501e-05, + "loss": 0.017, + "step": 24750 + }, + { + "epoch": 14.412107101280558, + "grad_norm": 0.17442047595977783, + "learning_rate": 6.815229092685285e-05, + "loss": 0.0176, + "step": 24760 + }, + { + "epoch": 14.417927823050059, + "grad_norm": 0.23623603582382202, + "learning_rate": 6.812661057602399e-05, + "loss": 0.0121, + "step": 24770 + }, + { + "epoch": 14.423748544819558, + "grad_norm": 0.14861910045146942, + "learning_rate": 6.810092471880943e-05, + "loss": 0.0146, + "step": 24780 + }, + { + "epoch": 14.429569266589057, + "grad_norm": 0.2628679871559143, + "learning_rate": 6.807523336301187e-05, + "loss": 0.0152, + "step": 24790 + }, + { + "epoch": 14.435389988358557, + "grad_norm": 0.18435366451740265, + "learning_rate": 6.804953651643566e-05, + "loss": 0.0163, + "step": 24800 + }, + { + "epoch": 14.441210710128056, + "grad_norm": 0.19229818880558014, + "learning_rate": 6.802383418688685e-05, + "loss": 0.0154, + "step": 24810 + }, + { + "epoch": 14.447031431897555, + "grad_norm": 0.2672334313392639, + "learning_rate": 6.799812638217309e-05, + "loss": 0.0177, + "step": 24820 + }, + { + "epoch": 14.452852153667054, + "grad_norm": 0.25363436341285706, + "learning_rate": 6.797241311010373e-05, + "loss": 0.0153, + "step": 24830 + }, + { + "epoch": 14.458672875436553, + "grad_norm": 0.16506312787532806, + "learning_rate": 6.794669437848982e-05, + "loss": 0.0171, + "step": 24840 + }, + { + "epoch": 14.464493597206054, + "grad_norm": 0.1491541862487793, + "learning_rate": 6.792097019514402e-05, + "loss": 0.0147, + "step": 24850 + }, + { + "epoch": 14.470314318975554, + "grad_norm": 0.1791267842054367, + "learning_rate": 6.789524056788064e-05, + "loss": 0.0169, + "step": 24860 + }, + { + "epoch": 14.476135040745053, + "grad_norm": 0.23220151662826538, + "learning_rate": 6.786950550451567e-05, + "loss": 0.0135, + "step": 24870 + }, + { + "epoch": 14.481955762514552, + "grad_norm": 0.23185741901397705, + "learning_rate": 6.784376501286676e-05, + "loss": 0.015, + "step": 24880 + }, + { + "epoch": 14.487776484284051, + "grad_norm": 0.18555086851119995, + "learning_rate": 6.781801910075316e-05, + "loss": 0.0124, + "step": 24890 + }, + { + "epoch": 14.49359720605355, + "grad_norm": 0.19672635197639465, + "learning_rate": 6.779226777599581e-05, + "loss": 0.0134, + "step": 24900 + }, + { + "epoch": 14.49941792782305, + "grad_norm": 0.1777205467224121, + "learning_rate": 6.776651104641729e-05, + "loss": 0.0156, + "step": 24910 + }, + { + "epoch": 14.505238649592549, + "grad_norm": 0.29720011353492737, + "learning_rate": 6.774074891984183e-05, + "loss": 0.0162, + "step": 24920 + }, + { + "epoch": 14.511059371362048, + "grad_norm": 0.1570349782705307, + "learning_rate": 6.771498140409526e-05, + "loss": 0.0157, + "step": 24930 + }, + { + "epoch": 14.516880093131547, + "grad_norm": 0.18296067416667938, + "learning_rate": 6.768920850700506e-05, + "loss": 0.0116, + "step": 24940 + }, + { + "epoch": 14.522700814901048, + "grad_norm": 0.1484026163816452, + "learning_rate": 6.766343023640039e-05, + "loss": 0.0192, + "step": 24950 + }, + { + "epoch": 14.528521536670548, + "grad_norm": 0.1450258493423462, + "learning_rate": 6.763764660011198e-05, + "loss": 0.0106, + "step": 24960 + }, + { + "epoch": 14.534342258440047, + "grad_norm": 0.21367193758487701, + "learning_rate": 6.761185760597223e-05, + "loss": 0.0129, + "step": 24970 + }, + { + "epoch": 14.540162980209546, + "grad_norm": 0.2494056075811386, + "learning_rate": 6.758606326181515e-05, + "loss": 0.0138, + "step": 24980 + }, + { + "epoch": 14.545983701979045, + "grad_norm": 0.2342963069677353, + "learning_rate": 6.75602635754764e-05, + "loss": 0.011, + "step": 24990 + }, + { + "epoch": 14.551804423748544, + "grad_norm": 0.2302059531211853, + "learning_rate": 6.75344585547932e-05, + "loss": 0.0135, + "step": 25000 + }, + { + "epoch": 14.557625145518044, + "grad_norm": 0.19847702980041504, + "learning_rate": 6.750864820760449e-05, + "loss": 0.0117, + "step": 25010 + }, + { + "epoch": 14.563445867287543, + "grad_norm": 0.2508542835712433, + "learning_rate": 6.748283254175072e-05, + "loss": 0.014, + "step": 25020 + }, + { + "epoch": 14.569266589057044, + "grad_norm": 0.19862422347068787, + "learning_rate": 6.745701156507404e-05, + "loss": 0.0132, + "step": 25030 + }, + { + "epoch": 14.575087310826543, + "grad_norm": 0.11691225320100784, + "learning_rate": 6.743118528541818e-05, + "loss": 0.0106, + "step": 25040 + }, + { + "epoch": 14.580908032596042, + "grad_norm": 0.1745418906211853, + "learning_rate": 6.740535371062846e-05, + "loss": 0.0136, + "step": 25050 + }, + { + "epoch": 14.586728754365542, + "grad_norm": 0.1072767823934555, + "learning_rate": 6.737951684855185e-05, + "loss": 0.0106, + "step": 25060 + }, + { + "epoch": 14.59254947613504, + "grad_norm": 0.2251940667629242, + "learning_rate": 6.735367470703691e-05, + "loss": 0.0126, + "step": 25070 + }, + { + "epoch": 14.59837019790454, + "grad_norm": 0.28207623958587646, + "learning_rate": 6.732782729393379e-05, + "loss": 0.0129, + "step": 25080 + }, + { + "epoch": 14.60419091967404, + "grad_norm": 0.20610541105270386, + "learning_rate": 6.730197461709425e-05, + "loss": 0.0122, + "step": 25090 + }, + { + "epoch": 14.610011641443538, + "grad_norm": 0.17751923203468323, + "learning_rate": 6.727611668437164e-05, + "loss": 0.0163, + "step": 25100 + }, + { + "epoch": 14.615832363213038, + "grad_norm": 0.1513509750366211, + "learning_rate": 6.725025350362094e-05, + "loss": 0.0121, + "step": 25110 + }, + { + "epoch": 14.621653084982539, + "grad_norm": 0.21129904687404633, + "learning_rate": 6.72243850826987e-05, + "loss": 0.0105, + "step": 25120 + }, + { + "epoch": 14.627473806752038, + "grad_norm": 0.21819303929805756, + "learning_rate": 6.719851142946305e-05, + "loss": 0.0118, + "step": 25130 + }, + { + "epoch": 14.633294528521537, + "grad_norm": 0.31778040528297424, + "learning_rate": 6.717263255177372e-05, + "loss": 0.0188, + "step": 25140 + }, + { + "epoch": 14.639115250291036, + "grad_norm": 0.2425520271062851, + "learning_rate": 6.714674845749205e-05, + "loss": 0.0125, + "step": 25150 + }, + { + "epoch": 14.644935972060535, + "grad_norm": 0.18232205510139465, + "learning_rate": 6.712085915448092e-05, + "loss": 0.0148, + "step": 25160 + }, + { + "epoch": 14.650756693830035, + "grad_norm": 0.2300879955291748, + "learning_rate": 6.709496465060486e-05, + "loss": 0.0153, + "step": 25170 + }, + { + "epoch": 14.656577415599534, + "grad_norm": 0.18037347495555878, + "learning_rate": 6.706906495372987e-05, + "loss": 0.0117, + "step": 25180 + }, + { + "epoch": 14.662398137369033, + "grad_norm": 0.2354128062725067, + "learning_rate": 6.704316007172365e-05, + "loss": 0.0129, + "step": 25190 + }, + { + "epoch": 14.668218859138534, + "grad_norm": 0.22229431569576263, + "learning_rate": 6.701725001245539e-05, + "loss": 0.0159, + "step": 25200 + }, + { + "epoch": 14.674039580908033, + "grad_norm": 0.2473176121711731, + "learning_rate": 6.699133478379588e-05, + "loss": 0.0142, + "step": 25210 + }, + { + "epoch": 14.679860302677533, + "grad_norm": 0.16972248256206512, + "learning_rate": 6.69654143936175e-05, + "loss": 0.0143, + "step": 25220 + }, + { + "epoch": 14.685681024447032, + "grad_norm": 0.29028791189193726, + "learning_rate": 6.693948884979419e-05, + "loss": 0.017, + "step": 25230 + }, + { + "epoch": 14.691501746216531, + "grad_norm": 0.1630529910326004, + "learning_rate": 6.691355816020142e-05, + "loss": 0.0118, + "step": 25240 + }, + { + "epoch": 14.69732246798603, + "grad_norm": 0.1749436855316162, + "learning_rate": 6.688762233271624e-05, + "loss": 0.0129, + "step": 25250 + }, + { + "epoch": 14.70314318975553, + "grad_norm": 0.4127803146839142, + "learning_rate": 6.68616813752173e-05, + "loss": 0.017, + "step": 25260 + }, + { + "epoch": 14.708963911525029, + "grad_norm": 0.21900734305381775, + "learning_rate": 6.683573529558477e-05, + "loss": 0.0143, + "step": 25270 + }, + { + "epoch": 14.714784633294528, + "grad_norm": 0.2117701768875122, + "learning_rate": 6.680978410170037e-05, + "loss": 0.0147, + "step": 25280 + }, + { + "epoch": 14.720605355064027, + "grad_norm": 0.21537351608276367, + "learning_rate": 6.678382780144741e-05, + "loss": 0.0157, + "step": 25290 + }, + { + "epoch": 14.726426076833528, + "grad_norm": 0.16932621598243713, + "learning_rate": 6.675786640271071e-05, + "loss": 0.0119, + "step": 25300 + }, + { + "epoch": 14.732246798603027, + "grad_norm": 0.2304866760969162, + "learning_rate": 6.673189991337665e-05, + "loss": 0.0122, + "step": 25310 + }, + { + "epoch": 14.738067520372526, + "grad_norm": 0.17607827484607697, + "learning_rate": 6.670592834133317e-05, + "loss": 0.0158, + "step": 25320 + }, + { + "epoch": 14.743888242142026, + "grad_norm": 0.2388494908809662, + "learning_rate": 6.667995169446979e-05, + "loss": 0.0121, + "step": 25330 + }, + { + "epoch": 14.749708963911525, + "grad_norm": 0.18941913545131683, + "learning_rate": 6.665396998067747e-05, + "loss": 0.018, + "step": 25340 + }, + { + "epoch": 14.755529685681024, + "grad_norm": 0.2053556889295578, + "learning_rate": 6.66279832078488e-05, + "loss": 0.0107, + "step": 25350 + }, + { + "epoch": 14.761350407450523, + "grad_norm": 0.20384690165519714, + "learning_rate": 6.660199138387786e-05, + "loss": 0.011, + "step": 25360 + }, + { + "epoch": 14.767171129220023, + "grad_norm": 0.2154342234134674, + "learning_rate": 6.65759945166603e-05, + "loss": 0.0172, + "step": 25370 + }, + { + "epoch": 14.772991850989523, + "grad_norm": 0.21940499544143677, + "learning_rate": 6.654999261409326e-05, + "loss": 0.0178, + "step": 25380 + }, + { + "epoch": 14.778812572759023, + "grad_norm": 0.1765540987253189, + "learning_rate": 6.652398568407544e-05, + "loss": 0.0133, + "step": 25390 + }, + { + "epoch": 14.784633294528522, + "grad_norm": 0.14365650713443756, + "learning_rate": 6.649797373450707e-05, + "loss": 0.017, + "step": 25400 + }, + { + "epoch": 14.790454016298021, + "grad_norm": 0.12805937230587006, + "learning_rate": 6.647195677328988e-05, + "loss": 0.0104, + "step": 25410 + }, + { + "epoch": 14.79627473806752, + "grad_norm": 0.13377325236797333, + "learning_rate": 6.644593480832712e-05, + "loss": 0.0114, + "step": 25420 + }, + { + "epoch": 14.80209545983702, + "grad_norm": 0.24116843938827515, + "learning_rate": 6.641990784752363e-05, + "loss": 0.0141, + "step": 25430 + }, + { + "epoch": 14.807916181606519, + "grad_norm": 0.2865963280200958, + "learning_rate": 6.639387589878566e-05, + "loss": 0.0139, + "step": 25440 + }, + { + "epoch": 14.813736903376018, + "grad_norm": 0.23932583630084991, + "learning_rate": 6.636783897002103e-05, + "loss": 0.0115, + "step": 25450 + }, + { + "epoch": 14.819557625145517, + "grad_norm": 0.24844658374786377, + "learning_rate": 6.63417970691391e-05, + "loss": 0.0143, + "step": 25460 + }, + { + "epoch": 14.825378346915018, + "grad_norm": 0.2520641088485718, + "learning_rate": 6.63157502040507e-05, + "loss": 0.0112, + "step": 25470 + }, + { + "epoch": 14.831199068684517, + "grad_norm": 0.19985240697860718, + "learning_rate": 6.628969838266819e-05, + "loss": 0.0153, + "step": 25480 + }, + { + "epoch": 14.837019790454017, + "grad_norm": 0.227081298828125, + "learning_rate": 6.626364161290541e-05, + "loss": 0.0115, + "step": 25490 + }, + { + "epoch": 14.842840512223516, + "grad_norm": 0.22402918338775635, + "learning_rate": 6.623757990267774e-05, + "loss": 0.0131, + "step": 25500 + }, + { + "epoch": 14.848661233993015, + "grad_norm": 0.2229609340429306, + "learning_rate": 6.621151325990201e-05, + "loss": 0.0122, + "step": 25510 + }, + { + "epoch": 14.854481955762514, + "grad_norm": 0.20243531465530396, + "learning_rate": 6.618544169249657e-05, + "loss": 0.0132, + "step": 25520 + }, + { + "epoch": 14.860302677532014, + "grad_norm": 0.16061855852603912, + "learning_rate": 6.615936520838133e-05, + "loss": 0.0145, + "step": 25530 + }, + { + "epoch": 14.866123399301513, + "grad_norm": 0.10910730063915253, + "learning_rate": 6.613328381547759e-05, + "loss": 0.0088, + "step": 25540 + }, + { + "epoch": 14.871944121071014, + "grad_norm": 0.24693597853183746, + "learning_rate": 6.610719752170821e-05, + "loss": 0.0222, + "step": 25550 + }, + { + "epoch": 14.877764842840513, + "grad_norm": 0.19863669574260712, + "learning_rate": 6.60811063349975e-05, + "loss": 0.0147, + "step": 25560 + }, + { + "epoch": 14.883585564610012, + "grad_norm": 0.33154016733169556, + "learning_rate": 6.605501026327127e-05, + "loss": 0.0146, + "step": 25570 + }, + { + "epoch": 14.889406286379511, + "grad_norm": 0.15500755608081818, + "learning_rate": 6.602890931445685e-05, + "loss": 0.02, + "step": 25580 + }, + { + "epoch": 14.89522700814901, + "grad_norm": 0.24584659934043884, + "learning_rate": 6.6002803496483e-05, + "loss": 0.0175, + "step": 25590 + }, + { + "epoch": 14.90104772991851, + "grad_norm": 0.2414553314447403, + "learning_rate": 6.597669281727997e-05, + "loss": 0.0136, + "step": 25600 + }, + { + "epoch": 14.906868451688009, + "grad_norm": 0.21764640510082245, + "learning_rate": 6.595057728477949e-05, + "loss": 0.0176, + "step": 25610 + }, + { + "epoch": 14.912689173457508, + "grad_norm": 0.2662670910358429, + "learning_rate": 6.59244569069148e-05, + "loss": 0.0151, + "step": 25620 + }, + { + "epoch": 14.918509895227007, + "grad_norm": 0.23340216279029846, + "learning_rate": 6.589833169162054e-05, + "loss": 0.0132, + "step": 25630 + }, + { + "epoch": 14.924330616996507, + "grad_norm": 0.2706906795501709, + "learning_rate": 6.587220164683291e-05, + "loss": 0.0142, + "step": 25640 + }, + { + "epoch": 14.930151338766008, + "grad_norm": 0.18651694059371948, + "learning_rate": 6.58460667804895e-05, + "loss": 0.0127, + "step": 25650 + }, + { + "epoch": 14.935972060535507, + "grad_norm": 0.24755655229091644, + "learning_rate": 6.581992710052938e-05, + "loss": 0.0151, + "step": 25660 + }, + { + "epoch": 14.941792782305006, + "grad_norm": 0.1838797777891159, + "learning_rate": 6.579378261489311e-05, + "loss": 0.0142, + "step": 25670 + }, + { + "epoch": 14.947613504074505, + "grad_norm": 0.15728697180747986, + "learning_rate": 6.576763333152268e-05, + "loss": 0.0122, + "step": 25680 + }, + { + "epoch": 14.953434225844005, + "grad_norm": 0.1335478574037552, + "learning_rate": 6.574147925836159e-05, + "loss": 0.0105, + "step": 25690 + }, + { + "epoch": 14.959254947613504, + "grad_norm": 0.17509090900421143, + "learning_rate": 6.571532040335472e-05, + "loss": 0.0135, + "step": 25700 + }, + { + "epoch": 14.965075669383003, + "grad_norm": 0.14099764823913574, + "learning_rate": 6.568915677444845e-05, + "loss": 0.0123, + "step": 25710 + }, + { + "epoch": 14.970896391152502, + "grad_norm": 0.1909402757883072, + "learning_rate": 6.56629883795906e-05, + "loss": 0.0159, + "step": 25720 + }, + { + "epoch": 14.976717112922003, + "grad_norm": 0.181904137134552, + "learning_rate": 6.563681522673043e-05, + "loss": 0.013, + "step": 25730 + }, + { + "epoch": 14.982537834691502, + "grad_norm": 0.2921738922595978, + "learning_rate": 6.561063732381867e-05, + "loss": 0.0144, + "step": 25740 + }, + { + "epoch": 14.988358556461002, + "grad_norm": 0.18641705811023712, + "learning_rate": 6.558445467880745e-05, + "loss": 0.0125, + "step": 25750 + }, + { + "epoch": 14.9941792782305, + "grad_norm": 0.2848536968231201, + "learning_rate": 6.55582672996504e-05, + "loss": 0.016, + "step": 25760 + }, + { + "epoch": 15.0, + "grad_norm": 0.21063914895057678, + "learning_rate": 6.553207519430253e-05, + "loss": 0.011, + "step": 25770 + }, + { + "epoch": 15.0058207217695, + "grad_norm": 0.14188097417354584, + "learning_rate": 6.550587837072032e-05, + "loss": 0.0151, + "step": 25780 + }, + { + "epoch": 15.011641443538998, + "grad_norm": 0.14565196633338928, + "learning_rate": 6.547967683686166e-05, + "loss": 0.0143, + "step": 25790 + }, + { + "epoch": 15.017462165308498, + "grad_norm": 0.15533095598220825, + "learning_rate": 6.545347060068591e-05, + "loss": 0.0105, + "step": 25800 + }, + { + "epoch": 15.023282887077997, + "grad_norm": 0.23189647495746613, + "learning_rate": 6.542725967015382e-05, + "loss": 0.0144, + "step": 25810 + }, + { + "epoch": 15.029103608847498, + "grad_norm": 0.22099503874778748, + "learning_rate": 6.540104405322757e-05, + "loss": 0.012, + "step": 25820 + }, + { + "epoch": 15.034924330616997, + "grad_norm": 0.1972464621067047, + "learning_rate": 6.537482375787077e-05, + "loss": 0.0175, + "step": 25830 + }, + { + "epoch": 15.040745052386496, + "grad_norm": 0.25513172149658203, + "learning_rate": 6.534859879204845e-05, + "loss": 0.0152, + "step": 25840 + }, + { + "epoch": 15.046565774155995, + "grad_norm": 0.1636989414691925, + "learning_rate": 6.532236916372709e-05, + "loss": 0.0125, + "step": 25850 + }, + { + "epoch": 15.052386495925495, + "grad_norm": 0.27392610907554626, + "learning_rate": 6.529613488087454e-05, + "loss": 0.0149, + "step": 25860 + }, + { + "epoch": 15.058207217694994, + "grad_norm": 0.2285705804824829, + "learning_rate": 6.526989595146009e-05, + "loss": 0.0103, + "step": 25870 + }, + { + "epoch": 15.064027939464493, + "grad_norm": 0.12425664067268372, + "learning_rate": 6.524365238345441e-05, + "loss": 0.0113, + "step": 25880 + }, + { + "epoch": 15.069848661233992, + "grad_norm": 0.13942058384418488, + "learning_rate": 6.521740418482964e-05, + "loss": 0.0126, + "step": 25890 + }, + { + "epoch": 15.075669383003492, + "grad_norm": 0.16412800550460815, + "learning_rate": 6.519115136355925e-05, + "loss": 0.012, + "step": 25900 + }, + { + "epoch": 15.081490104772993, + "grad_norm": 0.22845710813999176, + "learning_rate": 6.51648939276182e-05, + "loss": 0.0142, + "step": 25910 + }, + { + "epoch": 15.087310826542492, + "grad_norm": 0.16960352659225464, + "learning_rate": 6.513863188498277e-05, + "loss": 0.0132, + "step": 25920 + }, + { + "epoch": 15.093131548311991, + "grad_norm": 0.1805526465177536, + "learning_rate": 6.511236524363068e-05, + "loss": 0.0123, + "step": 25930 + }, + { + "epoch": 15.09895227008149, + "grad_norm": 0.15365763008594513, + "learning_rate": 6.508609401154104e-05, + "loss": 0.0146, + "step": 25940 + }, + { + "epoch": 15.10477299185099, + "grad_norm": 0.24821354448795319, + "learning_rate": 6.505981819669439e-05, + "loss": 0.0135, + "step": 25950 + }, + { + "epoch": 15.110593713620489, + "grad_norm": 0.17398592829704285, + "learning_rate": 6.503353780707258e-05, + "loss": 0.0161, + "step": 25960 + }, + { + "epoch": 15.116414435389988, + "grad_norm": 0.18388713896274567, + "learning_rate": 6.500725285065895e-05, + "loss": 0.0141, + "step": 25970 + }, + { + "epoch": 15.122235157159487, + "grad_norm": 0.23989178240299225, + "learning_rate": 6.498096333543813e-05, + "loss": 0.016, + "step": 25980 + }, + { + "epoch": 15.128055878928988, + "grad_norm": 0.19256435334682465, + "learning_rate": 6.49546692693962e-05, + "loss": 0.0112, + "step": 25990 + }, + { + "epoch": 15.133876600698487, + "grad_norm": 0.25310906767845154, + "learning_rate": 6.492837066052059e-05, + "loss": 0.0181, + "step": 26000 + }, + { + "epoch": 15.139697322467986, + "grad_norm": 0.18151313066482544, + "learning_rate": 6.490206751680014e-05, + "loss": 0.0165, + "step": 26010 + }, + { + "epoch": 15.145518044237486, + "grad_norm": 0.19009387493133545, + "learning_rate": 6.487575984622505e-05, + "loss": 0.0155, + "step": 26020 + }, + { + "epoch": 15.151338766006985, + "grad_norm": 0.09663327038288116, + "learning_rate": 6.484944765678689e-05, + "loss": 0.0169, + "step": 26030 + }, + { + "epoch": 15.157159487776484, + "grad_norm": 0.22989587485790253, + "learning_rate": 6.482313095647861e-05, + "loss": 0.0119, + "step": 26040 + }, + { + "epoch": 15.162980209545983, + "grad_norm": 0.17849265038967133, + "learning_rate": 6.479680975329451e-05, + "loss": 0.0159, + "step": 26050 + }, + { + "epoch": 15.168800931315483, + "grad_norm": 0.26442229747772217, + "learning_rate": 6.477048405523031e-05, + "loss": 0.0119, + "step": 26060 + }, + { + "epoch": 15.174621653084982, + "grad_norm": 0.2303890585899353, + "learning_rate": 6.474415387028304e-05, + "loss": 0.0138, + "step": 26070 + }, + { + "epoch": 15.180442374854483, + "grad_norm": 0.15240547060966492, + "learning_rate": 6.471781920645114e-05, + "loss": 0.0104, + "step": 26080 + }, + { + "epoch": 15.186263096623982, + "grad_norm": 0.2164161652326584, + "learning_rate": 6.469148007173434e-05, + "loss": 0.012, + "step": 26090 + }, + { + "epoch": 15.192083818393481, + "grad_norm": 0.1838248372077942, + "learning_rate": 6.466513647413381e-05, + "loss": 0.0138, + "step": 26100 + }, + { + "epoch": 15.19790454016298, + "grad_norm": 0.20881645381450653, + "learning_rate": 6.463878842165203e-05, + "loss": 0.0135, + "step": 26110 + }, + { + "epoch": 15.20372526193248, + "grad_norm": 0.1675874888896942, + "learning_rate": 6.461243592229286e-05, + "loss": 0.016, + "step": 26120 + }, + { + "epoch": 15.209545983701979, + "grad_norm": 0.14522086083889008, + "learning_rate": 6.458607898406146e-05, + "loss": 0.009, + "step": 26130 + }, + { + "epoch": 15.215366705471478, + "grad_norm": 0.2403062880039215, + "learning_rate": 6.455971761496439e-05, + "loss": 0.0103, + "step": 26140 + }, + { + "epoch": 15.221187427240977, + "grad_norm": 0.2410437911748886, + "learning_rate": 6.453335182300953e-05, + "loss": 0.0112, + "step": 26150 + }, + { + "epoch": 15.227008149010477, + "grad_norm": 0.17282222211360931, + "learning_rate": 6.450698161620612e-05, + "loss": 0.0113, + "step": 26160 + }, + { + "epoch": 15.232828870779977, + "grad_norm": 0.22776617109775543, + "learning_rate": 6.448060700256473e-05, + "loss": 0.0152, + "step": 26170 + }, + { + "epoch": 15.238649592549477, + "grad_norm": 0.1403237283229828, + "learning_rate": 6.445422799009726e-05, + "loss": 0.0101, + "step": 26180 + }, + { + "epoch": 15.244470314318976, + "grad_norm": 0.29721829295158386, + "learning_rate": 6.442784458681699e-05, + "loss": 0.0144, + "step": 26190 + }, + { + "epoch": 15.250291036088475, + "grad_norm": 0.24984709918498993, + "learning_rate": 6.440145680073847e-05, + "loss": 0.0113, + "step": 26200 + }, + { + "epoch": 15.256111757857974, + "grad_norm": 0.11429306864738464, + "learning_rate": 6.437506463987762e-05, + "loss": 0.014, + "step": 26210 + }, + { + "epoch": 15.261932479627474, + "grad_norm": 0.11932381987571716, + "learning_rate": 6.434866811225168e-05, + "loss": 0.0116, + "step": 26220 + }, + { + "epoch": 15.267753201396973, + "grad_norm": 0.1742265671491623, + "learning_rate": 6.432226722587923e-05, + "loss": 0.0165, + "step": 26230 + }, + { + "epoch": 15.273573923166472, + "grad_norm": 0.26595622301101685, + "learning_rate": 6.429586198878015e-05, + "loss": 0.0147, + "step": 26240 + }, + { + "epoch": 15.279394644935971, + "grad_norm": 0.20247623324394226, + "learning_rate": 6.426945240897566e-05, + "loss": 0.0203, + "step": 26250 + }, + { + "epoch": 15.285215366705472, + "grad_norm": 0.1731765866279602, + "learning_rate": 6.424303849448829e-05, + "loss": 0.0123, + "step": 26260 + }, + { + "epoch": 15.291036088474971, + "grad_norm": 0.14904780685901642, + "learning_rate": 6.42166202533419e-05, + "loss": 0.0111, + "step": 26270 + }, + { + "epoch": 15.29685681024447, + "grad_norm": 0.20747411251068115, + "learning_rate": 6.419019769356164e-05, + "loss": 0.0126, + "step": 26280 + }, + { + "epoch": 15.30267753201397, + "grad_norm": 0.2413093000650406, + "learning_rate": 6.416377082317398e-05, + "loss": 0.0135, + "step": 26290 + }, + { + "epoch": 15.308498253783469, + "grad_norm": 0.1879521757364273, + "learning_rate": 6.413733965020674e-05, + "loss": 0.0112, + "step": 26300 + }, + { + "epoch": 15.314318975552968, + "grad_norm": 0.18780063092708588, + "learning_rate": 6.411090418268896e-05, + "loss": 0.0098, + "step": 26310 + }, + { + "epoch": 15.320139697322467, + "grad_norm": 0.19004814326763153, + "learning_rate": 6.408446442865109e-05, + "loss": 0.0101, + "step": 26320 + }, + { + "epoch": 15.325960419091967, + "grad_norm": 0.16217443346977234, + "learning_rate": 6.405802039612479e-05, + "loss": 0.0122, + "step": 26330 + }, + { + "epoch": 15.331781140861466, + "grad_norm": 0.18260174989700317, + "learning_rate": 6.403157209314308e-05, + "loss": 0.0141, + "step": 26340 + }, + { + "epoch": 15.337601862630967, + "grad_norm": 0.24114271998405457, + "learning_rate": 6.400511952774024e-05, + "loss": 0.0138, + "step": 26350 + }, + { + "epoch": 15.343422584400466, + "grad_norm": 0.16162066161632538, + "learning_rate": 6.397866270795187e-05, + "loss": 0.01, + "step": 26360 + }, + { + "epoch": 15.349243306169965, + "grad_norm": 0.15161417424678802, + "learning_rate": 6.395220164181489e-05, + "loss": 0.0108, + "step": 26370 + }, + { + "epoch": 15.355064027939465, + "grad_norm": 0.21503421664237976, + "learning_rate": 6.39257363373674e-05, + "loss": 0.012, + "step": 26380 + }, + { + "epoch": 15.360884749708964, + "grad_norm": 0.20670874416828156, + "learning_rate": 6.389926680264892e-05, + "loss": 0.0148, + "step": 26390 + }, + { + "epoch": 15.366705471478463, + "grad_norm": 0.2207045704126358, + "learning_rate": 6.387279304570017e-05, + "loss": 0.0176, + "step": 26400 + }, + { + "epoch": 15.372526193247962, + "grad_norm": 0.16574320197105408, + "learning_rate": 6.384631507456319e-05, + "loss": 0.0119, + "step": 26410 + }, + { + "epoch": 15.378346915017461, + "grad_norm": 0.1814911961555481, + "learning_rate": 6.381983289728126e-05, + "loss": 0.0129, + "step": 26420 + }, + { + "epoch": 15.384167636786962, + "grad_norm": 0.1380680352449417, + "learning_rate": 6.3793346521899e-05, + "loss": 0.0125, + "step": 26430 + }, + { + "epoch": 15.389988358556462, + "grad_norm": 0.185481995344162, + "learning_rate": 6.376685595646226e-05, + "loss": 0.0155, + "step": 26440 + }, + { + "epoch": 15.39580908032596, + "grad_norm": 0.19727006554603577, + "learning_rate": 6.374036120901816e-05, + "loss": 0.0164, + "step": 26450 + }, + { + "epoch": 15.40162980209546, + "grad_norm": 0.18127687275409698, + "learning_rate": 6.371386228761514e-05, + "loss": 0.0125, + "step": 26460 + }, + { + "epoch": 15.40745052386496, + "grad_norm": 0.2815975248813629, + "learning_rate": 6.368735920030283e-05, + "loss": 0.0192, + "step": 26470 + }, + { + "epoch": 15.413271245634458, + "grad_norm": 0.19860614836215973, + "learning_rate": 6.366085195513218e-05, + "loss": 0.0152, + "step": 26480 + }, + { + "epoch": 15.419091967403958, + "grad_norm": 0.15694773197174072, + "learning_rate": 6.363434056015543e-05, + "loss": 0.0202, + "step": 26490 + }, + { + "epoch": 15.424912689173457, + "grad_norm": 0.1593560427427292, + "learning_rate": 6.360782502342599e-05, + "loss": 0.0126, + "step": 26500 + }, + { + "epoch": 15.430733410942956, + "grad_norm": 0.18545660376548767, + "learning_rate": 6.358130535299862e-05, + "loss": 0.0165, + "step": 26510 + }, + { + "epoch": 15.436554132712457, + "grad_norm": 0.19959890842437744, + "learning_rate": 6.355478155692926e-05, + "loss": 0.0146, + "step": 26520 + }, + { + "epoch": 15.442374854481956, + "grad_norm": 0.2618829309940338, + "learning_rate": 6.352825364327517e-05, + "loss": 0.0139, + "step": 26530 + }, + { + "epoch": 15.448195576251456, + "grad_norm": 0.18362219631671906, + "learning_rate": 6.350172162009482e-05, + "loss": 0.0218, + "step": 26540 + }, + { + "epoch": 15.454016298020955, + "grad_norm": 0.2677624821662903, + "learning_rate": 6.347518549544793e-05, + "loss": 0.0149, + "step": 26550 + }, + { + "epoch": 15.459837019790454, + "grad_norm": 0.17144623398780823, + "learning_rate": 6.344864527739547e-05, + "loss": 0.0114, + "step": 26560 + }, + { + "epoch": 15.465657741559953, + "grad_norm": 0.1667124480009079, + "learning_rate": 6.342210097399966e-05, + "loss": 0.014, + "step": 26570 + }, + { + "epoch": 15.471478463329452, + "grad_norm": 0.1170695573091507, + "learning_rate": 6.339555259332398e-05, + "loss": 0.0175, + "step": 26580 + }, + { + "epoch": 15.477299185098952, + "grad_norm": 0.22148439288139343, + "learning_rate": 6.33690001434331e-05, + "loss": 0.0202, + "step": 26590 + }, + { + "epoch": 15.48311990686845, + "grad_norm": 0.18872317671775818, + "learning_rate": 6.334244363239296e-05, + "loss": 0.0154, + "step": 26600 + }, + { + "epoch": 15.488940628637952, + "grad_norm": 0.14118574559688568, + "learning_rate": 6.331588306827073e-05, + "loss": 0.0117, + "step": 26610 + }, + { + "epoch": 15.494761350407451, + "grad_norm": 0.22304312884807587, + "learning_rate": 6.328931845913483e-05, + "loss": 0.0121, + "step": 26620 + }, + { + "epoch": 15.50058207217695, + "grad_norm": 0.15458542108535767, + "learning_rate": 6.326274981305484e-05, + "loss": 0.0121, + "step": 26630 + }, + { + "epoch": 15.50640279394645, + "grad_norm": 0.17114372551441193, + "learning_rate": 6.323617713810166e-05, + "loss": 0.0151, + "step": 26640 + }, + { + "epoch": 15.512223515715949, + "grad_norm": 0.15751226246356964, + "learning_rate": 6.320960044234734e-05, + "loss": 0.0117, + "step": 26650 + }, + { + "epoch": 15.518044237485448, + "grad_norm": 0.23703163862228394, + "learning_rate": 6.318301973386518e-05, + "loss": 0.016, + "step": 26660 + }, + { + "epoch": 15.523864959254947, + "grad_norm": 0.3149299919605255, + "learning_rate": 6.315643502072971e-05, + "loss": 0.013, + "step": 26670 + }, + { + "epoch": 15.529685681024446, + "grad_norm": 0.18256264925003052, + "learning_rate": 6.312984631101667e-05, + "loss": 0.0105, + "step": 26680 + }, + { + "epoch": 15.535506402793946, + "grad_norm": 0.1697777807712555, + "learning_rate": 6.310325361280297e-05, + "loss": 0.0116, + "step": 26690 + }, + { + "epoch": 15.541327124563447, + "grad_norm": 0.1877039521932602, + "learning_rate": 6.30766569341668e-05, + "loss": 0.0108, + "step": 26700 + }, + { + "epoch": 15.547147846332946, + "grad_norm": 0.15870524942874908, + "learning_rate": 6.305005628318753e-05, + "loss": 0.0144, + "step": 26710 + }, + { + "epoch": 15.552968568102445, + "grad_norm": 0.1644054502248764, + "learning_rate": 6.302345166794572e-05, + "loss": 0.0114, + "step": 26720 + }, + { + "epoch": 15.558789289871944, + "grad_norm": 0.19081497192382812, + "learning_rate": 6.299684309652316e-05, + "loss": 0.0127, + "step": 26730 + }, + { + "epoch": 15.564610011641443, + "grad_norm": 0.19660882651805878, + "learning_rate": 6.297023057700283e-05, + "loss": 0.009, + "step": 26740 + }, + { + "epoch": 15.570430733410943, + "grad_norm": 0.15077082812786102, + "learning_rate": 6.294361411746891e-05, + "loss": 0.0117, + "step": 26750 + }, + { + "epoch": 15.576251455180442, + "grad_norm": 0.19206108152866364, + "learning_rate": 6.291699372600677e-05, + "loss": 0.0146, + "step": 26760 + }, + { + "epoch": 15.582072176949941, + "grad_norm": 0.2189801186323166, + "learning_rate": 6.2890369410703e-05, + "loss": 0.0158, + "step": 26770 + }, + { + "epoch": 15.587892898719442, + "grad_norm": 0.22234474122524261, + "learning_rate": 6.286374117964534e-05, + "loss": 0.0106, + "step": 26780 + }, + { + "epoch": 15.593713620488941, + "grad_norm": 0.14458967745304108, + "learning_rate": 6.283710904092277e-05, + "loss": 0.0107, + "step": 26790 + }, + { + "epoch": 15.59953434225844, + "grad_norm": 0.16530460119247437, + "learning_rate": 6.281047300262542e-05, + "loss": 0.0117, + "step": 26800 + }, + { + "epoch": 15.60535506402794, + "grad_norm": 0.1806812435388565, + "learning_rate": 6.278383307284461e-05, + "loss": 0.0126, + "step": 26810 + }, + { + "epoch": 15.611175785797439, + "grad_norm": 0.1566435843706131, + "learning_rate": 6.275718925967284e-05, + "loss": 0.0136, + "step": 26820 + }, + { + "epoch": 15.616996507566938, + "grad_norm": 0.23477263748645782, + "learning_rate": 6.273054157120382e-05, + "loss": 0.0133, + "step": 26830 + }, + { + "epoch": 15.622817229336437, + "grad_norm": 0.17257942259311676, + "learning_rate": 6.270389001553238e-05, + "loss": 0.015, + "step": 26840 + }, + { + "epoch": 15.628637951105937, + "grad_norm": 0.19866232573986053, + "learning_rate": 6.26772346007546e-05, + "loss": 0.0164, + "step": 26850 + }, + { + "epoch": 15.634458672875436, + "grad_norm": 0.2253584861755371, + "learning_rate": 6.265057533496767e-05, + "loss": 0.0142, + "step": 26860 + }, + { + "epoch": 15.640279394644937, + "grad_norm": 0.2644549608230591, + "learning_rate": 6.262391222626997e-05, + "loss": 0.0165, + "step": 26870 + }, + { + "epoch": 15.646100116414436, + "grad_norm": 0.20646893978118896, + "learning_rate": 6.259724528276106e-05, + "loss": 0.0154, + "step": 26880 + }, + { + "epoch": 15.651920838183935, + "grad_norm": 0.2705398201942444, + "learning_rate": 6.257057451254162e-05, + "loss": 0.0132, + "step": 26890 + }, + { + "epoch": 15.657741559953434, + "grad_norm": 0.1339806765317917, + "learning_rate": 6.254389992371357e-05, + "loss": 0.0134, + "step": 26900 + }, + { + "epoch": 15.663562281722934, + "grad_norm": 0.1778518408536911, + "learning_rate": 6.25172215243799e-05, + "loss": 0.0136, + "step": 26910 + }, + { + "epoch": 15.669383003492433, + "grad_norm": 0.2057759016752243, + "learning_rate": 6.249053932264486e-05, + "loss": 0.0135, + "step": 26920 + }, + { + "epoch": 15.675203725261932, + "grad_norm": 0.15637066960334778, + "learning_rate": 6.246385332661376e-05, + "loss": 0.0127, + "step": 26930 + }, + { + "epoch": 15.681024447031431, + "grad_norm": 0.18498176336288452, + "learning_rate": 6.24371635443931e-05, + "loss": 0.0155, + "step": 26940 + }, + { + "epoch": 15.68684516880093, + "grad_norm": 0.1235509067773819, + "learning_rate": 6.241046998409054e-05, + "loss": 0.0087, + "step": 26950 + }, + { + "epoch": 15.692665890570431, + "grad_norm": 0.23640267550945282, + "learning_rate": 6.238377265381489e-05, + "loss": 0.0126, + "step": 26960 + }, + { + "epoch": 15.69848661233993, + "grad_norm": 0.1552855223417282, + "learning_rate": 6.235707156167607e-05, + "loss": 0.0176, + "step": 26970 + }, + { + "epoch": 15.70430733410943, + "grad_norm": 0.12525205314159393, + "learning_rate": 6.233036671578519e-05, + "loss": 0.022, + "step": 26980 + }, + { + "epoch": 15.71012805587893, + "grad_norm": 0.16311098635196686, + "learning_rate": 6.230365812425445e-05, + "loss": 0.0134, + "step": 26990 + }, + { + "epoch": 15.715948777648428, + "grad_norm": 0.19320736825466156, + "learning_rate": 6.227694579519724e-05, + "loss": 0.0163, + "step": 27000 + }, + { + "epoch": 15.721769499417928, + "grad_norm": 0.13548050820827484, + "learning_rate": 6.225022973672805e-05, + "loss": 0.0128, + "step": 27010 + }, + { + "epoch": 15.727590221187427, + "grad_norm": 0.2310212105512619, + "learning_rate": 6.222350995696253e-05, + "loss": 0.0187, + "step": 27020 + }, + { + "epoch": 15.733410942956926, + "grad_norm": 0.2519291341304779, + "learning_rate": 6.21967864640174e-05, + "loss": 0.0105, + "step": 27030 + }, + { + "epoch": 15.739231664726425, + "grad_norm": 0.29892295598983765, + "learning_rate": 6.217005926601059e-05, + "loss": 0.01, + "step": 27040 + }, + { + "epoch": 15.745052386495926, + "grad_norm": 0.19289420545101166, + "learning_rate": 6.214332837106111e-05, + "loss": 0.0101, + "step": 27050 + }, + { + "epoch": 15.750873108265425, + "grad_norm": 0.18365874886512756, + "learning_rate": 6.21165937872891e-05, + "loss": 0.0118, + "step": 27060 + }, + { + "epoch": 15.756693830034925, + "grad_norm": 0.22272560000419617, + "learning_rate": 6.208985552281582e-05, + "loss": 0.0142, + "step": 27070 + }, + { + "epoch": 15.762514551804424, + "grad_norm": 0.15311653912067413, + "learning_rate": 6.206311358576364e-05, + "loss": 0.0219, + "step": 27080 + }, + { + "epoch": 15.768335273573923, + "grad_norm": 0.15660889446735382, + "learning_rate": 6.203636798425608e-05, + "loss": 0.0123, + "step": 27090 + }, + { + "epoch": 15.774155995343422, + "grad_norm": 0.23685763776302338, + "learning_rate": 6.20096187264177e-05, + "loss": 0.0151, + "step": 27100 + }, + { + "epoch": 15.779976717112921, + "grad_norm": 0.19458739459514618, + "learning_rate": 6.198286582037425e-05, + "loss": 0.0117, + "step": 27110 + }, + { + "epoch": 15.78579743888242, + "grad_norm": 0.20439797639846802, + "learning_rate": 6.195610927425256e-05, + "loss": 0.0158, + "step": 27120 + }, + { + "epoch": 15.791618160651922, + "grad_norm": 0.20343467593193054, + "learning_rate": 6.192934909618056e-05, + "loss": 0.0088, + "step": 27130 + }, + { + "epoch": 15.797438882421421, + "grad_norm": 0.11111843585968018, + "learning_rate": 6.190258529428728e-05, + "loss": 0.0108, + "step": 27140 + }, + { + "epoch": 15.80325960419092, + "grad_norm": 0.1429162621498108, + "learning_rate": 6.187581787670285e-05, + "loss": 0.0195, + "step": 27150 + }, + { + "epoch": 15.80908032596042, + "grad_norm": 0.138364776968956, + "learning_rate": 6.184904685155852e-05, + "loss": 0.0147, + "step": 27160 + }, + { + "epoch": 15.814901047729919, + "grad_norm": 0.2254614531993866, + "learning_rate": 6.18222722269866e-05, + "loss": 0.017, + "step": 27170 + }, + { + "epoch": 15.820721769499418, + "grad_norm": 0.19852212071418762, + "learning_rate": 6.179549401112053e-05, + "loss": 0.0102, + "step": 27180 + }, + { + "epoch": 15.826542491268917, + "grad_norm": 0.1367267221212387, + "learning_rate": 6.176871221209482e-05, + "loss": 0.0134, + "step": 27190 + }, + { + "epoch": 15.832363213038416, + "grad_norm": 0.08862041682004929, + "learning_rate": 6.174192683804508e-05, + "loss": 0.0124, + "step": 27200 + }, + { + "epoch": 15.838183934807915, + "grad_norm": 0.1679544597864151, + "learning_rate": 6.1715137897108e-05, + "loss": 0.0121, + "step": 27210 + }, + { + "epoch": 15.844004656577416, + "grad_norm": 0.09269139915704727, + "learning_rate": 6.168834539742134e-05, + "loss": 0.017, + "step": 27220 + }, + { + "epoch": 15.849825378346916, + "grad_norm": 0.2355707585811615, + "learning_rate": 6.166154934712397e-05, + "loss": 0.0146, + "step": 27230 + }, + { + "epoch": 15.855646100116415, + "grad_norm": 0.21078960597515106, + "learning_rate": 6.163474975435581e-05, + "loss": 0.0102, + "step": 27240 + }, + { + "epoch": 15.861466821885914, + "grad_norm": 0.2857899069786072, + "learning_rate": 6.160794662725787e-05, + "loss": 0.0177, + "step": 27250 + }, + { + "epoch": 15.867287543655413, + "grad_norm": 0.26249268651008606, + "learning_rate": 6.158113997397222e-05, + "loss": 0.0144, + "step": 27260 + }, + { + "epoch": 15.873108265424912, + "grad_norm": 0.16070839762687683, + "learning_rate": 6.155432980264205e-05, + "loss": 0.012, + "step": 27270 + }, + { + "epoch": 15.878928987194412, + "grad_norm": 0.15286310017108917, + "learning_rate": 6.152751612141156e-05, + "loss": 0.017, + "step": 27280 + }, + { + "epoch": 15.884749708963911, + "grad_norm": 0.13463649153709412, + "learning_rate": 6.150069893842602e-05, + "loss": 0.0128, + "step": 27290 + }, + { + "epoch": 15.89057043073341, + "grad_norm": 0.15364782512187958, + "learning_rate": 6.147387826183182e-05, + "loss": 0.0111, + "step": 27300 + }, + { + "epoch": 15.896391152502911, + "grad_norm": 0.1906721144914627, + "learning_rate": 6.144705409977635e-05, + "loss": 0.0198, + "step": 27310 + }, + { + "epoch": 15.90221187427241, + "grad_norm": 0.13895097374916077, + "learning_rate": 6.142022646040808e-05, + "loss": 0.0115, + "step": 27320 + }, + { + "epoch": 15.90803259604191, + "grad_norm": 0.20839831233024597, + "learning_rate": 6.139339535187653e-05, + "loss": 0.0127, + "step": 27330 + }, + { + "epoch": 15.913853317811409, + "grad_norm": 0.1599379926919937, + "learning_rate": 6.136656078233232e-05, + "loss": 0.0095, + "step": 27340 + }, + { + "epoch": 15.919674039580908, + "grad_norm": 0.22166462242603302, + "learning_rate": 6.133972275992707e-05, + "loss": 0.0161, + "step": 27350 + }, + { + "epoch": 15.925494761350407, + "grad_norm": 0.18671056628227234, + "learning_rate": 6.131288129281342e-05, + "loss": 0.015, + "step": 27360 + }, + { + "epoch": 15.931315483119906, + "grad_norm": 0.1798444241285324, + "learning_rate": 6.128603638914516e-05, + "loss": 0.0125, + "step": 27370 + }, + { + "epoch": 15.937136204889406, + "grad_norm": 0.20247799158096313, + "learning_rate": 6.125918805707704e-05, + "loss": 0.0121, + "step": 27380 + }, + { + "epoch": 15.942956926658905, + "grad_norm": 0.14845643937587738, + "learning_rate": 6.123233630476485e-05, + "loss": 0.0128, + "step": 27390 + }, + { + "epoch": 15.948777648428406, + "grad_norm": 0.17592109739780426, + "learning_rate": 6.120548114036547e-05, + "loss": 0.0125, + "step": 27400 + }, + { + "epoch": 15.954598370197905, + "grad_norm": 0.2057344764471054, + "learning_rate": 6.117862257203679e-05, + "loss": 0.0133, + "step": 27410 + }, + { + "epoch": 15.960419091967404, + "grad_norm": 0.2247558832168579, + "learning_rate": 6.115176060793771e-05, + "loss": 0.0174, + "step": 27420 + }, + { + "epoch": 15.966239813736903, + "grad_norm": 0.23716293275356293, + "learning_rate": 6.112489525622822e-05, + "loss": 0.0098, + "step": 27430 + }, + { + "epoch": 15.972060535506403, + "grad_norm": 0.2707483172416687, + "learning_rate": 6.109802652506928e-05, + "loss": 0.0139, + "step": 27440 + }, + { + "epoch": 15.977881257275902, + "grad_norm": 0.24354888498783112, + "learning_rate": 6.107115442262291e-05, + "loss": 0.0116, + "step": 27450 + }, + { + "epoch": 15.983701979045401, + "grad_norm": 0.16638824343681335, + "learning_rate": 6.104427895705214e-05, + "loss": 0.0152, + "step": 27460 + }, + { + "epoch": 15.9895227008149, + "grad_norm": 0.14276820421218872, + "learning_rate": 6.101740013652103e-05, + "loss": 0.0077, + "step": 27470 + }, + { + "epoch": 15.995343422584401, + "grad_norm": 0.14945171773433685, + "learning_rate": 6.099051796919465e-05, + "loss": 0.0148, + "step": 27480 + }, + { + "epoch": 16.0011641443539, + "grad_norm": 0.1556803435087204, + "learning_rate": 6.096363246323911e-05, + "loss": 0.0156, + "step": 27490 + }, + { + "epoch": 16.0069848661234, + "grad_norm": 0.2542346119880676, + "learning_rate": 6.0936743626821504e-05, + "loss": 0.0147, + "step": 27500 + }, + { + "epoch": 16.0128055878929, + "grad_norm": 0.2602655589580536, + "learning_rate": 6.090985146810996e-05, + "loss": 0.0113, + "step": 27510 + }, + { + "epoch": 16.018626309662398, + "grad_norm": 0.1444215625524521, + "learning_rate": 6.088295599527357e-05, + "loss": 0.0139, + "step": 27520 + }, + { + "epoch": 16.024447031431897, + "grad_norm": 0.26362261176109314, + "learning_rate": 6.085605721648252e-05, + "loss": 0.0115, + "step": 27530 + }, + { + "epoch": 16.030267753201397, + "grad_norm": 0.3212401568889618, + "learning_rate": 6.082915513990792e-05, + "loss": 0.0132, + "step": 27540 + }, + { + "epoch": 16.036088474970896, + "grad_norm": 0.10069014132022858, + "learning_rate": 6.080224977372192e-05, + "loss": 0.0123, + "step": 27550 + }, + { + "epoch": 16.041909196740395, + "grad_norm": 0.19599507749080658, + "learning_rate": 6.0775341126097666e-05, + "loss": 0.0128, + "step": 27560 + }, + { + "epoch": 16.047729918509894, + "grad_norm": 0.166745126247406, + "learning_rate": 6.074842920520926e-05, + "loss": 0.0139, + "step": 27570 + }, + { + "epoch": 16.053550640279393, + "grad_norm": 0.2196914404630661, + "learning_rate": 6.072151401923186e-05, + "loss": 0.0132, + "step": 27580 + }, + { + "epoch": 16.059371362048893, + "grad_norm": 0.18322543799877167, + "learning_rate": 6.069459557634159e-05, + "loss": 0.0139, + "step": 27590 + }, + { + "epoch": 16.065192083818392, + "grad_norm": 0.2248426377773285, + "learning_rate": 6.066767388471557e-05, + "loss": 0.0148, + "step": 27600 + }, + { + "epoch": 16.07101280558789, + "grad_norm": 0.29641294479370117, + "learning_rate": 6.064074895253188e-05, + "loss": 0.0144, + "step": 27610 + }, + { + "epoch": 16.076833527357394, + "grad_norm": 0.11679032444953918, + "learning_rate": 6.061382078796961e-05, + "loss": 0.0161, + "step": 27620 + }, + { + "epoch": 16.082654249126893, + "grad_norm": 0.14626136422157288, + "learning_rate": 6.0586889399208814e-05, + "loss": 0.0117, + "step": 27630 + }, + { + "epoch": 16.088474970896392, + "grad_norm": 0.15815076231956482, + "learning_rate": 6.0559954794430565e-05, + "loss": 0.0159, + "step": 27640 + }, + { + "epoch": 16.09429569266589, + "grad_norm": 0.16721311211585999, + "learning_rate": 6.053301698181687e-05, + "loss": 0.0155, + "step": 27650 + }, + { + "epoch": 16.10011641443539, + "grad_norm": 0.16518628597259521, + "learning_rate": 6.0506075969550725e-05, + "loss": 0.0112, + "step": 27660 + }, + { + "epoch": 16.10593713620489, + "grad_norm": 0.15866920351982117, + "learning_rate": 6.047913176581609e-05, + "loss": 0.0122, + "step": 27670 + }, + { + "epoch": 16.11175785797439, + "grad_norm": 0.16200029850006104, + "learning_rate": 6.0452184378797904e-05, + "loss": 0.0118, + "step": 27680 + }, + { + "epoch": 16.11757857974389, + "grad_norm": 0.18503105640411377, + "learning_rate": 6.042523381668209e-05, + "loss": 0.0139, + "step": 27690 + }, + { + "epoch": 16.123399301513388, + "grad_norm": 0.18560200929641724, + "learning_rate": 6.03982800876555e-05, + "loss": 0.0104, + "step": 27700 + }, + { + "epoch": 16.129220023282887, + "grad_norm": 0.1732499897480011, + "learning_rate": 6.0371323199905975e-05, + "loss": 0.0187, + "step": 27710 + }, + { + "epoch": 16.135040745052386, + "grad_norm": 0.18607337772846222, + "learning_rate": 6.03443631616223e-05, + "loss": 0.0139, + "step": 27720 + }, + { + "epoch": 16.140861466821885, + "grad_norm": 0.13487425446510315, + "learning_rate": 6.031739998099421e-05, + "loss": 0.0097, + "step": 27730 + }, + { + "epoch": 16.146682188591384, + "grad_norm": 0.15664677321910858, + "learning_rate": 6.029043366621243e-05, + "loss": 0.0104, + "step": 27740 + }, + { + "epoch": 16.152502910360884, + "grad_norm": 0.2161255031824112, + "learning_rate": 6.0263464225468615e-05, + "loss": 0.0145, + "step": 27750 + }, + { + "epoch": 16.158323632130383, + "grad_norm": 0.14324794709682465, + "learning_rate": 6.023649166695534e-05, + "loss": 0.0088, + "step": 27760 + }, + { + "epoch": 16.164144353899882, + "grad_norm": 0.17263147234916687, + "learning_rate": 6.0209515998866186e-05, + "loss": 0.0135, + "step": 27770 + }, + { + "epoch": 16.16996507566938, + "grad_norm": 0.17771191895008087, + "learning_rate": 6.018253722939563e-05, + "loss": 0.0131, + "step": 27780 + }, + { + "epoch": 16.175785797438884, + "grad_norm": 0.21196290850639343, + "learning_rate": 6.015555536673914e-05, + "loss": 0.0143, + "step": 27790 + }, + { + "epoch": 16.181606519208383, + "grad_norm": 0.18733979761600494, + "learning_rate": 6.0128570419093054e-05, + "loss": 0.0157, + "step": 27800 + }, + { + "epoch": 16.187427240977883, + "grad_norm": 0.17809350788593292, + "learning_rate": 6.010158239465471e-05, + "loss": 0.0099, + "step": 27810 + }, + { + "epoch": 16.19324796274738, + "grad_norm": 0.15098436176776886, + "learning_rate": 6.007459130162235e-05, + "loss": 0.014, + "step": 27820 + }, + { + "epoch": 16.19906868451688, + "grad_norm": 0.15078429877758026, + "learning_rate": 6.004759714819516e-05, + "loss": 0.0132, + "step": 27830 + }, + { + "epoch": 16.20488940628638, + "grad_norm": 0.21530458331108093, + "learning_rate": 6.002059994257323e-05, + "loss": 0.0155, + "step": 27840 + }, + { + "epoch": 16.21071012805588, + "grad_norm": 0.18323905766010284, + "learning_rate": 5.999359969295764e-05, + "loss": 0.013, + "step": 27850 + }, + { + "epoch": 16.21653084982538, + "grad_norm": 0.11928869038820267, + "learning_rate": 5.9966596407550314e-05, + "loss": 0.0092, + "step": 27860 + }, + { + "epoch": 16.222351571594878, + "grad_norm": 0.2505195438861847, + "learning_rate": 5.993959009455416e-05, + "loss": 0.0164, + "step": 27870 + }, + { + "epoch": 16.228172293364377, + "grad_norm": 0.27890443801879883, + "learning_rate": 5.991258076217298e-05, + "loss": 0.0107, + "step": 27880 + }, + { + "epoch": 16.233993015133876, + "grad_norm": 0.1833486109972, + "learning_rate": 5.988556841861147e-05, + "loss": 0.015, + "step": 27890 + }, + { + "epoch": 16.239813736903375, + "grad_norm": 0.2725547254085541, + "learning_rate": 5.985855307207531e-05, + "loss": 0.0163, + "step": 27900 + }, + { + "epoch": 16.245634458672875, + "grad_norm": 0.20399704575538635, + "learning_rate": 5.9831534730771e-05, + "loss": 0.0112, + "step": 27910 + }, + { + "epoch": 16.251455180442374, + "grad_norm": 0.14375315606594086, + "learning_rate": 5.980451340290605e-05, + "loss": 0.0124, + "step": 27920 + }, + { + "epoch": 16.257275902211873, + "grad_norm": 0.16215011477470398, + "learning_rate": 5.97774890966888e-05, + "loss": 0.0087, + "step": 27930 + }, + { + "epoch": 16.263096623981372, + "grad_norm": 0.21699760854244232, + "learning_rate": 5.975046182032851e-05, + "loss": 0.012, + "step": 27940 + }, + { + "epoch": 16.26891734575087, + "grad_norm": 0.2453547567129135, + "learning_rate": 5.972343158203537e-05, + "loss": 0.0122, + "step": 27950 + }, + { + "epoch": 16.274738067520374, + "grad_norm": 0.16436949372291565, + "learning_rate": 5.969639839002045e-05, + "loss": 0.0113, + "step": 27960 + }, + { + "epoch": 16.280558789289874, + "grad_norm": 0.12957365810871124, + "learning_rate": 5.966936225249572e-05, + "loss": 0.0139, + "step": 27970 + }, + { + "epoch": 16.286379511059373, + "grad_norm": 0.26771044731140137, + "learning_rate": 5.9642323177674044e-05, + "loss": 0.0096, + "step": 27980 + }, + { + "epoch": 16.292200232828872, + "grad_norm": 0.22723989188671112, + "learning_rate": 5.9615281173769154e-05, + "loss": 0.0108, + "step": 27990 + }, + { + "epoch": 16.29802095459837, + "grad_norm": 0.10659599304199219, + "learning_rate": 5.958823624899574e-05, + "loss": 0.0102, + "step": 28000 + }, + { + "epoch": 16.30384167636787, + "grad_norm": 0.2000678926706314, + "learning_rate": 5.956118841156933e-05, + "loss": 0.0115, + "step": 28010 + }, + { + "epoch": 16.30966239813737, + "grad_norm": 0.1099746823310852, + "learning_rate": 5.953413766970631e-05, + "loss": 0.0121, + "step": 28020 + }, + { + "epoch": 16.31548311990687, + "grad_norm": 0.2053193598985672, + "learning_rate": 5.9507084031624e-05, + "loss": 0.0134, + "step": 28030 + }, + { + "epoch": 16.321303841676368, + "grad_norm": 0.2474842667579651, + "learning_rate": 5.948002750554058e-05, + "loss": 0.0149, + "step": 28040 + }, + { + "epoch": 16.327124563445867, + "grad_norm": 0.19379575550556183, + "learning_rate": 5.9452968099675124e-05, + "loss": 0.0122, + "step": 28050 + }, + { + "epoch": 16.332945285215366, + "grad_norm": 0.16897854208946228, + "learning_rate": 5.9425905822247527e-05, + "loss": 0.014, + "step": 28060 + }, + { + "epoch": 16.338766006984866, + "grad_norm": 0.24512413144111633, + "learning_rate": 5.939884068147864e-05, + "loss": 0.0136, + "step": 28070 + }, + { + "epoch": 16.344586728754365, + "grad_norm": 0.17068564891815186, + "learning_rate": 5.937177268559011e-05, + "loss": 0.0111, + "step": 28080 + }, + { + "epoch": 16.350407450523864, + "grad_norm": 0.18789656460285187, + "learning_rate": 5.934470184280448e-05, + "loss": 0.0104, + "step": 28090 + }, + { + "epoch": 16.356228172293363, + "grad_norm": 0.19035153090953827, + "learning_rate": 5.931762816134516e-05, + "loss": 0.0119, + "step": 28100 + }, + { + "epoch": 16.362048894062863, + "grad_norm": 0.26387926936149597, + "learning_rate": 5.9290551649436434e-05, + "loss": 0.0116, + "step": 28110 + }, + { + "epoch": 16.36786961583236, + "grad_norm": 0.296365350484848, + "learning_rate": 5.9263472315303416e-05, + "loss": 0.0125, + "step": 28120 + }, + { + "epoch": 16.37369033760186, + "grad_norm": 0.20899993181228638, + "learning_rate": 5.9236390167172096e-05, + "loss": 0.0126, + "step": 28130 + }, + { + "epoch": 16.379511059371364, + "grad_norm": 0.19141970574855804, + "learning_rate": 5.920930521326932e-05, + "loss": 0.013, + "step": 28140 + }, + { + "epoch": 16.385331781140863, + "grad_norm": 0.1597336083650589, + "learning_rate": 5.918221746182276e-05, + "loss": 0.0139, + "step": 28150 + }, + { + "epoch": 16.391152502910362, + "grad_norm": 0.10455428063869476, + "learning_rate": 5.9155126921061e-05, + "loss": 0.0113, + "step": 28160 + }, + { + "epoch": 16.39697322467986, + "grad_norm": 0.22749091684818268, + "learning_rate": 5.91280335992134e-05, + "loss": 0.0121, + "step": 28170 + }, + { + "epoch": 16.40279394644936, + "grad_norm": 0.14428508281707764, + "learning_rate": 5.91009375045102e-05, + "loss": 0.0134, + "step": 28180 + }, + { + "epoch": 16.40861466821886, + "grad_norm": 0.11925283819437027, + "learning_rate": 5.9073838645182476e-05, + "loss": 0.0087, + "step": 28190 + }, + { + "epoch": 16.41443538998836, + "grad_norm": 0.15036126971244812, + "learning_rate": 5.904673702946217e-05, + "loss": 0.0141, + "step": 28200 + }, + { + "epoch": 16.42025611175786, + "grad_norm": 0.18890973925590515, + "learning_rate": 5.9019632665582004e-05, + "loss": 0.0109, + "step": 28210 + }, + { + "epoch": 16.426076833527357, + "grad_norm": 0.19297602772712708, + "learning_rate": 5.899252556177559e-05, + "loss": 0.0116, + "step": 28220 + }, + { + "epoch": 16.431897555296857, + "grad_norm": 0.13055598735809326, + "learning_rate": 5.896541572627735e-05, + "loss": 0.0121, + "step": 28230 + }, + { + "epoch": 16.437718277066356, + "grad_norm": 0.10670328885316849, + "learning_rate": 5.893830316732253e-05, + "loss": 0.0119, + "step": 28240 + }, + { + "epoch": 16.443538998835855, + "grad_norm": 0.17770415544509888, + "learning_rate": 5.8911187893147214e-05, + "loss": 0.0142, + "step": 28250 + }, + { + "epoch": 16.449359720605354, + "grad_norm": 0.22425983846187592, + "learning_rate": 5.888406991198828e-05, + "loss": 0.0143, + "step": 28260 + }, + { + "epoch": 16.455180442374854, + "grad_norm": 0.20574131608009338, + "learning_rate": 5.885694923208349e-05, + "loss": 0.0172, + "step": 28270 + }, + { + "epoch": 16.461001164144353, + "grad_norm": 0.2632496654987335, + "learning_rate": 5.882982586167138e-05, + "loss": 0.0128, + "step": 28280 + }, + { + "epoch": 16.466821885913852, + "grad_norm": 0.11188705265522003, + "learning_rate": 5.880269980899131e-05, + "loss": 0.0101, + "step": 28290 + }, + { + "epoch": 16.47264260768335, + "grad_norm": 0.14051666855812073, + "learning_rate": 5.8775571082283465e-05, + "loss": 0.0112, + "step": 28300 + }, + { + "epoch": 16.47846332945285, + "grad_norm": 0.14825290441513062, + "learning_rate": 5.8748439689788824e-05, + "loss": 0.0135, + "step": 28310 + }, + { + "epoch": 16.484284051222353, + "grad_norm": 0.12235958129167557, + "learning_rate": 5.87213056397492e-05, + "loss": 0.0117, + "step": 28320 + }, + { + "epoch": 16.490104772991852, + "grad_norm": 0.15842890739440918, + "learning_rate": 5.869416894040719e-05, + "loss": 0.015, + "step": 28330 + }, + { + "epoch": 16.49592549476135, + "grad_norm": 0.15233272314071655, + "learning_rate": 5.866702960000621e-05, + "loss": 0.0111, + "step": 28340 + }, + { + "epoch": 16.50174621653085, + "grad_norm": 0.2393541932106018, + "learning_rate": 5.863988762679048e-05, + "loss": 0.0131, + "step": 28350 + }, + { + "epoch": 16.50756693830035, + "grad_norm": 0.2097398042678833, + "learning_rate": 5.8612743029005e-05, + "loss": 0.0114, + "step": 28360 + }, + { + "epoch": 16.51338766006985, + "grad_norm": 0.18686561286449432, + "learning_rate": 5.858559581489561e-05, + "loss": 0.011, + "step": 28370 + }, + { + "epoch": 16.51920838183935, + "grad_norm": 0.09630461037158966, + "learning_rate": 5.85584459927089e-05, + "loss": 0.0121, + "step": 28380 + }, + { + "epoch": 16.525029103608848, + "grad_norm": 0.1486002802848816, + "learning_rate": 5.853129357069227e-05, + "loss": 0.0098, + "step": 28390 + }, + { + "epoch": 16.530849825378347, + "grad_norm": 0.17339718341827393, + "learning_rate": 5.8504138557093913e-05, + "loss": 0.0112, + "step": 28400 + }, + { + "epoch": 16.536670547147846, + "grad_norm": 0.1997438371181488, + "learning_rate": 5.8476980960162784e-05, + "loss": 0.0118, + "step": 28410 + }, + { + "epoch": 16.542491268917345, + "grad_norm": 0.2821563482284546, + "learning_rate": 5.844982078814868e-05, + "loss": 0.0134, + "step": 28420 + }, + { + "epoch": 16.548311990686845, + "grad_norm": 0.25448307394981384, + "learning_rate": 5.842265804930211e-05, + "loss": 0.0158, + "step": 28430 + }, + { + "epoch": 16.554132712456344, + "grad_norm": 0.20174309611320496, + "learning_rate": 5.839549275187444e-05, + "loss": 0.0116, + "step": 28440 + }, + { + "epoch": 16.559953434225843, + "grad_norm": 0.21930061280727386, + "learning_rate": 5.836832490411771e-05, + "loss": 0.0132, + "step": 28450 + }, + { + "epoch": 16.565774155995342, + "grad_norm": 0.21313253045082092, + "learning_rate": 5.834115451428485e-05, + "loss": 0.0149, + "step": 28460 + }, + { + "epoch": 16.57159487776484, + "grad_norm": 0.12323768436908722, + "learning_rate": 5.831398159062946e-05, + "loss": 0.0109, + "step": 28470 + }, + { + "epoch": 16.57741559953434, + "grad_norm": 0.2661484479904175, + "learning_rate": 5.828680614140599e-05, + "loss": 0.0111, + "step": 28480 + }, + { + "epoch": 16.583236321303843, + "grad_norm": 0.18922285735607147, + "learning_rate": 5.825962817486962e-05, + "loss": 0.0127, + "step": 28490 + }, + { + "epoch": 16.589057043073343, + "grad_norm": 0.1608707159757614, + "learning_rate": 5.823244769927629e-05, + "loss": 0.0108, + "step": 28500 + }, + { + "epoch": 16.594877764842842, + "grad_norm": 0.2097862958908081, + "learning_rate": 5.8205264722882716e-05, + "loss": 0.0166, + "step": 28510 + }, + { + "epoch": 16.60069848661234, + "grad_norm": 0.23737791180610657, + "learning_rate": 5.817807925394636e-05, + "loss": 0.0093, + "step": 28520 + }, + { + "epoch": 16.60651920838184, + "grad_norm": 0.13149234652519226, + "learning_rate": 5.815089130072546e-05, + "loss": 0.0112, + "step": 28530 + }, + { + "epoch": 16.61233993015134, + "grad_norm": 0.1824592649936676, + "learning_rate": 5.8123700871479e-05, + "loss": 0.0099, + "step": 28540 + }, + { + "epoch": 16.61816065192084, + "grad_norm": 0.15946917235851288, + "learning_rate": 5.809650797446671e-05, + "loss": 0.0111, + "step": 28550 + }, + { + "epoch": 16.623981373690338, + "grad_norm": 0.2099727988243103, + "learning_rate": 5.806931261794907e-05, + "loss": 0.0134, + "step": 28560 + }, + { + "epoch": 16.629802095459837, + "grad_norm": 0.17011769115924835, + "learning_rate": 5.804211481018731e-05, + "loss": 0.0077, + "step": 28570 + }, + { + "epoch": 16.635622817229336, + "grad_norm": 0.09437558054924011, + "learning_rate": 5.801491455944341e-05, + "loss": 0.0107, + "step": 28580 + }, + { + "epoch": 16.641443538998836, + "grad_norm": 0.10555020719766617, + "learning_rate": 5.79877118739801e-05, + "loss": 0.0091, + "step": 28590 + }, + { + "epoch": 16.647264260768335, + "grad_norm": 0.14191825687885284, + "learning_rate": 5.7960506762060816e-05, + "loss": 0.0132, + "step": 28600 + }, + { + "epoch": 16.653084982537834, + "grad_norm": 0.18364013731479645, + "learning_rate": 5.793329923194977e-05, + "loss": 0.009, + "step": 28610 + }, + { + "epoch": 16.658905704307333, + "grad_norm": 0.20591315627098083, + "learning_rate": 5.790608929191187e-05, + "loss": 0.0101, + "step": 28620 + }, + { + "epoch": 16.664726426076832, + "grad_norm": 0.13651396334171295, + "learning_rate": 5.78788769502128e-05, + "loss": 0.0157, + "step": 28630 + }, + { + "epoch": 16.67054714784633, + "grad_norm": 0.14682775735855103, + "learning_rate": 5.785166221511894e-05, + "loss": 0.0128, + "step": 28640 + }, + { + "epoch": 16.67636786961583, + "grad_norm": 0.14044611155986786, + "learning_rate": 5.7824445094897415e-05, + "loss": 0.0161, + "step": 28650 + }, + { + "epoch": 16.682188591385334, + "grad_norm": 0.1667587161064148, + "learning_rate": 5.7797225597816065e-05, + "loss": 0.0124, + "step": 28660 + }, + { + "epoch": 16.688009313154833, + "grad_norm": 0.2863614857196808, + "learning_rate": 5.777000373214345e-05, + "loss": 0.0105, + "step": 28670 + }, + { + "epoch": 16.693830034924332, + "grad_norm": 0.19845297932624817, + "learning_rate": 5.774277950614885e-05, + "loss": 0.0183, + "step": 28680 + }, + { + "epoch": 16.69965075669383, + "grad_norm": 0.19679377973079681, + "learning_rate": 5.771555292810227e-05, + "loss": 0.0121, + "step": 28690 + }, + { + "epoch": 16.70547147846333, + "grad_norm": 0.19950808584690094, + "learning_rate": 5.768832400627444e-05, + "loss": 0.0117, + "step": 28700 + }, + { + "epoch": 16.71129220023283, + "grad_norm": 0.2230244129896164, + "learning_rate": 5.7661092748936775e-05, + "loss": 0.0133, + "step": 28710 + }, + { + "epoch": 16.71711292200233, + "grad_norm": 0.12678126990795135, + "learning_rate": 5.76338591643614e-05, + "loss": 0.0157, + "step": 28720 + }, + { + "epoch": 16.722933643771828, + "grad_norm": 0.16344167292118073, + "learning_rate": 5.760662326082118e-05, + "loss": 0.0148, + "step": 28730 + }, + { + "epoch": 16.728754365541327, + "grad_norm": 0.14660614728927612, + "learning_rate": 5.757938504658965e-05, + "loss": 0.0098, + "step": 28740 + }, + { + "epoch": 16.734575087310827, + "grad_norm": 0.16023465991020203, + "learning_rate": 5.755214452994107e-05, + "loss": 0.0112, + "step": 28750 + }, + { + "epoch": 16.740395809080326, + "grad_norm": 0.11055311560630798, + "learning_rate": 5.752490171915039e-05, + "loss": 0.0174, + "step": 28760 + }, + { + "epoch": 16.746216530849825, + "grad_norm": 0.18671970069408417, + "learning_rate": 5.749765662249324e-05, + "loss": 0.0142, + "step": 28770 + }, + { + "epoch": 16.752037252619324, + "grad_norm": 0.19945505261421204, + "learning_rate": 5.747040924824596e-05, + "loss": 0.0076, + "step": 28780 + }, + { + "epoch": 16.757857974388823, + "grad_norm": 0.21327541768550873, + "learning_rate": 5.7443159604685613e-05, + "loss": 0.0113, + "step": 28790 + }, + { + "epoch": 16.763678696158323, + "grad_norm": 0.11252987384796143, + "learning_rate": 5.74159077000899e-05, + "loss": 0.014, + "step": 28800 + }, + { + "epoch": 16.769499417927822, + "grad_norm": 0.22377096116542816, + "learning_rate": 5.7388653542737235e-05, + "loss": 0.0126, + "step": 28810 + }, + { + "epoch": 16.77532013969732, + "grad_norm": 0.11118891835212708, + "learning_rate": 5.736139714090672e-05, + "loss": 0.0139, + "step": 28820 + }, + { + "epoch": 16.78114086146682, + "grad_norm": 0.19069206714630127, + "learning_rate": 5.73341385028781e-05, + "loss": 0.0121, + "step": 28830 + }, + { + "epoch": 16.78696158323632, + "grad_norm": 0.1522088646888733, + "learning_rate": 5.7306877636931855e-05, + "loss": 0.0106, + "step": 28840 + }, + { + "epoch": 16.792782305005822, + "grad_norm": 0.156394824385643, + "learning_rate": 5.7279614551349125e-05, + "loss": 0.0106, + "step": 28850 + }, + { + "epoch": 16.79860302677532, + "grad_norm": 0.22606554627418518, + "learning_rate": 5.725234925441169e-05, + "loss": 0.008, + "step": 28860 + }, + { + "epoch": 16.80442374854482, + "grad_norm": 0.2010137438774109, + "learning_rate": 5.7225081754402044e-05, + "loss": 0.0107, + "step": 28870 + }, + { + "epoch": 16.81024447031432, + "grad_norm": 0.1751130223274231, + "learning_rate": 5.7197812059603326e-05, + "loss": 0.0112, + "step": 28880 + }, + { + "epoch": 16.81606519208382, + "grad_norm": 0.18474030494689941, + "learning_rate": 5.717054017829934e-05, + "loss": 0.0144, + "step": 28890 + }, + { + "epoch": 16.82188591385332, + "grad_norm": 0.20282459259033203, + "learning_rate": 5.7143266118774584e-05, + "loss": 0.0199, + "step": 28900 + }, + { + "epoch": 16.827706635622818, + "grad_norm": 0.1645049899816513, + "learning_rate": 5.711598988931418e-05, + "loss": 0.0111, + "step": 28910 + }, + { + "epoch": 16.833527357392317, + "grad_norm": 0.18717698752880096, + "learning_rate": 5.7088711498203954e-05, + "loss": 0.0106, + "step": 28920 + }, + { + "epoch": 16.839348079161816, + "grad_norm": 0.17064838111400604, + "learning_rate": 5.706143095373033e-05, + "loss": 0.0158, + "step": 28930 + }, + { + "epoch": 16.845168800931315, + "grad_norm": 0.15215930342674255, + "learning_rate": 5.703414826418042e-05, + "loss": 0.0112, + "step": 28940 + }, + { + "epoch": 16.850989522700814, + "grad_norm": 0.18134084343910217, + "learning_rate": 5.7006863437842007e-05, + "loss": 0.0128, + "step": 28950 + }, + { + "epoch": 16.856810244470314, + "grad_norm": 0.14375823736190796, + "learning_rate": 5.697957648300348e-05, + "loss": 0.0177, + "step": 28960 + }, + { + "epoch": 16.862630966239813, + "grad_norm": 0.1756073385477066, + "learning_rate": 5.695228740795391e-05, + "loss": 0.0129, + "step": 28970 + }, + { + "epoch": 16.868451688009312, + "grad_norm": 0.19213049113750458, + "learning_rate": 5.6924996220982985e-05, + "loss": 0.0138, + "step": 28980 + }, + { + "epoch": 16.87427240977881, + "grad_norm": 0.22376161813735962, + "learning_rate": 5.6897702930381045e-05, + "loss": 0.011, + "step": 28990 + }, + { + "epoch": 16.88009313154831, + "grad_norm": 0.1525183916091919, + "learning_rate": 5.687040754443908e-05, + "loss": 0.009, + "step": 29000 + }, + { + "epoch": 16.88591385331781, + "grad_norm": 0.1429033726453781, + "learning_rate": 5.6843110071448725e-05, + "loss": 0.013, + "step": 29010 + }, + { + "epoch": 16.891734575087312, + "grad_norm": 0.24297630786895752, + "learning_rate": 5.6815810519702194e-05, + "loss": 0.0114, + "step": 29020 + }, + { + "epoch": 16.89755529685681, + "grad_norm": 0.16276469826698303, + "learning_rate": 5.6788508897492396e-05, + "loss": 0.0095, + "step": 29030 + }, + { + "epoch": 16.90337601862631, + "grad_norm": 0.2629111409187317, + "learning_rate": 5.676120521311282e-05, + "loss": 0.0137, + "step": 29040 + }, + { + "epoch": 16.90919674039581, + "grad_norm": 0.11115598678588867, + "learning_rate": 5.6733899474857634e-05, + "loss": 0.014, + "step": 29050 + }, + { + "epoch": 16.91501746216531, + "grad_norm": 0.23399008810520172, + "learning_rate": 5.670659169102157e-05, + "loss": 0.0127, + "step": 29060 + }, + { + "epoch": 16.92083818393481, + "grad_norm": 0.2940141260623932, + "learning_rate": 5.6679281869900044e-05, + "loss": 0.0169, + "step": 29070 + }, + { + "epoch": 16.926658905704308, + "grad_norm": 0.22601063549518585, + "learning_rate": 5.6651970019789045e-05, + "loss": 0.0139, + "step": 29080 + }, + { + "epoch": 16.932479627473807, + "grad_norm": 0.2175331711769104, + "learning_rate": 5.662465614898519e-05, + "loss": 0.0111, + "step": 29090 + }, + { + "epoch": 16.938300349243306, + "grad_norm": 0.1547728031873703, + "learning_rate": 5.6597340265785695e-05, + "loss": 0.0191, + "step": 29100 + }, + { + "epoch": 16.944121071012805, + "grad_norm": 0.14428399503231049, + "learning_rate": 5.657002237848843e-05, + "loss": 0.0108, + "step": 29110 + }, + { + "epoch": 16.949941792782305, + "grad_norm": 0.16302236914634705, + "learning_rate": 5.654270249539183e-05, + "loss": 0.01, + "step": 29120 + }, + { + "epoch": 16.955762514551804, + "grad_norm": 0.16808094084262848, + "learning_rate": 5.651538062479498e-05, + "loss": 0.0165, + "step": 29130 + }, + { + "epoch": 16.961583236321303, + "grad_norm": 0.1918502300977707, + "learning_rate": 5.648805677499751e-05, + "loss": 0.0119, + "step": 29140 + }, + { + "epoch": 16.967403958090802, + "grad_norm": 0.1972024291753769, + "learning_rate": 5.646073095429969e-05, + "loss": 0.0141, + "step": 29150 + }, + { + "epoch": 16.9732246798603, + "grad_norm": 0.13421893119812012, + "learning_rate": 5.643340317100241e-05, + "loss": 0.0118, + "step": 29160 + }, + { + "epoch": 16.9790454016298, + "grad_norm": 0.21008077263832092, + "learning_rate": 5.64060734334071e-05, + "loss": 0.0142, + "step": 29170 + }, + { + "epoch": 16.9848661233993, + "grad_norm": 0.10190806537866592, + "learning_rate": 5.637874174981583e-05, + "loss": 0.0106, + "step": 29180 + }, + { + "epoch": 16.990686845168803, + "grad_norm": 0.14332538843154907, + "learning_rate": 5.635140812853124e-05, + "loss": 0.012, + "step": 29190 + }, + { + "epoch": 16.996507566938302, + "grad_norm": 0.10756193101406097, + "learning_rate": 5.6324072577856544e-05, + "loss": 0.0114, + "step": 29200 + }, + { + "epoch": 17.0023282887078, + "grad_norm": 0.18801487982273102, + "learning_rate": 5.629673510609559e-05, + "loss": 0.0117, + "step": 29210 + }, + { + "epoch": 17.0081490104773, + "grad_norm": 0.10480181127786636, + "learning_rate": 5.626939572155276e-05, + "loss": 0.0142, + "step": 29220 + }, + { + "epoch": 17.0139697322468, + "grad_norm": 0.18320821225643158, + "learning_rate": 5.6242054432533054e-05, + "loss": 0.0105, + "step": 29230 + }, + { + "epoch": 17.0197904540163, + "grad_norm": 0.2429748922586441, + "learning_rate": 5.621471124734201e-05, + "loss": 0.0136, + "step": 29240 + }, + { + "epoch": 17.025611175785798, + "grad_norm": 0.260751873254776, + "learning_rate": 5.6187366174285794e-05, + "loss": 0.0219, + "step": 29250 + }, + { + "epoch": 17.031431897555297, + "grad_norm": 0.21046672761440277, + "learning_rate": 5.616001922167109e-05, + "loss": 0.0132, + "step": 29260 + }, + { + "epoch": 17.037252619324796, + "grad_norm": 0.16817006468772888, + "learning_rate": 5.61326703978052e-05, + "loss": 0.0121, + "step": 29270 + }, + { + "epoch": 17.043073341094296, + "grad_norm": 0.3315967619419098, + "learning_rate": 5.6105319710995964e-05, + "loss": 0.012, + "step": 29280 + }, + { + "epoch": 17.048894062863795, + "grad_norm": 0.13805076479911804, + "learning_rate": 5.60779671695518e-05, + "loss": 0.0121, + "step": 29290 + }, + { + "epoch": 17.054714784633294, + "grad_norm": 0.19249184429645538, + "learning_rate": 5.6050612781781684e-05, + "loss": 0.0117, + "step": 29300 + }, + { + "epoch": 17.060535506402793, + "grad_norm": 0.17761266231536865, + "learning_rate": 5.602325655599516e-05, + "loss": 0.0101, + "step": 29310 + }, + { + "epoch": 17.066356228172292, + "grad_norm": 0.1483728289604187, + "learning_rate": 5.599589850050234e-05, + "loss": 0.0104, + "step": 29320 + }, + { + "epoch": 17.07217694994179, + "grad_norm": 0.13170474767684937, + "learning_rate": 5.5968538623613874e-05, + "loss": 0.0097, + "step": 29330 + }, + { + "epoch": 17.07799767171129, + "grad_norm": 0.19764818251132965, + "learning_rate": 5.594117693364095e-05, + "loss": 0.01, + "step": 29340 + }, + { + "epoch": 17.08381839348079, + "grad_norm": 0.11838460713624954, + "learning_rate": 5.591381343889535e-05, + "loss": 0.0124, + "step": 29350 + }, + { + "epoch": 17.08963911525029, + "grad_norm": 0.07306783646345139, + "learning_rate": 5.5886448147689355e-05, + "loss": 0.0128, + "step": 29360 + }, + { + "epoch": 17.095459837019792, + "grad_norm": 0.13330036401748657, + "learning_rate": 5.585908106833585e-05, + "loss": 0.0093, + "step": 29370 + }, + { + "epoch": 17.10128055878929, + "grad_norm": 0.10433923453092575, + "learning_rate": 5.5831712209148226e-05, + "loss": 0.011, + "step": 29380 + }, + { + "epoch": 17.10710128055879, + "grad_norm": 0.16782422363758087, + "learning_rate": 5.58043415784404e-05, + "loss": 0.0136, + "step": 29390 + }, + { + "epoch": 17.11292200232829, + "grad_norm": 0.1346663534641266, + "learning_rate": 5.577696918452686e-05, + "loss": 0.0098, + "step": 29400 + }, + { + "epoch": 17.11874272409779, + "grad_norm": 0.10792607814073563, + "learning_rate": 5.5749595035722604e-05, + "loss": 0.0271, + "step": 29410 + }, + { + "epoch": 17.124563445867288, + "grad_norm": 0.15132607519626617, + "learning_rate": 5.5722219140343193e-05, + "loss": 0.0158, + "step": 29420 + }, + { + "epoch": 17.130384167636787, + "grad_norm": 0.2087009847164154, + "learning_rate": 5.56948415067047e-05, + "loss": 0.0119, + "step": 29430 + }, + { + "epoch": 17.136204889406287, + "grad_norm": 0.2209211140871048, + "learning_rate": 5.5667462143123704e-05, + "loss": 0.0169, + "step": 29440 + }, + { + "epoch": 17.142025611175786, + "grad_norm": 0.18040645122528076, + "learning_rate": 5.564008105791737e-05, + "loss": 0.0103, + "step": 29450 + }, + { + "epoch": 17.147846332945285, + "grad_norm": 0.2234218269586563, + "learning_rate": 5.5612698259403316e-05, + "loss": 0.0148, + "step": 29460 + }, + { + "epoch": 17.153667054714784, + "grad_norm": 0.14395081996917725, + "learning_rate": 5.5585313755899724e-05, + "loss": 0.0157, + "step": 29470 + }, + { + "epoch": 17.159487776484283, + "grad_norm": 0.196127787232399, + "learning_rate": 5.5557927555725285e-05, + "loss": 0.0164, + "step": 29480 + }, + { + "epoch": 17.165308498253783, + "grad_norm": 0.13211864233016968, + "learning_rate": 5.55305396671992e-05, + "loss": 0.0104, + "step": 29490 + }, + { + "epoch": 17.171129220023282, + "grad_norm": 0.19817641377449036, + "learning_rate": 5.55031500986412e-05, + "loss": 0.0158, + "step": 29500 + }, + { + "epoch": 17.17694994179278, + "grad_norm": 0.2844807207584381, + "learning_rate": 5.547575885837149e-05, + "loss": 0.0137, + "step": 29510 + }, + { + "epoch": 17.18277066356228, + "grad_norm": 0.1254895180463791, + "learning_rate": 5.5448365954710825e-05, + "loss": 0.0125, + "step": 29520 + }, + { + "epoch": 17.18859138533178, + "grad_norm": 0.11175572872161865, + "learning_rate": 5.5420971395980446e-05, + "loss": 0.0179, + "step": 29530 + }, + { + "epoch": 17.194412107101282, + "grad_norm": 0.2629016637802124, + "learning_rate": 5.539357519050209e-05, + "loss": 0.016, + "step": 29540 + }, + { + "epoch": 17.20023282887078, + "grad_norm": 0.08201931416988373, + "learning_rate": 5.536617734659799e-05, + "loss": 0.0157, + "step": 29550 + }, + { + "epoch": 17.20605355064028, + "grad_norm": 0.18974795937538147, + "learning_rate": 5.533877787259091e-05, + "loss": 0.0109, + "step": 29560 + }, + { + "epoch": 17.21187427240978, + "grad_norm": 0.14809216558933258, + "learning_rate": 5.5311376776804044e-05, + "loss": 0.0127, + "step": 29570 + }, + { + "epoch": 17.21769499417928, + "grad_norm": 0.1785852015018463, + "learning_rate": 5.528397406756118e-05, + "loss": 0.0134, + "step": 29580 + }, + { + "epoch": 17.22351571594878, + "grad_norm": 0.1613791137933731, + "learning_rate": 5.525656975318652e-05, + "loss": 0.0094, + "step": 29590 + }, + { + "epoch": 17.229336437718278, + "grad_norm": 0.07367366552352905, + "learning_rate": 5.522916384200474e-05, + "loss": 0.0073, + "step": 29600 + }, + { + "epoch": 17.235157159487777, + "grad_norm": 0.14832718670368195, + "learning_rate": 5.520175634234106e-05, + "loss": 0.0104, + "step": 29610 + }, + { + "epoch": 17.240977881257276, + "grad_norm": 0.21730118989944458, + "learning_rate": 5.517434726252113e-05, + "loss": 0.0101, + "step": 29620 + }, + { + "epoch": 17.246798603026775, + "grad_norm": 0.14734041690826416, + "learning_rate": 5.514693661087113e-05, + "loss": 0.0094, + "step": 29630 + }, + { + "epoch": 17.252619324796274, + "grad_norm": 0.20336425304412842, + "learning_rate": 5.511952439571769e-05, + "loss": 0.0112, + "step": 29640 + }, + { + "epoch": 17.258440046565774, + "grad_norm": 0.13347883522510529, + "learning_rate": 5.509211062538791e-05, + "loss": 0.009, + "step": 29650 + }, + { + "epoch": 17.264260768335273, + "grad_norm": 0.18431472778320312, + "learning_rate": 5.506469530820939e-05, + "loss": 0.0145, + "step": 29660 + }, + { + "epoch": 17.270081490104772, + "grad_norm": 0.12578357756137848, + "learning_rate": 5.503727845251014e-05, + "loss": 0.0112, + "step": 29670 + }, + { + "epoch": 17.27590221187427, + "grad_norm": 0.23325368762016296, + "learning_rate": 5.50098600666187e-05, + "loss": 0.0115, + "step": 29680 + }, + { + "epoch": 17.28172293364377, + "grad_norm": 0.1973099410533905, + "learning_rate": 5.498244015886406e-05, + "loss": 0.0117, + "step": 29690 + }, + { + "epoch": 17.28754365541327, + "grad_norm": 0.15166586637496948, + "learning_rate": 5.495501873757565e-05, + "loss": 0.0088, + "step": 29700 + }, + { + "epoch": 17.29336437718277, + "grad_norm": 0.14425857365131378, + "learning_rate": 5.492759581108336e-05, + "loss": 0.0107, + "step": 29710 + }, + { + "epoch": 17.29918509895227, + "grad_norm": 0.12829993665218353, + "learning_rate": 5.490017138771759e-05, + "loss": 0.0113, + "step": 29720 + }, + { + "epoch": 17.30500582072177, + "grad_norm": 0.14062272012233734, + "learning_rate": 5.487274547580912e-05, + "loss": 0.0111, + "step": 29730 + }, + { + "epoch": 17.31082654249127, + "grad_norm": 0.22430822253227234, + "learning_rate": 5.484531808368923e-05, + "loss": 0.0103, + "step": 29740 + }, + { + "epoch": 17.31664726426077, + "grad_norm": 0.1439807265996933, + "learning_rate": 5.4817889219689656e-05, + "loss": 0.0113, + "step": 29750 + }, + { + "epoch": 17.32246798603027, + "grad_norm": 0.20517469942569733, + "learning_rate": 5.4790458892142536e-05, + "loss": 0.0099, + "step": 29760 + }, + { + "epoch": 17.328288707799768, + "grad_norm": 0.2056550681591034, + "learning_rate": 5.476302710938048e-05, + "loss": 0.0097, + "step": 29770 + }, + { + "epoch": 17.334109429569267, + "grad_norm": 0.17343996465206146, + "learning_rate": 5.473559387973657e-05, + "loss": 0.0122, + "step": 29780 + }, + { + "epoch": 17.339930151338766, + "grad_norm": 0.16190625727176666, + "learning_rate": 5.470815921154425e-05, + "loss": 0.0133, + "step": 29790 + }, + { + "epoch": 17.345750873108265, + "grad_norm": 0.1333235800266266, + "learning_rate": 5.468072311313749e-05, + "loss": 0.008, + "step": 29800 + }, + { + "epoch": 17.351571594877765, + "grad_norm": 0.1614634245634079, + "learning_rate": 5.465328559285063e-05, + "loss": 0.0113, + "step": 29810 + }, + { + "epoch": 17.357392316647264, + "grad_norm": 0.18693643808364868, + "learning_rate": 5.462584665901849e-05, + "loss": 0.0108, + "step": 29820 + }, + { + "epoch": 17.363213038416763, + "grad_norm": 0.2814193069934845, + "learning_rate": 5.4598406319976235e-05, + "loss": 0.0146, + "step": 29830 + }, + { + "epoch": 17.369033760186262, + "grad_norm": 0.2381868064403534, + "learning_rate": 5.457096458405958e-05, + "loss": 0.0201, + "step": 29840 + }, + { + "epoch": 17.37485448195576, + "grad_norm": 0.1422087550163269, + "learning_rate": 5.454352145960457e-05, + "loss": 0.0113, + "step": 29850 + }, + { + "epoch": 17.38067520372526, + "grad_norm": 0.12633578479290009, + "learning_rate": 5.4516076954947715e-05, + "loss": 0.0095, + "step": 29860 + }, + { + "epoch": 17.38649592549476, + "grad_norm": 0.3976871967315674, + "learning_rate": 5.448863107842591e-05, + "loss": 0.0151, + "step": 29870 + }, + { + "epoch": 17.39231664726426, + "grad_norm": 0.19317062199115753, + "learning_rate": 5.446118383837651e-05, + "loss": 0.0128, + "step": 29880 + }, + { + "epoch": 17.398137369033762, + "grad_norm": 0.22793139517307281, + "learning_rate": 5.443373524313722e-05, + "loss": 0.015, + "step": 29890 + }, + { + "epoch": 17.40395809080326, + "grad_norm": 0.2032855749130249, + "learning_rate": 5.440628530104626e-05, + "loss": 0.0103, + "step": 29900 + }, + { + "epoch": 17.40977881257276, + "grad_norm": 0.1738421767950058, + "learning_rate": 5.4378834020442146e-05, + "loss": 0.0101, + "step": 29910 + }, + { + "epoch": 17.41559953434226, + "grad_norm": 0.19793549180030823, + "learning_rate": 5.4351381409663884e-05, + "loss": 0.0102, + "step": 29920 + }, + { + "epoch": 17.42142025611176, + "grad_norm": 0.18041767179965973, + "learning_rate": 5.432392747705084e-05, + "loss": 0.01, + "step": 29930 + }, + { + "epoch": 17.427240977881258, + "grad_norm": 0.15279892086982727, + "learning_rate": 5.429647223094278e-05, + "loss": 0.0105, + "step": 29940 + }, + { + "epoch": 17.433061699650757, + "grad_norm": 0.20437565445899963, + "learning_rate": 5.4269015679679924e-05, + "loss": 0.0129, + "step": 29950 + }, + { + "epoch": 17.438882421420256, + "grad_norm": 0.15029245615005493, + "learning_rate": 5.424155783160281e-05, + "loss": 0.0092, + "step": 29960 + }, + { + "epoch": 17.444703143189756, + "grad_norm": 0.19022338092327118, + "learning_rate": 5.4214098695052415e-05, + "loss": 0.0142, + "step": 29970 + }, + { + "epoch": 17.450523864959255, + "grad_norm": 0.25134527683258057, + "learning_rate": 5.418663827837012e-05, + "loss": 0.0145, + "step": 29980 + }, + { + "epoch": 17.456344586728754, + "grad_norm": 0.15959815680980682, + "learning_rate": 5.415917658989763e-05, + "loss": 0.0089, + "step": 29990 + }, + { + "epoch": 17.462165308498253, + "grad_norm": 0.24881796538829803, + "learning_rate": 5.413171363797713e-05, + "loss": 0.0123, + "step": 30000 + }, + { + "epoch": 17.467986030267753, + "grad_norm": 0.23090921342372894, + "learning_rate": 5.4104249430951116e-05, + "loss": 0.0121, + "step": 30010 + }, + { + "epoch": 17.47380675203725, + "grad_norm": 0.1790175586938858, + "learning_rate": 5.4076783977162494e-05, + "loss": 0.0113, + "step": 30020 + }, + { + "epoch": 17.47962747380675, + "grad_norm": 0.2390870749950409, + "learning_rate": 5.4049317284954525e-05, + "loss": 0.0106, + "step": 30030 + }, + { + "epoch": 17.48544819557625, + "grad_norm": 0.1554756760597229, + "learning_rate": 5.4021849362670884e-05, + "loss": 0.0102, + "step": 30040 + }, + { + "epoch": 17.49126891734575, + "grad_norm": 0.2868206799030304, + "learning_rate": 5.3994380218655604e-05, + "loss": 0.0168, + "step": 30050 + }, + { + "epoch": 17.49708963911525, + "grad_norm": 0.21125468611717224, + "learning_rate": 5.396690986125309e-05, + "loss": 0.0113, + "step": 30060 + }, + { + "epoch": 17.50291036088475, + "grad_norm": 0.15991848707199097, + "learning_rate": 5.3939438298808075e-05, + "loss": 0.0114, + "step": 30070 + }, + { + "epoch": 17.50873108265425, + "grad_norm": 0.2018059343099594, + "learning_rate": 5.3911965539665744e-05, + "loss": 0.0151, + "step": 30080 + }, + { + "epoch": 17.51455180442375, + "grad_norm": 0.09626469761133194, + "learning_rate": 5.388449159217156e-05, + "loss": 0.0112, + "step": 30090 + }, + { + "epoch": 17.52037252619325, + "grad_norm": 0.14000125229358673, + "learning_rate": 5.3857016464671385e-05, + "loss": 0.0108, + "step": 30100 + }, + { + "epoch": 17.52619324796275, + "grad_norm": 0.11721465736627579, + "learning_rate": 5.382954016551146e-05, + "loss": 0.0085, + "step": 30110 + }, + { + "epoch": 17.532013969732247, + "grad_norm": 0.20091485977172852, + "learning_rate": 5.380206270303835e-05, + "loss": 0.0117, + "step": 30120 + }, + { + "epoch": 17.537834691501747, + "grad_norm": 0.1458333283662796, + "learning_rate": 5.377458408559897e-05, + "loss": 0.0119, + "step": 30130 + }, + { + "epoch": 17.543655413271246, + "grad_norm": 0.13621221482753754, + "learning_rate": 5.374710432154061e-05, + "loss": 0.0103, + "step": 30140 + }, + { + "epoch": 17.549476135040745, + "grad_norm": 0.1264793574810028, + "learning_rate": 5.3719623419210886e-05, + "loss": 0.0098, + "step": 30150 + }, + { + "epoch": 17.555296856810244, + "grad_norm": 0.25793951749801636, + "learning_rate": 5.3692141386957786e-05, + "loss": 0.0095, + "step": 30160 + }, + { + "epoch": 17.561117578579744, + "grad_norm": 0.17849349975585938, + "learning_rate": 5.3664658233129616e-05, + "loss": 0.0132, + "step": 30170 + }, + { + "epoch": 17.566938300349243, + "grad_norm": 0.15862330794334412, + "learning_rate": 5.363717396607504e-05, + "loss": 0.0099, + "step": 30180 + }, + { + "epoch": 17.572759022118742, + "grad_norm": 0.2746502757072449, + "learning_rate": 5.360968859414305e-05, + "loss": 0.0111, + "step": 30190 + }, + { + "epoch": 17.57857974388824, + "grad_norm": 0.20163477957248688, + "learning_rate": 5.358220212568295e-05, + "loss": 0.01, + "step": 30200 + }, + { + "epoch": 17.58440046565774, + "grad_norm": 0.2993837594985962, + "learning_rate": 5.355471456904444e-05, + "loss": 0.011, + "step": 30210 + }, + { + "epoch": 17.59022118742724, + "grad_norm": 0.3107038736343384, + "learning_rate": 5.3527225932577495e-05, + "loss": 0.0196, + "step": 30220 + }, + { + "epoch": 17.59604190919674, + "grad_norm": 0.11407559365034103, + "learning_rate": 5.349973622463246e-05, + "loss": 0.015, + "step": 30230 + }, + { + "epoch": 17.601862630966238, + "grad_norm": 0.23253338038921356, + "learning_rate": 5.3472245453559956e-05, + "loss": 0.0093, + "step": 30240 + }, + { + "epoch": 17.60768335273574, + "grad_norm": 0.19857285916805267, + "learning_rate": 5.3444753627710955e-05, + "loss": 0.0135, + "step": 30250 + }, + { + "epoch": 17.61350407450524, + "grad_norm": 0.22209499776363373, + "learning_rate": 5.341726075543676e-05, + "loss": 0.0112, + "step": 30260 + }, + { + "epoch": 17.61932479627474, + "grad_norm": 0.18204902112483978, + "learning_rate": 5.338976684508898e-05, + "loss": 0.01, + "step": 30270 + }, + { + "epoch": 17.62514551804424, + "grad_norm": 0.13866829872131348, + "learning_rate": 5.336227190501953e-05, + "loss": 0.008, + "step": 30280 + }, + { + "epoch": 17.630966239813738, + "grad_norm": 0.1789502501487732, + "learning_rate": 5.3334775943580664e-05, + "loss": 0.0119, + "step": 30290 + }, + { + "epoch": 17.636786961583237, + "grad_norm": 0.1810736209154129, + "learning_rate": 5.330727896912491e-05, + "loss": 0.0093, + "step": 30300 + }, + { + "epoch": 17.642607683352736, + "grad_norm": 0.13658517599105835, + "learning_rate": 5.327978099000511e-05, + "loss": 0.011, + "step": 30310 + }, + { + "epoch": 17.648428405122235, + "grad_norm": 0.14384722709655762, + "learning_rate": 5.3252282014574465e-05, + "loss": 0.0112, + "step": 30320 + }, + { + "epoch": 17.654249126891735, + "grad_norm": 0.10103614628314972, + "learning_rate": 5.322478205118641e-05, + "loss": 0.0081, + "step": 30330 + }, + { + "epoch": 17.660069848661234, + "grad_norm": 0.18013525009155273, + "learning_rate": 5.3197281108194704e-05, + "loss": 0.0134, + "step": 30340 + }, + { + "epoch": 17.665890570430733, + "grad_norm": 0.2202008217573166, + "learning_rate": 5.316977919395342e-05, + "loss": 0.0089, + "step": 30350 + }, + { + "epoch": 17.671711292200232, + "grad_norm": 0.18227386474609375, + "learning_rate": 5.314227631681691e-05, + "loss": 0.0112, + "step": 30360 + }, + { + "epoch": 17.67753201396973, + "grad_norm": 0.16504494845867157, + "learning_rate": 5.311477248513982e-05, + "loss": 0.0105, + "step": 30370 + }, + { + "epoch": 17.68335273573923, + "grad_norm": 0.1698600798845291, + "learning_rate": 5.30872677072771e-05, + "loss": 0.014, + "step": 30380 + }, + { + "epoch": 17.68917345750873, + "grad_norm": 0.16857491433620453, + "learning_rate": 5.3059761991583954e-05, + "loss": 0.0132, + "step": 30390 + }, + { + "epoch": 17.69499417927823, + "grad_norm": 0.11375243961811066, + "learning_rate": 5.303225534641592e-05, + "loss": 0.0123, + "step": 30400 + }, + { + "epoch": 17.70081490104773, + "grad_norm": 0.09805640578269958, + "learning_rate": 5.300474778012875e-05, + "loss": 0.009, + "step": 30410 + }, + { + "epoch": 17.70663562281723, + "grad_norm": 0.11842584609985352, + "learning_rate": 5.297723930107855e-05, + "loss": 0.0091, + "step": 30420 + }, + { + "epoch": 17.71245634458673, + "grad_norm": 0.15558944642543793, + "learning_rate": 5.294972991762167e-05, + "loss": 0.0123, + "step": 30430 + }, + { + "epoch": 17.71827706635623, + "grad_norm": 0.1686694473028183, + "learning_rate": 5.292221963811472e-05, + "loss": 0.0104, + "step": 30440 + }, + { + "epoch": 17.72409778812573, + "grad_norm": 0.14436747133731842, + "learning_rate": 5.28947084709146e-05, + "loss": 0.0104, + "step": 30450 + }, + { + "epoch": 17.729918509895228, + "grad_norm": 0.18092583119869232, + "learning_rate": 5.2867196424378465e-05, + "loss": 0.0097, + "step": 30460 + }, + { + "epoch": 17.735739231664727, + "grad_norm": 0.2223665416240692, + "learning_rate": 5.2839683506863765e-05, + "loss": 0.0126, + "step": 30470 + }, + { + "epoch": 17.741559953434226, + "grad_norm": 0.16142228245735168, + "learning_rate": 5.281216972672821e-05, + "loss": 0.0107, + "step": 30480 + }, + { + "epoch": 17.747380675203726, + "grad_norm": 0.16786114871501923, + "learning_rate": 5.278465509232973e-05, + "loss": 0.0148, + "step": 30490 + }, + { + "epoch": 17.753201396973225, + "grad_norm": 0.17571189999580383, + "learning_rate": 5.275713961202655e-05, + "loss": 0.0106, + "step": 30500 + }, + { + "epoch": 17.759022118742724, + "grad_norm": 0.27766963839530945, + "learning_rate": 5.2729623294177165e-05, + "loss": 0.0094, + "step": 30510 + }, + { + "epoch": 17.764842840512223, + "grad_norm": 0.19222119450569153, + "learning_rate": 5.270210614714028e-05, + "loss": 0.0119, + "step": 30520 + }, + { + "epoch": 17.770663562281722, + "grad_norm": 0.2371245175600052, + "learning_rate": 5.267458817927491e-05, + "loss": 0.0142, + "step": 30530 + }, + { + "epoch": 17.77648428405122, + "grad_norm": 0.2286752164363861, + "learning_rate": 5.264706939894026e-05, + "loss": 0.0108, + "step": 30540 + }, + { + "epoch": 17.78230500582072, + "grad_norm": 0.09116510301828384, + "learning_rate": 5.261954981449584e-05, + "loss": 0.0112, + "step": 30550 + }, + { + "epoch": 17.78812572759022, + "grad_norm": 0.14310723543167114, + "learning_rate": 5.2592029434301324e-05, + "loss": 0.0106, + "step": 30560 + }, + { + "epoch": 17.79394644935972, + "grad_norm": 0.1962873488664627, + "learning_rate": 5.256450826671672e-05, + "loss": 0.0155, + "step": 30570 + }, + { + "epoch": 17.79976717112922, + "grad_norm": 0.14583076536655426, + "learning_rate": 5.253698632010221e-05, + "loss": 0.011, + "step": 30580 + }, + { + "epoch": 17.80558789289872, + "grad_norm": 0.13308732211589813, + "learning_rate": 5.2509463602818246e-05, + "loss": 0.012, + "step": 30590 + }, + { + "epoch": 17.81140861466822, + "grad_norm": 0.12134396284818649, + "learning_rate": 5.248194012322549e-05, + "loss": 0.0104, + "step": 30600 + }, + { + "epoch": 17.81722933643772, + "grad_norm": 0.14904314279556274, + "learning_rate": 5.245441588968486e-05, + "loss": 0.0116, + "step": 30610 + }, + { + "epoch": 17.82305005820722, + "grad_norm": 0.11280670762062073, + "learning_rate": 5.242689091055748e-05, + "loss": 0.015, + "step": 30620 + }, + { + "epoch": 17.828870779976718, + "grad_norm": 0.1667453646659851, + "learning_rate": 5.239936519420473e-05, + "loss": 0.0159, + "step": 30630 + }, + { + "epoch": 17.834691501746217, + "grad_norm": 0.19383616745471954, + "learning_rate": 5.2371838748988175e-05, + "loss": 0.0147, + "step": 30640 + }, + { + "epoch": 17.840512223515717, + "grad_norm": 0.24454571306705475, + "learning_rate": 5.234431158326965e-05, + "loss": 0.0149, + "step": 30650 + }, + { + "epoch": 17.846332945285216, + "grad_norm": 0.2642950415611267, + "learning_rate": 5.231678370541115e-05, + "loss": 0.013, + "step": 30660 + }, + { + "epoch": 17.852153667054715, + "grad_norm": 0.18516063690185547, + "learning_rate": 5.228925512377495e-05, + "loss": 0.0124, + "step": 30670 + }, + { + "epoch": 17.857974388824214, + "grad_norm": 0.16542479395866394, + "learning_rate": 5.2261725846723465e-05, + "loss": 0.0124, + "step": 30680 + }, + { + "epoch": 17.863795110593713, + "grad_norm": 0.17010398209095, + "learning_rate": 5.22341958826194e-05, + "loss": 0.0116, + "step": 30690 + }, + { + "epoch": 17.869615832363213, + "grad_norm": 0.09952867031097412, + "learning_rate": 5.22066652398256e-05, + "loss": 0.0104, + "step": 30700 + }, + { + "epoch": 17.875436554132712, + "grad_norm": 0.2229309380054474, + "learning_rate": 5.2179133926705185e-05, + "loss": 0.0108, + "step": 30710 + }, + { + "epoch": 17.88125727590221, + "grad_norm": 0.13461250066757202, + "learning_rate": 5.215160195162141e-05, + "loss": 0.0102, + "step": 30720 + }, + { + "epoch": 17.88707799767171, + "grad_norm": 0.10754374414682388, + "learning_rate": 5.212406932293776e-05, + "loss": 0.0091, + "step": 30730 + }, + { + "epoch": 17.89289871944121, + "grad_norm": 0.178869366645813, + "learning_rate": 5.209653604901795e-05, + "loss": 0.0106, + "step": 30740 + }, + { + "epoch": 17.89871944121071, + "grad_norm": 0.16962048411369324, + "learning_rate": 5.206900213822584e-05, + "loss": 0.0103, + "step": 30750 + }, + { + "epoch": 17.904540162980208, + "grad_norm": 0.14315982162952423, + "learning_rate": 5.204146759892551e-05, + "loss": 0.0121, + "step": 30760 + }, + { + "epoch": 17.91036088474971, + "grad_norm": 0.19668281078338623, + "learning_rate": 5.2013932439481216e-05, + "loss": 0.0105, + "step": 30770 + }, + { + "epoch": 17.91618160651921, + "grad_norm": 0.21263448894023895, + "learning_rate": 5.198639666825743e-05, + "loss": 0.0199, + "step": 30780 + }, + { + "epoch": 17.92200232828871, + "grad_norm": 0.178619846701622, + "learning_rate": 5.195886029361877e-05, + "loss": 0.0108, + "step": 30790 + }, + { + "epoch": 17.92782305005821, + "grad_norm": 0.17958448827266693, + "learning_rate": 5.193132332393009e-05, + "loss": 0.0157, + "step": 30800 + }, + { + "epoch": 17.933643771827708, + "grad_norm": 0.11867396533489227, + "learning_rate": 5.1903785767556376e-05, + "loss": 0.011, + "step": 30810 + }, + { + "epoch": 17.939464493597207, + "grad_norm": 0.11276105046272278, + "learning_rate": 5.187624763286282e-05, + "loss": 0.0089, + "step": 30820 + }, + { + "epoch": 17.945285215366706, + "grad_norm": 0.1852840930223465, + "learning_rate": 5.184870892821475e-05, + "loss": 0.016, + "step": 30830 + }, + { + "epoch": 17.951105937136205, + "grad_norm": 0.15836185216903687, + "learning_rate": 5.182116966197773e-05, + "loss": 0.0097, + "step": 30840 + }, + { + "epoch": 17.956926658905704, + "grad_norm": 0.1470034420490265, + "learning_rate": 5.1793629842517466e-05, + "loss": 0.0101, + "step": 30850 + }, + { + "epoch": 17.962747380675204, + "grad_norm": 0.19044145941734314, + "learning_rate": 5.17660894781998e-05, + "loss": 0.0103, + "step": 30860 + }, + { + "epoch": 17.968568102444703, + "grad_norm": 0.1875290870666504, + "learning_rate": 5.173854857739079e-05, + "loss": 0.012, + "step": 30870 + }, + { + "epoch": 17.974388824214202, + "grad_norm": 0.2069864124059677, + "learning_rate": 5.171100714845661e-05, + "loss": 0.0101, + "step": 30880 + }, + { + "epoch": 17.9802095459837, + "grad_norm": 0.15657231211662292, + "learning_rate": 5.1683465199763646e-05, + "loss": 0.0162, + "step": 30890 + }, + { + "epoch": 17.9860302677532, + "grad_norm": 0.21041306853294373, + "learning_rate": 5.16559227396784e-05, + "loss": 0.0083, + "step": 30900 + }, + { + "epoch": 17.9918509895227, + "grad_norm": 0.17179709672927856, + "learning_rate": 5.1628379776567556e-05, + "loss": 0.0117, + "step": 30910 + }, + { + "epoch": 17.9976717112922, + "grad_norm": 0.1261214315891266, + "learning_rate": 5.160083631879792e-05, + "loss": 0.0108, + "step": 30920 + }, + { + "epoch": 18.003492433061698, + "grad_norm": 0.15356571972370148, + "learning_rate": 5.1573292374736484e-05, + "loss": 0.0108, + "step": 30930 + }, + { + "epoch": 18.009313154831197, + "grad_norm": 0.15706394612789154, + "learning_rate": 5.1545747952750356e-05, + "loss": 0.0107, + "step": 30940 + }, + { + "epoch": 18.0151338766007, + "grad_norm": 0.1534712016582489, + "learning_rate": 5.151820306120682e-05, + "loss": 0.0111, + "step": 30950 + }, + { + "epoch": 18.0209545983702, + "grad_norm": 0.18270574510097504, + "learning_rate": 5.149065770847328e-05, + "loss": 0.0085, + "step": 30960 + }, + { + "epoch": 18.0267753201397, + "grad_norm": 0.2733309864997864, + "learning_rate": 5.1463111902917297e-05, + "loss": 0.0133, + "step": 30970 + }, + { + "epoch": 18.032596041909198, + "grad_norm": 0.22155369818210602, + "learning_rate": 5.143556565290654e-05, + "loss": 0.0147, + "step": 30980 + }, + { + "epoch": 18.038416763678697, + "grad_norm": 0.18632085621356964, + "learning_rate": 5.140801896680882e-05, + "loss": 0.0102, + "step": 30990 + }, + { + "epoch": 18.044237485448196, + "grad_norm": 0.19283942878246307, + "learning_rate": 5.1380471852992144e-05, + "loss": 0.0104, + "step": 31000 + }, + { + "epoch": 18.050058207217695, + "grad_norm": 0.12529854476451874, + "learning_rate": 5.135292431982457e-05, + "loss": 0.0111, + "step": 31010 + }, + { + "epoch": 18.055878928987195, + "grad_norm": 0.20542626082897186, + "learning_rate": 5.1325376375674294e-05, + "loss": 0.0098, + "step": 31020 + }, + { + "epoch": 18.061699650756694, + "grad_norm": 0.11533860117197037, + "learning_rate": 5.129782802890968e-05, + "loss": 0.0109, + "step": 31030 + }, + { + "epoch": 18.067520372526193, + "grad_norm": 0.10096288472414017, + "learning_rate": 5.127027928789916e-05, + "loss": 0.0107, + "step": 31040 + }, + { + "epoch": 18.073341094295692, + "grad_norm": 0.1343536674976349, + "learning_rate": 5.124273016101135e-05, + "loss": 0.021, + "step": 31050 + }, + { + "epoch": 18.07916181606519, + "grad_norm": 0.17156878113746643, + "learning_rate": 5.121518065661492e-05, + "loss": 0.0123, + "step": 31060 + }, + { + "epoch": 18.08498253783469, + "grad_norm": 0.1389233022928238, + "learning_rate": 5.11876307830787e-05, + "loss": 0.0121, + "step": 31070 + }, + { + "epoch": 18.09080325960419, + "grad_norm": 0.15159133076667786, + "learning_rate": 5.1160080548771596e-05, + "loss": 0.0151, + "step": 31080 + }, + { + "epoch": 18.09662398137369, + "grad_norm": 0.15885929763317108, + "learning_rate": 5.1132529962062656e-05, + "loss": 0.0112, + "step": 31090 + }, + { + "epoch": 18.10244470314319, + "grad_norm": 0.24702458083629608, + "learning_rate": 5.110497903132101e-05, + "loss": 0.0118, + "step": 31100 + }, + { + "epoch": 18.108265424912688, + "grad_norm": 0.1995919942855835, + "learning_rate": 5.107742776491592e-05, + "loss": 0.0101, + "step": 31110 + }, + { + "epoch": 18.11408614668219, + "grad_norm": 0.16643698513507843, + "learning_rate": 5.104987617121673e-05, + "loss": 0.0107, + "step": 31120 + }, + { + "epoch": 18.11990686845169, + "grad_norm": 0.13517138361930847, + "learning_rate": 5.102232425859287e-05, + "loss": 0.0104, + "step": 31130 + }, + { + "epoch": 18.12572759022119, + "grad_norm": 0.160825714468956, + "learning_rate": 5.09947720354139e-05, + "loss": 0.0094, + "step": 31140 + }, + { + "epoch": 18.131548311990688, + "grad_norm": 0.14869117736816406, + "learning_rate": 5.096721951004942e-05, + "loss": 0.0094, + "step": 31150 + }, + { + "epoch": 18.137369033760187, + "grad_norm": 0.1662643998861313, + "learning_rate": 5.0939666690869227e-05, + "loss": 0.01, + "step": 31160 + }, + { + "epoch": 18.143189755529686, + "grad_norm": 0.24289990961551666, + "learning_rate": 5.0912113586243096e-05, + "loss": 0.0115, + "step": 31170 + }, + { + "epoch": 18.149010477299186, + "grad_norm": 0.17828668653964996, + "learning_rate": 5.0884560204540935e-05, + "loss": 0.0116, + "step": 31180 + }, + { + "epoch": 18.154831199068685, + "grad_norm": 0.15414541959762573, + "learning_rate": 5.0857006554132736e-05, + "loss": 0.011, + "step": 31190 + }, + { + "epoch": 18.160651920838184, + "grad_norm": 0.1385876089334488, + "learning_rate": 5.0829452643388575e-05, + "loss": 0.0073, + "step": 31200 + }, + { + "epoch": 18.166472642607683, + "grad_norm": 0.1368221789598465, + "learning_rate": 5.08018984806786e-05, + "loss": 0.0078, + "step": 31210 + }, + { + "epoch": 18.172293364377182, + "grad_norm": 0.16156955063343048, + "learning_rate": 5.0774344074373036e-05, + "loss": 0.0128, + "step": 31220 + }, + { + "epoch": 18.17811408614668, + "grad_norm": 0.21248294413089752, + "learning_rate": 5.07467894328422e-05, + "loss": 0.0108, + "step": 31230 + }, + { + "epoch": 18.18393480791618, + "grad_norm": 0.30540212988853455, + "learning_rate": 5.0719234564456454e-05, + "loss": 0.0127, + "step": 31240 + }, + { + "epoch": 18.18975552968568, + "grad_norm": 0.18898603320121765, + "learning_rate": 5.0691679477586216e-05, + "loss": 0.0074, + "step": 31250 + }, + { + "epoch": 18.19557625145518, + "grad_norm": 0.13119399547576904, + "learning_rate": 5.0664124180602035e-05, + "loss": 0.0098, + "step": 31260 + }, + { + "epoch": 18.20139697322468, + "grad_norm": 0.142775297164917, + "learning_rate": 5.063656868187447e-05, + "loss": 0.0091, + "step": 31270 + }, + { + "epoch": 18.207217694994178, + "grad_norm": 0.14088164269924164, + "learning_rate": 5.060901298977413e-05, + "loss": 0.0098, + "step": 31280 + }, + { + "epoch": 18.213038416763677, + "grad_norm": 0.3239020109176636, + "learning_rate": 5.0581457112671725e-05, + "loss": 0.0183, + "step": 31290 + }, + { + "epoch": 18.21885913853318, + "grad_norm": 0.13011382520198822, + "learning_rate": 5.0553901058938016e-05, + "loss": 0.0113, + "step": 31300 + }, + { + "epoch": 18.22467986030268, + "grad_norm": 0.1851261705160141, + "learning_rate": 5.052634483694377e-05, + "loss": 0.0091, + "step": 31310 + }, + { + "epoch": 18.230500582072178, + "grad_norm": 0.2037660926580429, + "learning_rate": 5.049878845505988e-05, + "loss": 0.014, + "step": 31320 + }, + { + "epoch": 18.236321303841677, + "grad_norm": 0.11260638386011124, + "learning_rate": 5.047123192165721e-05, + "loss": 0.0103, + "step": 31330 + }, + { + "epoch": 18.242142025611177, + "grad_norm": 0.1937672644853592, + "learning_rate": 5.0443675245106735e-05, + "loss": 0.0108, + "step": 31340 + }, + { + "epoch": 18.247962747380676, + "grad_norm": 0.2859586179256439, + "learning_rate": 5.0416118433779426e-05, + "loss": 0.0088, + "step": 31350 + }, + { + "epoch": 18.253783469150175, + "grad_norm": 0.15000028908252716, + "learning_rate": 5.038856149604633e-05, + "loss": 0.0108, + "step": 31360 + }, + { + "epoch": 18.259604190919674, + "grad_norm": 0.1334361433982849, + "learning_rate": 5.03610044402785e-05, + "loss": 0.0111, + "step": 31370 + }, + { + "epoch": 18.265424912689173, + "grad_norm": 0.23092299699783325, + "learning_rate": 5.033344727484707e-05, + "loss": 0.0136, + "step": 31380 + }, + { + "epoch": 18.271245634458673, + "grad_norm": 0.13449738919734955, + "learning_rate": 5.030589000812315e-05, + "loss": 0.0099, + "step": 31390 + }, + { + "epoch": 18.277066356228172, + "grad_norm": 0.10345552116632462, + "learning_rate": 5.027833264847793e-05, + "loss": 0.0093, + "step": 31400 + }, + { + "epoch": 18.28288707799767, + "grad_norm": 0.18455766141414642, + "learning_rate": 5.025077520428258e-05, + "loss": 0.0142, + "step": 31410 + }, + { + "epoch": 18.28870779976717, + "grad_norm": 0.17971499264240265, + "learning_rate": 5.022321768390837e-05, + "loss": 0.008, + "step": 31420 + }, + { + "epoch": 18.29452852153667, + "grad_norm": 0.24390555918216705, + "learning_rate": 5.0195660095726516e-05, + "loss": 0.0128, + "step": 31430 + }, + { + "epoch": 18.30034924330617, + "grad_norm": 0.14955414831638336, + "learning_rate": 5.016810244810829e-05, + "loss": 0.0102, + "step": 31440 + }, + { + "epoch": 18.306169965075668, + "grad_norm": 0.2358100712299347, + "learning_rate": 5.0140544749424976e-05, + "loss": 0.0095, + "step": 31450 + }, + { + "epoch": 18.311990686845167, + "grad_norm": 0.17474299669265747, + "learning_rate": 5.0112987008047874e-05, + "loss": 0.0097, + "step": 31460 + }, + { + "epoch": 18.31781140861467, + "grad_norm": 0.14040246605873108, + "learning_rate": 5.008542923234831e-05, + "loss": 0.0109, + "step": 31470 + }, + { + "epoch": 18.32363213038417, + "grad_norm": 0.18174821138381958, + "learning_rate": 5.00578714306976e-05, + "loss": 0.0117, + "step": 31480 + }, + { + "epoch": 18.32945285215367, + "grad_norm": 0.1734914630651474, + "learning_rate": 5.0030313611467084e-05, + "loss": 0.0107, + "step": 31490 + }, + { + "epoch": 18.335273573923168, + "grad_norm": 0.28041937947273254, + "learning_rate": 5.0002755783028074e-05, + "loss": 0.0108, + "step": 31500 + }, + { + "epoch": 18.341094295692667, + "grad_norm": 0.26315435767173767, + "learning_rate": 4.997519795375194e-05, + "loss": 0.0116, + "step": 31510 + }, + { + "epoch": 18.346915017462166, + "grad_norm": 0.143113374710083, + "learning_rate": 4.9947640132010016e-05, + "loss": 0.0113, + "step": 31520 + }, + { + "epoch": 18.352735739231665, + "grad_norm": 0.18227288126945496, + "learning_rate": 4.9920082326173625e-05, + "loss": 0.0107, + "step": 31530 + }, + { + "epoch": 18.358556461001164, + "grad_norm": 0.35032784938812256, + "learning_rate": 4.9892524544614114e-05, + "loss": 0.0124, + "step": 31540 + }, + { + "epoch": 18.364377182770664, + "grad_norm": 0.16665521264076233, + "learning_rate": 4.986496679570283e-05, + "loss": 0.0102, + "step": 31550 + }, + { + "epoch": 18.370197904540163, + "grad_norm": 0.17587998509407043, + "learning_rate": 4.983740908781105e-05, + "loss": 0.0126, + "step": 31560 + }, + { + "epoch": 18.376018626309662, + "grad_norm": 0.12690630555152893, + "learning_rate": 4.9809851429310116e-05, + "loss": 0.0084, + "step": 31570 + }, + { + "epoch": 18.38183934807916, + "grad_norm": 0.1422034353017807, + "learning_rate": 4.9782293828571275e-05, + "loss": 0.0158, + "step": 31580 + }, + { + "epoch": 18.38766006984866, + "grad_norm": 0.24079613387584686, + "learning_rate": 4.9754736293965846e-05, + "loss": 0.0109, + "step": 31590 + }, + { + "epoch": 18.39348079161816, + "grad_norm": 0.15354539453983307, + "learning_rate": 4.972717883386502e-05, + "loss": 0.0149, + "step": 31600 + }, + { + "epoch": 18.39930151338766, + "grad_norm": 0.15572310984134674, + "learning_rate": 4.9699621456640075e-05, + "loss": 0.0119, + "step": 31610 + }, + { + "epoch": 18.405122235157158, + "grad_norm": 0.2051883488893509, + "learning_rate": 4.9672064170662214e-05, + "loss": 0.01, + "step": 31620 + }, + { + "epoch": 18.410942956926657, + "grad_norm": 0.18806588649749756, + "learning_rate": 4.9644506984302583e-05, + "loss": 0.0158, + "step": 31630 + }, + { + "epoch": 18.416763678696157, + "grad_norm": 0.21128451824188232, + "learning_rate": 4.9616949905932356e-05, + "loss": 0.0112, + "step": 31640 + }, + { + "epoch": 18.42258440046566, + "grad_norm": 0.18005387485027313, + "learning_rate": 4.9589392943922615e-05, + "loss": 0.0118, + "step": 31650 + }, + { + "epoch": 18.42840512223516, + "grad_norm": 0.20483216643333435, + "learning_rate": 4.956183610664447e-05, + "loss": 0.012, + "step": 31660 + }, + { + "epoch": 18.434225844004658, + "grad_norm": 0.18490596115589142, + "learning_rate": 4.9534279402468945e-05, + "loss": 0.0091, + "step": 31670 + }, + { + "epoch": 18.440046565774157, + "grad_norm": 0.20257823169231415, + "learning_rate": 4.9506722839767036e-05, + "loss": 0.0113, + "step": 31680 + }, + { + "epoch": 18.445867287543656, + "grad_norm": 0.2018694430589676, + "learning_rate": 4.947916642690972e-05, + "loss": 0.0199, + "step": 31690 + }, + { + "epoch": 18.451688009313155, + "grad_norm": 0.15336383879184723, + "learning_rate": 4.9451610172267874e-05, + "loss": 0.0079, + "step": 31700 + }, + { + "epoch": 18.457508731082655, + "grad_norm": 0.24206717312335968, + "learning_rate": 4.9424054084212376e-05, + "loss": 0.0097, + "step": 31710 + }, + { + "epoch": 18.463329452852154, + "grad_norm": 0.1271076500415802, + "learning_rate": 4.939649817111407e-05, + "loss": 0.0079, + "step": 31720 + }, + { + "epoch": 18.469150174621653, + "grad_norm": 0.12239319086074829, + "learning_rate": 4.936894244134365e-05, + "loss": 0.0106, + "step": 31730 + }, + { + "epoch": 18.474970896391152, + "grad_norm": 0.12864230573177338, + "learning_rate": 4.9341386903271886e-05, + "loss": 0.0093, + "step": 31740 + }, + { + "epoch": 18.48079161816065, + "grad_norm": 0.1814577877521515, + "learning_rate": 4.931383156526936e-05, + "loss": 0.0157, + "step": 31750 + }, + { + "epoch": 18.48661233993015, + "grad_norm": 0.17527472972869873, + "learning_rate": 4.92862764357067e-05, + "loss": 0.0118, + "step": 31760 + }, + { + "epoch": 18.49243306169965, + "grad_norm": 0.1160091757774353, + "learning_rate": 4.925872152295443e-05, + "loss": 0.0079, + "step": 31770 + }, + { + "epoch": 18.49825378346915, + "grad_norm": 0.1578139066696167, + "learning_rate": 4.923116683538296e-05, + "loss": 0.0113, + "step": 31780 + }, + { + "epoch": 18.50407450523865, + "grad_norm": 0.13626566529273987, + "learning_rate": 4.920361238136273e-05, + "loss": 0.0126, + "step": 31790 + }, + { + "epoch": 18.509895227008148, + "grad_norm": 0.16629891097545624, + "learning_rate": 4.9176058169264014e-05, + "loss": 0.0122, + "step": 31800 + }, + { + "epoch": 18.515715948777647, + "grad_norm": 0.10468962788581848, + "learning_rate": 4.9148504207457074e-05, + "loss": 0.0137, + "step": 31810 + }, + { + "epoch": 18.52153667054715, + "grad_norm": 0.1657097339630127, + "learning_rate": 4.912095050431208e-05, + "loss": 0.0135, + "step": 31820 + }, + { + "epoch": 18.52735739231665, + "grad_norm": 0.08657045662403107, + "learning_rate": 4.909339706819911e-05, + "loss": 0.0119, + "step": 31830 + }, + { + "epoch": 18.533178114086148, + "grad_norm": 0.15795797109603882, + "learning_rate": 4.906584390748819e-05, + "loss": 0.0106, + "step": 31840 + }, + { + "epoch": 18.538998835855647, + "grad_norm": 0.1869620829820633, + "learning_rate": 4.9038291030549195e-05, + "loss": 0.0238, + "step": 31850 + }, + { + "epoch": 18.544819557625146, + "grad_norm": 0.15684431791305542, + "learning_rate": 4.9010738445751995e-05, + "loss": 0.0108, + "step": 31860 + }, + { + "epoch": 18.550640279394646, + "grad_norm": 0.1269041746854782, + "learning_rate": 4.8983186161466364e-05, + "loss": 0.0072, + "step": 31870 + }, + { + "epoch": 18.556461001164145, + "grad_norm": 0.17687121033668518, + "learning_rate": 4.89556341860619e-05, + "loss": 0.0134, + "step": 31880 + }, + { + "epoch": 18.562281722933644, + "grad_norm": 0.1540309488773346, + "learning_rate": 4.892808252790822e-05, + "loss": 0.0112, + "step": 31890 + }, + { + "epoch": 18.568102444703143, + "grad_norm": 0.1909775584936142, + "learning_rate": 4.890053119537475e-05, + "loss": 0.0103, + "step": 31900 + }, + { + "epoch": 18.573923166472643, + "grad_norm": 0.14956122636795044, + "learning_rate": 4.887298019683087e-05, + "loss": 0.0127, + "step": 31910 + }, + { + "epoch": 18.57974388824214, + "grad_norm": 0.245534285902977, + "learning_rate": 4.884542954064587e-05, + "loss": 0.0125, + "step": 31920 + }, + { + "epoch": 18.58556461001164, + "grad_norm": 0.23347507417201996, + "learning_rate": 4.881787923518887e-05, + "loss": 0.0131, + "step": 31930 + }, + { + "epoch": 18.59138533178114, + "grad_norm": 0.17930033802986145, + "learning_rate": 4.879032928882896e-05, + "loss": 0.0088, + "step": 31940 + }, + { + "epoch": 18.59720605355064, + "grad_norm": 0.14763198792934418, + "learning_rate": 4.876277970993505e-05, + "loss": 0.0084, + "step": 31950 + }, + { + "epoch": 18.60302677532014, + "grad_norm": 0.1590244472026825, + "learning_rate": 4.873523050687602e-05, + "loss": 0.0136, + "step": 31960 + }, + { + "epoch": 18.608847497089638, + "grad_norm": 0.11607623845338821, + "learning_rate": 4.870768168802056e-05, + "loss": 0.0093, + "step": 31970 + }, + { + "epoch": 18.614668218859137, + "grad_norm": 0.17354841530323029, + "learning_rate": 4.868013326173728e-05, + "loss": 0.0106, + "step": 31980 + }, + { + "epoch": 18.620488940628636, + "grad_norm": 0.12925374507904053, + "learning_rate": 4.865258523639468e-05, + "loss": 0.0104, + "step": 31990 + }, + { + "epoch": 18.62630966239814, + "grad_norm": 0.15742024779319763, + "learning_rate": 4.862503762036109e-05, + "loss": 0.0092, + "step": 32000 + }, + { + "epoch": 18.63213038416764, + "grad_norm": 0.17201419174671173, + "learning_rate": 4.859749042200478e-05, + "loss": 0.0087, + "step": 32010 + }, + { + "epoch": 18.637951105937137, + "grad_norm": 0.1828613430261612, + "learning_rate": 4.856994364969384e-05, + "loss": 0.0097, + "step": 32020 + }, + { + "epoch": 18.643771827706637, + "grad_norm": 0.11775583028793335, + "learning_rate": 4.854239731179625e-05, + "loss": 0.0073, + "step": 32030 + }, + { + "epoch": 18.649592549476136, + "grad_norm": 0.14123596251010895, + "learning_rate": 4.85148514166799e-05, + "loss": 0.0093, + "step": 32040 + }, + { + "epoch": 18.655413271245635, + "grad_norm": 0.1515197902917862, + "learning_rate": 4.8487305972712456e-05, + "loss": 0.0103, + "step": 32050 + }, + { + "epoch": 18.661233993015134, + "grad_norm": 0.15569783747196198, + "learning_rate": 4.8459760988261526e-05, + "loss": 0.0074, + "step": 32060 + }, + { + "epoch": 18.667054714784634, + "grad_norm": 0.21004754304885864, + "learning_rate": 4.843221647169453e-05, + "loss": 0.0114, + "step": 32070 + }, + { + "epoch": 18.672875436554133, + "grad_norm": 0.22802932560443878, + "learning_rate": 4.840467243137878e-05, + "loss": 0.0123, + "step": 32080 + }, + { + "epoch": 18.678696158323632, + "grad_norm": 0.17620550096035004, + "learning_rate": 4.837712887568143e-05, + "loss": 0.0121, + "step": 32090 + }, + { + "epoch": 18.68451688009313, + "grad_norm": 0.13585105538368225, + "learning_rate": 4.8349585812969464e-05, + "loss": 0.0111, + "step": 32100 + }, + { + "epoch": 18.69033760186263, + "grad_norm": 0.16629639267921448, + "learning_rate": 4.8322043251609775e-05, + "loss": 0.0103, + "step": 32110 + }, + { + "epoch": 18.69615832363213, + "grad_norm": 0.1634054034948349, + "learning_rate": 4.8294501199969015e-05, + "loss": 0.0086, + "step": 32120 + }, + { + "epoch": 18.70197904540163, + "grad_norm": 0.15307752788066864, + "learning_rate": 4.826695966641376e-05, + "loss": 0.0085, + "step": 32130 + }, + { + "epoch": 18.707799767171128, + "grad_norm": 0.20127005875110626, + "learning_rate": 4.823941865931043e-05, + "loss": 0.0125, + "step": 32140 + }, + { + "epoch": 18.713620488940627, + "grad_norm": 0.17928525805473328, + "learning_rate": 4.82118781870252e-05, + "loss": 0.0137, + "step": 32150 + }, + { + "epoch": 18.719441210710126, + "grad_norm": 0.21272848546504974, + "learning_rate": 4.8184338257924185e-05, + "loss": 0.0118, + "step": 32160 + }, + { + "epoch": 18.725261932479626, + "grad_norm": 0.17793558537960052, + "learning_rate": 4.815679888037324e-05, + "loss": 0.0116, + "step": 32170 + }, + { + "epoch": 18.73108265424913, + "grad_norm": 0.23091010749340057, + "learning_rate": 4.8129260062738135e-05, + "loss": 0.0084, + "step": 32180 + }, + { + "epoch": 18.736903376018628, + "grad_norm": 0.25744348764419556, + "learning_rate": 4.810172181338445e-05, + "loss": 0.0135, + "step": 32190 + }, + { + "epoch": 18.742724097788127, + "grad_norm": 0.14830511808395386, + "learning_rate": 4.807418414067753e-05, + "loss": 0.0089, + "step": 32200 + }, + { + "epoch": 18.748544819557626, + "grad_norm": 0.10791568458080292, + "learning_rate": 4.804664705298264e-05, + "loss": 0.0119, + "step": 32210 + }, + { + "epoch": 18.754365541327125, + "grad_norm": 0.1637260615825653, + "learning_rate": 4.80191105586648e-05, + "loss": 0.0095, + "step": 32220 + }, + { + "epoch": 18.760186263096625, + "grad_norm": 0.1984974443912506, + "learning_rate": 4.799157466608886e-05, + "loss": 0.0095, + "step": 32230 + }, + { + "epoch": 18.766006984866124, + "grad_norm": 0.1106342077255249, + "learning_rate": 4.796403938361951e-05, + "loss": 0.0121, + "step": 32240 + }, + { + "epoch": 18.771827706635623, + "grad_norm": 0.17441992461681366, + "learning_rate": 4.793650471962123e-05, + "loss": 0.0091, + "step": 32250 + }, + { + "epoch": 18.777648428405122, + "grad_norm": 0.14178943634033203, + "learning_rate": 4.790897068245835e-05, + "loss": 0.0089, + "step": 32260 + }, + { + "epoch": 18.78346915017462, + "grad_norm": 0.1634606122970581, + "learning_rate": 4.7881437280494954e-05, + "loss": 0.0123, + "step": 32270 + }, + { + "epoch": 18.78928987194412, + "grad_norm": 0.16303586959838867, + "learning_rate": 4.7853904522094965e-05, + "loss": 0.0112, + "step": 32280 + }, + { + "epoch": 18.79511059371362, + "grad_norm": 0.12587758898735046, + "learning_rate": 4.782637241562215e-05, + "loss": 0.0112, + "step": 32290 + }, + { + "epoch": 18.80093131548312, + "grad_norm": 0.1550004780292511, + "learning_rate": 4.779884096943997e-05, + "loss": 0.0091, + "step": 32300 + }, + { + "epoch": 18.80675203725262, + "grad_norm": 0.13920457661151886, + "learning_rate": 4.777131019191182e-05, + "loss": 0.0158, + "step": 32310 + }, + { + "epoch": 18.812572759022117, + "grad_norm": 0.22345751523971558, + "learning_rate": 4.774378009140076e-05, + "loss": 0.0118, + "step": 32320 + }, + { + "epoch": 18.818393480791617, + "grad_norm": 0.1427847445011139, + "learning_rate": 4.7716250676269735e-05, + "loss": 0.0097, + "step": 32330 + }, + { + "epoch": 18.824214202561116, + "grad_norm": 0.1793399155139923, + "learning_rate": 4.7688721954881485e-05, + "loss": 0.0091, + "step": 32340 + }, + { + "epoch": 18.83003492433062, + "grad_norm": 0.17001166939735413, + "learning_rate": 4.7661193935598446e-05, + "loss": 0.0145, + "step": 32350 + }, + { + "epoch": 18.835855646100118, + "grad_norm": 0.17452724277973175, + "learning_rate": 4.763366662678296e-05, + "loss": 0.0099, + "step": 32360 + }, + { + "epoch": 18.841676367869617, + "grad_norm": 0.2479344606399536, + "learning_rate": 4.7606140036797064e-05, + "loss": 0.0132, + "step": 32370 + }, + { + "epoch": 18.847497089639116, + "grad_norm": 0.17758987843990326, + "learning_rate": 4.7578614174002614e-05, + "loss": 0.0085, + "step": 32380 + }, + { + "epoch": 18.853317811408616, + "grad_norm": 0.17170663177967072, + "learning_rate": 4.755108904676125e-05, + "loss": 0.009, + "step": 32390 + }, + { + "epoch": 18.859138533178115, + "grad_norm": 0.13546504080295563, + "learning_rate": 4.752356466343436e-05, + "loss": 0.0164, + "step": 32400 + }, + { + "epoch": 18.864959254947614, + "grad_norm": 0.1304585486650467, + "learning_rate": 4.7496041032383174e-05, + "loss": 0.0086, + "step": 32410 + }, + { + "epoch": 18.870779976717113, + "grad_norm": 0.13031086325645447, + "learning_rate": 4.746851816196858e-05, + "loss": 0.0092, + "step": 32420 + }, + { + "epoch": 18.876600698486612, + "grad_norm": 0.1651422679424286, + "learning_rate": 4.744099606055135e-05, + "loss": 0.0091, + "step": 32430 + }, + { + "epoch": 18.88242142025611, + "grad_norm": 0.18478773534297943, + "learning_rate": 4.741347473649193e-05, + "loss": 0.0127, + "step": 32440 + }, + { + "epoch": 18.88824214202561, + "grad_norm": 0.17248055338859558, + "learning_rate": 4.738595419815058e-05, + "loss": 0.0078, + "step": 32450 + }, + { + "epoch": 18.89406286379511, + "grad_norm": 0.18754589557647705, + "learning_rate": 4.7358434453887365e-05, + "loss": 0.0152, + "step": 32460 + }, + { + "epoch": 18.89988358556461, + "grad_norm": 0.2441083937883377, + "learning_rate": 4.7330915512061976e-05, + "loss": 0.0102, + "step": 32470 + }, + { + "epoch": 18.90570430733411, + "grad_norm": 0.10172121226787567, + "learning_rate": 4.730339738103402e-05, + "loss": 0.012, + "step": 32480 + }, + { + "epoch": 18.911525029103608, + "grad_norm": 0.17433521151542664, + "learning_rate": 4.727588006916271e-05, + "loss": 0.0131, + "step": 32490 + }, + { + "epoch": 18.917345750873107, + "grad_norm": 0.17995241284370422, + "learning_rate": 4.724836358480711e-05, + "loss": 0.0094, + "step": 32500 + }, + { + "epoch": 18.923166472642606, + "grad_norm": 0.25944197177886963, + "learning_rate": 4.722084793632601e-05, + "loss": 0.0112, + "step": 32510 + }, + { + "epoch": 18.92898719441211, + "grad_norm": 0.14954419434070587, + "learning_rate": 4.719333313207792e-05, + "loss": 0.0121, + "step": 32520 + }, + { + "epoch": 18.934807916181608, + "grad_norm": 0.17942701280117035, + "learning_rate": 4.716581918042114e-05, + "loss": 0.012, + "step": 32530 + }, + { + "epoch": 18.940628637951107, + "grad_norm": 0.11990433186292648, + "learning_rate": 4.7138306089713636e-05, + "loss": 0.0089, + "step": 32540 + }, + { + "epoch": 18.946449359720607, + "grad_norm": 0.20932015776634216, + "learning_rate": 4.7110793868313183e-05, + "loss": 0.0075, + "step": 32550 + }, + { + "epoch": 18.952270081490106, + "grad_norm": 0.15747448801994324, + "learning_rate": 4.708328252457729e-05, + "loss": 0.0119, + "step": 32560 + }, + { + "epoch": 18.958090803259605, + "grad_norm": 0.1539856493473053, + "learning_rate": 4.7055772066863135e-05, + "loss": 0.0177, + "step": 32570 + }, + { + "epoch": 18.963911525029104, + "grad_norm": 0.12104585766792297, + "learning_rate": 4.702826250352771e-05, + "loss": 0.008, + "step": 32580 + }, + { + "epoch": 18.969732246798603, + "grad_norm": 0.13181650638580322, + "learning_rate": 4.7000753842927653e-05, + "loss": 0.011, + "step": 32590 + }, + { + "epoch": 18.975552968568103, + "grad_norm": 0.13828185200691223, + "learning_rate": 4.6973246093419384e-05, + "loss": 0.0115, + "step": 32600 + }, + { + "epoch": 18.981373690337602, + "grad_norm": 0.13224823772907257, + "learning_rate": 4.694573926335906e-05, + "loss": 0.0128, + "step": 32610 + }, + { + "epoch": 18.9871944121071, + "grad_norm": 0.16549836099147797, + "learning_rate": 4.6918233361102476e-05, + "loss": 0.0101, + "step": 32620 + }, + { + "epoch": 18.9930151338766, + "grad_norm": 0.1764732152223587, + "learning_rate": 4.689072839500525e-05, + "loss": 0.0115, + "step": 32630 + }, + { + "epoch": 18.9988358556461, + "grad_norm": 0.19228427112102509, + "learning_rate": 4.6863224373422635e-05, + "loss": 0.0074, + "step": 32640 + }, + { + "epoch": 19.0046565774156, + "grad_norm": 0.10699804872274399, + "learning_rate": 4.683572130470962e-05, + "loss": 0.0089, + "step": 32650 + }, + { + "epoch": 19.010477299185098, + "grad_norm": 0.2446102797985077, + "learning_rate": 4.680821919722094e-05, + "loss": 0.015, + "step": 32660 + }, + { + "epoch": 19.016298020954597, + "grad_norm": 0.10852531343698502, + "learning_rate": 4.6780718059310975e-05, + "loss": 0.0123, + "step": 32670 + }, + { + "epoch": 19.022118742724096, + "grad_norm": 0.18426613509655, + "learning_rate": 4.675321789933389e-05, + "loss": 0.007, + "step": 32680 + }, + { + "epoch": 19.027939464493596, + "grad_norm": 0.15789814293384552, + "learning_rate": 4.6725718725643464e-05, + "loss": 0.0073, + "step": 32690 + }, + { + "epoch": 19.0337601862631, + "grad_norm": 0.2587735950946808, + "learning_rate": 4.669822054659323e-05, + "loss": 0.0139, + "step": 32700 + }, + { + "epoch": 19.039580908032598, + "grad_norm": 0.16966092586517334, + "learning_rate": 4.667072337053644e-05, + "loss": 0.0116, + "step": 32710 + }, + { + "epoch": 19.045401629802097, + "grad_norm": 0.156858429312706, + "learning_rate": 4.6643227205825965e-05, + "loss": 0.0124, + "step": 32720 + }, + { + "epoch": 19.051222351571596, + "grad_norm": 0.2526727616786957, + "learning_rate": 4.6615732060814454e-05, + "loss": 0.0102, + "step": 32730 + }, + { + "epoch": 19.057043073341095, + "grad_norm": 0.14414964616298676, + "learning_rate": 4.658823794385417e-05, + "loss": 0.0091, + "step": 32740 + }, + { + "epoch": 19.062863795110594, + "grad_norm": 0.20707358419895172, + "learning_rate": 4.6560744863297115e-05, + "loss": 0.0118, + "step": 32750 + }, + { + "epoch": 19.068684516880094, + "grad_norm": 0.14443376660346985, + "learning_rate": 4.653325282749498e-05, + "loss": 0.0108, + "step": 32760 + }, + { + "epoch": 19.074505238649593, + "grad_norm": 0.22100596129894257, + "learning_rate": 4.6505761844799075e-05, + "loss": 0.0097, + "step": 32770 + }, + { + "epoch": 19.080325960419092, + "grad_norm": 0.2297191619873047, + "learning_rate": 4.647827192356048e-05, + "loss": 0.0109, + "step": 32780 + }, + { + "epoch": 19.08614668218859, + "grad_norm": 0.14509700238704681, + "learning_rate": 4.645078307212989e-05, + "loss": 0.012, + "step": 32790 + }, + { + "epoch": 19.09196740395809, + "grad_norm": 0.16516786813735962, + "learning_rate": 4.642329529885768e-05, + "loss": 0.0086, + "step": 32800 + }, + { + "epoch": 19.09778812572759, + "grad_norm": 0.2575877904891968, + "learning_rate": 4.639580861209393e-05, + "loss": 0.0122, + "step": 32810 + }, + { + "epoch": 19.10360884749709, + "grad_norm": 0.1266922801733017, + "learning_rate": 4.636832302018835e-05, + "loss": 0.0142, + "step": 32820 + }, + { + "epoch": 19.109429569266588, + "grad_norm": 0.0853516012430191, + "learning_rate": 4.6340838531490365e-05, + "loss": 0.0079, + "step": 32830 + }, + { + "epoch": 19.115250291036087, + "grad_norm": 0.21068698167800903, + "learning_rate": 4.6313355154349e-05, + "loss": 0.0098, + "step": 32840 + }, + { + "epoch": 19.121071012805587, + "grad_norm": 0.13992612063884735, + "learning_rate": 4.6285872897113025e-05, + "loss": 0.0086, + "step": 32850 + }, + { + "epoch": 19.126891734575086, + "grad_norm": 0.08942194283008575, + "learning_rate": 4.625839176813077e-05, + "loss": 0.0079, + "step": 32860 + }, + { + "epoch": 19.132712456344585, + "grad_norm": 0.18390321731567383, + "learning_rate": 4.623091177575031e-05, + "loss": 0.0091, + "step": 32870 + }, + { + "epoch": 19.138533178114088, + "grad_norm": 0.1550687700510025, + "learning_rate": 4.620343292831936e-05, + "loss": 0.011, + "step": 32880 + }, + { + "epoch": 19.144353899883587, + "grad_norm": 0.1611989289522171, + "learning_rate": 4.6175955234185206e-05, + "loss": 0.0117, + "step": 32890 + }, + { + "epoch": 19.150174621653086, + "grad_norm": 0.22197428345680237, + "learning_rate": 4.614847870169492e-05, + "loss": 0.0081, + "step": 32900 + }, + { + "epoch": 19.155995343422585, + "grad_norm": 0.17058037221431732, + "learning_rate": 4.612100333919509e-05, + "loss": 0.0089, + "step": 32910 + }, + { + "epoch": 19.161816065192085, + "grad_norm": 0.25735047459602356, + "learning_rate": 4.609352915503202e-05, + "loss": 0.0121, + "step": 32920 + }, + { + "epoch": 19.167636786961584, + "grad_norm": 0.2227022796869278, + "learning_rate": 4.606605615755166e-05, + "loss": 0.0088, + "step": 32930 + }, + { + "epoch": 19.173457508731083, + "grad_norm": 0.16778413951396942, + "learning_rate": 4.6038584355099576e-05, + "loss": 0.0121, + "step": 32940 + }, + { + "epoch": 19.179278230500582, + "grad_norm": 0.177757129073143, + "learning_rate": 4.6011113756020964e-05, + "loss": 0.011, + "step": 32950 + }, + { + "epoch": 19.18509895227008, + "grad_norm": 0.18435367941856384, + "learning_rate": 4.598364436866066e-05, + "loss": 0.0103, + "step": 32960 + }, + { + "epoch": 19.19091967403958, + "grad_norm": 0.25336459279060364, + "learning_rate": 4.595617620136316e-05, + "loss": 0.013, + "step": 32970 + }, + { + "epoch": 19.19674039580908, + "grad_norm": 0.18264247477054596, + "learning_rate": 4.592870926247257e-05, + "loss": 0.0117, + "step": 32980 + }, + { + "epoch": 19.20256111757858, + "grad_norm": 0.17890670895576477, + "learning_rate": 4.5901243560332594e-05, + "loss": 0.01, + "step": 32990 + }, + { + "epoch": 19.20838183934808, + "grad_norm": 0.17037832736968994, + "learning_rate": 4.587377910328662e-05, + "loss": 0.0091, + "step": 33000 + }, + { + "epoch": 19.214202561117578, + "grad_norm": 0.16089187562465668, + "learning_rate": 4.5846315899677586e-05, + "loss": 0.008, + "step": 33010 + }, + { + "epoch": 19.220023282887077, + "grad_norm": 0.13253901898860931, + "learning_rate": 4.5818853957848114e-05, + "loss": 0.011, + "step": 33020 + }, + { + "epoch": 19.225844004656576, + "grad_norm": 0.13131964206695557, + "learning_rate": 4.579139328614043e-05, + "loss": 0.0089, + "step": 33030 + }, + { + "epoch": 19.231664726426075, + "grad_norm": 0.17554408311843872, + "learning_rate": 4.576393389289633e-05, + "loss": 0.0103, + "step": 33040 + }, + { + "epoch": 19.237485448195578, + "grad_norm": 0.12480992078781128, + "learning_rate": 4.573647578645728e-05, + "loss": 0.0114, + "step": 33050 + }, + { + "epoch": 19.243306169965077, + "grad_norm": 0.136123389005661, + "learning_rate": 4.57090189751643e-05, + "loss": 0.0063, + "step": 33060 + }, + { + "epoch": 19.249126891734576, + "grad_norm": 0.17225012183189392, + "learning_rate": 4.568156346735806e-05, + "loss": 0.0103, + "step": 33070 + }, + { + "epoch": 19.254947613504076, + "grad_norm": 0.12741084396839142, + "learning_rate": 4.565410927137882e-05, + "loss": 0.0077, + "step": 33080 + }, + { + "epoch": 19.260768335273575, + "grad_norm": 0.16318421065807343, + "learning_rate": 4.562665639556644e-05, + "loss": 0.0129, + "step": 33090 + }, + { + "epoch": 19.266589057043074, + "grad_norm": 0.1702069193124771, + "learning_rate": 4.559920484826037e-05, + "loss": 0.0088, + "step": 33100 + }, + { + "epoch": 19.272409778812573, + "grad_norm": 0.16447852551937103, + "learning_rate": 4.5571754637799665e-05, + "loss": 0.011, + "step": 33110 + }, + { + "epoch": 19.278230500582072, + "grad_norm": 0.17250725626945496, + "learning_rate": 4.554430577252298e-05, + "loss": 0.0086, + "step": 33120 + }, + { + "epoch": 19.28405122235157, + "grad_norm": 0.1501537412405014, + "learning_rate": 4.551685826076858e-05, + "loss": 0.0098, + "step": 33130 + }, + { + "epoch": 19.28987194412107, + "grad_norm": 0.14657579362392426, + "learning_rate": 4.5489412110874246e-05, + "loss": 0.0088, + "step": 33140 + }, + { + "epoch": 19.29569266589057, + "grad_norm": 0.2161208540201187, + "learning_rate": 4.5461967331177444e-05, + "loss": 0.0132, + "step": 33150 + }, + { + "epoch": 19.30151338766007, + "grad_norm": 0.22392547130584717, + "learning_rate": 4.5434523930015115e-05, + "loss": 0.0112, + "step": 33160 + }, + { + "epoch": 19.30733410942957, + "grad_norm": 0.15022152662277222, + "learning_rate": 4.540708191572388e-05, + "loss": 0.0104, + "step": 33170 + }, + { + "epoch": 19.313154831199068, + "grad_norm": 0.1724541038274765, + "learning_rate": 4.537964129663991e-05, + "loss": 0.0074, + "step": 33180 + }, + { + "epoch": 19.318975552968567, + "grad_norm": 0.14798492193222046, + "learning_rate": 4.535220208109889e-05, + "loss": 0.011, + "step": 33190 + }, + { + "epoch": 19.324796274738066, + "grad_norm": 0.24224385619163513, + "learning_rate": 4.5324764277436194e-05, + "loss": 0.0082, + "step": 33200 + }, + { + "epoch": 19.330616996507565, + "grad_norm": 0.1699395626783371, + "learning_rate": 4.529732789398664e-05, + "loss": 0.0096, + "step": 33210 + }, + { + "epoch": 19.336437718277068, + "grad_norm": 0.09919935464859009, + "learning_rate": 4.526989293908472e-05, + "loss": 0.0128, + "step": 33220 + }, + { + "epoch": 19.342258440046567, + "grad_norm": 0.17172348499298096, + "learning_rate": 4.524245942106442e-05, + "loss": 0.0092, + "step": 33230 + }, + { + "epoch": 19.348079161816067, + "grad_norm": 0.17725171148777008, + "learning_rate": 4.5215027348259345e-05, + "loss": 0.0123, + "step": 33240 + }, + { + "epoch": 19.353899883585566, + "grad_norm": 0.1760435551404953, + "learning_rate": 4.5187596729002616e-05, + "loss": 0.0103, + "step": 33250 + }, + { + "epoch": 19.359720605355065, + "grad_norm": 0.18450425565242767, + "learning_rate": 4.516016757162693e-05, + "loss": 0.0094, + "step": 33260 + }, + { + "epoch": 19.365541327124564, + "grad_norm": 0.13979190587997437, + "learning_rate": 4.513273988446457e-05, + "loss": 0.0087, + "step": 33270 + }, + { + "epoch": 19.371362048894063, + "grad_norm": 0.14714333415031433, + "learning_rate": 4.5105313675847296e-05, + "loss": 0.0092, + "step": 33280 + }, + { + "epoch": 19.377182770663563, + "grad_norm": 0.17380359768867493, + "learning_rate": 4.5077888954106495e-05, + "loss": 0.0092, + "step": 33290 + }, + { + "epoch": 19.383003492433062, + "grad_norm": 0.19231246411800385, + "learning_rate": 4.505046572757309e-05, + "loss": 0.0107, + "step": 33300 + }, + { + "epoch": 19.38882421420256, + "grad_norm": 0.13914115726947784, + "learning_rate": 4.502304400457749e-05, + "loss": 0.0114, + "step": 33310 + }, + { + "epoch": 19.39464493597206, + "grad_norm": 0.2418595850467682, + "learning_rate": 4.499562379344973e-05, + "loss": 0.0085, + "step": 33320 + }, + { + "epoch": 19.40046565774156, + "grad_norm": 0.17797990143299103, + "learning_rate": 4.4968205102519306e-05, + "loss": 0.0107, + "step": 33330 + }, + { + "epoch": 19.40628637951106, + "grad_norm": 0.191444993019104, + "learning_rate": 4.494078794011532e-05, + "loss": 0.0098, + "step": 33340 + }, + { + "epoch": 19.412107101280558, + "grad_norm": 0.1954161524772644, + "learning_rate": 4.491337231456639e-05, + "loss": 0.0108, + "step": 33350 + }, + { + "epoch": 19.417927823050057, + "grad_norm": 0.11470448225736618, + "learning_rate": 4.4885958234200634e-05, + "loss": 0.0085, + "step": 33360 + }, + { + "epoch": 19.423748544819556, + "grad_norm": 0.1552472859621048, + "learning_rate": 4.485854570734575e-05, + "loss": 0.0092, + "step": 33370 + }, + { + "epoch": 19.429569266589056, + "grad_norm": 0.21879777312278748, + "learning_rate": 4.483113474232891e-05, + "loss": 0.0116, + "step": 33380 + }, + { + "epoch": 19.435389988358555, + "grad_norm": 0.08754558861255646, + "learning_rate": 4.480372534747688e-05, + "loss": 0.0103, + "step": 33390 + }, + { + "epoch": 19.441210710128058, + "grad_norm": 0.21033801138401031, + "learning_rate": 4.477631753111588e-05, + "loss": 0.0152, + "step": 33400 + }, + { + "epoch": 19.447031431897557, + "grad_norm": 0.1562817096710205, + "learning_rate": 4.4748911301571686e-05, + "loss": 0.0095, + "step": 33410 + }, + { + "epoch": 19.452852153667056, + "grad_norm": 0.14322955906391144, + "learning_rate": 4.472150666716961e-05, + "loss": 0.0083, + "step": 33420 + }, + { + "epoch": 19.458672875436555, + "grad_norm": 0.1891438364982605, + "learning_rate": 4.469410363623442e-05, + "loss": 0.0079, + "step": 33430 + }, + { + "epoch": 19.464493597206054, + "grad_norm": 0.15393032133579254, + "learning_rate": 4.466670221709044e-05, + "loss": 0.0171, + "step": 33440 + }, + { + "epoch": 19.470314318975554, + "grad_norm": 0.15306755900382996, + "learning_rate": 4.463930241806154e-05, + "loss": 0.0114, + "step": 33450 + }, + { + "epoch": 19.476135040745053, + "grad_norm": 0.17234595119953156, + "learning_rate": 4.4611904247471006e-05, + "loss": 0.0078, + "step": 33460 + }, + { + "epoch": 19.481955762514552, + "grad_norm": 0.20781399309635162, + "learning_rate": 4.458450771364171e-05, + "loss": 0.0098, + "step": 33470 + }, + { + "epoch": 19.48777648428405, + "grad_norm": 0.1521453857421875, + "learning_rate": 4.4557112824895965e-05, + "loss": 0.01, + "step": 33480 + }, + { + "epoch": 19.49359720605355, + "grad_norm": 0.2025199681520462, + "learning_rate": 4.452971958955563e-05, + "loss": 0.0092, + "step": 33490 + }, + { + "epoch": 19.49941792782305, + "grad_norm": 0.11683536320924759, + "learning_rate": 4.450232801594208e-05, + "loss": 0.0094, + "step": 33500 + }, + { + "epoch": 19.50523864959255, + "grad_norm": 0.15777719020843506, + "learning_rate": 4.447493811237609e-05, + "loss": 0.0096, + "step": 33510 + }, + { + "epoch": 19.511059371362048, + "grad_norm": 0.165725976228714, + "learning_rate": 4.444754988717804e-05, + "loss": 0.0116, + "step": 33520 + }, + { + "epoch": 19.516880093131547, + "grad_norm": 0.15615761280059814, + "learning_rate": 4.442016334866771e-05, + "loss": 0.0147, + "step": 33530 + }, + { + "epoch": 19.522700814901047, + "grad_norm": 0.13292083144187927, + "learning_rate": 4.4392778505164445e-05, + "loss": 0.0076, + "step": 33540 + }, + { + "epoch": 19.528521536670546, + "grad_norm": 0.14525076746940613, + "learning_rate": 4.436539536498702e-05, + "loss": 0.0072, + "step": 33550 + }, + { + "epoch": 19.534342258440045, + "grad_norm": 0.14816559851169586, + "learning_rate": 4.433801393645369e-05, + "loss": 0.0108, + "step": 33560 + }, + { + "epoch": 19.540162980209544, + "grad_norm": 0.26520267128944397, + "learning_rate": 4.431063422788226e-05, + "loss": 0.013, + "step": 33570 + }, + { + "epoch": 19.545983701979047, + "grad_norm": 0.2420707494020462, + "learning_rate": 4.428325624758991e-05, + "loss": 0.0094, + "step": 33580 + }, + { + "epoch": 19.551804423748546, + "grad_norm": 0.2669438123703003, + "learning_rate": 4.4255880003893366e-05, + "loss": 0.0114, + "step": 33590 + }, + { + "epoch": 19.557625145518045, + "grad_norm": 0.20475345849990845, + "learning_rate": 4.422850550510884e-05, + "loss": 0.0086, + "step": 33600 + }, + { + "epoch": 19.563445867287545, + "grad_norm": 0.2124543935060501, + "learning_rate": 4.4201132759551934e-05, + "loss": 0.0129, + "step": 33610 + }, + { + "epoch": 19.569266589057044, + "grad_norm": 0.2262299507856369, + "learning_rate": 4.4173761775537804e-05, + "loss": 0.0124, + "step": 33620 + }, + { + "epoch": 19.575087310826543, + "grad_norm": 0.2671263813972473, + "learning_rate": 4.414639256138099e-05, + "loss": 0.0118, + "step": 33630 + }, + { + "epoch": 19.580908032596042, + "grad_norm": 0.2364172339439392, + "learning_rate": 4.411902512539557e-05, + "loss": 0.0108, + "step": 33640 + }, + { + "epoch": 19.58672875436554, + "grad_norm": 0.17912426590919495, + "learning_rate": 4.4091659475895044e-05, + "loss": 0.0078, + "step": 33650 + }, + { + "epoch": 19.59254947613504, + "grad_norm": 0.12997646629810333, + "learning_rate": 4.406429562119235e-05, + "loss": 0.0101, + "step": 33660 + }, + { + "epoch": 19.59837019790454, + "grad_norm": 0.16725307703018188, + "learning_rate": 4.4036933569599945e-05, + "loss": 0.0105, + "step": 33670 + }, + { + "epoch": 19.60419091967404, + "grad_norm": 0.1190069392323494, + "learning_rate": 4.400957332942965e-05, + "loss": 0.0094, + "step": 33680 + }, + { + "epoch": 19.61001164144354, + "grad_norm": 0.20864811539649963, + "learning_rate": 4.3982214908992844e-05, + "loss": 0.0098, + "step": 33690 + }, + { + "epoch": 19.615832363213038, + "grad_norm": 0.16525353491306305, + "learning_rate": 4.3954858316600235e-05, + "loss": 0.0087, + "step": 33700 + }, + { + "epoch": 19.621653084982537, + "grad_norm": 0.2023632526397705, + "learning_rate": 4.392750356056205e-05, + "loss": 0.0085, + "step": 33710 + }, + { + "epoch": 19.627473806752036, + "grad_norm": 0.15987399220466614, + "learning_rate": 4.390015064918798e-05, + "loss": 0.0084, + "step": 33720 + }, + { + "epoch": 19.633294528521535, + "grad_norm": 0.1891179233789444, + "learning_rate": 4.387279959078705e-05, + "loss": 0.0084, + "step": 33730 + }, + { + "epoch": 19.639115250291034, + "grad_norm": 0.12107190489768982, + "learning_rate": 4.384545039366786e-05, + "loss": 0.0089, + "step": 33740 + }, + { + "epoch": 19.644935972060537, + "grad_norm": 0.16192679107189178, + "learning_rate": 4.381810306613831e-05, + "loss": 0.0143, + "step": 33750 + }, + { + "epoch": 19.650756693830036, + "grad_norm": 0.15036168694496155, + "learning_rate": 4.3790757616505826e-05, + "loss": 0.0117, + "step": 33760 + }, + { + "epoch": 19.656577415599536, + "grad_norm": 0.18178820610046387, + "learning_rate": 4.376341405307725e-05, + "loss": 0.0137, + "step": 33770 + }, + { + "epoch": 19.662398137369035, + "grad_norm": 0.1125609278678894, + "learning_rate": 4.37360723841588e-05, + "loss": 0.0117, + "step": 33780 + }, + { + "epoch": 19.668218859138534, + "grad_norm": 0.19873888790607452, + "learning_rate": 4.370873261805619e-05, + "loss": 0.014, + "step": 33790 + }, + { + "epoch": 19.674039580908033, + "grad_norm": 0.21499282121658325, + "learning_rate": 4.368139476307449e-05, + "loss": 0.014, + "step": 33800 + }, + { + "epoch": 19.679860302677533, + "grad_norm": 0.2103547304868698, + "learning_rate": 4.365405882751822e-05, + "loss": 0.0107, + "step": 33810 + }, + { + "epoch": 19.68568102444703, + "grad_norm": 0.16494931280612946, + "learning_rate": 4.3626724819691326e-05, + "loss": 0.0093, + "step": 33820 + }, + { + "epoch": 19.69150174621653, + "grad_norm": 0.11595741659402847, + "learning_rate": 4.359939274789715e-05, + "loss": 0.0089, + "step": 33830 + }, + { + "epoch": 19.69732246798603, + "grad_norm": 0.11592236161231995, + "learning_rate": 4.357206262043848e-05, + "loss": 0.0119, + "step": 33840 + }, + { + "epoch": 19.70314318975553, + "grad_norm": 0.15784285962581635, + "learning_rate": 4.354473444561745e-05, + "loss": 0.0072, + "step": 33850 + }, + { + "epoch": 19.70896391152503, + "grad_norm": 0.19323843717575073, + "learning_rate": 4.3517408231735644e-05, + "loss": 0.0105, + "step": 33860 + }, + { + "epoch": 19.714784633294528, + "grad_norm": 0.1642087996006012, + "learning_rate": 4.3490083987094086e-05, + "loss": 0.0086, + "step": 33870 + }, + { + "epoch": 19.720605355064027, + "grad_norm": 0.16626384854316711, + "learning_rate": 4.34627617199931e-05, + "loss": 0.0133, + "step": 33880 + }, + { + "epoch": 19.726426076833526, + "grad_norm": 0.2773437798023224, + "learning_rate": 4.3435441438732526e-05, + "loss": 0.0064, + "step": 33890 + }, + { + "epoch": 19.732246798603025, + "grad_norm": 0.09950442612171173, + "learning_rate": 4.340812315161149e-05, + "loss": 0.0092, + "step": 33900 + }, + { + "epoch": 19.738067520372525, + "grad_norm": 0.26822754740715027, + "learning_rate": 4.338080686692859e-05, + "loss": 0.0136, + "step": 33910 + }, + { + "epoch": 19.743888242142027, + "grad_norm": 0.17382164299488068, + "learning_rate": 4.3353492592981816e-05, + "loss": 0.0085, + "step": 33920 + }, + { + "epoch": 19.749708963911527, + "grad_norm": 0.16932591795921326, + "learning_rate": 4.3326180338068485e-05, + "loss": 0.0159, + "step": 33930 + }, + { + "epoch": 19.755529685681026, + "grad_norm": 0.14824414253234863, + "learning_rate": 4.3298870110485356e-05, + "loss": 0.0091, + "step": 33940 + }, + { + "epoch": 19.761350407450525, + "grad_norm": 0.1414097547531128, + "learning_rate": 4.3271561918528567e-05, + "loss": 0.0084, + "step": 33950 + }, + { + "epoch": 19.767171129220024, + "grad_norm": 0.14723606407642365, + "learning_rate": 4.324425577049359e-05, + "loss": 0.0093, + "step": 33960 + }, + { + "epoch": 19.772991850989523, + "grad_norm": 0.12496569752693176, + "learning_rate": 4.321695167467535e-05, + "loss": 0.0108, + "step": 33970 + }, + { + "epoch": 19.778812572759023, + "grad_norm": 0.2125893235206604, + "learning_rate": 4.3189649639368093e-05, + "loss": 0.0147, + "step": 33980 + }, + { + "epoch": 19.784633294528522, + "grad_norm": 0.12255004048347473, + "learning_rate": 4.316234967286547e-05, + "loss": 0.0094, + "step": 33990 + }, + { + "epoch": 19.79045401629802, + "grad_norm": 0.27263957262039185, + "learning_rate": 4.313505178346046e-05, + "loss": 0.0172, + "step": 34000 + }, + { + "epoch": 19.79627473806752, + "grad_norm": 0.15816546976566315, + "learning_rate": 4.3107755979445465e-05, + "loss": 0.0104, + "step": 34010 + }, + { + "epoch": 19.80209545983702, + "grad_norm": 0.12162778526544571, + "learning_rate": 4.308046226911224e-05, + "loss": 0.0089, + "step": 34020 + }, + { + "epoch": 19.80791618160652, + "grad_norm": 0.10801813006401062, + "learning_rate": 4.305317066075185e-05, + "loss": 0.0117, + "step": 34030 + }, + { + "epoch": 19.813736903376018, + "grad_norm": 0.20327617228031158, + "learning_rate": 4.302588116265482e-05, + "loss": 0.0103, + "step": 34040 + }, + { + "epoch": 19.819557625145517, + "grad_norm": 0.1775432676076889, + "learning_rate": 4.299859378311094e-05, + "loss": 0.0116, + "step": 34050 + }, + { + "epoch": 19.825378346915016, + "grad_norm": 0.1823890060186386, + "learning_rate": 4.2971308530409424e-05, + "loss": 0.0084, + "step": 34060 + }, + { + "epoch": 19.831199068684516, + "grad_norm": 0.17319361865520477, + "learning_rate": 4.2944025412838765e-05, + "loss": 0.0082, + "step": 34070 + }, + { + "epoch": 19.837019790454015, + "grad_norm": 0.1853582262992859, + "learning_rate": 4.291674443868689e-05, + "loss": 0.0093, + "step": 34080 + }, + { + "epoch": 19.842840512223514, + "grad_norm": 0.09930511564016342, + "learning_rate": 4.288946561624104e-05, + "loss": 0.0098, + "step": 34090 + }, + { + "epoch": 19.848661233993017, + "grad_norm": 0.1271035224199295, + "learning_rate": 4.2862188953787794e-05, + "loss": 0.0085, + "step": 34100 + }, + { + "epoch": 19.854481955762516, + "grad_norm": 0.18600960075855255, + "learning_rate": 4.283491445961308e-05, + "loss": 0.0099, + "step": 34110 + }, + { + "epoch": 19.860302677532015, + "grad_norm": 0.16477997601032257, + "learning_rate": 4.2807642142002155e-05, + "loss": 0.011, + "step": 34120 + }, + { + "epoch": 19.866123399301514, + "grad_norm": 0.20230868458747864, + "learning_rate": 4.278037200923966e-05, + "loss": 0.0085, + "step": 34130 + }, + { + "epoch": 19.871944121071014, + "grad_norm": 0.2125154733657837, + "learning_rate": 4.275310406960953e-05, + "loss": 0.0168, + "step": 34140 + }, + { + "epoch": 19.877764842840513, + "grad_norm": 0.19766896963119507, + "learning_rate": 4.272583833139502e-05, + "loss": 0.0127, + "step": 34150 + }, + { + "epoch": 19.883585564610012, + "grad_norm": 0.2404792457818985, + "learning_rate": 4.2698574802878794e-05, + "loss": 0.0108, + "step": 34160 + }, + { + "epoch": 19.88940628637951, + "grad_norm": 0.20025165379047394, + "learning_rate": 4.2671313492342734e-05, + "loss": 0.0074, + "step": 34170 + }, + { + "epoch": 19.89522700814901, + "grad_norm": 0.2594536542892456, + "learning_rate": 4.264405440806813e-05, + "loss": 0.0089, + "step": 34180 + }, + { + "epoch": 19.90104772991851, + "grad_norm": 0.1524757295846939, + "learning_rate": 4.26167975583356e-05, + "loss": 0.0157, + "step": 34190 + }, + { + "epoch": 19.90686845168801, + "grad_norm": 0.13177426159381866, + "learning_rate": 4.2589542951425e-05, + "loss": 0.008, + "step": 34200 + }, + { + "epoch": 19.912689173457508, + "grad_norm": 0.19066721200942993, + "learning_rate": 4.2562290595615615e-05, + "loss": 0.0115, + "step": 34210 + }, + { + "epoch": 19.918509895227007, + "grad_norm": 0.16536769270896912, + "learning_rate": 4.2535040499185946e-05, + "loss": 0.0136, + "step": 34220 + }, + { + "epoch": 19.924330616996507, + "grad_norm": 0.2092328667640686, + "learning_rate": 4.250779267041387e-05, + "loss": 0.0106, + "step": 34230 + }, + { + "epoch": 19.930151338766006, + "grad_norm": 0.14909307658672333, + "learning_rate": 4.248054711757657e-05, + "loss": 0.0102, + "step": 34240 + }, + { + "epoch": 19.935972060535505, + "grad_norm": 0.1555938422679901, + "learning_rate": 4.245330384895052e-05, + "loss": 0.0127, + "step": 34250 + }, + { + "epoch": 19.941792782305004, + "grad_norm": 0.12639136612415314, + "learning_rate": 4.242606287281151e-05, + "loss": 0.0148, + "step": 34260 + }, + { + "epoch": 19.947613504074504, + "grad_norm": 0.12190929055213928, + "learning_rate": 4.2398824197434595e-05, + "loss": 0.0159, + "step": 34270 + }, + { + "epoch": 19.953434225844006, + "grad_norm": 0.12750542163848877, + "learning_rate": 4.23715878310942e-05, + "loss": 0.0092, + "step": 34280 + }, + { + "epoch": 19.959254947613505, + "grad_norm": 0.12414013594388962, + "learning_rate": 4.234435378206402e-05, + "loss": 0.009, + "step": 34290 + }, + { + "epoch": 19.965075669383005, + "grad_norm": 0.14311735332012177, + "learning_rate": 4.2317122058617006e-05, + "loss": 0.0108, + "step": 34300 + }, + { + "epoch": 19.970896391152504, + "grad_norm": 0.1308780461549759, + "learning_rate": 4.2289892669025485e-05, + "loss": 0.0113, + "step": 34310 + }, + { + "epoch": 19.976717112922003, + "grad_norm": 0.25151577591896057, + "learning_rate": 4.226266562156097e-05, + "loss": 0.0113, + "step": 34320 + }, + { + "epoch": 19.982537834691502, + "grad_norm": 0.17076249420642853, + "learning_rate": 4.223544092449435e-05, + "loss": 0.0099, + "step": 34330 + }, + { + "epoch": 19.988358556461, + "grad_norm": 0.10826318711042404, + "learning_rate": 4.2208218586095784e-05, + "loss": 0.0095, + "step": 34340 + }, + { + "epoch": 19.9941792782305, + "grad_norm": 0.14375072717666626, + "learning_rate": 4.218099861463466e-05, + "loss": 0.0084, + "step": 34350 + }, + { + "epoch": 20.0, + "grad_norm": 0.0991954356431961, + "learning_rate": 4.215378101837972e-05, + "loss": 0.0119, + "step": 34360 + }, + { + "epoch": 20.0058207217695, + "grad_norm": 0.10785826295614243, + "learning_rate": 4.2126565805598937e-05, + "loss": 0.0094, + "step": 34370 + }, + { + "epoch": 20.011641443539, + "grad_norm": 0.2037353664636612, + "learning_rate": 4.209935298455957e-05, + "loss": 0.0083, + "step": 34380 + }, + { + "epoch": 20.017462165308498, + "grad_norm": 0.1981620192527771, + "learning_rate": 4.207214256352817e-05, + "loss": 0.0083, + "step": 34390 + }, + { + "epoch": 20.023282887077997, + "grad_norm": 0.11035227030515671, + "learning_rate": 4.2044934550770524e-05, + "loss": 0.0129, + "step": 34400 + }, + { + "epoch": 20.029103608847496, + "grad_norm": 0.14006012678146362, + "learning_rate": 4.201772895455174e-05, + "loss": 0.0131, + "step": 34410 + }, + { + "epoch": 20.034924330616995, + "grad_norm": 0.17275595664978027, + "learning_rate": 4.199052578313613e-05, + "loss": 0.0122, + "step": 34420 + }, + { + "epoch": 20.040745052386495, + "grad_norm": 0.15135489404201508, + "learning_rate": 4.1963325044787294e-05, + "loss": 0.0111, + "step": 34430 + }, + { + "epoch": 20.046565774155994, + "grad_norm": 0.14585694670677185, + "learning_rate": 4.193612674776814e-05, + "loss": 0.0088, + "step": 34440 + }, + { + "epoch": 20.052386495925496, + "grad_norm": 0.10141884535551071, + "learning_rate": 4.1908930900340745e-05, + "loss": 0.0098, + "step": 34450 + }, + { + "epoch": 20.058207217694996, + "grad_norm": 0.18232299387454987, + "learning_rate": 4.1881737510766536e-05, + "loss": 0.0093, + "step": 34460 + }, + { + "epoch": 20.064027939464495, + "grad_norm": 0.13559485971927643, + "learning_rate": 4.185454658730609e-05, + "loss": 0.0077, + "step": 34470 + }, + { + "epoch": 20.069848661233994, + "grad_norm": 0.15430961549282074, + "learning_rate": 4.1827358138219355e-05, + "loss": 0.0119, + "step": 34480 + }, + { + "epoch": 20.075669383003493, + "grad_norm": 0.1471460461616516, + "learning_rate": 4.1800172171765404e-05, + "loss": 0.0168, + "step": 34490 + }, + { + "epoch": 20.081490104772993, + "grad_norm": 0.14017048478126526, + "learning_rate": 4.177298869620264e-05, + "loss": 0.0086, + "step": 34500 + }, + { + "epoch": 20.087310826542492, + "grad_norm": 0.14331679046154022, + "learning_rate": 4.1745807719788705e-05, + "loss": 0.0118, + "step": 34510 + }, + { + "epoch": 20.09313154831199, + "grad_norm": 0.14626026153564453, + "learning_rate": 4.1718629250780445e-05, + "loss": 0.0096, + "step": 34520 + }, + { + "epoch": 20.09895227008149, + "grad_norm": 0.10522554069757462, + "learning_rate": 4.1691453297433956e-05, + "loss": 0.0094, + "step": 34530 + }, + { + "epoch": 20.10477299185099, + "grad_norm": 0.27462857961654663, + "learning_rate": 4.166427986800457e-05, + "loss": 0.0106, + "step": 34540 + }, + { + "epoch": 20.11059371362049, + "grad_norm": 0.15869802236557007, + "learning_rate": 4.163710897074688e-05, + "loss": 0.0075, + "step": 34550 + }, + { + "epoch": 20.116414435389988, + "grad_norm": 0.14408349990844727, + "learning_rate": 4.1609940613914686e-05, + "loss": 0.0166, + "step": 34560 + }, + { + "epoch": 20.122235157159487, + "grad_norm": 0.16050852835178375, + "learning_rate": 4.1582774805760996e-05, + "loss": 0.0122, + "step": 34570 + }, + { + "epoch": 20.128055878928986, + "grad_norm": 0.1867092102766037, + "learning_rate": 4.155561155453809e-05, + "loss": 0.0093, + "step": 34580 + }, + { + "epoch": 20.133876600698486, + "grad_norm": 0.16095732152462006, + "learning_rate": 4.15284508684974e-05, + "loss": 0.0111, + "step": 34590 + }, + { + "epoch": 20.139697322467985, + "grad_norm": 0.18379664421081543, + "learning_rate": 4.1501292755889675e-05, + "loss": 0.0133, + "step": 34600 + }, + { + "epoch": 20.145518044237484, + "grad_norm": 0.2024531215429306, + "learning_rate": 4.1474137224964833e-05, + "loss": 0.0098, + "step": 34610 + }, + { + "epoch": 20.151338766006983, + "grad_norm": 0.1967552751302719, + "learning_rate": 4.144698428397197e-05, + "loss": 0.0137, + "step": 34620 + }, + { + "epoch": 20.157159487776486, + "grad_norm": 0.12721017003059387, + "learning_rate": 4.1419833941159466e-05, + "loss": 0.0072, + "step": 34630 + }, + { + "epoch": 20.162980209545985, + "grad_norm": 0.25852081179618835, + "learning_rate": 4.1392686204774846e-05, + "loss": 0.0092, + "step": 34640 + }, + { + "epoch": 20.168800931315484, + "grad_norm": 0.20481155812740326, + "learning_rate": 4.13655410830649e-05, + "loss": 0.0087, + "step": 34650 + }, + { + "epoch": 20.174621653084984, + "grad_norm": 0.2788775563240051, + "learning_rate": 4.1338398584275594e-05, + "loss": 0.0143, + "step": 34660 + }, + { + "epoch": 20.180442374854483, + "grad_norm": 0.12603877484798431, + "learning_rate": 4.1311258716652104e-05, + "loss": 0.0117, + "step": 34670 + }, + { + "epoch": 20.186263096623982, + "grad_norm": 0.1243518739938736, + "learning_rate": 4.128412148843881e-05, + "loss": 0.0084, + "step": 34680 + }, + { + "epoch": 20.19208381839348, + "grad_norm": 0.2213018238544464, + "learning_rate": 4.125698690787926e-05, + "loss": 0.0132, + "step": 34690 + }, + { + "epoch": 20.19790454016298, + "grad_norm": 0.1395307183265686, + "learning_rate": 4.1229854983216245e-05, + "loss": 0.0116, + "step": 34700 + }, + { + "epoch": 20.20372526193248, + "grad_norm": 0.2382606714963913, + "learning_rate": 4.120272572269175e-05, + "loss": 0.0118, + "step": 34710 + }, + { + "epoch": 20.20954598370198, + "grad_norm": 0.13717064261436462, + "learning_rate": 4.117559913454687e-05, + "loss": 0.0111, + "step": 34720 + }, + { + "epoch": 20.215366705471478, + "grad_norm": 0.13907241821289062, + "learning_rate": 4.114847522702201e-05, + "loss": 0.0098, + "step": 34730 + }, + { + "epoch": 20.221187427240977, + "grad_norm": 0.12240081280469894, + "learning_rate": 4.112135400835664e-05, + "loss": 0.0063, + "step": 34740 + }, + { + "epoch": 20.227008149010477, + "grad_norm": 0.12404307723045349, + "learning_rate": 4.109423548678949e-05, + "loss": 0.0063, + "step": 34750 + }, + { + "epoch": 20.232828870779976, + "grad_norm": 0.10768923908472061, + "learning_rate": 4.106711967055848e-05, + "loss": 0.0104, + "step": 34760 + }, + { + "epoch": 20.238649592549475, + "grad_norm": 0.1518455296754837, + "learning_rate": 4.1040006567900636e-05, + "loss": 0.0089, + "step": 34770 + }, + { + "epoch": 20.244470314318974, + "grad_norm": 0.1808595359325409, + "learning_rate": 4.101289618705224e-05, + "loss": 0.0075, + "step": 34780 + }, + { + "epoch": 20.250291036088473, + "grad_norm": 0.22003255784511566, + "learning_rate": 4.0985788536248675e-05, + "loss": 0.0075, + "step": 34790 + }, + { + "epoch": 20.256111757857976, + "grad_norm": 0.18094468116760254, + "learning_rate": 4.095868362372454e-05, + "loss": 0.0072, + "step": 34800 + }, + { + "epoch": 20.261932479627475, + "grad_norm": 0.21144908666610718, + "learning_rate": 4.0931581457713614e-05, + "loss": 0.0094, + "step": 34810 + }, + { + "epoch": 20.267753201396975, + "grad_norm": 0.2093619704246521, + "learning_rate": 4.09044820464488e-05, + "loss": 0.0125, + "step": 34820 + }, + { + "epoch": 20.273573923166474, + "grad_norm": 0.20733238756656647, + "learning_rate": 4.087738539816219e-05, + "loss": 0.0109, + "step": 34830 + }, + { + "epoch": 20.279394644935973, + "grad_norm": 0.3398309051990509, + "learning_rate": 4.085029152108501e-05, + "loss": 0.0131, + "step": 34840 + }, + { + "epoch": 20.285215366705472, + "grad_norm": 0.19109508395195007, + "learning_rate": 4.0823200423447714e-05, + "loss": 0.0112, + "step": 34850 + }, + { + "epoch": 20.29103608847497, + "grad_norm": 0.1330375373363495, + "learning_rate": 4.079611211347981e-05, + "loss": 0.0134, + "step": 34860 + }, + { + "epoch": 20.29685681024447, + "grad_norm": 0.15902957320213318, + "learning_rate": 4.076902659941002e-05, + "loss": 0.008, + "step": 34870 + }, + { + "epoch": 20.30267753201397, + "grad_norm": 0.14774487912654877, + "learning_rate": 4.074194388946624e-05, + "loss": 0.0088, + "step": 34880 + }, + { + "epoch": 20.30849825378347, + "grad_norm": 0.16092664003372192, + "learning_rate": 4.071486399187545e-05, + "loss": 0.0088, + "step": 34890 + }, + { + "epoch": 20.31431897555297, + "grad_norm": 0.14123061299324036, + "learning_rate": 4.0687786914863836e-05, + "loss": 0.0072, + "step": 34900 + }, + { + "epoch": 20.320139697322467, + "grad_norm": 0.15611910820007324, + "learning_rate": 4.0660712666656666e-05, + "loss": 0.0075, + "step": 34910 + }, + { + "epoch": 20.325960419091967, + "grad_norm": 0.1954159289598465, + "learning_rate": 4.0633641255478394e-05, + "loss": 0.0121, + "step": 34920 + }, + { + "epoch": 20.331781140861466, + "grad_norm": 0.11175677925348282, + "learning_rate": 4.0606572689552624e-05, + "loss": 0.0072, + "step": 34930 + }, + { + "epoch": 20.337601862630965, + "grad_norm": 0.22444584965705872, + "learning_rate": 4.0579506977102036e-05, + "loss": 0.013, + "step": 34940 + }, + { + "epoch": 20.343422584400464, + "grad_norm": 0.17769411206245422, + "learning_rate": 4.055244412634849e-05, + "loss": 0.0095, + "step": 34950 + }, + { + "epoch": 20.349243306169964, + "grad_norm": 0.13328899443149567, + "learning_rate": 4.052538414551298e-05, + "loss": 0.0116, + "step": 34960 + }, + { + "epoch": 20.355064027939463, + "grad_norm": 0.10934446007013321, + "learning_rate": 4.0498327042815596e-05, + "loss": 0.0092, + "step": 34970 + }, + { + "epoch": 20.360884749708966, + "grad_norm": 0.11388754844665527, + "learning_rate": 4.047127282647559e-05, + "loss": 0.0084, + "step": 34980 + }, + { + "epoch": 20.366705471478465, + "grad_norm": 0.17465999722480774, + "learning_rate": 4.04442215047113e-05, + "loss": 0.0075, + "step": 34990 + }, + { + "epoch": 20.372526193247964, + "grad_norm": 0.14419563114643097, + "learning_rate": 4.041717308574023e-05, + "loss": 0.007, + "step": 35000 + }, + { + "epoch": 20.378346915017463, + "grad_norm": 0.20996518433094025, + "learning_rate": 4.039012757777893e-05, + "loss": 0.0086, + "step": 35010 + }, + { + "epoch": 20.384167636786962, + "grad_norm": 0.16220876574516296, + "learning_rate": 4.036308498904314e-05, + "loss": 0.0062, + "step": 35020 + }, + { + "epoch": 20.38998835855646, + "grad_norm": 0.13357046246528625, + "learning_rate": 4.033604532774771e-05, + "loss": 0.0117, + "step": 35030 + }, + { + "epoch": 20.39580908032596, + "grad_norm": 0.1650664061307907, + "learning_rate": 4.030900860210652e-05, + "loss": 0.008, + "step": 35040 + }, + { + "epoch": 20.40162980209546, + "grad_norm": 0.11723874509334564, + "learning_rate": 4.028197482033266e-05, + "loss": 0.0103, + "step": 35050 + }, + { + "epoch": 20.40745052386496, + "grad_norm": 0.239778533577919, + "learning_rate": 4.0254943990638246e-05, + "loss": 0.0095, + "step": 35060 + }, + { + "epoch": 20.41327124563446, + "grad_norm": 0.2748855948448181, + "learning_rate": 4.022791612123454e-05, + "loss": 0.0161, + "step": 35070 + }, + { + "epoch": 20.419091967403958, + "grad_norm": 0.17227359116077423, + "learning_rate": 4.020089122033192e-05, + "loss": 0.0117, + "step": 35080 + }, + { + "epoch": 20.424912689173457, + "grad_norm": 0.1550983339548111, + "learning_rate": 4.01738692961398e-05, + "loss": 0.0185, + "step": 35090 + }, + { + "epoch": 20.430733410942956, + "grad_norm": 0.14513827860355377, + "learning_rate": 4.014685035686675e-05, + "loss": 0.0078, + "step": 35100 + }, + { + "epoch": 20.436554132712455, + "grad_norm": 0.15046493709087372, + "learning_rate": 4.011983441072039e-05, + "loss": 0.0089, + "step": 35110 + }, + { + "epoch": 20.442374854481955, + "grad_norm": 0.20018380880355835, + "learning_rate": 4.0092821465907485e-05, + "loss": 0.0132, + "step": 35120 + }, + { + "epoch": 20.448195576251454, + "grad_norm": 0.19426238536834717, + "learning_rate": 4.006581153063383e-05, + "loss": 0.0084, + "step": 35130 + }, + { + "epoch": 20.454016298020953, + "grad_norm": 0.15996426343917847, + "learning_rate": 4.003880461310432e-05, + "loss": 0.0098, + "step": 35140 + }, + { + "epoch": 20.459837019790456, + "grad_norm": 0.29236599802970886, + "learning_rate": 4.001180072152298e-05, + "loss": 0.0112, + "step": 35150 + }, + { + "epoch": 20.465657741559955, + "grad_norm": 0.1524381786584854, + "learning_rate": 3.998479986409285e-05, + "loss": 0.0099, + "step": 35160 + }, + { + "epoch": 20.471478463329454, + "grad_norm": 0.15310294926166534, + "learning_rate": 3.995780204901607e-05, + "loss": 0.0076, + "step": 35170 + }, + { + "epoch": 20.477299185098953, + "grad_norm": 0.11088825017213821, + "learning_rate": 3.993080728449391e-05, + "loss": 0.0075, + "step": 35180 + }, + { + "epoch": 20.483119906868453, + "grad_norm": 0.12772677838802338, + "learning_rate": 3.990381557872661e-05, + "loss": 0.007, + "step": 35190 + }, + { + "epoch": 20.488940628637952, + "grad_norm": 0.11027088761329651, + "learning_rate": 3.987682693991359e-05, + "loss": 0.0085, + "step": 35200 + }, + { + "epoch": 20.49476135040745, + "grad_norm": 0.10262049734592438, + "learning_rate": 3.9849841376253226e-05, + "loss": 0.0061, + "step": 35210 + }, + { + "epoch": 20.50058207217695, + "grad_norm": 0.17721597850322723, + "learning_rate": 3.982285889594306e-05, + "loss": 0.0122, + "step": 35220 + }, + { + "epoch": 20.50640279394645, + "grad_norm": 0.2041778713464737, + "learning_rate": 3.9795879507179665e-05, + "loss": 0.0092, + "step": 35230 + }, + { + "epoch": 20.51222351571595, + "grad_norm": 0.16633857786655426, + "learning_rate": 3.9768903218158634e-05, + "loss": 0.0088, + "step": 35240 + }, + { + "epoch": 20.518044237485448, + "grad_norm": 0.17499056458473206, + "learning_rate": 3.974193003707468e-05, + "loss": 0.0092, + "step": 35250 + }, + { + "epoch": 20.523864959254947, + "grad_norm": 0.2178049087524414, + "learning_rate": 3.971495997212152e-05, + "loss": 0.01, + "step": 35260 + }, + { + "epoch": 20.529685681024446, + "grad_norm": 0.2767927050590515, + "learning_rate": 3.9687993031491985e-05, + "loss": 0.0095, + "step": 35270 + }, + { + "epoch": 20.535506402793946, + "grad_norm": 0.19741927087306976, + "learning_rate": 3.966102922337787e-05, + "loss": 0.0116, + "step": 35280 + }, + { + "epoch": 20.541327124563445, + "grad_norm": 0.15487167239189148, + "learning_rate": 3.963406855597009e-05, + "loss": 0.0081, + "step": 35290 + }, + { + "epoch": 20.547147846332944, + "grad_norm": 0.1307229995727539, + "learning_rate": 3.960711103745861e-05, + "loss": 0.0096, + "step": 35300 + }, + { + "epoch": 20.552968568102443, + "grad_norm": 0.14836256206035614, + "learning_rate": 3.958015667603237e-05, + "loss": 0.0105, + "step": 35310 + }, + { + "epoch": 20.558789289871942, + "grad_norm": 0.14764374494552612, + "learning_rate": 3.955320547987943e-05, + "loss": 0.0094, + "step": 35320 + }, + { + "epoch": 20.564610011641445, + "grad_norm": 0.21289406716823578, + "learning_rate": 3.952625745718681e-05, + "loss": 0.0137, + "step": 35330 + }, + { + "epoch": 20.570430733410944, + "grad_norm": 0.16624881327152252, + "learning_rate": 3.949931261614064e-05, + "loss": 0.0111, + "step": 35340 + }, + { + "epoch": 20.576251455180444, + "grad_norm": 0.1897488534450531, + "learning_rate": 3.947237096492605e-05, + "loss": 0.0076, + "step": 35350 + }, + { + "epoch": 20.582072176949943, + "grad_norm": 0.13864900171756744, + "learning_rate": 3.944543251172719e-05, + "loss": 0.0096, + "step": 35360 + }, + { + "epoch": 20.587892898719442, + "grad_norm": 0.14904916286468506, + "learning_rate": 3.941849726472725e-05, + "loss": 0.0072, + "step": 35370 + }, + { + "epoch": 20.59371362048894, + "grad_norm": 0.18545131385326385, + "learning_rate": 3.939156523210846e-05, + "loss": 0.0092, + "step": 35380 + }, + { + "epoch": 20.59953434225844, + "grad_norm": 0.12229405343532562, + "learning_rate": 3.9364636422052046e-05, + "loss": 0.0123, + "step": 35390 + }, + { + "epoch": 20.60535506402794, + "grad_norm": 0.15603876113891602, + "learning_rate": 3.933771084273828e-05, + "loss": 0.0144, + "step": 35400 + }, + { + "epoch": 20.61117578579744, + "grad_norm": 0.2584415376186371, + "learning_rate": 3.931078850234643e-05, + "loss": 0.0123, + "step": 35410 + }, + { + "epoch": 20.616996507566938, + "grad_norm": 0.15771988034248352, + "learning_rate": 3.928386940905483e-05, + "loss": 0.0086, + "step": 35420 + }, + { + "epoch": 20.622817229336437, + "grad_norm": 0.1366727501153946, + "learning_rate": 3.925695357104073e-05, + "loss": 0.0078, + "step": 35430 + }, + { + "epoch": 20.628637951105937, + "grad_norm": 0.24024035036563873, + "learning_rate": 3.923004099648049e-05, + "loss": 0.0119, + "step": 35440 + }, + { + "epoch": 20.634458672875436, + "grad_norm": 0.16667820513248444, + "learning_rate": 3.920313169354944e-05, + "loss": 0.0089, + "step": 35450 + }, + { + "epoch": 20.640279394644935, + "grad_norm": 0.1388242542743683, + "learning_rate": 3.9176225670421897e-05, + "loss": 0.014, + "step": 35460 + }, + { + "epoch": 20.646100116414434, + "grad_norm": 0.20812128484249115, + "learning_rate": 3.9149322935271224e-05, + "loss": 0.0101, + "step": 35470 + }, + { + "epoch": 20.651920838183933, + "grad_norm": 0.2671215832233429, + "learning_rate": 3.9122423496269725e-05, + "loss": 0.0116, + "step": 35480 + }, + { + "epoch": 20.657741559953433, + "grad_norm": 0.17904581129550934, + "learning_rate": 3.909552736158877e-05, + "loss": 0.0093, + "step": 35490 + }, + { + "epoch": 20.663562281722932, + "grad_norm": 0.15009813010692596, + "learning_rate": 3.90686345393987e-05, + "loss": 0.0073, + "step": 35500 + }, + { + "epoch": 20.669383003492435, + "grad_norm": 0.11975826323032379, + "learning_rate": 3.9041745037868816e-05, + "loss": 0.0092, + "step": 35510 + }, + { + "epoch": 20.675203725261934, + "grad_norm": 0.16417208313941956, + "learning_rate": 3.9014858865167465e-05, + "loss": 0.0134, + "step": 35520 + }, + { + "epoch": 20.681024447031433, + "grad_norm": 0.08676816523075104, + "learning_rate": 3.8987976029461935e-05, + "loss": 0.0144, + "step": 35530 + }, + { + "epoch": 20.686845168800932, + "grad_norm": 0.13197311758995056, + "learning_rate": 3.896109653891853e-05, + "loss": 0.0102, + "step": 35540 + }, + { + "epoch": 20.69266589057043, + "grad_norm": 0.10192885994911194, + "learning_rate": 3.893422040170254e-05, + "loss": 0.0072, + "step": 35550 + }, + { + "epoch": 20.69848661233993, + "grad_norm": 0.17634636163711548, + "learning_rate": 3.8907347625978207e-05, + "loss": 0.0139, + "step": 35560 + }, + { + "epoch": 20.70430733410943, + "grad_norm": 0.1685745120048523, + "learning_rate": 3.88804782199088e-05, + "loss": 0.0122, + "step": 35570 + }, + { + "epoch": 20.71012805587893, + "grad_norm": 0.20464184880256653, + "learning_rate": 3.8853612191656495e-05, + "loss": 0.007, + "step": 35580 + }, + { + "epoch": 20.71594877764843, + "grad_norm": 0.11271538585424423, + "learning_rate": 3.88267495493825e-05, + "loss": 0.0115, + "step": 35590 + }, + { + "epoch": 20.721769499417928, + "grad_norm": 0.13842640817165375, + "learning_rate": 3.8799890301247004e-05, + "loss": 0.013, + "step": 35600 + }, + { + "epoch": 20.727590221187427, + "grad_norm": 0.1920432597398758, + "learning_rate": 3.8773034455409096e-05, + "loss": 0.0098, + "step": 35610 + }, + { + "epoch": 20.733410942956926, + "grad_norm": 0.133915975689888, + "learning_rate": 3.8746182020026904e-05, + "loss": 0.0092, + "step": 35620 + }, + { + "epoch": 20.739231664726425, + "grad_norm": 0.11826014518737793, + "learning_rate": 3.871933300325745e-05, + "loss": 0.0078, + "step": 35630 + }, + { + "epoch": 20.745052386495924, + "grad_norm": 0.27925610542297363, + "learning_rate": 3.869248741325679e-05, + "loss": 0.0106, + "step": 35640 + }, + { + "epoch": 20.750873108265424, + "grad_norm": 0.11953294277191162, + "learning_rate": 3.866564525817992e-05, + "loss": 0.0084, + "step": 35650 + }, + { + "epoch": 20.756693830034923, + "grad_norm": 0.14643016457557678, + "learning_rate": 3.8638806546180725e-05, + "loss": 0.0099, + "step": 35660 + }, + { + "epoch": 20.762514551804422, + "grad_norm": 0.14718012511730194, + "learning_rate": 3.861197128541213e-05, + "loss": 0.0114, + "step": 35670 + }, + { + "epoch": 20.768335273573925, + "grad_norm": 0.10072635859251022, + "learning_rate": 3.858513948402599e-05, + "loss": 0.0127, + "step": 35680 + }, + { + "epoch": 20.774155995343424, + "grad_norm": 0.1636141538619995, + "learning_rate": 3.8558311150173077e-05, + "loss": 0.0189, + "step": 35690 + }, + { + "epoch": 20.779976717112923, + "grad_norm": 0.16112889349460602, + "learning_rate": 3.853148629200312e-05, + "loss": 0.0128, + "step": 35700 + }, + { + "epoch": 20.785797438882422, + "grad_norm": 0.14448554813861847, + "learning_rate": 3.850466491766482e-05, + "loss": 0.0108, + "step": 35710 + }, + { + "epoch": 20.79161816065192, + "grad_norm": 0.16195937991142273, + "learning_rate": 3.847784703530583e-05, + "loss": 0.0064, + "step": 35720 + }, + { + "epoch": 20.79743888242142, + "grad_norm": 0.06760425120592117, + "learning_rate": 3.845103265307266e-05, + "loss": 0.008, + "step": 35730 + }, + { + "epoch": 20.80325960419092, + "grad_norm": 0.11101029813289642, + "learning_rate": 3.842422177911086e-05, + "loss": 0.0071, + "step": 35740 + }, + { + "epoch": 20.80908032596042, + "grad_norm": 0.09687268733978271, + "learning_rate": 3.8397414421564826e-05, + "loss": 0.0114, + "step": 35750 + }, + { + "epoch": 20.81490104772992, + "grad_norm": 0.18128053843975067, + "learning_rate": 3.8370610588577935e-05, + "loss": 0.0104, + "step": 35760 + }, + { + "epoch": 20.820721769499418, + "grad_norm": 0.14886757731437683, + "learning_rate": 3.834381028829251e-05, + "loss": 0.0071, + "step": 35770 + }, + { + "epoch": 20.826542491268917, + "grad_norm": 0.10048999637365341, + "learning_rate": 3.8317013528849745e-05, + "loss": 0.0089, + "step": 35780 + }, + { + "epoch": 20.832363213038416, + "grad_norm": 0.1233435869216919, + "learning_rate": 3.8290220318389815e-05, + "loss": 0.0113, + "step": 35790 + }, + { + "epoch": 20.838183934807915, + "grad_norm": 0.09458698332309723, + "learning_rate": 3.8263430665051746e-05, + "loss": 0.0062, + "step": 35800 + }, + { + "epoch": 20.844004656577415, + "grad_norm": 0.15475121140480042, + "learning_rate": 3.8236644576973554e-05, + "loss": 0.0078, + "step": 35810 + }, + { + "epoch": 20.849825378346914, + "grad_norm": 0.14898055791854858, + "learning_rate": 3.820986206229217e-05, + "loss": 0.0089, + "step": 35820 + }, + { + "epoch": 20.855646100116413, + "grad_norm": 0.16072948276996613, + "learning_rate": 3.8183083129143384e-05, + "loss": 0.0079, + "step": 35830 + }, + { + "epoch": 20.861466821885912, + "grad_norm": 0.14506548643112183, + "learning_rate": 3.815630778566193e-05, + "loss": 0.0186, + "step": 35840 + }, + { + "epoch": 20.867287543655415, + "grad_norm": 0.15295633673667908, + "learning_rate": 3.812953603998145e-05, + "loss": 0.0113, + "step": 35850 + }, + { + "epoch": 20.873108265424914, + "grad_norm": 0.1537076085805893, + "learning_rate": 3.8102767900234504e-05, + "loss": 0.0097, + "step": 35860 + }, + { + "epoch": 20.878928987194413, + "grad_norm": 0.11557379364967346, + "learning_rate": 3.807600337455256e-05, + "loss": 0.011, + "step": 35870 + }, + { + "epoch": 20.884749708963913, + "grad_norm": 0.15416964888572693, + "learning_rate": 3.804924247106593e-05, + "loss": 0.0112, + "step": 35880 + }, + { + "epoch": 20.890570430733412, + "grad_norm": 0.16901399195194244, + "learning_rate": 3.8022485197903925e-05, + "loss": 0.0093, + "step": 35890 + }, + { + "epoch": 20.89639115250291, + "grad_norm": 0.13578738272190094, + "learning_rate": 3.799573156319464e-05, + "loss": 0.0079, + "step": 35900 + }, + { + "epoch": 20.90221187427241, + "grad_norm": 0.1384427845478058, + "learning_rate": 3.796898157506515e-05, + "loss": 0.0103, + "step": 35910 + }, + { + "epoch": 20.90803259604191, + "grad_norm": 0.15974591672420502, + "learning_rate": 3.794223524164143e-05, + "loss": 0.0093, + "step": 35920 + }, + { + "epoch": 20.91385331781141, + "grad_norm": 0.2296055406332016, + "learning_rate": 3.7915492571048245e-05, + "loss": 0.0124, + "step": 35930 + }, + { + "epoch": 20.919674039580908, + "grad_norm": 0.16749154031276703, + "learning_rate": 3.788875357140937e-05, + "loss": 0.0168, + "step": 35940 + }, + { + "epoch": 20.925494761350407, + "grad_norm": 0.22274154424667358, + "learning_rate": 3.786201825084736e-05, + "loss": 0.011, + "step": 35950 + }, + { + "epoch": 20.931315483119906, + "grad_norm": 0.15143555402755737, + "learning_rate": 3.783528661748372e-05, + "loss": 0.009, + "step": 35960 + }, + { + "epoch": 20.937136204889406, + "grad_norm": 0.17640218138694763, + "learning_rate": 3.780855867943882e-05, + "loss": 0.0139, + "step": 35970 + }, + { + "epoch": 20.942956926658905, + "grad_norm": 0.21972867846488953, + "learning_rate": 3.778183444483189e-05, + "loss": 0.0092, + "step": 35980 + }, + { + "epoch": 20.948777648428404, + "grad_norm": 0.20205500721931458, + "learning_rate": 3.775511392178108e-05, + "loss": 0.0109, + "step": 35990 + }, + { + "epoch": 20.954598370197903, + "grad_norm": 0.191153421998024, + "learning_rate": 3.772839711840332e-05, + "loss": 0.0134, + "step": 36000 + }, + { + "epoch": 20.960419091967402, + "grad_norm": 0.22602729499340057, + "learning_rate": 3.7701684042814515e-05, + "loss": 0.013, + "step": 36010 + }, + { + "epoch": 20.9662398137369, + "grad_norm": 0.23730690777301788, + "learning_rate": 3.76749747031294e-05, + "loss": 0.009, + "step": 36020 + }, + { + "epoch": 20.972060535506404, + "grad_norm": 0.13014042377471924, + "learning_rate": 3.764826910746152e-05, + "loss": 0.0166, + "step": 36030 + }, + { + "epoch": 20.977881257275904, + "grad_norm": 0.07751065492630005, + "learning_rate": 3.762156726392338e-05, + "loss": 0.0072, + "step": 36040 + }, + { + "epoch": 20.983701979045403, + "grad_norm": 0.13264507055282593, + "learning_rate": 3.759486918062625e-05, + "loss": 0.013, + "step": 36050 + }, + { + "epoch": 20.989522700814902, + "grad_norm": 0.14907823503017426, + "learning_rate": 3.756817486568033e-05, + "loss": 0.0106, + "step": 36060 + }, + { + "epoch": 20.9953434225844, + "grad_norm": 0.18960502743721008, + "learning_rate": 3.7541484327194654e-05, + "loss": 0.0097, + "step": 36070 + }, + { + "epoch": 21.0011641443539, + "grad_norm": 0.10296875238418579, + "learning_rate": 3.751479757327707e-05, + "loss": 0.0121, + "step": 36080 + }, + { + "epoch": 21.0069848661234, + "grad_norm": 0.24657931923866272, + "learning_rate": 3.7488114612034345e-05, + "loss": 0.0113, + "step": 36090 + }, + { + "epoch": 21.0128055878929, + "grad_norm": 0.15149812400341034, + "learning_rate": 3.7461435451572044e-05, + "loss": 0.012, + "step": 36100 + }, + { + "epoch": 21.018626309662398, + "grad_norm": 0.07431797683238983, + "learning_rate": 3.743476009999459e-05, + "loss": 0.0137, + "step": 36110 + }, + { + "epoch": 21.024447031431897, + "grad_norm": 0.13376514613628387, + "learning_rate": 3.7408088565405245e-05, + "loss": 0.0085, + "step": 36120 + }, + { + "epoch": 21.030267753201397, + "grad_norm": 0.11051466315984726, + "learning_rate": 3.738142085590612e-05, + "loss": 0.0125, + "step": 36130 + }, + { + "epoch": 21.036088474970896, + "grad_norm": 0.16638608276844025, + "learning_rate": 3.7354756979598194e-05, + "loss": 0.0099, + "step": 36140 + }, + { + "epoch": 21.041909196740395, + "grad_norm": 0.1311444789171219, + "learning_rate": 3.7328096944581187e-05, + "loss": 0.0082, + "step": 36150 + }, + { + "epoch": 21.047729918509894, + "grad_norm": 0.12913724780082703, + "learning_rate": 3.730144075895377e-05, + "loss": 0.0106, + "step": 36160 + }, + { + "epoch": 21.053550640279393, + "grad_norm": 0.1184098944067955, + "learning_rate": 3.727478843081335e-05, + "loss": 0.0084, + "step": 36170 + }, + { + "epoch": 21.059371362048893, + "grad_norm": 0.17714090645313263, + "learning_rate": 3.72481399682562e-05, + "loss": 0.0085, + "step": 36180 + }, + { + "epoch": 21.065192083818392, + "grad_norm": 0.15859921276569366, + "learning_rate": 3.722149537937747e-05, + "loss": 0.0112, + "step": 36190 + }, + { + "epoch": 21.07101280558789, + "grad_norm": 0.17926189303398132, + "learning_rate": 3.7194854672271015e-05, + "loss": 0.0074, + "step": 36200 + }, + { + "epoch": 21.076833527357394, + "grad_norm": 0.11560161411762238, + "learning_rate": 3.7168217855029644e-05, + "loss": 0.01, + "step": 36210 + }, + { + "epoch": 21.082654249126893, + "grad_norm": 0.11144976317882538, + "learning_rate": 3.7141584935744856e-05, + "loss": 0.0108, + "step": 36220 + }, + { + "epoch": 21.088474970896392, + "grad_norm": 0.16427995264530182, + "learning_rate": 3.7114955922507055e-05, + "loss": 0.0075, + "step": 36230 + }, + { + "epoch": 21.09429569266589, + "grad_norm": 0.1386810690164566, + "learning_rate": 3.708833082340545e-05, + "loss": 0.012, + "step": 36240 + }, + { + "epoch": 21.10011641443539, + "grad_norm": 0.11247413605451584, + "learning_rate": 3.7061709646528034e-05, + "loss": 0.0064, + "step": 36250 + }, + { + "epoch": 21.10593713620489, + "grad_norm": 0.14582960307598114, + "learning_rate": 3.7035092399961604e-05, + "loss": 0.0089, + "step": 36260 + }, + { + "epoch": 21.11175785797439, + "grad_norm": 0.1191091313958168, + "learning_rate": 3.700847909179177e-05, + "loss": 0.0094, + "step": 36270 + }, + { + "epoch": 21.11757857974389, + "grad_norm": 0.16801577806472778, + "learning_rate": 3.698186973010297e-05, + "loss": 0.0158, + "step": 36280 + }, + { + "epoch": 21.123399301513388, + "grad_norm": 0.1012287363409996, + "learning_rate": 3.695526432297844e-05, + "loss": 0.0086, + "step": 36290 + }, + { + "epoch": 21.129220023282887, + "grad_norm": 0.1927359700202942, + "learning_rate": 3.692866287850017e-05, + "loss": 0.0077, + "step": 36300 + }, + { + "epoch": 21.135040745052386, + "grad_norm": 0.1345120668411255, + "learning_rate": 3.6902065404749006e-05, + "loss": 0.0095, + "step": 36310 + }, + { + "epoch": 21.140861466821885, + "grad_norm": 0.15951013565063477, + "learning_rate": 3.6875471909804516e-05, + "loss": 0.0121, + "step": 36320 + }, + { + "epoch": 21.146682188591384, + "grad_norm": 0.1479579210281372, + "learning_rate": 3.6848882401745135e-05, + "loss": 0.0065, + "step": 36330 + }, + { + "epoch": 21.152502910360884, + "grad_norm": 0.14238564670085907, + "learning_rate": 3.682229688864806e-05, + "loss": 0.0104, + "step": 36340 + }, + { + "epoch": 21.158323632130383, + "grad_norm": 0.09470665454864502, + "learning_rate": 3.6795715378589235e-05, + "loss": 0.0063, + "step": 36350 + }, + { + "epoch": 21.164144353899882, + "grad_norm": 0.13670653104782104, + "learning_rate": 3.676913787964345e-05, + "loss": 0.0126, + "step": 36360 + }, + { + "epoch": 21.16996507566938, + "grad_norm": 0.11854956299066544, + "learning_rate": 3.674256439988423e-05, + "loss": 0.0107, + "step": 36370 + }, + { + "epoch": 21.175785797438884, + "grad_norm": 0.20088693499565125, + "learning_rate": 3.6715994947383904e-05, + "loss": 0.0102, + "step": 36380 + }, + { + "epoch": 21.181606519208383, + "grad_norm": 0.08259875327348709, + "learning_rate": 3.668942953021357e-05, + "loss": 0.0062, + "step": 36390 + }, + { + "epoch": 21.187427240977883, + "grad_norm": 0.11992927640676498, + "learning_rate": 3.66628681564431e-05, + "loss": 0.0081, + "step": 36400 + }, + { + "epoch": 21.19324796274738, + "grad_norm": 0.1927708387374878, + "learning_rate": 3.663631083414114e-05, + "loss": 0.0062, + "step": 36410 + }, + { + "epoch": 21.19906868451688, + "grad_norm": 0.13530509173870087, + "learning_rate": 3.660975757137509e-05, + "loss": 0.0081, + "step": 36420 + }, + { + "epoch": 21.20488940628638, + "grad_norm": 0.1626838594675064, + "learning_rate": 3.658320837621114e-05, + "loss": 0.008, + "step": 36430 + }, + { + "epoch": 21.21071012805588, + "grad_norm": 0.16651299595832825, + "learning_rate": 3.655666325671426e-05, + "loss": 0.0096, + "step": 36440 + }, + { + "epoch": 21.21653084982538, + "grad_norm": 0.18343831598758698, + "learning_rate": 3.65301222209481e-05, + "loss": 0.0089, + "step": 36450 + }, + { + "epoch": 21.222351571594878, + "grad_norm": 0.14767983555793762, + "learning_rate": 3.650358527697519e-05, + "loss": 0.0138, + "step": 36460 + }, + { + "epoch": 21.228172293364377, + "grad_norm": 0.15250419080257416, + "learning_rate": 3.64770524328567e-05, + "loss": 0.0093, + "step": 36470 + }, + { + "epoch": 21.233993015133876, + "grad_norm": 0.17070774734020233, + "learning_rate": 3.645052369665265e-05, + "loss": 0.0091, + "step": 36480 + }, + { + "epoch": 21.239813736903375, + "grad_norm": 0.13078126311302185, + "learning_rate": 3.6423999076421724e-05, + "loss": 0.0103, + "step": 36490 + }, + { + "epoch": 21.245634458672875, + "grad_norm": 0.15749672055244446, + "learning_rate": 3.639747858022142e-05, + "loss": 0.0105, + "step": 36500 + }, + { + "epoch": 21.251455180442374, + "grad_norm": 0.1464647799730301, + "learning_rate": 3.637096221610799e-05, + "loss": 0.0092, + "step": 36510 + }, + { + "epoch": 21.257275902211873, + "grad_norm": 0.12693682312965393, + "learning_rate": 3.634444999213638e-05, + "loss": 0.0063, + "step": 36520 + }, + { + "epoch": 21.263096623981372, + "grad_norm": 0.17056390643119812, + "learning_rate": 3.6317941916360296e-05, + "loss": 0.0112, + "step": 36530 + }, + { + "epoch": 21.26891734575087, + "grad_norm": 0.24555130302906036, + "learning_rate": 3.629143799683221e-05, + "loss": 0.0076, + "step": 36540 + }, + { + "epoch": 21.274738067520374, + "grad_norm": 0.16150304675102234, + "learning_rate": 3.626493824160331e-05, + "loss": 0.0086, + "step": 36550 + }, + { + "epoch": 21.280558789289874, + "grad_norm": 0.18319129943847656, + "learning_rate": 3.623844265872352e-05, + "loss": 0.0052, + "step": 36560 + }, + { + "epoch": 21.286379511059373, + "grad_norm": 0.24680626392364502, + "learning_rate": 3.621195125624149e-05, + "loss": 0.0093, + "step": 36570 + }, + { + "epoch": 21.292200232828872, + "grad_norm": 0.15520747005939484, + "learning_rate": 3.618546404220463e-05, + "loss": 0.0111, + "step": 36580 + }, + { + "epoch": 21.29802095459837, + "grad_norm": 0.14885564148426056, + "learning_rate": 3.615898102465903e-05, + "loss": 0.011, + "step": 36590 + }, + { + "epoch": 21.30384167636787, + "grad_norm": 0.1641659438610077, + "learning_rate": 3.6132502211649544e-05, + "loss": 0.0097, + "step": 36600 + }, + { + "epoch": 21.30966239813737, + "grad_norm": 0.136110320687294, + "learning_rate": 3.610602761121975e-05, + "loss": 0.0085, + "step": 36610 + }, + { + "epoch": 21.31548311990687, + "grad_norm": 0.1204947754740715, + "learning_rate": 3.6079557231411897e-05, + "loss": 0.0099, + "step": 36620 + }, + { + "epoch": 21.321303841676368, + "grad_norm": 0.09129106998443604, + "learning_rate": 3.6053091080267035e-05, + "loss": 0.0065, + "step": 36630 + }, + { + "epoch": 21.327124563445867, + "grad_norm": 0.16981108486652374, + "learning_rate": 3.602662916582483e-05, + "loss": 0.0076, + "step": 36640 + }, + { + "epoch": 21.332945285215366, + "grad_norm": 0.16974744200706482, + "learning_rate": 3.600017149612375e-05, + "loss": 0.0088, + "step": 36650 + }, + { + "epoch": 21.338766006984866, + "grad_norm": 0.18886826932430267, + "learning_rate": 3.5973718079200935e-05, + "loss": 0.0109, + "step": 36660 + }, + { + "epoch": 21.344586728754365, + "grad_norm": 0.1367643028497696, + "learning_rate": 3.5947268923092216e-05, + "loss": 0.008, + "step": 36670 + }, + { + "epoch": 21.350407450523864, + "grad_norm": 0.15932859480381012, + "learning_rate": 3.592082403583216e-05, + "loss": 0.0095, + "step": 36680 + }, + { + "epoch": 21.356228172293363, + "grad_norm": 0.14219152927398682, + "learning_rate": 3.5894383425454004e-05, + "loss": 0.0093, + "step": 36690 + }, + { + "epoch": 21.362048894062863, + "grad_norm": 0.07904931157827377, + "learning_rate": 3.586794709998975e-05, + "loss": 0.0056, + "step": 36700 + }, + { + "epoch": 21.36786961583236, + "grad_norm": 0.13247737288475037, + "learning_rate": 3.584151506747002e-05, + "loss": 0.0081, + "step": 36710 + }, + { + "epoch": 21.37369033760186, + "grad_norm": 0.121693454682827, + "learning_rate": 3.581508733592418e-05, + "loss": 0.0105, + "step": 36720 + }, + { + "epoch": 21.379511059371364, + "grad_norm": 0.12824265658855438, + "learning_rate": 3.5788663913380297e-05, + "loss": 0.0089, + "step": 36730 + }, + { + "epoch": 21.385331781140863, + "grad_norm": 0.1608259528875351, + "learning_rate": 3.576224480786506e-05, + "loss": 0.0141, + "step": 36740 + }, + { + "epoch": 21.391152502910362, + "grad_norm": 0.1170772984623909, + "learning_rate": 3.573583002740393e-05, + "loss": 0.0092, + "step": 36750 + }, + { + "epoch": 21.39697322467986, + "grad_norm": 0.19060520827770233, + "learning_rate": 3.570941958002103e-05, + "loss": 0.0119, + "step": 36760 + }, + { + "epoch": 21.40279394644936, + "grad_norm": 0.14737844467163086, + "learning_rate": 3.568301347373912e-05, + "loss": 0.0075, + "step": 36770 + }, + { + "epoch": 21.40861466821886, + "grad_norm": 0.12083044648170471, + "learning_rate": 3.5656611716579726e-05, + "loss": 0.0086, + "step": 36780 + }, + { + "epoch": 21.41443538998836, + "grad_norm": 0.08978255838155746, + "learning_rate": 3.5630214316562946e-05, + "loss": 0.0172, + "step": 36790 + }, + { + "epoch": 21.42025611175786, + "grad_norm": 0.06108976528048515, + "learning_rate": 3.560382128170766e-05, + "loss": 0.0071, + "step": 36800 + }, + { + "epoch": 21.426076833527357, + "grad_norm": 0.11206282675266266, + "learning_rate": 3.5577432620031374e-05, + "loss": 0.0139, + "step": 36810 + }, + { + "epoch": 21.431897555296857, + "grad_norm": 0.1746027171611786, + "learning_rate": 3.5551048339550216e-05, + "loss": 0.0102, + "step": 36820 + }, + { + "epoch": 21.437718277066356, + "grad_norm": 0.1656407117843628, + "learning_rate": 3.55246684482791e-05, + "loss": 0.0112, + "step": 36830 + }, + { + "epoch": 21.443538998835855, + "grad_norm": 0.12174893915653229, + "learning_rate": 3.5498292954231496e-05, + "loss": 0.0091, + "step": 36840 + }, + { + "epoch": 21.449359720605354, + "grad_norm": 0.2008921504020691, + "learning_rate": 3.54719218654196e-05, + "loss": 0.0093, + "step": 36850 + }, + { + "epoch": 21.455180442374854, + "grad_norm": 0.12837673723697662, + "learning_rate": 3.544555518985425e-05, + "loss": 0.0081, + "step": 36860 + }, + { + "epoch": 21.461001164144353, + "grad_norm": 0.14737357199192047, + "learning_rate": 3.541919293554494e-05, + "loss": 0.0081, + "step": 36870 + }, + { + "epoch": 21.466821885913852, + "grad_norm": 0.27214354276657104, + "learning_rate": 3.539283511049985e-05, + "loss": 0.0106, + "step": 36880 + }, + { + "epoch": 21.47264260768335, + "grad_norm": 0.3114960491657257, + "learning_rate": 3.5366481722725755e-05, + "loss": 0.009, + "step": 36890 + }, + { + "epoch": 21.47846332945285, + "grad_norm": 0.20425312221050262, + "learning_rate": 3.534013278022816e-05, + "loss": 0.0085, + "step": 36900 + }, + { + "epoch": 21.484284051222353, + "grad_norm": 0.3084229528903961, + "learning_rate": 3.531378829101113e-05, + "loss": 0.012, + "step": 36910 + }, + { + "epoch": 21.490104772991852, + "grad_norm": 0.1964176595211029, + "learning_rate": 3.528744826307746e-05, + "loss": 0.0094, + "step": 36920 + }, + { + "epoch": 21.49592549476135, + "grad_norm": 0.2474251091480255, + "learning_rate": 3.5261112704428554e-05, + "loss": 0.0093, + "step": 36930 + }, + { + "epoch": 21.50174621653085, + "grad_norm": 0.2181745022535324, + "learning_rate": 3.523478162306443e-05, + "loss": 0.0077, + "step": 36940 + }, + { + "epoch": 21.50756693830035, + "grad_norm": 0.23887687921524048, + "learning_rate": 3.520845502698381e-05, + "loss": 0.008, + "step": 36950 + }, + { + "epoch": 21.51338766006985, + "grad_norm": 0.19078779220581055, + "learning_rate": 3.5182132924184005e-05, + "loss": 0.0087, + "step": 36960 + }, + { + "epoch": 21.51920838183935, + "grad_norm": 0.15912584960460663, + "learning_rate": 3.5155815322660966e-05, + "loss": 0.0116, + "step": 36970 + }, + { + "epoch": 21.525029103608848, + "grad_norm": 0.1627229005098343, + "learning_rate": 3.512950223040931e-05, + "loss": 0.0085, + "step": 36980 + }, + { + "epoch": 21.530849825378347, + "grad_norm": 0.2544988691806793, + "learning_rate": 3.5103193655422216e-05, + "loss": 0.0087, + "step": 36990 + }, + { + "epoch": 21.536670547147846, + "grad_norm": 0.08738040179014206, + "learning_rate": 3.5076889605691596e-05, + "loss": 0.0094, + "step": 37000 + }, + { + "epoch": 21.542491268917345, + "grad_norm": 0.20363691449165344, + "learning_rate": 3.505059008920787e-05, + "loss": 0.0115, + "step": 37010 + }, + { + "epoch": 21.548311990686845, + "grad_norm": 0.25958022475242615, + "learning_rate": 3.502429511396016e-05, + "loss": 0.0138, + "step": 37020 + }, + { + "epoch": 21.554132712456344, + "grad_norm": 0.12259281426668167, + "learning_rate": 3.4998004687936196e-05, + "loss": 0.0102, + "step": 37030 + }, + { + "epoch": 21.559953434225843, + "grad_norm": 0.15449564158916473, + "learning_rate": 3.497171881912229e-05, + "loss": 0.007, + "step": 37040 + }, + { + "epoch": 21.565774155995342, + "grad_norm": 0.11493966728448868, + "learning_rate": 3.494543751550342e-05, + "loss": 0.0097, + "step": 37050 + }, + { + "epoch": 21.57159487776484, + "grad_norm": 0.11191015690565109, + "learning_rate": 3.491916078506313e-05, + "loss": 0.0103, + "step": 37060 + }, + { + "epoch": 21.57741559953434, + "grad_norm": 0.09532647579908371, + "learning_rate": 3.489288863578361e-05, + "loss": 0.0139, + "step": 37070 + }, + { + "epoch": 21.583236321303843, + "grad_norm": 0.16188345849514008, + "learning_rate": 3.4866621075645646e-05, + "loss": 0.0134, + "step": 37080 + }, + { + "epoch": 21.589057043073343, + "grad_norm": 0.2161346673965454, + "learning_rate": 3.4840358112628614e-05, + "loss": 0.0099, + "step": 37090 + }, + { + "epoch": 21.594877764842842, + "grad_norm": 0.1826017200946808, + "learning_rate": 3.481409975471053e-05, + "loss": 0.0065, + "step": 37100 + }, + { + "epoch": 21.60069848661234, + "grad_norm": 0.12555089592933655, + "learning_rate": 3.4787846009867986e-05, + "loss": 0.0129, + "step": 37110 + }, + { + "epoch": 21.60651920838184, + "grad_norm": 0.24123480916023254, + "learning_rate": 3.476159688607615e-05, + "loss": 0.0109, + "step": 37120 + }, + { + "epoch": 21.61233993015134, + "grad_norm": 0.1964626908302307, + "learning_rate": 3.4735352391308854e-05, + "loss": 0.0075, + "step": 37130 + }, + { + "epoch": 21.61816065192084, + "grad_norm": 0.18868762254714966, + "learning_rate": 3.4709112533538446e-05, + "loss": 0.0104, + "step": 37140 + }, + { + "epoch": 21.623981373690338, + "grad_norm": 0.14308056235313416, + "learning_rate": 3.4682877320735934e-05, + "loss": 0.0044, + "step": 37150 + }, + { + "epoch": 21.629802095459837, + "grad_norm": 0.1629176139831543, + "learning_rate": 3.465664676087085e-05, + "loss": 0.0115, + "step": 37160 + }, + { + "epoch": 21.635622817229336, + "grad_norm": 0.18380184471607208, + "learning_rate": 3.463042086191136e-05, + "loss": 0.0105, + "step": 37170 + }, + { + "epoch": 21.641443538998836, + "grad_norm": 0.19662690162658691, + "learning_rate": 3.460419963182423e-05, + "loss": 0.012, + "step": 37180 + }, + { + "epoch": 21.647264260768335, + "grad_norm": 0.11319395154714584, + "learning_rate": 3.457798307857473e-05, + "loss": 0.0106, + "step": 37190 + }, + { + "epoch": 21.653084982537834, + "grad_norm": 0.2549956142902374, + "learning_rate": 3.455177121012678e-05, + "loss": 0.0106, + "step": 37200 + }, + { + "epoch": 21.658905704307333, + "grad_norm": 0.18213893473148346, + "learning_rate": 3.452556403444285e-05, + "loss": 0.0101, + "step": 37210 + }, + { + "epoch": 21.664726426076832, + "grad_norm": 0.25493454933166504, + "learning_rate": 3.4499361559483975e-05, + "loss": 0.008, + "step": 37220 + }, + { + "epoch": 21.67054714784633, + "grad_norm": 0.1668849140405655, + "learning_rate": 3.44731637932098e-05, + "loss": 0.0069, + "step": 37230 + }, + { + "epoch": 21.67636786961583, + "grad_norm": 0.14671356976032257, + "learning_rate": 3.44469707435785e-05, + "loss": 0.0086, + "step": 37240 + }, + { + "epoch": 21.682188591385334, + "grad_norm": 0.13860267400741577, + "learning_rate": 3.4420782418546835e-05, + "loss": 0.0084, + "step": 37250 + }, + { + "epoch": 21.688009313154833, + "grad_norm": 0.10626638680696487, + "learning_rate": 3.439459882607012e-05, + "loss": 0.0092, + "step": 37260 + }, + { + "epoch": 21.693830034924332, + "grad_norm": 0.16178816556930542, + "learning_rate": 3.436841997410225e-05, + "loss": 0.0074, + "step": 37270 + }, + { + "epoch": 21.69965075669383, + "grad_norm": 0.08834406733512878, + "learning_rate": 3.434224587059567e-05, + "loss": 0.0077, + "step": 37280 + }, + { + "epoch": 21.70547147846333, + "grad_norm": 0.09738211333751678, + "learning_rate": 3.431607652350136e-05, + "loss": 0.0093, + "step": 37290 + }, + { + "epoch": 21.71129220023283, + "grad_norm": 0.08760903775691986, + "learning_rate": 3.428991194076891e-05, + "loss": 0.0077, + "step": 37300 + }, + { + "epoch": 21.71711292200233, + "grad_norm": 0.1255377233028412, + "learning_rate": 3.4263752130346394e-05, + "loss": 0.0079, + "step": 37310 + }, + { + "epoch": 21.722933643771828, + "grad_norm": 0.2213071882724762, + "learning_rate": 3.4237597100180515e-05, + "loss": 0.0086, + "step": 37320 + }, + { + "epoch": 21.728754365541327, + "grad_norm": 0.21287856996059418, + "learning_rate": 3.4211446858216427e-05, + "loss": 0.0092, + "step": 37330 + }, + { + "epoch": 21.734575087310827, + "grad_norm": 0.22559580206871033, + "learning_rate": 3.4185301412397915e-05, + "loss": 0.0199, + "step": 37340 + }, + { + "epoch": 21.740395809080326, + "grad_norm": 0.11354909092187881, + "learning_rate": 3.415916077066729e-05, + "loss": 0.0132, + "step": 37350 + }, + { + "epoch": 21.746216530849825, + "grad_norm": 0.13346216082572937, + "learning_rate": 3.413302494096535e-05, + "loss": 0.0063, + "step": 37360 + }, + { + "epoch": 21.752037252619324, + "grad_norm": 0.11355914175510406, + "learning_rate": 3.410689393123151e-05, + "loss": 0.0101, + "step": 37370 + }, + { + "epoch": 21.757857974388823, + "grad_norm": 0.24918515980243683, + "learning_rate": 3.408076774940364e-05, + "loss": 0.0087, + "step": 37380 + }, + { + "epoch": 21.763678696158323, + "grad_norm": 0.24233485758304596, + "learning_rate": 3.40546464034182e-05, + "loss": 0.0093, + "step": 37390 + }, + { + "epoch": 21.769499417927822, + "grad_norm": 0.09482322633266449, + "learning_rate": 3.4028529901210185e-05, + "loss": 0.0141, + "step": 37400 + }, + { + "epoch": 21.77532013969732, + "grad_norm": 0.13295350968837738, + "learning_rate": 3.4002418250713086e-05, + "loss": 0.0123, + "step": 37410 + }, + { + "epoch": 21.78114086146682, + "grad_norm": 0.15955416858196259, + "learning_rate": 3.3976311459858936e-05, + "loss": 0.0073, + "step": 37420 + }, + { + "epoch": 21.78696158323632, + "grad_norm": 0.22964821755886078, + "learning_rate": 3.395020953657826e-05, + "loss": 0.009, + "step": 37430 + }, + { + "epoch": 21.792782305005822, + "grad_norm": 0.09732450544834137, + "learning_rate": 3.3924112488800165e-05, + "loss": 0.0068, + "step": 37440 + }, + { + "epoch": 21.79860302677532, + "grad_norm": 0.12352339178323746, + "learning_rate": 3.389802032445225e-05, + "loss": 0.0083, + "step": 37450 + }, + { + "epoch": 21.80442374854482, + "grad_norm": 0.14224225282669067, + "learning_rate": 3.38719330514606e-05, + "loss": 0.0075, + "step": 37460 + }, + { + "epoch": 21.81024447031432, + "grad_norm": 0.11341757327318192, + "learning_rate": 3.3845850677749866e-05, + "loss": 0.0095, + "step": 37470 + }, + { + "epoch": 21.81606519208382, + "grad_norm": 0.17247715592384338, + "learning_rate": 3.3819773211243157e-05, + "loss": 0.0077, + "step": 37480 + }, + { + "epoch": 21.82188591385332, + "grad_norm": 0.10120856761932373, + "learning_rate": 3.379370065986213e-05, + "loss": 0.006, + "step": 37490 + }, + { + "epoch": 21.827706635622818, + "grad_norm": 0.16100338101387024, + "learning_rate": 3.3767633031526955e-05, + "loss": 0.0072, + "step": 37500 + }, + { + "epoch": 21.833527357392317, + "grad_norm": 0.10797702521085739, + "learning_rate": 3.374157033415626e-05, + "loss": 0.0064, + "step": 37510 + }, + { + "epoch": 21.839348079161816, + "grad_norm": 0.1938280165195465, + "learning_rate": 3.371551257566723e-05, + "loss": 0.0077, + "step": 37520 + }, + { + "epoch": 21.845168800931315, + "grad_norm": 0.14447607100009918, + "learning_rate": 3.36894597639755e-05, + "loss": 0.0109, + "step": 37530 + }, + { + "epoch": 21.850989522700814, + "grad_norm": 0.13848425447940826, + "learning_rate": 3.366341190699523e-05, + "loss": 0.009, + "step": 37540 + }, + { + "epoch": 21.856810244470314, + "grad_norm": 0.2501564621925354, + "learning_rate": 3.36373690126391e-05, + "loss": 0.0075, + "step": 37550 + }, + { + "epoch": 21.862630966239813, + "grad_norm": 0.16157715022563934, + "learning_rate": 3.3611331088818234e-05, + "loss": 0.011, + "step": 37560 + }, + { + "epoch": 21.868451688009312, + "grad_norm": 0.13123837113380432, + "learning_rate": 3.3585298143442265e-05, + "loss": 0.0107, + "step": 37570 + }, + { + "epoch": 21.87427240977881, + "grad_norm": 0.1708393394947052, + "learning_rate": 3.35592701844193e-05, + "loss": 0.0103, + "step": 37580 + }, + { + "epoch": 21.88009313154831, + "grad_norm": 0.13507458567619324, + "learning_rate": 3.353324721965596e-05, + "loss": 0.0108, + "step": 37590 + }, + { + "epoch": 21.88591385331781, + "grad_norm": 0.20074741542339325, + "learning_rate": 3.350722925705736e-05, + "loss": 0.008, + "step": 37600 + }, + { + "epoch": 21.891734575087312, + "grad_norm": 0.12787960469722748, + "learning_rate": 3.348121630452703e-05, + "loss": 0.0126, + "step": 37610 + }, + { + "epoch": 21.89755529685681, + "grad_norm": 0.22813692688941956, + "learning_rate": 3.3455208369967044e-05, + "loss": 0.009, + "step": 37620 + }, + { + "epoch": 21.90337601862631, + "grad_norm": 0.17291571199893951, + "learning_rate": 3.34292054612779e-05, + "loss": 0.0086, + "step": 37630 + }, + { + "epoch": 21.90919674039581, + "grad_norm": 0.25887903571128845, + "learning_rate": 3.340320758635861e-05, + "loss": 0.0118, + "step": 37640 + }, + { + "epoch": 21.91501746216531, + "grad_norm": 0.14556629955768585, + "learning_rate": 3.337721475310666e-05, + "loss": 0.0091, + "step": 37650 + }, + { + "epoch": 21.92083818393481, + "grad_norm": 0.1562197357416153, + "learning_rate": 3.335122696941795e-05, + "loss": 0.0086, + "step": 37660 + }, + { + "epoch": 21.926658905704308, + "grad_norm": 0.20266146957874298, + "learning_rate": 3.332524424318692e-05, + "loss": 0.007, + "step": 37670 + }, + { + "epoch": 21.932479627473807, + "grad_norm": 0.1357797086238861, + "learning_rate": 3.32992665823064e-05, + "loss": 0.0072, + "step": 37680 + }, + { + "epoch": 21.938300349243306, + "grad_norm": 0.14495985209941864, + "learning_rate": 3.327329399466774e-05, + "loss": 0.011, + "step": 37690 + }, + { + "epoch": 21.944121071012805, + "grad_norm": 0.15738581120967865, + "learning_rate": 3.324732648816072e-05, + "loss": 0.0077, + "step": 37700 + }, + { + "epoch": 21.949941792782305, + "grad_norm": 0.215051531791687, + "learning_rate": 3.322136407067358e-05, + "loss": 0.0062, + "step": 37710 + }, + { + "epoch": 21.955762514551804, + "grad_norm": 0.22480499744415283, + "learning_rate": 3.3195406750093036e-05, + "loss": 0.0084, + "step": 37720 + }, + { + "epoch": 21.961583236321303, + "grad_norm": 0.15371163189411163, + "learning_rate": 3.3169454534304205e-05, + "loss": 0.0076, + "step": 37730 + }, + { + "epoch": 21.967403958090802, + "grad_norm": 0.16587652266025543, + "learning_rate": 3.3143507431190725e-05, + "loss": 0.0108, + "step": 37740 + }, + { + "epoch": 21.9732246798603, + "grad_norm": 0.18056705594062805, + "learning_rate": 3.311756544863459e-05, + "loss": 0.0073, + "step": 37750 + }, + { + "epoch": 21.9790454016298, + "grad_norm": 0.1608642041683197, + "learning_rate": 3.309162859451633e-05, + "loss": 0.0113, + "step": 37760 + }, + { + "epoch": 21.9848661233993, + "grad_norm": 0.12285313010215759, + "learning_rate": 3.306569687671487e-05, + "loss": 0.0072, + "step": 37770 + }, + { + "epoch": 21.990686845168803, + "grad_norm": 0.1362115889787674, + "learning_rate": 3.303977030310756e-05, + "loss": 0.0079, + "step": 37780 + }, + { + "epoch": 21.996507566938302, + "grad_norm": 0.24659082293510437, + "learning_rate": 3.3013848881570245e-05, + "loss": 0.0071, + "step": 37790 + }, + { + "epoch": 22.0023282887078, + "grad_norm": 0.11912588030099869, + "learning_rate": 3.298793261997712e-05, + "loss": 0.0138, + "step": 37800 + }, + { + "epoch": 22.0081490104773, + "grad_norm": 0.1277562975883484, + "learning_rate": 3.2962021526200893e-05, + "loss": 0.0086, + "step": 37810 + }, + { + "epoch": 22.0139697322468, + "grad_norm": 0.12099479883909225, + "learning_rate": 3.293611560811268e-05, + "loss": 0.0096, + "step": 37820 + }, + { + "epoch": 22.0197904540163, + "grad_norm": 0.20841747522354126, + "learning_rate": 3.291021487358199e-05, + "loss": 0.0118, + "step": 37830 + }, + { + "epoch": 22.025611175785798, + "grad_norm": 0.1979352980852127, + "learning_rate": 3.28843193304768e-05, + "loss": 0.0131, + "step": 37840 + }, + { + "epoch": 22.031431897555297, + "grad_norm": 0.1803765594959259, + "learning_rate": 3.2858428986663456e-05, + "loss": 0.0112, + "step": 37850 + }, + { + "epoch": 22.037252619324796, + "grad_norm": 0.13867312669754028, + "learning_rate": 3.283254385000681e-05, + "loss": 0.0069, + "step": 37860 + }, + { + "epoch": 22.043073341094296, + "grad_norm": 0.13759000599384308, + "learning_rate": 3.2806663928370076e-05, + "loss": 0.008, + "step": 37870 + }, + { + "epoch": 22.048894062863795, + "grad_norm": 0.11202666163444519, + "learning_rate": 3.278078922961485e-05, + "loss": 0.0081, + "step": 37880 + }, + { + "epoch": 22.054714784633294, + "grad_norm": 0.11808265000581741, + "learning_rate": 3.275491976160123e-05, + "loss": 0.0068, + "step": 37890 + }, + { + "epoch": 22.060535506402793, + "grad_norm": 0.11243658512830734, + "learning_rate": 3.2729055532187645e-05, + "loss": 0.0064, + "step": 37900 + }, + { + "epoch": 22.066356228172292, + "grad_norm": 0.2295374870300293, + "learning_rate": 3.270319654923097e-05, + "loss": 0.0108, + "step": 37910 + }, + { + "epoch": 22.07217694994179, + "grad_norm": 0.20264501869678497, + "learning_rate": 3.2677342820586506e-05, + "loss": 0.0089, + "step": 37920 + }, + { + "epoch": 22.07799767171129, + "grad_norm": 0.2135523408651352, + "learning_rate": 3.2651494354107905e-05, + "loss": 0.0102, + "step": 37930 + }, + { + "epoch": 22.08381839348079, + "grad_norm": 0.13886304199695587, + "learning_rate": 3.2625651157647266e-05, + "loss": 0.0075, + "step": 37940 + }, + { + "epoch": 22.08963911525029, + "grad_norm": 0.12620025873184204, + "learning_rate": 3.259981323905505e-05, + "loss": 0.0093, + "step": 37950 + }, + { + "epoch": 22.095459837019792, + "grad_norm": 0.11367858946323395, + "learning_rate": 3.257398060618014e-05, + "loss": 0.0073, + "step": 37960 + }, + { + "epoch": 22.10128055878929, + "grad_norm": 0.14410483837127686, + "learning_rate": 3.254815326686983e-05, + "loss": 0.0064, + "step": 37970 + }, + { + "epoch": 22.10710128055879, + "grad_norm": 0.11918853968381882, + "learning_rate": 3.2522331228969774e-05, + "loss": 0.0058, + "step": 37980 + }, + { + "epoch": 22.11292200232829, + "grad_norm": 0.20620185136795044, + "learning_rate": 3.2496514500324006e-05, + "loss": 0.0087, + "step": 37990 + }, + { + "epoch": 22.11874272409779, + "grad_norm": 0.09768659621477127, + "learning_rate": 3.247070308877498e-05, + "loss": 0.0058, + "step": 38000 + }, + { + "epoch": 22.124563445867288, + "grad_norm": 0.12013177573680878, + "learning_rate": 3.2444897002163515e-05, + "loss": 0.0084, + "step": 38010 + }, + { + "epoch": 22.130384167636787, + "grad_norm": 0.17559556663036346, + "learning_rate": 3.241909624832885e-05, + "loss": 0.007, + "step": 38020 + }, + { + "epoch": 22.136204889406287, + "grad_norm": 0.11839792132377625, + "learning_rate": 3.239330083510852e-05, + "loss": 0.0103, + "step": 38030 + }, + { + "epoch": 22.142025611175786, + "grad_norm": 0.12098522484302521, + "learning_rate": 3.236751077033855e-05, + "loss": 0.0081, + "step": 38040 + }, + { + "epoch": 22.147846332945285, + "grad_norm": 0.20817261934280396, + "learning_rate": 3.234172606185322e-05, + "loss": 0.0106, + "step": 38050 + }, + { + "epoch": 22.153667054714784, + "grad_norm": 0.14292237162590027, + "learning_rate": 3.231594671748528e-05, + "loss": 0.01, + "step": 38060 + }, + { + "epoch": 22.159487776484283, + "grad_norm": 0.13491979241371155, + "learning_rate": 3.2290172745065815e-05, + "loss": 0.0065, + "step": 38070 + }, + { + "epoch": 22.165308498253783, + "grad_norm": 0.12541276216506958, + "learning_rate": 3.226440415242426e-05, + "loss": 0.0085, + "step": 38080 + }, + { + "epoch": 22.171129220023282, + "grad_norm": 0.10979664325714111, + "learning_rate": 3.223864094738846e-05, + "loss": 0.0075, + "step": 38090 + }, + { + "epoch": 22.17694994179278, + "grad_norm": 0.1053968146443367, + "learning_rate": 3.221288313778456e-05, + "loss": 0.0059, + "step": 38100 + }, + { + "epoch": 22.18277066356228, + "grad_norm": 0.07114506512880325, + "learning_rate": 3.2187130731437125e-05, + "loss": 0.0063, + "step": 38110 + }, + { + "epoch": 22.18859138533178, + "grad_norm": 0.09099091589450836, + "learning_rate": 3.216138373616905e-05, + "loss": 0.0087, + "step": 38120 + }, + { + "epoch": 22.194412107101282, + "grad_norm": 0.14609907567501068, + "learning_rate": 3.21356421598016e-05, + "loss": 0.0088, + "step": 38130 + }, + { + "epoch": 22.20023282887078, + "grad_norm": 0.2344403713941574, + "learning_rate": 3.210990601015438e-05, + "loss": 0.0093, + "step": 38140 + }, + { + "epoch": 22.20605355064028, + "grad_norm": 0.17477379739284515, + "learning_rate": 3.208417529504535e-05, + "loss": 0.0081, + "step": 38150 + }, + { + "epoch": 22.21187427240978, + "grad_norm": 0.19156326353549957, + "learning_rate": 3.205845002229084e-05, + "loss": 0.01, + "step": 38160 + }, + { + "epoch": 22.21769499417928, + "grad_norm": 0.26499485969543457, + "learning_rate": 3.203273019970547e-05, + "loss": 0.0065, + "step": 38170 + }, + { + "epoch": 22.22351571594878, + "grad_norm": 0.2517027258872986, + "learning_rate": 3.200701583510227e-05, + "loss": 0.01, + "step": 38180 + }, + { + "epoch": 22.229336437718278, + "grad_norm": 0.1032700464129448, + "learning_rate": 3.198130693629261e-05, + "loss": 0.0085, + "step": 38190 + }, + { + "epoch": 22.235157159487777, + "grad_norm": 0.18145860731601715, + "learning_rate": 3.195560351108612e-05, + "loss": 0.0095, + "step": 38200 + }, + { + "epoch": 22.240977881257276, + "grad_norm": 0.1524345427751541, + "learning_rate": 3.1929905567290865e-05, + "loss": 0.0064, + "step": 38210 + }, + { + "epoch": 22.246798603026775, + "grad_norm": 0.1242697536945343, + "learning_rate": 3.1904213112713164e-05, + "loss": 0.0079, + "step": 38220 + }, + { + "epoch": 22.252619324796274, + "grad_norm": 0.12598064541816711, + "learning_rate": 3.187852615515774e-05, + "loss": 0.0104, + "step": 38230 + }, + { + "epoch": 22.258440046565774, + "grad_norm": 0.1425088346004486, + "learning_rate": 3.1852844702427606e-05, + "loss": 0.0057, + "step": 38240 + }, + { + "epoch": 22.264260768335273, + "grad_norm": 0.1627953052520752, + "learning_rate": 3.18271687623241e-05, + "loss": 0.009, + "step": 38250 + }, + { + "epoch": 22.270081490104772, + "grad_norm": 0.1952623426914215, + "learning_rate": 3.1801498342646896e-05, + "loss": 0.011, + "step": 38260 + }, + { + "epoch": 22.27590221187427, + "grad_norm": 0.19170480966567993, + "learning_rate": 3.177583345119398e-05, + "loss": 0.0098, + "step": 38270 + }, + { + "epoch": 22.28172293364377, + "grad_norm": 0.19695155322551727, + "learning_rate": 3.17501740957617e-05, + "loss": 0.0072, + "step": 38280 + }, + { + "epoch": 22.28754365541327, + "grad_norm": 0.1533876210451126, + "learning_rate": 3.172452028414467e-05, + "loss": 0.0085, + "step": 38290 + }, + { + "epoch": 22.29336437718277, + "grad_norm": 0.2413344830274582, + "learning_rate": 3.169887202413583e-05, + "loss": 0.0086, + "step": 38300 + }, + { + "epoch": 22.29918509895227, + "grad_norm": 0.22592076659202576, + "learning_rate": 3.167322932352646e-05, + "loss": 0.0102, + "step": 38310 + }, + { + "epoch": 22.30500582072177, + "grad_norm": 0.10436265915632248, + "learning_rate": 3.164759219010613e-05, + "loss": 0.0082, + "step": 38320 + }, + { + "epoch": 22.31082654249127, + "grad_norm": 0.16613857448101044, + "learning_rate": 3.1621960631662725e-05, + "loss": 0.0085, + "step": 38330 + }, + { + "epoch": 22.31664726426077, + "grad_norm": 0.18556301295757294, + "learning_rate": 3.159633465598245e-05, + "loss": 0.0105, + "step": 38340 + }, + { + "epoch": 22.32246798603027, + "grad_norm": 0.17288188636302948, + "learning_rate": 3.1570714270849767e-05, + "loss": 0.0076, + "step": 38350 + }, + { + "epoch": 22.328288707799768, + "grad_norm": 0.13864701986312866, + "learning_rate": 3.1545099484047516e-05, + "loss": 0.0085, + "step": 38360 + }, + { + "epoch": 22.334109429569267, + "grad_norm": 0.22033382952213287, + "learning_rate": 3.151949030335674e-05, + "loss": 0.0063, + "step": 38370 + }, + { + "epoch": 22.339930151338766, + "grad_norm": 0.11223684996366501, + "learning_rate": 3.149388673655687e-05, + "loss": 0.0093, + "step": 38380 + }, + { + "epoch": 22.345750873108265, + "grad_norm": 0.21824783086776733, + "learning_rate": 3.146828879142559e-05, + "loss": 0.01, + "step": 38390 + }, + { + "epoch": 22.351571594877765, + "grad_norm": 0.1606181263923645, + "learning_rate": 3.1442696475738866e-05, + "loss": 0.0064, + "step": 38400 + }, + { + "epoch": 22.357392316647264, + "grad_norm": 0.22994069755077362, + "learning_rate": 3.141710979727098e-05, + "loss": 0.0069, + "step": 38410 + }, + { + "epoch": 22.363213038416763, + "grad_norm": 0.1404743641614914, + "learning_rate": 3.139152876379447e-05, + "loss": 0.0065, + "step": 38420 + }, + { + "epoch": 22.369033760186262, + "grad_norm": 0.21032197773456573, + "learning_rate": 3.1365953383080214e-05, + "loss": 0.0088, + "step": 38430 + }, + { + "epoch": 22.37485448195576, + "grad_norm": 0.29695650935173035, + "learning_rate": 3.134038366289731e-05, + "loss": 0.0082, + "step": 38440 + }, + { + "epoch": 22.38067520372526, + "grad_norm": 0.1743500679731369, + "learning_rate": 3.131481961101317e-05, + "loss": 0.0116, + "step": 38450 + }, + { + "epoch": 22.38649592549476, + "grad_norm": 0.14662855863571167, + "learning_rate": 3.128926123519349e-05, + "loss": 0.0095, + "step": 38460 + }, + { + "epoch": 22.39231664726426, + "grad_norm": 0.12636899948120117, + "learning_rate": 3.1263708543202194e-05, + "loss": 0.0052, + "step": 38470 + }, + { + "epoch": 22.398137369033762, + "grad_norm": 0.20186039805412292, + "learning_rate": 3.123816154280155e-05, + "loss": 0.0115, + "step": 38480 + }, + { + "epoch": 22.40395809080326, + "grad_norm": 0.16104711592197418, + "learning_rate": 3.121262024175207e-05, + "loss": 0.0088, + "step": 38490 + }, + { + "epoch": 22.40977881257276, + "grad_norm": 0.2026391327381134, + "learning_rate": 3.118708464781248e-05, + "loss": 0.0083, + "step": 38500 + }, + { + "epoch": 22.41559953434226, + "grad_norm": 0.19069211184978485, + "learning_rate": 3.116155476873987e-05, + "loss": 0.0092, + "step": 38510 + }, + { + "epoch": 22.42142025611176, + "grad_norm": 0.12789221107959747, + "learning_rate": 3.11360306122895e-05, + "loss": 0.0085, + "step": 38520 + }, + { + "epoch": 22.427240977881258, + "grad_norm": 0.16926942765712738, + "learning_rate": 3.1110512186214975e-05, + "loss": 0.0085, + "step": 38530 + }, + { + "epoch": 22.433061699650757, + "grad_norm": 0.16523709893226624, + "learning_rate": 3.1084999498268095e-05, + "loss": 0.0078, + "step": 38540 + }, + { + "epoch": 22.438882421420256, + "grad_norm": 0.16105395555496216, + "learning_rate": 3.1059492556198934e-05, + "loss": 0.01, + "step": 38550 + }, + { + "epoch": 22.444703143189756, + "grad_norm": 0.2076508104801178, + "learning_rate": 3.103399136775586e-05, + "loss": 0.0109, + "step": 38560 + }, + { + "epoch": 22.450523864959255, + "grad_norm": 0.11279261112213135, + "learning_rate": 3.100849594068541e-05, + "loss": 0.01, + "step": 38570 + }, + { + "epoch": 22.456344586728754, + "grad_norm": 0.1519061177968979, + "learning_rate": 3.0983006282732484e-05, + "loss": 0.0083, + "step": 38580 + }, + { + "epoch": 22.462165308498253, + "grad_norm": 0.15773610770702362, + "learning_rate": 3.0957522401640116e-05, + "loss": 0.0086, + "step": 38590 + }, + { + "epoch": 22.467986030267753, + "grad_norm": 0.1203511580824852, + "learning_rate": 3.0932044305149645e-05, + "loss": 0.0063, + "step": 38600 + }, + { + "epoch": 22.47380675203725, + "grad_norm": 0.1535172164440155, + "learning_rate": 3.090657200100068e-05, + "loss": 0.0071, + "step": 38610 + }, + { + "epoch": 22.47962747380675, + "grad_norm": 0.17775219678878784, + "learning_rate": 3.088110549693099e-05, + "loss": 0.0082, + "step": 38620 + }, + { + "epoch": 22.48544819557625, + "grad_norm": 0.1130376011133194, + "learning_rate": 3.085564480067667e-05, + "loss": 0.0068, + "step": 38630 + }, + { + "epoch": 22.49126891734575, + "grad_norm": 0.11384267359972, + "learning_rate": 3.0830189919971955e-05, + "loss": 0.0068, + "step": 38640 + }, + { + "epoch": 22.49708963911525, + "grad_norm": 0.13855722546577454, + "learning_rate": 3.080474086254939e-05, + "loss": 0.0088, + "step": 38650 + }, + { + "epoch": 22.50291036088475, + "grad_norm": 0.09889696538448334, + "learning_rate": 3.077929763613975e-05, + "loss": 0.0071, + "step": 38660 + }, + { + "epoch": 22.50873108265425, + "grad_norm": 0.07577607780694962, + "learning_rate": 3.075386024847198e-05, + "loss": 0.0096, + "step": 38670 + }, + { + "epoch": 22.51455180442375, + "grad_norm": 0.06671173870563507, + "learning_rate": 3.072842870727331e-05, + "loss": 0.0069, + "step": 38680 + }, + { + "epoch": 22.52037252619325, + "grad_norm": 0.1505860984325409, + "learning_rate": 3.070300302026916e-05, + "loss": 0.0076, + "step": 38690 + }, + { + "epoch": 22.52619324796275, + "grad_norm": 0.1897744983434677, + "learning_rate": 3.067758319518318e-05, + "loss": 0.0112, + "step": 38700 + }, + { + "epoch": 22.532013969732247, + "grad_norm": 0.14099399745464325, + "learning_rate": 3.065216923973725e-05, + "loss": 0.0073, + "step": 38710 + }, + { + "epoch": 22.537834691501747, + "grad_norm": 0.1574043333530426, + "learning_rate": 3.062676116165145e-05, + "loss": 0.0062, + "step": 38720 + }, + { + "epoch": 22.543655413271246, + "grad_norm": 0.15579453110694885, + "learning_rate": 3.06013589686441e-05, + "loss": 0.0068, + "step": 38730 + }, + { + "epoch": 22.549476135040745, + "grad_norm": 0.1574985235929489, + "learning_rate": 3.05759626684317e-05, + "loss": 0.0076, + "step": 38740 + }, + { + "epoch": 22.555296856810244, + "grad_norm": 0.10082177072763443, + "learning_rate": 3.055057226872896e-05, + "loss": 0.0083, + "step": 38750 + }, + { + "epoch": 22.561117578579744, + "grad_norm": 0.08104535937309265, + "learning_rate": 3.052518777724887e-05, + "loss": 0.0088, + "step": 38760 + }, + { + "epoch": 22.566938300349243, + "grad_norm": 0.1084103137254715, + "learning_rate": 3.04998092017025e-05, + "loss": 0.0069, + "step": 38770 + }, + { + "epoch": 22.572759022118742, + "grad_norm": 0.1399247646331787, + "learning_rate": 3.0474436549799246e-05, + "loss": 0.0107, + "step": 38780 + }, + { + "epoch": 22.57857974388824, + "grad_norm": 0.18985311686992645, + "learning_rate": 3.044906982924661e-05, + "loss": 0.0068, + "step": 38790 + }, + { + "epoch": 22.58440046565774, + "grad_norm": 0.13458320498466492, + "learning_rate": 3.0423709047750337e-05, + "loss": 0.0068, + "step": 38800 + }, + { + "epoch": 22.59022118742724, + "grad_norm": 0.24160286784172058, + "learning_rate": 3.03983542130144e-05, + "loss": 0.0085, + "step": 38810 + }, + { + "epoch": 22.59604190919674, + "grad_norm": 0.1800495982170105, + "learning_rate": 3.0373005332740877e-05, + "loss": 0.0114, + "step": 38820 + }, + { + "epoch": 22.601862630966238, + "grad_norm": 0.15226741135120392, + "learning_rate": 3.034766241463013e-05, + "loss": 0.0073, + "step": 38830 + }, + { + "epoch": 22.60768335273574, + "grad_norm": 0.15186239778995514, + "learning_rate": 3.032232546638064e-05, + "loss": 0.0066, + "step": 38840 + }, + { + "epoch": 22.61350407450524, + "grad_norm": 0.15201212465763092, + "learning_rate": 3.0296994495689114e-05, + "loss": 0.0076, + "step": 38850 + }, + { + "epoch": 22.61932479627474, + "grad_norm": 0.16236385703086853, + "learning_rate": 3.0271669510250444e-05, + "loss": 0.0081, + "step": 38860 + }, + { + "epoch": 22.62514551804424, + "grad_norm": 0.12345056980848312, + "learning_rate": 3.024635051775766e-05, + "loss": 0.0077, + "step": 38870 + }, + { + "epoch": 22.630966239813738, + "grad_norm": 0.18448787927627563, + "learning_rate": 3.022103752590205e-05, + "loss": 0.0112, + "step": 38880 + }, + { + "epoch": 22.636786961583237, + "grad_norm": 0.2191372811794281, + "learning_rate": 3.0195730542372992e-05, + "loss": 0.0095, + "step": 38890 + }, + { + "epoch": 22.642607683352736, + "grad_norm": 0.15287525951862335, + "learning_rate": 3.0170429574858084e-05, + "loss": 0.0062, + "step": 38900 + }, + { + "epoch": 22.648428405122235, + "grad_norm": 0.12737469375133514, + "learning_rate": 3.0145134631043127e-05, + "loss": 0.008, + "step": 38910 + }, + { + "epoch": 22.654249126891735, + "grad_norm": 0.1288687288761139, + "learning_rate": 3.0119845718612018e-05, + "loss": 0.007, + "step": 38920 + }, + { + "epoch": 22.660069848661234, + "grad_norm": 0.16744843125343323, + "learning_rate": 3.009456284524688e-05, + "loss": 0.0104, + "step": 38930 + }, + { + "epoch": 22.665890570430733, + "grad_norm": 0.10451580584049225, + "learning_rate": 3.0069286018627967e-05, + "loss": 0.0089, + "step": 38940 + }, + { + "epoch": 22.671711292200232, + "grad_norm": 0.16577844321727753, + "learning_rate": 3.0044015246433743e-05, + "loss": 0.0128, + "step": 38950 + }, + { + "epoch": 22.67753201396973, + "grad_norm": 0.1632094830274582, + "learning_rate": 3.0018750536340755e-05, + "loss": 0.0119, + "step": 38960 + }, + { + "epoch": 22.68335273573923, + "grad_norm": 0.12207971513271332, + "learning_rate": 2.999349189602378e-05, + "loss": 0.0152, + "step": 38970 + }, + { + "epoch": 22.68917345750873, + "grad_norm": 0.1054726392030716, + "learning_rate": 2.9968239333155733e-05, + "loss": 0.0076, + "step": 38980 + }, + { + "epoch": 22.69499417927823, + "grad_norm": 0.09044294059276581, + "learning_rate": 2.994299285540767e-05, + "loss": 0.0081, + "step": 38990 + }, + { + "epoch": 22.70081490104773, + "grad_norm": 0.1453421413898468, + "learning_rate": 2.9917752470448813e-05, + "loss": 0.0074, + "step": 39000 + }, + { + "epoch": 22.70663562281723, + "grad_norm": 0.11506839096546173, + "learning_rate": 2.9892518185946495e-05, + "loss": 0.0066, + "step": 39010 + }, + { + "epoch": 22.71245634458673, + "grad_norm": 0.1503496766090393, + "learning_rate": 2.986729000956624e-05, + "loss": 0.0104, + "step": 39020 + }, + { + "epoch": 22.71827706635623, + "grad_norm": 0.12743543088436127, + "learning_rate": 2.9842067948971736e-05, + "loss": 0.0082, + "step": 39030 + }, + { + "epoch": 22.72409778812573, + "grad_norm": 0.09971607476472855, + "learning_rate": 2.9816852011824727e-05, + "loss": 0.0063, + "step": 39040 + }, + { + "epoch": 22.729918509895228, + "grad_norm": 0.10578136891126633, + "learning_rate": 2.979164220578519e-05, + "loss": 0.0085, + "step": 39050 + }, + { + "epoch": 22.735739231664727, + "grad_norm": 0.11170454323291779, + "learning_rate": 2.9766438538511165e-05, + "loss": 0.0097, + "step": 39060 + }, + { + "epoch": 22.741559953434226, + "grad_norm": 0.22696131467819214, + "learning_rate": 2.9741241017658873e-05, + "loss": 0.0082, + "step": 39070 + }, + { + "epoch": 22.747380675203726, + "grad_norm": 0.22668980062007904, + "learning_rate": 2.971604965088267e-05, + "loss": 0.0084, + "step": 39080 + }, + { + "epoch": 22.753201396973225, + "grad_norm": 0.17276953160762787, + "learning_rate": 2.9690864445835008e-05, + "loss": 0.0074, + "step": 39090 + }, + { + "epoch": 22.759022118742724, + "grad_norm": 0.11487296968698502, + "learning_rate": 2.966568541016651e-05, + "loss": 0.0159, + "step": 39100 + }, + { + "epoch": 22.764842840512223, + "grad_norm": 0.22137793898582458, + "learning_rate": 2.9640512551525867e-05, + "loss": 0.0113, + "step": 39110 + }, + { + "epoch": 22.770663562281722, + "grad_norm": 0.1834760457277298, + "learning_rate": 2.961534587755995e-05, + "loss": 0.0081, + "step": 39120 + }, + { + "epoch": 22.77648428405122, + "grad_norm": 0.1840173900127411, + "learning_rate": 2.959018539591375e-05, + "loss": 0.0101, + "step": 39130 + }, + { + "epoch": 22.78230500582072, + "grad_norm": 0.1435650736093521, + "learning_rate": 2.9565031114230325e-05, + "loss": 0.0097, + "step": 39140 + }, + { + "epoch": 22.78812572759022, + "grad_norm": 0.16497108340263367, + "learning_rate": 2.9539883040150895e-05, + "loss": 0.0093, + "step": 39150 + }, + { + "epoch": 22.79394644935972, + "grad_norm": 0.15126170217990875, + "learning_rate": 2.9514741181314774e-05, + "loss": 0.0099, + "step": 39160 + }, + { + "epoch": 22.79976717112922, + "grad_norm": 0.18575669825077057, + "learning_rate": 2.94896055453594e-05, + "loss": 0.0053, + "step": 39170 + }, + { + "epoch": 22.80558789289872, + "grad_norm": 0.1400071233510971, + "learning_rate": 2.9464476139920332e-05, + "loss": 0.0088, + "step": 39180 + }, + { + "epoch": 22.81140861466822, + "grad_norm": 0.1167982742190361, + "learning_rate": 2.9439352972631186e-05, + "loss": 0.006, + "step": 39190 + }, + { + "epoch": 22.81722933643772, + "grad_norm": 0.10892492532730103, + "learning_rate": 2.9414236051123757e-05, + "loss": 0.0091, + "step": 39200 + }, + { + "epoch": 22.82305005820722, + "grad_norm": 0.1521719992160797, + "learning_rate": 2.938912538302785e-05, + "loss": 0.0071, + "step": 39210 + }, + { + "epoch": 22.828870779976718, + "grad_norm": 0.1605912297964096, + "learning_rate": 2.9364020975971464e-05, + "loss": 0.0067, + "step": 39220 + }, + { + "epoch": 22.834691501746217, + "grad_norm": 0.2116527557373047, + "learning_rate": 2.9338922837580657e-05, + "loss": 0.0067, + "step": 39230 + }, + { + "epoch": 22.840512223515717, + "grad_norm": 0.21836012601852417, + "learning_rate": 2.931383097547955e-05, + "loss": 0.0085, + "step": 39240 + }, + { + "epoch": 22.846332945285216, + "grad_norm": 0.11949519068002701, + "learning_rate": 2.928874539729043e-05, + "loss": 0.0092, + "step": 39250 + }, + { + "epoch": 22.852153667054715, + "grad_norm": 0.1431536227464676, + "learning_rate": 2.926366611063358e-05, + "loss": 0.0069, + "step": 39260 + }, + { + "epoch": 22.857974388824214, + "grad_norm": 0.13612037897109985, + "learning_rate": 2.9238593123127463e-05, + "loss": 0.0084, + "step": 39270 + }, + { + "epoch": 22.863795110593713, + "grad_norm": 0.14871330559253693, + "learning_rate": 2.9213526442388583e-05, + "loss": 0.007, + "step": 39280 + }, + { + "epoch": 22.869615832363213, + "grad_norm": 0.1225414052605629, + "learning_rate": 2.9188466076031545e-05, + "loss": 0.0066, + "step": 39290 + }, + { + "epoch": 22.875436554132712, + "grad_norm": 0.12385770678520203, + "learning_rate": 2.9163412031669012e-05, + "loss": 0.0083, + "step": 39300 + }, + { + "epoch": 22.88125727590221, + "grad_norm": 0.08214427530765533, + "learning_rate": 2.913836431691175e-05, + "loss": 0.0096, + "step": 39310 + }, + { + "epoch": 22.88707799767171, + "grad_norm": 0.09528204053640366, + "learning_rate": 2.9113322939368583e-05, + "loss": 0.0098, + "step": 39320 + }, + { + "epoch": 22.89289871944121, + "grad_norm": 0.11498425155878067, + "learning_rate": 2.9088287906646427e-05, + "loss": 0.0109, + "step": 39330 + }, + { + "epoch": 22.89871944121071, + "grad_norm": 0.21399593353271484, + "learning_rate": 2.906325922635024e-05, + "loss": 0.0092, + "step": 39340 + }, + { + "epoch": 22.904540162980208, + "grad_norm": 0.1997402161359787, + "learning_rate": 2.903823690608313e-05, + "loss": 0.0066, + "step": 39350 + }, + { + "epoch": 22.91036088474971, + "grad_norm": 0.09238588064908981, + "learning_rate": 2.9013220953446174e-05, + "loss": 0.0079, + "step": 39360 + }, + { + "epoch": 22.91618160651921, + "grad_norm": 0.14103350043296814, + "learning_rate": 2.8988211376038564e-05, + "loss": 0.006, + "step": 39370 + }, + { + "epoch": 22.92200232828871, + "grad_norm": 0.1246882900595665, + "learning_rate": 2.8963208181457564e-05, + "loss": 0.0085, + "step": 39380 + }, + { + "epoch": 22.92782305005821, + "grad_norm": 0.1315031349658966, + "learning_rate": 2.8938211377298453e-05, + "loss": 0.0144, + "step": 39390 + }, + { + "epoch": 22.933643771827708, + "grad_norm": 0.22818732261657715, + "learning_rate": 2.8913220971154652e-05, + "loss": 0.0143, + "step": 39400 + }, + { + "epoch": 22.939464493597207, + "grad_norm": 0.08209316432476044, + "learning_rate": 2.888823697061753e-05, + "loss": 0.0059, + "step": 39410 + }, + { + "epoch": 22.945285215366706, + "grad_norm": 0.13385231792926788, + "learning_rate": 2.8863259383276618e-05, + "loss": 0.0081, + "step": 39420 + }, + { + "epoch": 22.951105937136205, + "grad_norm": 0.1919773817062378, + "learning_rate": 2.8838288216719395e-05, + "loss": 0.0114, + "step": 39430 + }, + { + "epoch": 22.956926658905704, + "grad_norm": 0.11558422446250916, + "learning_rate": 2.8813323478531484e-05, + "loss": 0.0073, + "step": 39440 + }, + { + "epoch": 22.962747380675204, + "grad_norm": 0.29541143774986267, + "learning_rate": 2.8788365176296496e-05, + "loss": 0.0116, + "step": 39450 + }, + { + "epoch": 22.968568102444703, + "grad_norm": 0.12623777985572815, + "learning_rate": 2.876341331759611e-05, + "loss": 0.0052, + "step": 39460 + }, + { + "epoch": 22.974388824214202, + "grad_norm": 0.07503208518028259, + "learning_rate": 2.8738467910010036e-05, + "loss": 0.0127, + "step": 39470 + }, + { + "epoch": 22.9802095459837, + "grad_norm": 0.1958889663219452, + "learning_rate": 2.8713528961116032e-05, + "loss": 0.0092, + "step": 39480 + }, + { + "epoch": 22.9860302677532, + "grad_norm": 0.14741922914981842, + "learning_rate": 2.8688596478489875e-05, + "loss": 0.0113, + "step": 39490 + }, + { + "epoch": 22.9918509895227, + "grad_norm": 0.16296762228012085, + "learning_rate": 2.8663670469705434e-05, + "loss": 0.007, + "step": 39500 + }, + { + "epoch": 22.9976717112922, + "grad_norm": 0.11281244456768036, + "learning_rate": 2.8638750942334546e-05, + "loss": 0.0122, + "step": 39510 + }, + { + "epoch": 23.003492433061698, + "grad_norm": 0.135442852973938, + "learning_rate": 2.8613837903947115e-05, + "loss": 0.0081, + "step": 39520 + }, + { + "epoch": 23.009313154831197, + "grad_norm": 0.19649799168109894, + "learning_rate": 2.858893136211106e-05, + "loss": 0.0107, + "step": 39530 + }, + { + "epoch": 23.0151338766007, + "grad_norm": 0.20682360231876373, + "learning_rate": 2.8564031324392315e-05, + "loss": 0.0096, + "step": 39540 + }, + { + "epoch": 23.0209545983702, + "grad_norm": 0.17060133814811707, + "learning_rate": 2.85391377983549e-05, + "loss": 0.0087, + "step": 39550 + }, + { + "epoch": 23.0267753201397, + "grad_norm": 0.25304800271987915, + "learning_rate": 2.851425079156075e-05, + "loss": 0.0116, + "step": 39560 + }, + { + "epoch": 23.032596041909198, + "grad_norm": 0.1996564269065857, + "learning_rate": 2.848937031156994e-05, + "loss": 0.0085, + "step": 39570 + }, + { + "epoch": 23.038416763678697, + "grad_norm": 0.18514209985733032, + "learning_rate": 2.846449636594044e-05, + "loss": 0.01, + "step": 39580 + }, + { + "epoch": 23.044237485448196, + "grad_norm": 0.11844079941511154, + "learning_rate": 2.843962896222836e-05, + "loss": 0.0078, + "step": 39590 + }, + { + "epoch": 23.050058207217695, + "grad_norm": 0.1337621957063675, + "learning_rate": 2.8414768107987722e-05, + "loss": 0.0087, + "step": 39600 + }, + { + "epoch": 23.055878928987195, + "grad_norm": 0.22897085547447205, + "learning_rate": 2.838991381077061e-05, + "loss": 0.0098, + "step": 39610 + }, + { + "epoch": 23.061699650756694, + "grad_norm": 0.13961221277713776, + "learning_rate": 2.83650660781271e-05, + "loss": 0.0073, + "step": 39620 + }, + { + "epoch": 23.067520372526193, + "grad_norm": 0.10853222012519836, + "learning_rate": 2.8340224917605285e-05, + "loss": 0.0061, + "step": 39630 + }, + { + "epoch": 23.073341094295692, + "grad_norm": 0.16763590276241302, + "learning_rate": 2.831539033675122e-05, + "loss": 0.0094, + "step": 39640 + }, + { + "epoch": 23.07916181606519, + "grad_norm": 0.11376123130321503, + "learning_rate": 2.8290562343109038e-05, + "loss": 0.0129, + "step": 39650 + }, + { + "epoch": 23.08498253783469, + "grad_norm": 0.10598535090684891, + "learning_rate": 2.826574094422082e-05, + "loss": 0.0071, + "step": 39660 + }, + { + "epoch": 23.09080325960419, + "grad_norm": 0.12220023572444916, + "learning_rate": 2.8240926147626645e-05, + "loss": 0.0107, + "step": 39670 + }, + { + "epoch": 23.09662398137369, + "grad_norm": 0.1330631524324417, + "learning_rate": 2.8216117960864586e-05, + "loss": 0.0104, + "step": 39680 + }, + { + "epoch": 23.10244470314319, + "grad_norm": 0.23697632551193237, + "learning_rate": 2.8191316391470703e-05, + "loss": 0.01, + "step": 39690 + }, + { + "epoch": 23.108265424912688, + "grad_norm": 0.12702105939388275, + "learning_rate": 2.816652144697911e-05, + "loss": 0.013, + "step": 39700 + }, + { + "epoch": 23.11408614668219, + "grad_norm": 0.1386582851409912, + "learning_rate": 2.8141733134921783e-05, + "loss": 0.011, + "step": 39710 + }, + { + "epoch": 23.11990686845169, + "grad_norm": 0.2989281117916107, + "learning_rate": 2.811695146282884e-05, + "loss": 0.0102, + "step": 39720 + }, + { + "epoch": 23.12572759022119, + "grad_norm": 0.20389902591705322, + "learning_rate": 2.8092176438228212e-05, + "loss": 0.0092, + "step": 39730 + }, + { + "epoch": 23.131548311990688, + "grad_norm": 0.20204894244670868, + "learning_rate": 2.806740806864598e-05, + "loss": 0.0094, + "step": 39740 + }, + { + "epoch": 23.137369033760187, + "grad_norm": 0.15325139462947845, + "learning_rate": 2.804264636160604e-05, + "loss": 0.007, + "step": 39750 + }, + { + "epoch": 23.143189755529686, + "grad_norm": 0.08593256026506424, + "learning_rate": 2.8017891324630402e-05, + "loss": 0.0062, + "step": 39760 + }, + { + "epoch": 23.149010477299186, + "grad_norm": 0.11192186921834946, + "learning_rate": 2.7993142965238976e-05, + "loss": 0.0096, + "step": 39770 + }, + { + "epoch": 23.154831199068685, + "grad_norm": 0.11001443862915039, + "learning_rate": 2.7968401290949665e-05, + "loss": 0.0076, + "step": 39780 + }, + { + "epoch": 23.160651920838184, + "grad_norm": 0.10410792380571365, + "learning_rate": 2.7943666309278328e-05, + "loss": 0.0078, + "step": 39790 + }, + { + "epoch": 23.166472642607683, + "grad_norm": 0.11119882762432098, + "learning_rate": 2.7918938027738783e-05, + "loss": 0.0088, + "step": 39800 + }, + { + "epoch": 23.172293364377182, + "grad_norm": 0.10674387961626053, + "learning_rate": 2.789421645384287e-05, + "loss": 0.0052, + "step": 39810 + }, + { + "epoch": 23.17811408614668, + "grad_norm": 0.18520310521125793, + "learning_rate": 2.786950159510032e-05, + "loss": 0.0097, + "step": 39820 + }, + { + "epoch": 23.18393480791618, + "grad_norm": 0.14522004127502441, + "learning_rate": 2.7844793459018876e-05, + "loss": 0.0078, + "step": 39830 + }, + { + "epoch": 23.18975552968568, + "grad_norm": 0.173667773604393, + "learning_rate": 2.7820092053104195e-05, + "loss": 0.0075, + "step": 39840 + }, + { + "epoch": 23.19557625145518, + "grad_norm": 0.12881407141685486, + "learning_rate": 2.7795397384859933e-05, + "loss": 0.0057, + "step": 39850 + }, + { + "epoch": 23.20139697322468, + "grad_norm": 0.17921525239944458, + "learning_rate": 2.7770709461787638e-05, + "loss": 0.0076, + "step": 39860 + }, + { + "epoch": 23.207217694994178, + "grad_norm": 0.1568700522184372, + "learning_rate": 2.7746028291386915e-05, + "loss": 0.0062, + "step": 39870 + }, + { + "epoch": 23.213038416763677, + "grad_norm": 0.19151048362255096, + "learning_rate": 2.772135388115519e-05, + "loss": 0.0083, + "step": 39880 + }, + { + "epoch": 23.21885913853318, + "grad_norm": 0.16482406854629517, + "learning_rate": 2.7696686238587945e-05, + "loss": 0.0084, + "step": 39890 + }, + { + "epoch": 23.22467986030268, + "grad_norm": 0.1892799735069275, + "learning_rate": 2.7672025371178505e-05, + "loss": 0.0099, + "step": 39900 + }, + { + "epoch": 23.230500582072178, + "grad_norm": 0.22645002603530884, + "learning_rate": 2.7647371286418238e-05, + "loss": 0.0073, + "step": 39910 + }, + { + "epoch": 23.236321303841677, + "grad_norm": 0.1488988846540451, + "learning_rate": 2.762272399179639e-05, + "loss": 0.0088, + "step": 39920 + }, + { + "epoch": 23.242142025611177, + "grad_norm": 0.07308867573738098, + "learning_rate": 2.7598083494800154e-05, + "loss": 0.0067, + "step": 39930 + }, + { + "epoch": 23.247962747380676, + "grad_norm": 0.1779673844575882, + "learning_rate": 2.7573449802914664e-05, + "loss": 0.0102, + "step": 39940 + }, + { + "epoch": 23.253783469150175, + "grad_norm": 0.15757562220096588, + "learning_rate": 2.7548822923622964e-05, + "loss": 0.0056, + "step": 39950 + }, + { + "epoch": 23.259604190919674, + "grad_norm": 0.10805273056030273, + "learning_rate": 2.752420286440609e-05, + "loss": 0.0078, + "step": 39960 + }, + { + "epoch": 23.265424912689173, + "grad_norm": 0.11519189178943634, + "learning_rate": 2.749958963274295e-05, + "loss": 0.0083, + "step": 39970 + }, + { + "epoch": 23.271245634458673, + "grad_norm": 0.18202421069145203, + "learning_rate": 2.747498323611039e-05, + "loss": 0.01, + "step": 39980 + }, + { + "epoch": 23.277066356228172, + "grad_norm": 0.09930091351270676, + "learning_rate": 2.7450383681983184e-05, + "loss": 0.006, + "step": 39990 + }, + { + "epoch": 23.28288707799767, + "grad_norm": 0.13893099129199982, + "learning_rate": 2.742579097783403e-05, + "loss": 0.0066, + "step": 40000 + }, + { + "epoch": 23.28870779976717, + "grad_norm": 1.0752500295639038, + "learning_rate": 2.7401205131133512e-05, + "loss": -0.1378, + "step": 40010 + }, + { + "epoch": 23.29452852153667, + "grad_norm": 0.6266250014305115, + "learning_rate": 2.7376626149350238e-05, + "loss": -0.3273, + "step": 40020 + }, + { + "epoch": 23.30034924330617, + "grad_norm": 0.3743039071559906, + "learning_rate": 2.735205403995056e-05, + "loss": -0.3896, + "step": 40030 + }, + { + "epoch": 23.306169965075668, + "grad_norm": 0.25254911184310913, + "learning_rate": 2.7327488810398917e-05, + "loss": -0.4159, + "step": 40040 + }, + { + "epoch": 23.311990686845167, + "grad_norm": 0.19838955998420715, + "learning_rate": 2.7302930468157507e-05, + "loss": -0.4296, + "step": 40050 + }, + { + "epoch": 23.31781140861467, + "grad_norm": 0.31474584341049194, + "learning_rate": 2.727837902068655e-05, + "loss": -0.4521, + "step": 40060 + }, + { + "epoch": 23.32363213038417, + "grad_norm": 0.22210608422756195, + "learning_rate": 2.7253834475444123e-05, + "loss": -0.4746, + "step": 40070 + }, + { + "epoch": 23.32945285215367, + "grad_norm": 0.255059152841568, + "learning_rate": 2.7229296839886204e-05, + "loss": -0.4875, + "step": 40080 + }, + { + "epoch": 23.335273573923168, + "grad_norm": 0.19998660683631897, + "learning_rate": 2.720476612146668e-05, + "loss": -0.5021, + "step": 40090 + }, + { + "epoch": 23.341094295692667, + "grad_norm": 0.21656912565231323, + "learning_rate": 2.7180242327637317e-05, + "loss": -0.5113, + "step": 40100 + }, + { + "epoch": 23.346915017462166, + "grad_norm": 0.21734030544757843, + "learning_rate": 2.7155725465847826e-05, + "loss": -0.5266, + "step": 40110 + }, + { + "epoch": 23.352735739231665, + "grad_norm": 0.1843586415052414, + "learning_rate": 2.713121554354578e-05, + "loss": -0.5512, + "step": 40120 + }, + { + "epoch": 23.358556461001164, + "grad_norm": 0.20473359525203705, + "learning_rate": 2.7106712568176628e-05, + "loss": -0.5794, + "step": 40130 + }, + { + "epoch": 23.364377182770664, + "grad_norm": 0.23273389041423798, + "learning_rate": 2.708221654718374e-05, + "loss": -0.6021, + "step": 40140 + }, + { + "epoch": 23.370197904540163, + "grad_norm": 0.17061281204223633, + "learning_rate": 2.7057727488008357e-05, + "loss": -0.6229, + "step": 40150 + }, + { + "epoch": 23.376018626309662, + "grad_norm": 0.16494835913181305, + "learning_rate": 2.703324539808961e-05, + "loss": -0.6397, + "step": 40160 + }, + { + "epoch": 23.38183934807916, + "grad_norm": 0.18994449079036713, + "learning_rate": 2.7008770284864505e-05, + "loss": -0.658, + "step": 40170 + }, + { + "epoch": 23.38766006984866, + "grad_norm": 0.21254129707813263, + "learning_rate": 2.6984302155767916e-05, + "loss": -0.6705, + "step": 40180 + }, + { + "epoch": 23.39348079161816, + "grad_norm": 0.24763697385787964, + "learning_rate": 2.6959841018232683e-05, + "loss": -0.6878, + "step": 40190 + }, + { + "epoch": 23.39930151338766, + "grad_norm": 0.21877112984657288, + "learning_rate": 2.693538687968937e-05, + "loss": -0.7055, + "step": 40200 + }, + { + "epoch": 23.405122235157158, + "grad_norm": 0.20225496590137482, + "learning_rate": 2.6910939747566556e-05, + "loss": -0.7114, + "step": 40210 + }, + { + "epoch": 23.410942956926657, + "grad_norm": 0.16262882947921753, + "learning_rate": 2.6886499629290607e-05, + "loss": -0.7314, + "step": 40220 + }, + { + "epoch": 23.416763678696157, + "grad_norm": 0.16489335894584656, + "learning_rate": 2.6862066532285802e-05, + "loss": -0.741, + "step": 40230 + }, + { + "epoch": 23.42258440046566, + "grad_norm": 0.1990000456571579, + "learning_rate": 2.6837640463974262e-05, + "loss": -0.7409, + "step": 40240 + }, + { + "epoch": 23.42840512223516, + "grad_norm": 0.2889009714126587, + "learning_rate": 2.681322143177596e-05, + "loss": -0.7536, + "step": 40250 + }, + { + "epoch": 23.434225844004658, + "grad_norm": 0.24090786278247833, + "learning_rate": 2.678880944310882e-05, + "loss": -0.7636, + "step": 40260 + }, + { + "epoch": 23.440046565774157, + "grad_norm": 0.2673332095146179, + "learning_rate": 2.6764404505388474e-05, + "loss": -0.7625, + "step": 40270 + }, + { + "epoch": 23.445867287543656, + "grad_norm": 0.1939028948545456, + "learning_rate": 2.6740006626028558e-05, + "loss": -0.7653, + "step": 40280 + }, + { + "epoch": 23.451688009313155, + "grad_norm": 0.13891653716564178, + "learning_rate": 2.671561581244048e-05, + "loss": -0.7756, + "step": 40290 + }, + { + "epoch": 23.457508731082655, + "grad_norm": 0.14423587918281555, + "learning_rate": 2.6691232072033536e-05, + "loss": -0.7765, + "step": 40300 + }, + { + "epoch": 23.463329452852154, + "grad_norm": 0.21951253712177277, + "learning_rate": 2.6666855412214852e-05, + "loss": -0.7847, + "step": 40310 + }, + { + "epoch": 23.469150174621653, + "grad_norm": 0.15967468917369843, + "learning_rate": 2.664248584038942e-05, + "loss": -0.7865, + "step": 40320 + }, + { + "epoch": 23.474970896391152, + "grad_norm": 0.23513752222061157, + "learning_rate": 2.6618123363960047e-05, + "loss": -0.7933, + "step": 40330 + }, + { + "epoch": 23.48079161816065, + "grad_norm": 0.23821642994880676, + "learning_rate": 2.659376799032748e-05, + "loss": -0.7931, + "step": 40340 + }, + { + "epoch": 23.48661233993015, + "grad_norm": 0.19822999835014343, + "learning_rate": 2.6569419726890145e-05, + "loss": -0.796, + "step": 40350 + }, + { + "epoch": 23.49243306169965, + "grad_norm": 0.2181634157896042, + "learning_rate": 2.654507858104447e-05, + "loss": -0.8004, + "step": 40360 + }, + { + "epoch": 23.49825378346915, + "grad_norm": 0.14941799640655518, + "learning_rate": 2.652074456018463e-05, + "loss": -0.8051, + "step": 40370 + }, + { + "epoch": 23.50407450523865, + "grad_norm": 0.19435180723667145, + "learning_rate": 2.6496417671702646e-05, + "loss": -0.8063, + "step": 40380 + }, + { + "epoch": 23.509895227008148, + "grad_norm": 0.20033922791481018, + "learning_rate": 2.6472097922988427e-05, + "loss": -0.8157, + "step": 40390 + }, + { + "epoch": 23.515715948777647, + "grad_norm": 0.1852255016565323, + "learning_rate": 2.6447785321429607e-05, + "loss": -0.8053, + "step": 40400 + }, + { + "epoch": 23.52153667054715, + "grad_norm": 0.1892230063676834, + "learning_rate": 2.6423479874411784e-05, + "loss": -0.8078, + "step": 40410 + }, + { + "epoch": 23.52735739231665, + "grad_norm": 0.24510237574577332, + "learning_rate": 2.6399181589318234e-05, + "loss": -0.8133, + "step": 40420 + }, + { + "epoch": 23.533178114086148, + "grad_norm": 0.22666558623313904, + "learning_rate": 2.6374890473530188e-05, + "loss": -0.8232, + "step": 40430 + }, + { + "epoch": 23.538998835855647, + "grad_norm": 0.2118794322013855, + "learning_rate": 2.635060653442664e-05, + "loss": -0.8211, + "step": 40440 + }, + { + "epoch": 23.544819557625146, + "grad_norm": 0.1567903459072113, + "learning_rate": 2.6326329779384395e-05, + "loss": -0.8226, + "step": 40450 + }, + { + "epoch": 23.550640279394646, + "grad_norm": 0.2162737101316452, + "learning_rate": 2.63020602157781e-05, + "loss": -0.8258, + "step": 40460 + }, + { + "epoch": 23.556461001164145, + "grad_norm": 0.24676918983459473, + "learning_rate": 2.62777978509802e-05, + "loss": -0.8285, + "step": 40470 + }, + { + "epoch": 23.562281722933644, + "grad_norm": 0.19985942542552948, + "learning_rate": 2.6253542692360954e-05, + "loss": -0.8264, + "step": 40480 + }, + { + "epoch": 23.568102444703143, + "grad_norm": 0.21113355457782745, + "learning_rate": 2.6229294747288458e-05, + "loss": -0.8312, + "step": 40490 + }, + { + "epoch": 23.573923166472643, + "grad_norm": 0.16105830669403076, + "learning_rate": 2.6205054023128596e-05, + "loss": -0.8323, + "step": 40500 + }, + { + "epoch": 23.57974388824214, + "grad_norm": 0.16600556671619415, + "learning_rate": 2.6180820527245043e-05, + "loss": -0.8301, + "step": 40510 + }, + { + "epoch": 23.58556461001164, + "grad_norm": 0.21837688982486725, + "learning_rate": 2.6156594266999313e-05, + "loss": -0.8381, + "step": 40520 + }, + { + "epoch": 23.59138533178114, + "grad_norm": 0.28348851203918457, + "learning_rate": 2.6132375249750672e-05, + "loss": -0.8319, + "step": 40530 + }, + { + "epoch": 23.59720605355064, + "grad_norm": 0.1586354374885559, + "learning_rate": 2.6108163482856286e-05, + "loss": -0.8384, + "step": 40540 + }, + { + "epoch": 23.60302677532014, + "grad_norm": 0.2242506444454193, + "learning_rate": 2.6083958973670964e-05, + "loss": -0.8406, + "step": 40550 + }, + { + "epoch": 23.608847497089638, + "grad_norm": 0.18251070380210876, + "learning_rate": 2.6059761729547483e-05, + "loss": -0.839, + "step": 40560 + }, + { + "epoch": 23.614668218859137, + "grad_norm": 0.16035884618759155, + "learning_rate": 2.603557175783624e-05, + "loss": -0.8434, + "step": 40570 + }, + { + "epoch": 23.620488940628636, + "grad_norm": 0.1602831482887268, + "learning_rate": 2.601138906588559e-05, + "loss": -0.8403, + "step": 40580 + }, + { + "epoch": 23.62630966239814, + "grad_norm": 0.18171162903308868, + "learning_rate": 2.598721366104152e-05, + "loss": -0.8431, + "step": 40590 + }, + { + "epoch": 23.63213038416764, + "grad_norm": 0.18738193809986115, + "learning_rate": 2.5963045550647945e-05, + "loss": -0.8391, + "step": 40600 + }, + { + "epoch": 23.637951105937137, + "grad_norm": 0.26754212379455566, + "learning_rate": 2.5938884742046466e-05, + "loss": -0.8416, + "step": 40610 + }, + { + "epoch": 23.643771827706637, + "grad_norm": 0.1761496514081955, + "learning_rate": 2.5914731242576507e-05, + "loss": -0.8458, + "step": 40620 + }, + { + "epoch": 23.649592549476136, + "grad_norm": 0.13786469399929047, + "learning_rate": 2.5890585059575268e-05, + "loss": -0.8433, + "step": 40630 + }, + { + "epoch": 23.655413271245635, + "grad_norm": 0.1746762990951538, + "learning_rate": 2.5866446200377688e-05, + "loss": -0.847, + "step": 40640 + }, + { + "epoch": 23.661233993015134, + "grad_norm": 0.1780836284160614, + "learning_rate": 2.5842314672316566e-05, + "loss": -0.8467, + "step": 40650 + }, + { + "epoch": 23.667054714784634, + "grad_norm": 0.13331246376037598, + "learning_rate": 2.581819048272239e-05, + "loss": -0.8488, + "step": 40660 + }, + { + "epoch": 23.672875436554133, + "grad_norm": 0.21262134611606598, + "learning_rate": 2.5794073638923478e-05, + "loss": -0.8436, + "step": 40670 + }, + { + "epoch": 23.678696158323632, + "grad_norm": 0.21227483451366425, + "learning_rate": 2.576996414824586e-05, + "loss": -0.8389, + "step": 40680 + }, + { + "epoch": 23.68451688009313, + "grad_norm": 0.25781533122062683, + "learning_rate": 2.574586201801339e-05, + "loss": -0.8494, + "step": 40690 + }, + { + "epoch": 23.69033760186263, + "grad_norm": 0.181414395570755, + "learning_rate": 2.572176725554762e-05, + "loss": -0.8501, + "step": 40700 + }, + { + "epoch": 23.69615832363213, + "grad_norm": 0.1843719482421875, + "learning_rate": 2.5697679868167966e-05, + "loss": -0.8488, + "step": 40710 + }, + { + "epoch": 23.70197904540163, + "grad_norm": 0.24774464964866638, + "learning_rate": 2.5673599863191468e-05, + "loss": -0.8485, + "step": 40720 + }, + { + "epoch": 23.707799767171128, + "grad_norm": 0.1406918168067932, + "learning_rate": 2.564952724793306e-05, + "loss": -0.8549, + "step": 40730 + }, + { + "epoch": 23.713620488940627, + "grad_norm": 0.15014059841632843, + "learning_rate": 2.5625462029705306e-05, + "loss": -0.8516, + "step": 40740 + }, + { + "epoch": 23.719441210710126, + "grad_norm": 0.1814449578523636, + "learning_rate": 2.5601404215818624e-05, + "loss": -0.8533, + "step": 40750 + }, + { + "epoch": 23.725261932479626, + "grad_norm": 0.15616007149219513, + "learning_rate": 2.5577353813581144e-05, + "loss": -0.8552, + "step": 40760 + }, + { + "epoch": 23.73108265424913, + "grad_norm": 0.2335229367017746, + "learning_rate": 2.5553310830298733e-05, + "loss": -0.8524, + "step": 40770 + }, + { + "epoch": 23.736903376018628, + "grad_norm": 0.15131917595863342, + "learning_rate": 2.5529275273275012e-05, + "loss": -0.8566, + "step": 40780 + }, + { + "epoch": 23.742724097788127, + "grad_norm": 0.17954735457897186, + "learning_rate": 2.550524714981133e-05, + "loss": -0.862, + "step": 40790 + }, + { + "epoch": 23.748544819557626, + "grad_norm": 0.1873871088027954, + "learning_rate": 2.5481226467206837e-05, + "loss": -0.8583, + "step": 40800 + }, + { + "epoch": 23.754365541327125, + "grad_norm": 0.18092451989650726, + "learning_rate": 2.5457213232758365e-05, + "loss": -0.8603, + "step": 40810 + }, + { + "epoch": 23.760186263096625, + "grad_norm": 0.188015416264534, + "learning_rate": 2.5433207453760498e-05, + "loss": -0.8526, + "step": 40820 + }, + { + "epoch": 23.766006984866124, + "grad_norm": 0.17578986287117004, + "learning_rate": 2.5409209137505552e-05, + "loss": -0.8602, + "step": 40830 + }, + { + "epoch": 23.771827706635623, + "grad_norm": 0.15062300860881805, + "learning_rate": 2.5385218291283597e-05, + "loss": -0.859, + "step": 40840 + }, + { + "epoch": 23.777648428405122, + "grad_norm": 0.19142919778823853, + "learning_rate": 2.5361234922382383e-05, + "loss": -0.8576, + "step": 40850 + }, + { + "epoch": 23.78346915017462, + "grad_norm": 0.16095222532749176, + "learning_rate": 2.533725903808749e-05, + "loss": -0.8617, + "step": 40860 + }, + { + "epoch": 23.78928987194412, + "grad_norm": 0.24393805861473083, + "learning_rate": 2.5313290645682085e-05, + "loss": -0.8603, + "step": 40870 + }, + { + "epoch": 23.79511059371362, + "grad_norm": 0.18414317071437836, + "learning_rate": 2.52893297524472e-05, + "loss": -0.8563, + "step": 40880 + }, + { + "epoch": 23.80093131548312, + "grad_norm": 0.17435234785079956, + "learning_rate": 2.526537636566145e-05, + "loss": -0.8627, + "step": 40890 + }, + { + "epoch": 23.80675203725262, + "grad_norm": 0.20007219910621643, + "learning_rate": 2.5241430492601305e-05, + "loss": -0.8606, + "step": 40900 + }, + { + "epoch": 23.812572759022117, + "grad_norm": 0.1545776128768921, + "learning_rate": 2.5217492140540867e-05, + "loss": -0.861, + "step": 40910 + }, + { + "epoch": 23.818393480791617, + "grad_norm": 0.15901663899421692, + "learning_rate": 2.5193561316751967e-05, + "loss": -0.863, + "step": 40920 + }, + { + "epoch": 23.824214202561116, + "grad_norm": 0.13831442594528198, + "learning_rate": 2.516963802850416e-05, + "loss": -0.8606, + "step": 40930 + }, + { + "epoch": 23.83003492433062, + "grad_norm": 0.18980218470096588, + "learning_rate": 2.5145722283064698e-05, + "loss": -0.863, + "step": 40940 + }, + { + "epoch": 23.835855646100118, + "grad_norm": 0.24028092622756958, + "learning_rate": 2.5121814087698602e-05, + "loss": -0.8644, + "step": 40950 + }, + { + "epoch": 23.841676367869617, + "grad_norm": 0.23487423360347748, + "learning_rate": 2.509791344966848e-05, + "loss": -0.8664, + "step": 40960 + }, + { + "epoch": 23.847497089639116, + "grad_norm": 0.16160792112350464, + "learning_rate": 2.5074020376234768e-05, + "loss": -0.8672, + "step": 40970 + }, + { + "epoch": 23.853317811408616, + "grad_norm": 0.1935427188873291, + "learning_rate": 2.5050134874655534e-05, + "loss": -0.8668, + "step": 40980 + }, + { + "epoch": 23.859138533178115, + "grad_norm": 0.21766267716884613, + "learning_rate": 2.5026256952186566e-05, + "loss": -0.866, + "step": 40990 + }, + { + "epoch": 23.864959254947614, + "grad_norm": 0.20224477350711823, + "learning_rate": 2.5002386616081335e-05, + "loss": -0.8652, + "step": 41000 + }, + { + "epoch": 23.870779976717113, + "grad_norm": 0.15448638796806335, + "learning_rate": 2.497852387359103e-05, + "loss": -0.8575, + "step": 41010 + }, + { + "epoch": 23.876600698486612, + "grad_norm": 0.20826561748981476, + "learning_rate": 2.4954668731964496e-05, + "loss": -0.8677, + "step": 41020 + }, + { + "epoch": 23.88242142025611, + "grad_norm": 0.16250550746917725, + "learning_rate": 2.4930821198448364e-05, + "loss": -0.8661, + "step": 41030 + }, + { + "epoch": 23.88824214202561, + "grad_norm": 0.19614703953266144, + "learning_rate": 2.4906981280286796e-05, + "loss": -0.8667, + "step": 41040 + }, + { + "epoch": 23.89406286379511, + "grad_norm": 0.10170795768499374, + "learning_rate": 2.488314898472179e-05, + "loss": -0.8652, + "step": 41050 + }, + { + "epoch": 23.89988358556461, + "grad_norm": 0.16227512061595917, + "learning_rate": 2.485932431899295e-05, + "loss": -0.8725, + "step": 41060 + }, + { + "epoch": 23.90570430733411, + "grad_norm": 0.21896660327911377, + "learning_rate": 2.4835507290337584e-05, + "loss": -0.8681, + "step": 41070 + }, + { + "epoch": 23.911525029103608, + "grad_norm": 0.16675801575183868, + "learning_rate": 2.4811697905990672e-05, + "loss": -0.8697, + "step": 41080 + }, + { + "epoch": 23.917345750873107, + "grad_norm": 0.14290928840637207, + "learning_rate": 2.4787896173184854e-05, + "loss": -0.8735, + "step": 41090 + }, + { + "epoch": 23.923166472642606, + "grad_norm": 0.140194833278656, + "learning_rate": 2.4764102099150534e-05, + "loss": -0.8637, + "step": 41100 + }, + { + "epoch": 23.92898719441211, + "grad_norm": 0.1563364863395691, + "learning_rate": 2.4740315691115644e-05, + "loss": -0.8664, + "step": 41110 + }, + { + "epoch": 23.934807916181608, + "grad_norm": 0.15423429012298584, + "learning_rate": 2.4716536956305918e-05, + "loss": -0.8644, + "step": 41120 + }, + { + "epoch": 23.940628637951107, + "grad_norm": 0.2101043164730072, + "learning_rate": 2.4692765901944697e-05, + "loss": -0.8683, + "step": 41130 + }, + { + "epoch": 23.946449359720607, + "grad_norm": 0.19062036275863647, + "learning_rate": 2.4669002535253e-05, + "loss": -0.8675, + "step": 41140 + }, + { + "epoch": 23.952270081490106, + "grad_norm": 0.17482665181159973, + "learning_rate": 2.46452468634495e-05, + "loss": -0.8678, + "step": 41150 + }, + { + "epoch": 23.958090803259605, + "grad_norm": 0.2150595635175705, + "learning_rate": 2.462149889375055e-05, + "loss": -0.8673, + "step": 41160 + }, + { + "epoch": 23.963911525029104, + "grad_norm": 0.16187910735607147, + "learning_rate": 2.459775863337014e-05, + "loss": -0.871, + "step": 41170 + }, + { + "epoch": 23.969732246798603, + "grad_norm": 0.17320185899734497, + "learning_rate": 2.4574026089519985e-05, + "loss": -0.8717, + "step": 41180 + }, + { + "epoch": 23.975552968568103, + "grad_norm": 0.21795202791690826, + "learning_rate": 2.4550301269409333e-05, + "loss": -0.8718, + "step": 41190 + }, + { + "epoch": 23.981373690337602, + "grad_norm": 0.18848934769630432, + "learning_rate": 2.4526584180245216e-05, + "loss": -0.8702, + "step": 41200 + }, + { + "epoch": 23.9871944121071, + "grad_norm": 0.15559084713459015, + "learning_rate": 2.4502874829232236e-05, + "loss": -0.8708, + "step": 41210 + }, + { + "epoch": 23.9930151338766, + "grad_norm": 0.2202901691198349, + "learning_rate": 2.447917322357267e-05, + "loss": -0.8743, + "step": 41220 + }, + { + "epoch": 23.9988358556461, + "grad_norm": 0.19890958070755005, + "learning_rate": 2.4455479370466443e-05, + "loss": -0.8696, + "step": 41230 + }, + { + "epoch": 24.0046565774156, + "grad_norm": 0.1819002330303192, + "learning_rate": 2.4431793277111097e-05, + "loss": -0.8739, + "step": 41240 + }, + { + "epoch": 24.010477299185098, + "grad_norm": 0.2575104236602783, + "learning_rate": 2.4408114950701905e-05, + "loss": -0.8763, + "step": 41250 + }, + { + "epoch": 24.016298020954597, + "grad_norm": 0.1504586786031723, + "learning_rate": 2.4384444398431634e-05, + "loss": -0.8747, + "step": 41260 + }, + { + "epoch": 24.022118742724096, + "grad_norm": 0.19249945878982544, + "learning_rate": 2.4360781627490837e-05, + "loss": -0.8754, + "step": 41270 + }, + { + "epoch": 24.027939464493596, + "grad_norm": 0.18297924101352692, + "learning_rate": 2.433712664506762e-05, + "loss": -0.8763, + "step": 41280 + }, + { + "epoch": 24.0337601862631, + "grad_norm": 0.24800291657447815, + "learning_rate": 2.431347945834774e-05, + "loss": -0.8766, + "step": 41290 + }, + { + "epoch": 24.039580908032598, + "grad_norm": 0.19684354960918427, + "learning_rate": 2.428984007451458e-05, + "loss": -0.871, + "step": 41300 + }, + { + "epoch": 24.045401629802097, + "grad_norm": 0.18777970969676971, + "learning_rate": 2.426620850074917e-05, + "loss": -0.8753, + "step": 41310 + }, + { + "epoch": 24.051222351571596, + "grad_norm": 0.15069054067134857, + "learning_rate": 2.424258474423014e-05, + "loss": -0.8709, + "step": 41320 + }, + { + "epoch": 24.057043073341095, + "grad_norm": 0.13938070833683014, + "learning_rate": 2.421896881213382e-05, + "loss": -0.8758, + "step": 41330 + }, + { + "epoch": 24.062863795110594, + "grad_norm": 0.1892063021659851, + "learning_rate": 2.419536071163402e-05, + "loss": -0.8759, + "step": 41340 + }, + { + "epoch": 24.068684516880094, + "grad_norm": 0.18998831510543823, + "learning_rate": 2.417176044990233e-05, + "loss": -0.878, + "step": 41350 + }, + { + "epoch": 24.074505238649593, + "grad_norm": 0.15840953588485718, + "learning_rate": 2.4148168034107855e-05, + "loss": -0.8807, + "step": 41360 + }, + { + "epoch": 24.080325960419092, + "grad_norm": 0.23831865191459656, + "learning_rate": 2.4124583471417355e-05, + "loss": -0.876, + "step": 41370 + }, + { + "epoch": 24.08614668218859, + "grad_norm": 0.2078382819890976, + "learning_rate": 2.41010067689952e-05, + "loss": -0.8744, + "step": 41380 + }, + { + "epoch": 24.09196740395809, + "grad_norm": 0.2185351550579071, + "learning_rate": 2.4077437934003338e-05, + "loss": -0.8753, + "step": 41390 + }, + { + "epoch": 24.09778812572759, + "grad_norm": 0.13611432909965515, + "learning_rate": 2.405387697360143e-05, + "loss": -0.8726, + "step": 41400 + }, + { + "epoch": 24.10360884749709, + "grad_norm": 0.14810550212860107, + "learning_rate": 2.4030323894946595e-05, + "loss": -0.8763, + "step": 41410 + }, + { + "epoch": 24.109429569266588, + "grad_norm": 0.158655047416687, + "learning_rate": 2.40067787051937e-05, + "loss": -0.8772, + "step": 41420 + }, + { + "epoch": 24.115250291036087, + "grad_norm": 0.11800022423267365, + "learning_rate": 2.3983241411495087e-05, + "loss": -0.8782, + "step": 41430 + }, + { + "epoch": 24.121071012805587, + "grad_norm": 0.2306893765926361, + "learning_rate": 2.3959712021000823e-05, + "loss": -0.8751, + "step": 41440 + }, + { + "epoch": 24.126891734575086, + "grad_norm": 0.198311448097229, + "learning_rate": 2.3936190540858495e-05, + "loss": -0.8772, + "step": 41450 + }, + { + "epoch": 24.132712456344585, + "grad_norm": 0.14184606075286865, + "learning_rate": 2.39126769782133e-05, + "loss": -0.876, + "step": 41460 + }, + { + "epoch": 24.138533178114088, + "grad_norm": 0.1189938634634018, + "learning_rate": 2.388917134020805e-05, + "loss": -0.8778, + "step": 41470 + }, + { + "epoch": 24.144353899883587, + "grad_norm": 0.23377232253551483, + "learning_rate": 2.3865673633983128e-05, + "loss": -0.8793, + "step": 41480 + }, + { + "epoch": 24.150174621653086, + "grad_norm": 0.21569983661174774, + "learning_rate": 2.3842183866676492e-05, + "loss": -0.8806, + "step": 41490 + }, + { + "epoch": 24.155995343422585, + "grad_norm": 0.15266533195972443, + "learning_rate": 2.381870204542377e-05, + "loss": -0.88, + "step": 41500 + }, + { + "epoch": 24.161816065192085, + "grad_norm": 0.1893848031759262, + "learning_rate": 2.379522817735808e-05, + "loss": -0.8815, + "step": 41510 + }, + { + "epoch": 24.167636786961584, + "grad_norm": 0.2236000895500183, + "learning_rate": 2.377176226961018e-05, + "loss": -0.8772, + "step": 41520 + }, + { + "epoch": 24.173457508731083, + "grad_norm": 0.14642442762851715, + "learning_rate": 2.3748304329308384e-05, + "loss": -0.8806, + "step": 41530 + }, + { + "epoch": 24.179278230500582, + "grad_norm": 0.2645827829837799, + "learning_rate": 2.372485436357858e-05, + "loss": -0.8772, + "step": 41540 + }, + { + "epoch": 24.18509895227008, + "grad_norm": 0.17316366732120514, + "learning_rate": 2.3701412379544296e-05, + "loss": -0.8782, + "step": 41550 + }, + { + "epoch": 24.19091967403958, + "grad_norm": 0.12081843614578247, + "learning_rate": 2.367797838432653e-05, + "loss": -0.8793, + "step": 41560 + }, + { + "epoch": 24.19674039580908, + "grad_norm": 0.222383052110672, + "learning_rate": 2.3654552385043967e-05, + "loss": -0.8762, + "step": 41570 + }, + { + "epoch": 24.20256111757858, + "grad_norm": 0.14332132041454315, + "learning_rate": 2.3631134388812742e-05, + "loss": -0.8785, + "step": 41580 + }, + { + "epoch": 24.20838183934808, + "grad_norm": 0.18367476761341095, + "learning_rate": 2.3607724402746684e-05, + "loss": -0.8704, + "step": 41590 + }, + { + "epoch": 24.214202561117578, + "grad_norm": 0.1412970870733261, + "learning_rate": 2.35843224339571e-05, + "loss": -0.8816, + "step": 41600 + }, + { + "epoch": 24.220023282887077, + "grad_norm": 0.1505371779203415, + "learning_rate": 2.3560928489552897e-05, + "loss": -0.8834, + "step": 41610 + }, + { + "epoch": 24.225844004656576, + "grad_norm": 0.25607410073280334, + "learning_rate": 2.353754257664053e-05, + "loss": -0.8782, + "step": 41620 + }, + { + "epoch": 24.231664726426075, + "grad_norm": 0.14610373973846436, + "learning_rate": 2.3514164702324037e-05, + "loss": -0.8782, + "step": 41630 + }, + { + "epoch": 24.237485448195578, + "grad_norm": 0.20766331255435944, + "learning_rate": 2.3490794873704963e-05, + "loss": -0.8804, + "step": 41640 + }, + { + "epoch": 24.243306169965077, + "grad_norm": 0.18828237056732178, + "learning_rate": 2.3467433097882496e-05, + "loss": -0.8848, + "step": 41650 + }, + { + "epoch": 24.249126891734576, + "grad_norm": 0.18871372938156128, + "learning_rate": 2.34440793819533e-05, + "loss": -0.8819, + "step": 41660 + }, + { + "epoch": 24.254947613504076, + "grad_norm": 0.13611426949501038, + "learning_rate": 2.3420733733011617e-05, + "loss": -0.8833, + "step": 41670 + }, + { + "epoch": 24.260768335273575, + "grad_norm": 0.1695103794336319, + "learning_rate": 2.3397396158149243e-05, + "loss": -0.8816, + "step": 41680 + }, + { + "epoch": 24.266589057043074, + "grad_norm": 0.15940992534160614, + "learning_rate": 2.3374066664455498e-05, + "loss": -0.8801, + "step": 41690 + }, + { + "epoch": 24.272409778812573, + "grad_norm": 0.1502213329076767, + "learning_rate": 2.3350745259017315e-05, + "loss": -0.8793, + "step": 41700 + }, + { + "epoch": 24.278230500582072, + "grad_norm": 0.16589610278606415, + "learning_rate": 2.332743194891906e-05, + "loss": -0.8819, + "step": 41710 + }, + { + "epoch": 24.28405122235157, + "grad_norm": 0.12828129529953003, + "learning_rate": 2.330412674124276e-05, + "loss": -0.8835, + "step": 41720 + }, + { + "epoch": 24.28987194412107, + "grad_norm": 0.11107316613197327, + "learning_rate": 2.328082964306786e-05, + "loss": -0.8851, + "step": 41730 + }, + { + "epoch": 24.29569266589057, + "grad_norm": 0.11426285654306412, + "learning_rate": 2.325754066147145e-05, + "loss": -0.8816, + "step": 41740 + }, + { + "epoch": 24.30151338766007, + "grad_norm": 0.12760719656944275, + "learning_rate": 2.32342598035281e-05, + "loss": -0.8841, + "step": 41750 + }, + { + "epoch": 24.30733410942957, + "grad_norm": 0.1943366676568985, + "learning_rate": 2.321098707630991e-05, + "loss": -0.8804, + "step": 41760 + }, + { + "epoch": 24.313154831199068, + "grad_norm": 0.1593681126832962, + "learning_rate": 2.318772248688652e-05, + "loss": -0.8855, + "step": 41770 + }, + { + "epoch": 24.318975552968567, + "grad_norm": 0.22510388493537903, + "learning_rate": 2.3164466042325107e-05, + "loss": -0.8848, + "step": 41780 + }, + { + "epoch": 24.324796274738066, + "grad_norm": 0.20559056103229523, + "learning_rate": 2.3141217749690353e-05, + "loss": -0.8831, + "step": 41790 + }, + { + "epoch": 24.330616996507565, + "grad_norm": 0.1346869021654129, + "learning_rate": 2.3117977616044466e-05, + "loss": -0.8858, + "step": 41800 + }, + { + "epoch": 24.336437718277068, + "grad_norm": 0.17354904115200043, + "learning_rate": 2.309474564844722e-05, + "loss": -0.8828, + "step": 41810 + }, + { + "epoch": 24.342258440046567, + "grad_norm": 0.12720797955989838, + "learning_rate": 2.307152185395585e-05, + "loss": -0.8824, + "step": 41820 + }, + { + "epoch": 24.348079161816067, + "grad_norm": 0.17500954866409302, + "learning_rate": 2.3048306239625144e-05, + "loss": -0.8829, + "step": 41830 + }, + { + "epoch": 24.353899883585566, + "grad_norm": 0.21863694489002228, + "learning_rate": 2.3025098812507378e-05, + "loss": -0.8794, + "step": 41840 + }, + { + "epoch": 24.359720605355065, + "grad_norm": 0.2451930195093155, + "learning_rate": 2.3001899579652366e-05, + "loss": -0.8857, + "step": 41850 + }, + { + "epoch": 24.365541327124564, + "grad_norm": 0.1310482770204544, + "learning_rate": 2.2978708548107393e-05, + "loss": -0.8859, + "step": 41860 + }, + { + "epoch": 24.371362048894063, + "grad_norm": 0.17690321803092957, + "learning_rate": 2.2955525724917348e-05, + "loss": -0.8866, + "step": 41870 + }, + { + "epoch": 24.377182770663563, + "grad_norm": 0.17996254563331604, + "learning_rate": 2.2932351117124477e-05, + "loss": -0.8844, + "step": 41880 + }, + { + "epoch": 24.383003492433062, + "grad_norm": 0.15114261209964752, + "learning_rate": 2.29091847317687e-05, + "loss": -0.8844, + "step": 41890 + }, + { + "epoch": 24.38882421420256, + "grad_norm": 0.14420489966869354, + "learning_rate": 2.2886026575887277e-05, + "loss": -0.8853, + "step": 41900 + }, + { + "epoch": 24.39464493597206, + "grad_norm": 0.2505515217781067, + "learning_rate": 2.2862876656515094e-05, + "loss": -0.8806, + "step": 41910 + }, + { + "epoch": 24.40046565774156, + "grad_norm": 0.2225228101015091, + "learning_rate": 2.2839734980684464e-05, + "loss": -0.8862, + "step": 41920 + }, + { + "epoch": 24.40628637951106, + "grad_norm": 0.16918110847473145, + "learning_rate": 2.281660155542522e-05, + "loss": -0.88, + "step": 41930 + }, + { + "epoch": 24.412107101280558, + "grad_norm": 0.19034512341022491, + "learning_rate": 2.279347638776469e-05, + "loss": -0.8843, + "step": 41940 + }, + { + "epoch": 24.417927823050057, + "grad_norm": 0.28683772683143616, + "learning_rate": 2.2770359484727665e-05, + "loss": -0.8812, + "step": 41950 + }, + { + "epoch": 24.423748544819556, + "grad_norm": 0.20060782134532928, + "learning_rate": 2.27472508533365e-05, + "loss": -0.886, + "step": 41960 + }, + { + "epoch": 24.429569266589056, + "grad_norm": 0.23340852558612823, + "learning_rate": 2.2724150500610948e-05, + "loss": -0.8782, + "step": 41970 + }, + { + "epoch": 24.435389988358555, + "grad_norm": 0.1665811985731125, + "learning_rate": 2.2701058433568302e-05, + "loss": -0.8824, + "step": 41980 + }, + { + "epoch": 24.441210710128058, + "grad_norm": 0.22384290397167206, + "learning_rate": 2.2677974659223318e-05, + "loss": -0.8786, + "step": 41990 + }, + { + "epoch": 24.447031431897557, + "grad_norm": 0.22740007936954498, + "learning_rate": 2.2654899184588235e-05, + "loss": -0.8801, + "step": 42000 + }, + { + "epoch": 24.452852153667056, + "grad_norm": 0.233863964676857, + "learning_rate": 2.2631832016672756e-05, + "loss": -0.889, + "step": 42010 + }, + { + "epoch": 24.458672875436555, + "grad_norm": 0.210628941655159, + "learning_rate": 2.2608773162484127e-05, + "loss": -0.8851, + "step": 42020 + }, + { + "epoch": 24.464493597206054, + "grad_norm": 0.1453627645969391, + "learning_rate": 2.2585722629026958e-05, + "loss": -0.8912, + "step": 42030 + }, + { + "epoch": 24.470314318975554, + "grad_norm": 0.15318819880485535, + "learning_rate": 2.2562680423303457e-05, + "loss": -0.8872, + "step": 42040 + }, + { + "epoch": 24.476135040745053, + "grad_norm": 0.12134958058595657, + "learning_rate": 2.2539646552313165e-05, + "loss": -0.8804, + "step": 42050 + }, + { + "epoch": 24.481955762514552, + "grad_norm": 0.16587777435779572, + "learning_rate": 2.251662102305322e-05, + "loss": -0.8874, + "step": 42060 + }, + { + "epoch": 24.48777648428405, + "grad_norm": 0.13109616935253143, + "learning_rate": 2.2493603842518152e-05, + "loss": -0.8905, + "step": 42070 + }, + { + "epoch": 24.49359720605355, + "grad_norm": 0.15754318237304688, + "learning_rate": 2.2470595017699974e-05, + "loss": -0.889, + "step": 42080 + }, + { + "epoch": 24.49941792782305, + "grad_norm": 0.16642457246780396, + "learning_rate": 2.244759455558816e-05, + "loss": -0.884, + "step": 42090 + }, + { + "epoch": 24.50523864959255, + "grad_norm": 0.16331997513771057, + "learning_rate": 2.2424602463169614e-05, + "loss": -0.8848, + "step": 42100 + }, + { + "epoch": 24.511059371362048, + "grad_norm": 0.13767080008983612, + "learning_rate": 2.2401618747428776e-05, + "loss": -0.8868, + "step": 42110 + }, + { + "epoch": 24.516880093131547, + "grad_norm": 0.14464673399925232, + "learning_rate": 2.237864341534747e-05, + "loss": -0.8877, + "step": 42120 + }, + { + "epoch": 24.522700814901047, + "grad_norm": 0.11473821103572845, + "learning_rate": 2.2355676473904998e-05, + "loss": -0.8866, + "step": 42130 + }, + { + "epoch": 24.528521536670546, + "grad_norm": 0.14129987359046936, + "learning_rate": 2.2332717930078108e-05, + "loss": -0.888, + "step": 42140 + }, + { + "epoch": 24.534342258440045, + "grad_norm": 0.18053220212459564, + "learning_rate": 2.2309767790840992e-05, + "loss": -0.884, + "step": 42150 + }, + { + "epoch": 24.540162980209544, + "grad_norm": 0.1536279171705246, + "learning_rate": 2.228682606316529e-05, + "loss": -0.8872, + "step": 42160 + }, + { + "epoch": 24.545983701979047, + "grad_norm": 0.2006060928106308, + "learning_rate": 2.2263892754020138e-05, + "loss": -0.8891, + "step": 42170 + }, + { + "epoch": 24.551804423748546, + "grad_norm": 0.18933451175689697, + "learning_rate": 2.2240967870372004e-05, + "loss": -0.8896, + "step": 42180 + }, + { + "epoch": 24.557625145518045, + "grad_norm": 0.17670978605747223, + "learning_rate": 2.2218051419184933e-05, + "loss": -0.8857, + "step": 42190 + }, + { + "epoch": 24.563445867287545, + "grad_norm": 0.22169309854507446, + "learning_rate": 2.219514340742026e-05, + "loss": -0.8863, + "step": 42200 + }, + { + "epoch": 24.569266589057044, + "grad_norm": 0.20336444675922394, + "learning_rate": 2.2172243842036898e-05, + "loss": -0.8881, + "step": 42210 + }, + { + "epoch": 24.575087310826543, + "grad_norm": 0.2076411098241806, + "learning_rate": 2.2149352729991107e-05, + "loss": -0.8867, + "step": 42220 + }, + { + "epoch": 24.580908032596042, + "grad_norm": 0.13487738370895386, + "learning_rate": 2.2126470078236605e-05, + "loss": -0.8877, + "step": 42230 + }, + { + "epoch": 24.58672875436554, + "grad_norm": 0.18642456829547882, + "learning_rate": 2.2103595893724533e-05, + "loss": -0.8924, + "step": 42240 + }, + { + "epoch": 24.59254947613504, + "grad_norm": 0.12915737926959991, + "learning_rate": 2.208073018340345e-05, + "loss": -0.8901, + "step": 42250 + }, + { + "epoch": 24.59837019790454, + "grad_norm": 0.16433660686016083, + "learning_rate": 2.2057872954219405e-05, + "loss": -0.8874, + "step": 42260 + }, + { + "epoch": 24.60419091967404, + "grad_norm": 0.2386990338563919, + "learning_rate": 2.203502421311575e-05, + "loss": -0.8837, + "step": 42270 + }, + { + "epoch": 24.61001164144354, + "grad_norm": 0.25622403621673584, + "learning_rate": 2.2012183967033388e-05, + "loss": -0.8927, + "step": 42280 + }, + { + "epoch": 24.615832363213038, + "grad_norm": 0.1777285933494568, + "learning_rate": 2.198935222291056e-05, + "loss": -0.8856, + "step": 42290 + }, + { + "epoch": 24.621653084982537, + "grad_norm": 0.20499466359615326, + "learning_rate": 2.1966528987682948e-05, + "loss": -0.8883, + "step": 42300 + }, + { + "epoch": 24.627473806752036, + "grad_norm": 0.2180076390504837, + "learning_rate": 2.194371426828365e-05, + "loss": -0.886, + "step": 42310 + }, + { + "epoch": 24.633294528521535, + "grad_norm": 0.2172173410654068, + "learning_rate": 2.192090807164317e-05, + "loss": -0.8886, + "step": 42320 + }, + { + "epoch": 24.639115250291034, + "grad_norm": 0.1404149830341339, + "learning_rate": 2.1898110404689422e-05, + "loss": -0.8904, + "step": 42330 + }, + { + "epoch": 24.644935972060537, + "grad_norm": 0.20864978432655334, + "learning_rate": 2.1875321274347776e-05, + "loss": -0.8875, + "step": 42340 + }, + { + "epoch": 24.650756693830036, + "grad_norm": 0.13264626264572144, + "learning_rate": 2.18525406875409e-05, + "loss": -0.8873, + "step": 42350 + }, + { + "epoch": 24.656577415599536, + "grad_norm": 0.19632890820503235, + "learning_rate": 2.1829768651188997e-05, + "loss": -0.8917, + "step": 42360 + }, + { + "epoch": 24.662398137369035, + "grad_norm": 0.21275867521762848, + "learning_rate": 2.180700517220958e-05, + "loss": -0.8945, + "step": 42370 + }, + { + "epoch": 24.668218859138534, + "grad_norm": 0.16442640125751495, + "learning_rate": 2.1784250257517603e-05, + "loss": -0.8917, + "step": 42380 + }, + { + "epoch": 24.674039580908033, + "grad_norm": 0.22212012112140656, + "learning_rate": 2.1761503914025406e-05, + "loss": -0.8895, + "step": 42390 + }, + { + "epoch": 24.679860302677533, + "grad_norm": 0.14920106530189514, + "learning_rate": 2.1738766148642705e-05, + "loss": -0.8898, + "step": 42400 + }, + { + "epoch": 24.68568102444703, + "grad_norm": 0.14772504568099976, + "learning_rate": 2.1716036968276683e-05, + "loss": -0.8907, + "step": 42410 + }, + { + "epoch": 24.69150174621653, + "grad_norm": 0.12381414324045181, + "learning_rate": 2.1693316379831808e-05, + "loss": -0.8933, + "step": 42420 + }, + { + "epoch": 24.69732246798603, + "grad_norm": 0.2268947958946228, + "learning_rate": 2.1670604390210037e-05, + "loss": -0.8905, + "step": 42430 + }, + { + "epoch": 24.70314318975553, + "grad_norm": 0.18359869718551636, + "learning_rate": 2.1647901006310656e-05, + "loss": -0.8887, + "step": 42440 + }, + { + "epoch": 24.70896391152503, + "grad_norm": 0.20720981061458588, + "learning_rate": 2.1625206235030353e-05, + "loss": -0.8903, + "step": 42450 + }, + { + "epoch": 24.714784633294528, + "grad_norm": 0.19547437131404877, + "learning_rate": 2.160252008326321e-05, + "loss": -0.8853, + "step": 42460 + }, + { + "epoch": 24.720605355064027, + "grad_norm": 0.2014286071062088, + "learning_rate": 2.157984255790067e-05, + "loss": -0.8889, + "step": 42470 + }, + { + "epoch": 24.726426076833526, + "grad_norm": 0.22717122733592987, + "learning_rate": 2.1557173665831553e-05, + "loss": -0.8912, + "step": 42480 + }, + { + "epoch": 24.732246798603025, + "grad_norm": 0.18325921893119812, + "learning_rate": 2.153451341394212e-05, + "loss": -0.8878, + "step": 42490 + }, + { + "epoch": 24.738067520372525, + "grad_norm": 0.13160070776939392, + "learning_rate": 2.151186180911589e-05, + "loss": -0.8929, + "step": 42500 + }, + { + "epoch": 24.743888242142027, + "grad_norm": 0.17496825754642487, + "learning_rate": 2.1489218858233877e-05, + "loss": -0.8926, + "step": 42510 + }, + { + "epoch": 24.749708963911527, + "grad_norm": 0.15485955774784088, + "learning_rate": 2.1466584568174392e-05, + "loss": -0.8893, + "step": 42520 + }, + { + "epoch": 24.755529685681026, + "grad_norm": 0.3025606572628021, + "learning_rate": 2.1443958945813132e-05, + "loss": -0.8878, + "step": 42530 + }, + { + "epoch": 24.761350407450525, + "grad_norm": 0.22887396812438965, + "learning_rate": 2.1421341998023163e-05, + "loss": -0.8876, + "step": 42540 + }, + { + "epoch": 24.767171129220024, + "grad_norm": 0.20519933104515076, + "learning_rate": 2.139873373167491e-05, + "loss": -0.8885, + "step": 42550 + }, + { + "epoch": 24.772991850989523, + "grad_norm": 0.12482722103595734, + "learning_rate": 2.13761341536362e-05, + "loss": -0.8913, + "step": 42560 + }, + { + "epoch": 24.778812572759023, + "grad_norm": 0.20697550475597382, + "learning_rate": 2.1353543270772136e-05, + "loss": -0.8916, + "step": 42570 + }, + { + "epoch": 24.784633294528522, + "grad_norm": 0.1363937258720398, + "learning_rate": 2.1330961089945297e-05, + "loss": -0.8919, + "step": 42580 + }, + { + "epoch": 24.79045401629802, + "grad_norm": 0.15499214828014374, + "learning_rate": 2.130838761801548e-05, + "loss": -0.8869, + "step": 42590 + }, + { + "epoch": 24.79627473806752, + "grad_norm": 0.16102837026119232, + "learning_rate": 2.1285822861839966e-05, + "loss": -0.8883, + "step": 42600 + }, + { + "epoch": 24.80209545983702, + "grad_norm": 0.14304694533348083, + "learning_rate": 2.126326682827331e-05, + "loss": -0.8926, + "step": 42610 + }, + { + "epoch": 24.80791618160652, + "grad_norm": 0.2676279842853546, + "learning_rate": 2.124071952416744e-05, + "loss": -0.8927, + "step": 42620 + }, + { + "epoch": 24.813736903376018, + "grad_norm": 0.1727117896080017, + "learning_rate": 2.1218180956371634e-05, + "loss": -0.8945, + "step": 42630 + }, + { + "epoch": 24.819557625145517, + "grad_norm": 0.13453535735607147, + "learning_rate": 2.119565113173252e-05, + "loss": -0.8876, + "step": 42640 + }, + { + "epoch": 24.825378346915016, + "grad_norm": 0.14857132732868195, + "learning_rate": 2.1173130057094033e-05, + "loss": -0.8931, + "step": 42650 + }, + { + "epoch": 24.831199068684516, + "grad_norm": 0.14539793133735657, + "learning_rate": 2.115061773929753e-05, + "loss": -0.895, + "step": 42660 + }, + { + "epoch": 24.837019790454015, + "grad_norm": 0.2453879415988922, + "learning_rate": 2.1128114185181623e-05, + "loss": -0.8927, + "step": 42670 + }, + { + "epoch": 24.842840512223514, + "grad_norm": 0.13431015610694885, + "learning_rate": 2.1105619401582317e-05, + "loss": -0.8931, + "step": 42680 + }, + { + "epoch": 24.848661233993017, + "grad_norm": 0.15409038960933685, + "learning_rate": 2.1083133395332928e-05, + "loss": -0.8958, + "step": 42690 + }, + { + "epoch": 24.854481955762516, + "grad_norm": 0.15011639893054962, + "learning_rate": 2.1060656173264082e-05, + "loss": -0.8928, + "step": 42700 + }, + { + "epoch": 24.860302677532015, + "grad_norm": 0.23532602190971375, + "learning_rate": 2.103818774220383e-05, + "loss": -0.8855, + "step": 42710 + }, + { + "epoch": 24.866123399301514, + "grad_norm": 0.263285756111145, + "learning_rate": 2.1015728108977412e-05, + "loss": -0.8885, + "step": 42720 + }, + { + "epoch": 24.871944121071014, + "grad_norm": 0.20518146455287933, + "learning_rate": 2.0993277280407548e-05, + "loss": -0.8881, + "step": 42730 + }, + { + "epoch": 24.877764842840513, + "grad_norm": 0.17654526233673096, + "learning_rate": 2.0970835263314132e-05, + "loss": -0.8941, + "step": 42740 + }, + { + "epoch": 24.883585564610012, + "grad_norm": 0.16132989525794983, + "learning_rate": 2.094840206451451e-05, + "loss": -0.8936, + "step": 42750 + }, + { + "epoch": 24.88940628637951, + "grad_norm": 0.17299222946166992, + "learning_rate": 2.0925977690823273e-05, + "loss": -0.8937, + "step": 42760 + }, + { + "epoch": 24.89522700814901, + "grad_norm": 0.16251596808433533, + "learning_rate": 2.0903562149052364e-05, + "loss": -0.8921, + "step": 42770 + }, + { + "epoch": 24.90104772991851, + "grad_norm": 0.18200355768203735, + "learning_rate": 2.0881155446011025e-05, + "loss": -0.8957, + "step": 42780 + }, + { + "epoch": 24.90686845168801, + "grad_norm": 0.181378573179245, + "learning_rate": 2.0858757588505823e-05, + "loss": -0.8909, + "step": 42790 + }, + { + "epoch": 24.912689173457508, + "grad_norm": 0.2120579481124878, + "learning_rate": 2.0836368583340622e-05, + "loss": -0.8953, + "step": 42800 + }, + { + "epoch": 24.918509895227007, + "grad_norm": 0.16776838898658752, + "learning_rate": 2.081398843731664e-05, + "loss": -0.8923, + "step": 42810 + }, + { + "epoch": 24.924330616996507, + "grad_norm": 0.27273163199424744, + "learning_rate": 2.0791617157232357e-05, + "loss": -0.8885, + "step": 42820 + }, + { + "epoch": 24.930151338766006, + "grad_norm": 0.17069941759109497, + "learning_rate": 2.0769254749883576e-05, + "loss": -0.8906, + "step": 42830 + }, + { + "epoch": 24.935972060535505, + "grad_norm": 0.17869316041469574, + "learning_rate": 2.0746901222063415e-05, + "loss": -0.8897, + "step": 42840 + }, + { + "epoch": 24.941792782305004, + "grad_norm": 0.14156574010849, + "learning_rate": 2.072455658056226e-05, + "loss": -0.8928, + "step": 42850 + }, + { + "epoch": 24.947613504074504, + "grad_norm": 0.20698487758636475, + "learning_rate": 2.0702220832167873e-05, + "loss": -0.8925, + "step": 42860 + }, + { + "epoch": 24.953434225844006, + "grad_norm": 0.17548541724681854, + "learning_rate": 2.0679893983665205e-05, + "loss": -0.8937, + "step": 42870 + }, + { + "epoch": 24.959254947613505, + "grad_norm": 0.2238251119852066, + "learning_rate": 2.0657576041836622e-05, + "loss": -0.8877, + "step": 42880 + }, + { + "epoch": 24.965075669383005, + "grad_norm": 0.15732935070991516, + "learning_rate": 2.0635267013461666e-05, + "loss": -0.8861, + "step": 42890 + }, + { + "epoch": 24.970896391152504, + "grad_norm": 0.2354145497083664, + "learning_rate": 2.061296690531728e-05, + "loss": -0.8906, + "step": 42900 + }, + { + "epoch": 24.976717112922003, + "grad_norm": 0.1719089299440384, + "learning_rate": 2.0590675724177622e-05, + "loss": -0.8959, + "step": 42910 + }, + { + "epoch": 24.982537834691502, + "grad_norm": 0.13205577433109283, + "learning_rate": 2.0568393476814167e-05, + "loss": -0.8945, + "step": 42920 + }, + { + "epoch": 24.988358556461, + "grad_norm": 0.1742578148841858, + "learning_rate": 2.0546120169995685e-05, + "loss": -0.8945, + "step": 42930 + }, + { + "epoch": 24.9941792782305, + "grad_norm": 0.18581975996494293, + "learning_rate": 2.0523855810488214e-05, + "loss": -0.8937, + "step": 42940 + }, + { + "epoch": 25.0, + "grad_norm": 0.21049164235591888, + "learning_rate": 2.050160040505505e-05, + "loss": -0.8936, + "step": 42950 + }, + { + "epoch": 25.0058207217695, + "grad_norm": 0.24154169857501984, + "learning_rate": 2.0479353960456843e-05, + "loss": -0.8929, + "step": 42960 + }, + { + "epoch": 25.011641443539, + "grad_norm": 0.17096593976020813, + "learning_rate": 2.0457116483451456e-05, + "loss": -0.8952, + "step": 42970 + }, + { + "epoch": 25.017462165308498, + "grad_norm": 0.1951345056295395, + "learning_rate": 2.0434887980794043e-05, + "loss": -0.8934, + "step": 42980 + }, + { + "epoch": 25.023282887077997, + "grad_norm": 0.20733292400836945, + "learning_rate": 2.0412668459237043e-05, + "loss": -0.8933, + "step": 42990 + }, + { + "epoch": 25.029103608847496, + "grad_norm": 0.2120080143213272, + "learning_rate": 2.039045792553016e-05, + "loss": -0.8948, + "step": 43000 + }, + { + "epoch": 25.034924330616995, + "grad_norm": 0.20645703375339508, + "learning_rate": 2.036825638642036e-05, + "loss": -0.8928, + "step": 43010 + }, + { + "epoch": 25.040745052386495, + "grad_norm": 0.2567991018295288, + "learning_rate": 2.0346063848651868e-05, + "loss": -0.8934, + "step": 43020 + }, + { + "epoch": 25.046565774155994, + "grad_norm": 0.23641207814216614, + "learning_rate": 2.0323880318966254e-05, + "loss": -0.8953, + "step": 43030 + }, + { + "epoch": 25.052386495925496, + "grad_norm": 0.1773327887058258, + "learning_rate": 2.030170580410221e-05, + "loss": -0.8942, + "step": 43040 + }, + { + "epoch": 25.058207217694996, + "grad_norm": 0.1505250334739685, + "learning_rate": 2.0279540310795837e-05, + "loss": -0.8964, + "step": 43050 + }, + { + "epoch": 25.064027939464495, + "grad_norm": 0.13543391227722168, + "learning_rate": 2.0257383845780365e-05, + "loss": -0.8966, + "step": 43060 + }, + { + "epoch": 25.069848661233994, + "grad_norm": 0.17616768181324005, + "learning_rate": 2.0235236415786384e-05, + "loss": -0.8943, + "step": 43070 + }, + { + "epoch": 25.075669383003493, + "grad_norm": 0.1071346253156662, + "learning_rate": 2.021309802754169e-05, + "loss": -0.8977, + "step": 43080 + }, + { + "epoch": 25.081490104772993, + "grad_norm": 0.11182286590337753, + "learning_rate": 2.0190968687771332e-05, + "loss": -0.8987, + "step": 43090 + }, + { + "epoch": 25.087310826542492, + "grad_norm": 0.25623956322669983, + "learning_rate": 2.016884840319763e-05, + "loss": -0.8932, + "step": 43100 + }, + { + "epoch": 25.09313154831199, + "grad_norm": 0.29083776473999023, + "learning_rate": 2.0146737180540122e-05, + "loss": -0.8939, + "step": 43110 + }, + { + "epoch": 25.09895227008149, + "grad_norm": 0.15025320649147034, + "learning_rate": 2.012463502651564e-05, + "loss": -0.8963, + "step": 43120 + }, + { + "epoch": 25.10477299185099, + "grad_norm": 0.1455255150794983, + "learning_rate": 2.0102541947838228e-05, + "loss": -0.8948, + "step": 43130 + }, + { + "epoch": 25.11059371362049, + "grad_norm": 0.1255018264055252, + "learning_rate": 2.0080457951219173e-05, + "loss": -0.8973, + "step": 43140 + }, + { + "epoch": 25.116414435389988, + "grad_norm": 0.14009855687618256, + "learning_rate": 2.0058383043367017e-05, + "loss": -0.8932, + "step": 43150 + }, + { + "epoch": 25.122235157159487, + "grad_norm": 0.1761932224035263, + "learning_rate": 2.0036317230987528e-05, + "loss": -0.8961, + "step": 43160 + }, + { + "epoch": 25.128055878928986, + "grad_norm": 0.1959724724292755, + "learning_rate": 2.0014260520783696e-05, + "loss": -0.893, + "step": 43170 + }, + { + "epoch": 25.133876600698486, + "grad_norm": 0.2104731649160385, + "learning_rate": 1.9992212919455834e-05, + "loss": -0.8956, + "step": 43180 + }, + { + "epoch": 25.139697322467985, + "grad_norm": 0.18181000649929047, + "learning_rate": 1.9970174433701333e-05, + "loss": -0.8934, + "step": 43190 + }, + { + "epoch": 25.145518044237484, + "grad_norm": 0.21872636675834656, + "learning_rate": 1.9948145070214992e-05, + "loss": -0.9001, + "step": 43200 + }, + { + "epoch": 25.151338766006983, + "grad_norm": 0.13743829727172852, + "learning_rate": 1.9926124835688663e-05, + "loss": -0.8933, + "step": 43210 + }, + { + "epoch": 25.157159487776486, + "grad_norm": 0.18575818836688995, + "learning_rate": 1.9904113736811576e-05, + "loss": -0.8968, + "step": 43220 + }, + { + "epoch": 25.162980209545985, + "grad_norm": 0.1530267596244812, + "learning_rate": 1.9882111780270096e-05, + "loss": -0.8949, + "step": 43230 + }, + { + "epoch": 25.168800931315484, + "grad_norm": 0.16051283478736877, + "learning_rate": 1.986011897274784e-05, + "loss": -0.8956, + "step": 43240 + }, + { + "epoch": 25.174621653084984, + "grad_norm": 0.17037741839885712, + "learning_rate": 1.983813532092565e-05, + "loss": -0.8943, + "step": 43250 + }, + { + "epoch": 25.180442374854483, + "grad_norm": 0.1848468780517578, + "learning_rate": 1.981616083148155e-05, + "loss": -0.8981, + "step": 43260 + }, + { + "epoch": 25.186263096623982, + "grad_norm": 0.17496776580810547, + "learning_rate": 1.9794195511090845e-05, + "loss": -0.8986, + "step": 43270 + }, + { + "epoch": 25.19208381839348, + "grad_norm": 0.16108722984790802, + "learning_rate": 1.977223936642601e-05, + "loss": -0.8968, + "step": 43280 + }, + { + "epoch": 25.19790454016298, + "grad_norm": 0.1453387290239334, + "learning_rate": 1.975029240415674e-05, + "loss": -0.8961, + "step": 43290 + }, + { + "epoch": 25.20372526193248, + "grad_norm": 0.18237248063087463, + "learning_rate": 1.9728354630949936e-05, + "loss": -0.8939, + "step": 43300 + }, + { + "epoch": 25.20954598370198, + "grad_norm": 0.1262328028678894, + "learning_rate": 1.9706426053469716e-05, + "loss": -0.8955, + "step": 43310 + }, + { + "epoch": 25.215366705471478, + "grad_norm": 0.14936144649982452, + "learning_rate": 1.9684506678377396e-05, + "loss": -0.8966, + "step": 43320 + }, + { + "epoch": 25.221187427240977, + "grad_norm": 0.24220485985279083, + "learning_rate": 1.9662596512331544e-05, + "loss": -0.8985, + "step": 43330 + }, + { + "epoch": 25.227008149010477, + "grad_norm": 0.15953294932842255, + "learning_rate": 1.964069556198782e-05, + "loss": -0.8955, + "step": 43340 + }, + { + "epoch": 25.232828870779976, + "grad_norm": 0.11397083103656769, + "learning_rate": 1.9618803833999232e-05, + "loss": -0.893, + "step": 43350 + }, + { + "epoch": 25.238649592549475, + "grad_norm": 0.18657389283180237, + "learning_rate": 1.9596921335015838e-05, + "loss": -0.8984, + "step": 43360 + }, + { + "epoch": 25.244470314318974, + "grad_norm": 0.21229609847068787, + "learning_rate": 1.957504807168501e-05, + "loss": -0.8949, + "step": 43370 + }, + { + "epoch": 25.250291036088473, + "grad_norm": 0.16117025911808014, + "learning_rate": 1.9553184050651253e-05, + "loss": -0.8999, + "step": 43380 + }, + { + "epoch": 25.256111757857976, + "grad_norm": 0.21646271646022797, + "learning_rate": 1.953132927855628e-05, + "loss": -0.8936, + "step": 43390 + }, + { + "epoch": 25.261932479627475, + "grad_norm": 0.24849197268486023, + "learning_rate": 1.9509483762038995e-05, + "loss": -0.8956, + "step": 43400 + }, + { + "epoch": 25.267753201396975, + "grad_norm": 0.16092199087142944, + "learning_rate": 1.9487647507735467e-05, + "loss": -0.896, + "step": 43410 + }, + { + "epoch": 25.273573923166474, + "grad_norm": 0.1854458749294281, + "learning_rate": 1.9465820522279032e-05, + "loss": -0.8987, + "step": 43420 + }, + { + "epoch": 25.279394644935973, + "grad_norm": 0.19335399568080902, + "learning_rate": 1.9444002812300078e-05, + "loss": -0.893, + "step": 43430 + }, + { + "epoch": 25.285215366705472, + "grad_norm": 0.24732555449008942, + "learning_rate": 1.94221943844263e-05, + "loss": -0.8996, + "step": 43440 + }, + { + "epoch": 25.29103608847497, + "grad_norm": 0.2226506918668747, + "learning_rate": 1.9400395245282515e-05, + "loss": -0.8977, + "step": 43450 + }, + { + "epoch": 25.29685681024447, + "grad_norm": 0.3023942708969116, + "learning_rate": 1.937860540149071e-05, + "loss": -0.8972, + "step": 43460 + }, + { + "epoch": 25.30267753201397, + "grad_norm": 0.17203031480312347, + "learning_rate": 1.9356824859670082e-05, + "loss": -0.8977, + "step": 43470 + }, + { + "epoch": 25.30849825378347, + "grad_norm": 0.12067936360836029, + "learning_rate": 1.9335053626436967e-05, + "loss": -0.8965, + "step": 43480 + }, + { + "epoch": 25.31431897555297, + "grad_norm": 0.12339016795158386, + "learning_rate": 1.9313291708404885e-05, + "loss": -0.8964, + "step": 43490 + }, + { + "epoch": 25.320139697322467, + "grad_norm": 0.13252127170562744, + "learning_rate": 1.9291539112184587e-05, + "loss": -0.8966, + "step": 43500 + }, + { + "epoch": 25.325960419091967, + "grad_norm": 0.15816786885261536, + "learning_rate": 1.9269795844383854e-05, + "loss": -0.8986, + "step": 43510 + }, + { + "epoch": 25.331781140861466, + "grad_norm": 0.15480077266693115, + "learning_rate": 1.9248061911607777e-05, + "loss": -0.9, + "step": 43520 + }, + { + "epoch": 25.337601862630965, + "grad_norm": 0.18072064220905304, + "learning_rate": 1.9226337320458538e-05, + "loss": -0.896, + "step": 43530 + }, + { + "epoch": 25.343422584400464, + "grad_norm": 0.11614232510328293, + "learning_rate": 1.9204622077535488e-05, + "loss": -0.8987, + "step": 43540 + }, + { + "epoch": 25.349243306169964, + "grad_norm": 0.20247581601142883, + "learning_rate": 1.9182916189435147e-05, + "loss": -0.8968, + "step": 43550 + }, + { + "epoch": 25.355064027939463, + "grad_norm": 0.18061839044094086, + "learning_rate": 1.916121966275117e-05, + "loss": -0.8955, + "step": 43560 + }, + { + "epoch": 25.360884749708966, + "grad_norm": 0.14140599966049194, + "learning_rate": 1.9139532504074443e-05, + "loss": -0.8992, + "step": 43570 + }, + { + "epoch": 25.366705471478465, + "grad_norm": 0.23890148103237152, + "learning_rate": 1.9117854719992885e-05, + "loss": -0.8996, + "step": 43580 + }, + { + "epoch": 25.372526193247964, + "grad_norm": 0.189350888133049, + "learning_rate": 1.9096186317091687e-05, + "loss": -0.9003, + "step": 43590 + }, + { + "epoch": 25.378346915017463, + "grad_norm": 0.20088566839694977, + "learning_rate": 1.9074527301953116e-05, + "loss": -0.8985, + "step": 43600 + }, + { + "epoch": 25.384167636786962, + "grad_norm": 0.21176379919052124, + "learning_rate": 1.9052877681156607e-05, + "loss": -0.8963, + "step": 43610 + }, + { + "epoch": 25.38998835855646, + "grad_norm": 0.23047681152820587, + "learning_rate": 1.903123746127875e-05, + "loss": -0.8944, + "step": 43620 + }, + { + "epoch": 25.39580908032596, + "grad_norm": 0.10237433761358261, + "learning_rate": 1.900960664889327e-05, + "loss": -0.8994, + "step": 43630 + }, + { + "epoch": 25.40162980209546, + "grad_norm": 0.139192134141922, + "learning_rate": 1.8987985250571015e-05, + "loss": -0.8999, + "step": 43640 + }, + { + "epoch": 25.40745052386496, + "grad_norm": 0.33107271790504456, + "learning_rate": 1.8966373272880054e-05, + "loss": -0.8959, + "step": 43650 + }, + { + "epoch": 25.41327124563446, + "grad_norm": 0.25121796131134033, + "learning_rate": 1.8944770722385462e-05, + "loss": -0.8968, + "step": 43660 + }, + { + "epoch": 25.419091967403958, + "grad_norm": 0.23279426991939545, + "learning_rate": 1.8923177605649576e-05, + "loss": -0.898, + "step": 43670 + }, + { + "epoch": 25.424912689173457, + "grad_norm": 0.2859823703765869, + "learning_rate": 1.8901593929231802e-05, + "loss": -0.8976, + "step": 43680 + }, + { + "epoch": 25.430733410942956, + "grad_norm": 0.16752013564109802, + "learning_rate": 1.8880019699688684e-05, + "loss": -0.8978, + "step": 43690 + }, + { + "epoch": 25.436554132712455, + "grad_norm": 0.21888402104377747, + "learning_rate": 1.8858454923573904e-05, + "loss": -0.8959, + "step": 43700 + }, + { + "epoch": 25.442374854481955, + "grad_norm": 0.20232529938220978, + "learning_rate": 1.8836899607438253e-05, + "loss": -0.8981, + "step": 43710 + }, + { + "epoch": 25.448195576251454, + "grad_norm": 0.17275553941726685, + "learning_rate": 1.8815353757829723e-05, + "loss": -0.8961, + "step": 43720 + }, + { + "epoch": 25.454016298020953, + "grad_norm": 0.16866664588451385, + "learning_rate": 1.879381738129331e-05, + "loss": -0.8974, + "step": 43730 + }, + { + "epoch": 25.459837019790456, + "grad_norm": 0.21246349811553955, + "learning_rate": 1.8772290484371236e-05, + "loss": -0.8997, + "step": 43740 + }, + { + "epoch": 25.465657741559955, + "grad_norm": 0.23279593884944916, + "learning_rate": 1.8750773073602795e-05, + "loss": -0.9012, + "step": 43750 + }, + { + "epoch": 25.471478463329454, + "grad_norm": 0.15429680049419403, + "learning_rate": 1.8729265155524405e-05, + "loss": -0.8989, + "step": 43760 + }, + { + "epoch": 25.477299185098953, + "grad_norm": 0.18156953155994415, + "learning_rate": 1.8707766736669607e-05, + "loss": -0.8987, + "step": 43770 + }, + { + "epoch": 25.483119906868453, + "grad_norm": 0.14735811948776245, + "learning_rate": 1.8686277823569055e-05, + "loss": -0.8992, + "step": 43780 + }, + { + "epoch": 25.488940628637952, + "grad_norm": 0.13503503799438477, + "learning_rate": 1.8664798422750484e-05, + "loss": -0.9043, + "step": 43790 + }, + { + "epoch": 25.49476135040745, + "grad_norm": 0.20476476848125458, + "learning_rate": 1.8643328540738832e-05, + "loss": -0.8989, + "step": 43800 + }, + { + "epoch": 25.50058207217695, + "grad_norm": 0.2312810719013214, + "learning_rate": 1.862186818405601e-05, + "loss": -0.8977, + "step": 43810 + }, + { + "epoch": 25.50640279394645, + "grad_norm": 0.13556796312332153, + "learning_rate": 1.8600417359221156e-05, + "loss": -0.8994, + "step": 43820 + }, + { + "epoch": 25.51222351571595, + "grad_norm": 0.14357464015483856, + "learning_rate": 1.8578976072750454e-05, + "loss": -0.9023, + "step": 43830 + }, + { + "epoch": 25.518044237485448, + "grad_norm": 0.16087904572486877, + "learning_rate": 1.8557544331157194e-05, + "loss": -0.9007, + "step": 43840 + }, + { + "epoch": 25.523864959254947, + "grad_norm": 0.21643376350402832, + "learning_rate": 1.8536122140951785e-05, + "loss": -0.8952, + "step": 43850 + }, + { + "epoch": 25.529685681024446, + "grad_norm": 0.21091608703136444, + "learning_rate": 1.8514709508641688e-05, + "loss": -0.8976, + "step": 43860 + }, + { + "epoch": 25.535506402793946, + "grad_norm": 0.2371700406074524, + "learning_rate": 1.8493306440731555e-05, + "loss": -0.8984, + "step": 43870 + }, + { + "epoch": 25.541327124563445, + "grad_norm": 0.2666894495487213, + "learning_rate": 1.8471912943723013e-05, + "loss": -0.8979, + "step": 43880 + }, + { + "epoch": 25.547147846332944, + "grad_norm": 0.13442561030387878, + "learning_rate": 1.8450529024114894e-05, + "loss": -0.9021, + "step": 43890 + }, + { + "epoch": 25.552968568102443, + "grad_norm": 0.1355966329574585, + "learning_rate": 1.842915468840301e-05, + "loss": -0.8982, + "step": 43900 + }, + { + "epoch": 25.558789289871942, + "grad_norm": 0.14141523838043213, + "learning_rate": 1.840778994308037e-05, + "loss": -0.8981, + "step": 43910 + }, + { + "epoch": 25.564610011641445, + "grad_norm": 0.17625558376312256, + "learning_rate": 1.8386434794637004e-05, + "loss": -0.9027, + "step": 43920 + }, + { + "epoch": 25.570430733410944, + "grad_norm": 0.19308961927890778, + "learning_rate": 1.8365089249560034e-05, + "loss": -0.9019, + "step": 43930 + }, + { + "epoch": 25.576251455180444, + "grad_norm": 0.21293993294239044, + "learning_rate": 1.8343753314333683e-05, + "loss": -0.899, + "step": 43940 + }, + { + "epoch": 25.582072176949943, + "grad_norm": 0.24354177713394165, + "learning_rate": 1.8322426995439236e-05, + "loss": -0.8983, + "step": 43950 + }, + { + "epoch": 25.587892898719442, + "grad_norm": 0.18977557122707367, + "learning_rate": 1.8301110299355058e-05, + "loss": -0.8912, + "step": 43960 + }, + { + "epoch": 25.59371362048894, + "grad_norm": 0.22196586430072784, + "learning_rate": 1.8279803232556625e-05, + "loss": -0.8982, + "step": 43970 + }, + { + "epoch": 25.59953434225844, + "grad_norm": 0.1757989376783371, + "learning_rate": 1.8258505801516444e-05, + "loss": -0.8992, + "step": 43980 + }, + { + "epoch": 25.60535506402794, + "grad_norm": 0.1767387092113495, + "learning_rate": 1.8237218012704117e-05, + "loss": -0.8998, + "step": 43990 + }, + { + "epoch": 25.61117578579744, + "grad_norm": 0.13121740520000458, + "learning_rate": 1.821593987258631e-05, + "loss": -0.8999, + "step": 44000 + }, + { + "epoch": 25.616996507566938, + "grad_norm": 0.16137968003749847, + "learning_rate": 1.8194671387626744e-05, + "loss": -0.899, + "step": 44010 + }, + { + "epoch": 25.622817229336437, + "grad_norm": 0.14561280608177185, + "learning_rate": 1.8173412564286276e-05, + "loss": -0.8995, + "step": 44020 + }, + { + "epoch": 25.628637951105937, + "grad_norm": 0.14247018098831177, + "learning_rate": 1.8152163409022697e-05, + "loss": -0.8962, + "step": 44030 + }, + { + "epoch": 25.634458672875436, + "grad_norm": 0.14477510750293732, + "learning_rate": 1.8130923928291023e-05, + "loss": -0.8996, + "step": 44040 + }, + { + "epoch": 25.640279394644935, + "grad_norm": 0.27179285883903503, + "learning_rate": 1.8109694128543163e-05, + "loss": -0.8979, + "step": 44050 + }, + { + "epoch": 25.646100116414434, + "grad_norm": 0.18149088323116302, + "learning_rate": 1.8088474016228237e-05, + "loss": -0.9008, + "step": 44060 + }, + { + "epoch": 25.651920838183933, + "grad_norm": 0.1808999478816986, + "learning_rate": 1.8067263597792328e-05, + "loss": -0.9004, + "step": 44070 + }, + { + "epoch": 25.657741559953433, + "grad_norm": 0.17480230331420898, + "learning_rate": 1.80460628796786e-05, + "loss": -0.9006, + "step": 44080 + }, + { + "epoch": 25.663562281722932, + "grad_norm": 0.23863227665424347, + "learning_rate": 1.8024871868327276e-05, + "loss": -0.9029, + "step": 44090 + }, + { + "epoch": 25.669383003492435, + "grad_norm": 0.13059446215629578, + "learning_rate": 1.8003690570175608e-05, + "loss": -0.9013, + "step": 44100 + }, + { + "epoch": 25.675203725261934, + "grad_norm": 0.1525738388299942, + "learning_rate": 1.7982518991657943e-05, + "loss": -0.9001, + "step": 44110 + }, + { + "epoch": 25.681024447031433, + "grad_norm": 0.15183000266551971, + "learning_rate": 1.7961357139205643e-05, + "loss": -0.9027, + "step": 44120 + }, + { + "epoch": 25.686845168800932, + "grad_norm": 0.15770679712295532, + "learning_rate": 1.7940205019247108e-05, + "loss": -0.9019, + "step": 44130 + }, + { + "epoch": 25.69266589057043, + "grad_norm": 0.20435687899589539, + "learning_rate": 1.79190626382078e-05, + "loss": -0.9006, + "step": 44140 + }, + { + "epoch": 25.69848661233993, + "grad_norm": 0.22770267724990845, + "learning_rate": 1.7897930002510215e-05, + "loss": -0.8977, + "step": 44150 + }, + { + "epoch": 25.70430733410943, + "grad_norm": 0.17930129170417786, + "learning_rate": 1.787680711857387e-05, + "loss": -0.8992, + "step": 44160 + }, + { + "epoch": 25.71012805587893, + "grad_norm": 0.1609847992658615, + "learning_rate": 1.7855693992815398e-05, + "loss": -0.9005, + "step": 44170 + }, + { + "epoch": 25.71594877764843, + "grad_norm": 0.15502429008483887, + "learning_rate": 1.7834590631648328e-05, + "loss": -0.9018, + "step": 44180 + }, + { + "epoch": 25.721769499417928, + "grad_norm": 0.25188785791397095, + "learning_rate": 1.7813497041483384e-05, + "loss": -0.9, + "step": 44190 + }, + { + "epoch": 25.727590221187427, + "grad_norm": 0.26399296522140503, + "learning_rate": 1.779241322872817e-05, + "loss": -0.9016, + "step": 44200 + }, + { + "epoch": 25.733410942956926, + "grad_norm": 0.16664022207260132, + "learning_rate": 1.777133919978744e-05, + "loss": -0.9016, + "step": 44210 + }, + { + "epoch": 25.739231664726425, + "grad_norm": 0.16371795535087585, + "learning_rate": 1.7750274961062912e-05, + "loss": -0.8951, + "step": 44220 + }, + { + "epoch": 25.745052386495924, + "grad_norm": 0.18301519751548767, + "learning_rate": 1.772922051895335e-05, + "loss": -0.9035, + "step": 44230 + }, + { + "epoch": 25.750873108265424, + "grad_norm": 0.11967914551496506, + "learning_rate": 1.770817587985453e-05, + "loss": -0.9031, + "step": 44240 + }, + { + "epoch": 25.756693830034923, + "grad_norm": 0.17736919224262238, + "learning_rate": 1.7687141050159246e-05, + "loss": -0.9026, + "step": 44250 + }, + { + "epoch": 25.762514551804422, + "grad_norm": 0.13769148290157318, + "learning_rate": 1.7666116036257375e-05, + "loss": -0.9038, + "step": 44260 + }, + { + "epoch": 25.768335273573925, + "grad_norm": 0.15802176296710968, + "learning_rate": 1.764510084453569e-05, + "loss": -0.9045, + "step": 44270 + }, + { + "epoch": 25.774155995343424, + "grad_norm": 0.18553519248962402, + "learning_rate": 1.76240954813781e-05, + "loss": -0.8993, + "step": 44280 + }, + { + "epoch": 25.779976717112923, + "grad_norm": 0.20421287417411804, + "learning_rate": 1.7603099953165476e-05, + "loss": -0.9037, + "step": 44290 + }, + { + "epoch": 25.785797438882422, + "grad_norm": 0.2080332189798355, + "learning_rate": 1.7582114266275683e-05, + "loss": -0.9008, + "step": 44300 + }, + { + "epoch": 25.79161816065192, + "grad_norm": 0.11586529016494751, + "learning_rate": 1.756113842708364e-05, + "loss": -0.9005, + "step": 44310 + }, + { + "epoch": 25.79743888242142, + "grad_norm": 0.15496917068958282, + "learning_rate": 1.7540172441961245e-05, + "loss": -0.9016, + "step": 44320 + }, + { + "epoch": 25.80325960419092, + "grad_norm": 0.13993003964424133, + "learning_rate": 1.7519216317277387e-05, + "loss": -0.9022, + "step": 44330 + }, + { + "epoch": 25.80908032596042, + "grad_norm": 0.2056097388267517, + "learning_rate": 1.7498270059398046e-05, + "loss": -0.9012, + "step": 44340 + }, + { + "epoch": 25.81490104772992, + "grad_norm": 0.18284575641155243, + "learning_rate": 1.7477333674686062e-05, + "loss": -0.9015, + "step": 44350 + }, + { + "epoch": 25.820721769499418, + "grad_norm": 0.1905841827392578, + "learning_rate": 1.745640716950142e-05, + "loss": -0.9004, + "step": 44360 + }, + { + "epoch": 25.826542491268917, + "grad_norm": 0.21671204268932343, + "learning_rate": 1.7435490550201017e-05, + "loss": -0.9015, + "step": 44370 + }, + { + "epoch": 25.832363213038416, + "grad_norm": 0.10787954926490784, + "learning_rate": 1.7414583823138762e-05, + "loss": -0.9058, + "step": 44380 + }, + { + "epoch": 25.838183934807915, + "grad_norm": 0.12228470295667648, + "learning_rate": 1.739368699466558e-05, + "loss": -0.9023, + "step": 44390 + }, + { + "epoch": 25.844004656577415, + "grad_norm": 0.13481484353542328, + "learning_rate": 1.737280007112935e-05, + "loss": -0.9031, + "step": 44400 + }, + { + "epoch": 25.849825378346914, + "grad_norm": 0.1456834375858307, + "learning_rate": 1.735192305887502e-05, + "loss": -0.9054, + "step": 44410 + }, + { + "epoch": 25.855646100116413, + "grad_norm": 0.13561305403709412, + "learning_rate": 1.733105596424441e-05, + "loss": -0.9025, + "step": 44420 + }, + { + "epoch": 25.861466821885912, + "grad_norm": 0.11761511117219925, + "learning_rate": 1.7310198793576437e-05, + "loss": -0.8985, + "step": 44430 + }, + { + "epoch": 25.867287543655415, + "grad_norm": 0.1711665242910385, + "learning_rate": 1.7289351553206952e-05, + "loss": -0.9002, + "step": 44440 + }, + { + "epoch": 25.873108265424914, + "grad_norm": 0.21862252056598663, + "learning_rate": 1.7268514249468788e-05, + "loss": -0.8984, + "step": 44450 + }, + { + "epoch": 25.878928987194413, + "grad_norm": 0.15989528596401215, + "learning_rate": 1.7247686888691765e-05, + "loss": -0.9024, + "step": 44460 + }, + { + "epoch": 25.884749708963913, + "grad_norm": 0.16217459738254547, + "learning_rate": 1.7226869477202694e-05, + "loss": -0.9019, + "step": 44470 + }, + { + "epoch": 25.890570430733412, + "grad_norm": 0.22466795146465302, + "learning_rate": 1.7206062021325336e-05, + "loss": -0.8984, + "step": 44480 + }, + { + "epoch": 25.89639115250291, + "grad_norm": 0.13362249732017517, + "learning_rate": 1.7185264527380502e-05, + "loss": -0.901, + "step": 44490 + }, + { + "epoch": 25.90221187427241, + "grad_norm": 0.1546936333179474, + "learning_rate": 1.716447700168584e-05, + "loss": -0.9038, + "step": 44500 + }, + { + "epoch": 25.90803259604191, + "grad_norm": 0.17165222764015198, + "learning_rate": 1.714369945055611e-05, + "loss": -0.8979, + "step": 44510 + }, + { + "epoch": 25.91385331781141, + "grad_norm": 0.1378115862607956, + "learning_rate": 1.7122931880302968e-05, + "loss": -0.9049, + "step": 44520 + }, + { + "epoch": 25.919674039580908, + "grad_norm": 0.28111082315444946, + "learning_rate": 1.710217429723505e-05, + "loss": -0.9001, + "step": 44530 + }, + { + "epoch": 25.925494761350407, + "grad_norm": 0.11401087790727615, + "learning_rate": 1.7081426707657972e-05, + "loss": -0.9005, + "step": 44540 + }, + { + "epoch": 25.931315483119906, + "grad_norm": 0.18489351868629456, + "learning_rate": 1.7060689117874275e-05, + "loss": -0.8991, + "step": 44550 + }, + { + "epoch": 25.937136204889406, + "grad_norm": 0.12630857527256012, + "learning_rate": 1.703996153418354e-05, + "loss": -0.9015, + "step": 44560 + }, + { + "epoch": 25.942956926658905, + "grad_norm": 0.20100398361682892, + "learning_rate": 1.7019243962882205e-05, + "loss": -0.9032, + "step": 44570 + }, + { + "epoch": 25.948777648428404, + "grad_norm": 0.12864486873149872, + "learning_rate": 1.6998536410263754e-05, + "loss": -0.905, + "step": 44580 + }, + { + "epoch": 25.954598370197903, + "grad_norm": 0.12079174816608429, + "learning_rate": 1.6977838882618596e-05, + "loss": -0.9025, + "step": 44590 + }, + { + "epoch": 25.960419091967402, + "grad_norm": 0.24346397817134857, + "learning_rate": 1.6957151386234088e-05, + "loss": -0.9025, + "step": 44600 + }, + { + "epoch": 25.9662398137369, + "grad_norm": 0.13108475506305695, + "learning_rate": 1.6936473927394536e-05, + "loss": -0.9043, + "step": 44610 + }, + { + "epoch": 25.972060535506404, + "grad_norm": 0.16830994188785553, + "learning_rate": 1.6915806512381222e-05, + "loss": -0.9013, + "step": 44620 + }, + { + "epoch": 25.977881257275904, + "grad_norm": 0.17529787123203278, + "learning_rate": 1.6895149147472344e-05, + "loss": -0.9017, + "step": 44630 + }, + { + "epoch": 25.983701979045403, + "grad_norm": 0.18474380671977997, + "learning_rate": 1.6874501838943073e-05, + "loss": -0.9004, + "step": 44640 + }, + { + "epoch": 25.989522700814902, + "grad_norm": 0.23914489150047302, + "learning_rate": 1.6853864593065506e-05, + "loss": -0.9031, + "step": 44650 + }, + { + "epoch": 25.9953434225844, + "grad_norm": 0.263138085603714, + "learning_rate": 1.683323741610871e-05, + "loss": -0.903, + "step": 44660 + }, + { + "epoch": 26.0011641443539, + "grad_norm": 0.24518807232379913, + "learning_rate": 1.6812620314338674e-05, + "loss": -0.9, + "step": 44670 + }, + { + "epoch": 26.0069848661234, + "grad_norm": 0.17231781780719757, + "learning_rate": 1.6792013294018326e-05, + "loss": -0.9042, + "step": 44680 + }, + { + "epoch": 26.0128055878929, + "grad_norm": 0.12328848987817764, + "learning_rate": 1.6771416361407526e-05, + "loss": -0.9023, + "step": 44690 + }, + { + "epoch": 26.018626309662398, + "grad_norm": 0.19853518903255463, + "learning_rate": 1.675082952276308e-05, + "loss": -0.9039, + "step": 44700 + }, + { + "epoch": 26.024447031431897, + "grad_norm": 0.22640854120254517, + "learning_rate": 1.6730252784338757e-05, + "loss": -0.9025, + "step": 44710 + }, + { + "epoch": 26.030267753201397, + "grad_norm": 0.14526993036270142, + "learning_rate": 1.6709686152385166e-05, + "loss": -0.903, + "step": 44720 + }, + { + "epoch": 26.036088474970896, + "grad_norm": 0.21856622397899628, + "learning_rate": 1.668912963314998e-05, + "loss": -0.9017, + "step": 44730 + }, + { + "epoch": 26.041909196740395, + "grad_norm": 0.14481927454471588, + "learning_rate": 1.6668583232877653e-05, + "loss": -0.9023, + "step": 44740 + }, + { + "epoch": 26.047729918509894, + "grad_norm": 0.13521859049797058, + "learning_rate": 1.6648046957809698e-05, + "loss": -0.8994, + "step": 44750 + }, + { + "epoch": 26.053550640279393, + "grad_norm": 0.12624657154083252, + "learning_rate": 1.6627520814184462e-05, + "loss": -0.9013, + "step": 44760 + }, + { + "epoch": 26.059371362048893, + "grad_norm": 0.143196240067482, + "learning_rate": 1.660700480823726e-05, + "loss": -0.9035, + "step": 44770 + }, + { + "epoch": 26.065192083818392, + "grad_norm": 0.16166266798973083, + "learning_rate": 1.65864989462003e-05, + "loss": -0.9034, + "step": 44780 + }, + { + "epoch": 26.07101280558789, + "grad_norm": 0.18452376127243042, + "learning_rate": 1.656600323430273e-05, + "loss": -0.8999, + "step": 44790 + }, + { + "epoch": 26.076833527357394, + "grad_norm": 0.18314135074615479, + "learning_rate": 1.654551767877059e-05, + "loss": -0.9051, + "step": 44800 + }, + { + "epoch": 26.082654249126893, + "grad_norm": 0.2078206092119217, + "learning_rate": 1.6525042285826874e-05, + "loss": -0.8994, + "step": 44810 + }, + { + "epoch": 26.088474970896392, + "grad_norm": 0.18336082994937897, + "learning_rate": 1.6504577061691468e-05, + "loss": -0.8985, + "step": 44820 + }, + { + "epoch": 26.09429569266589, + "grad_norm": 0.20337426662445068, + "learning_rate": 1.6484122012581143e-05, + "loss": -0.9003, + "step": 44830 + }, + { + "epoch": 26.10011641443539, + "grad_norm": 0.12845034897327423, + "learning_rate": 1.6463677144709623e-05, + "loss": -0.9039, + "step": 44840 + }, + { + "epoch": 26.10593713620489, + "grad_norm": 0.1267399936914444, + "learning_rate": 1.6443242464287493e-05, + "loss": -0.9036, + "step": 44850 + }, + { + "epoch": 26.11175785797439, + "grad_norm": 0.15360620617866516, + "learning_rate": 1.642281797752232e-05, + "loss": -0.9037, + "step": 44860 + }, + { + "epoch": 26.11757857974389, + "grad_norm": 0.18220213055610657, + "learning_rate": 1.6402403690618456e-05, + "loss": -0.9045, + "step": 44870 + }, + { + "epoch": 26.123399301513388, + "grad_norm": 0.12694096565246582, + "learning_rate": 1.6381999609777295e-05, + "loss": -0.9063, + "step": 44880 + }, + { + "epoch": 26.129220023282887, + "grad_norm": 0.14208658039569855, + "learning_rate": 1.6361605741196983e-05, + "loss": -0.8941, + "step": 44890 + }, + { + "epoch": 26.135040745052386, + "grad_norm": 0.19661974906921387, + "learning_rate": 1.63412220910727e-05, + "loss": -0.9018, + "step": 44900 + }, + { + "epoch": 26.140861466821885, + "grad_norm": 0.11467096954584122, + "learning_rate": 1.6320848665596433e-05, + "loss": -0.902, + "step": 44910 + }, + { + "epoch": 26.146682188591384, + "grad_norm": 0.1357603222131729, + "learning_rate": 1.6300485470957095e-05, + "loss": -0.9006, + "step": 44920 + }, + { + "epoch": 26.152502910360884, + "grad_norm": 0.16231565177440643, + "learning_rate": 1.6280132513340483e-05, + "loss": -0.904, + "step": 44930 + }, + { + "epoch": 26.158323632130383, + "grad_norm": 0.15253892540931702, + "learning_rate": 1.62597897989293e-05, + "loss": -0.9058, + "step": 44940 + }, + { + "epoch": 26.164144353899882, + "grad_norm": 0.14945118129253387, + "learning_rate": 1.623945733390309e-05, + "loss": -0.9028, + "step": 44950 + }, + { + "epoch": 26.16996507566938, + "grad_norm": 0.19902539253234863, + "learning_rate": 1.6219135124438374e-05, + "loss": -0.9045, + "step": 44960 + }, + { + "epoch": 26.175785797438884, + "grad_norm": 0.14566928148269653, + "learning_rate": 1.6198823176708465e-05, + "loss": -0.9066, + "step": 44970 + }, + { + "epoch": 26.181606519208383, + "grad_norm": 0.2425704449415207, + "learning_rate": 1.6178521496883613e-05, + "loss": -0.9005, + "step": 44980 + }, + { + "epoch": 26.187427240977883, + "grad_norm": 0.20865987241268158, + "learning_rate": 1.6158230091130926e-05, + "loss": -0.9045, + "step": 44990 + }, + { + "epoch": 26.19324796274738, + "grad_norm": 0.1540181189775467, + "learning_rate": 1.613794896561438e-05, + "loss": -0.9064, + "step": 45000 + }, + { + "epoch": 26.19906868451688, + "grad_norm": 0.23082074522972107, + "learning_rate": 1.6117678126494894e-05, + "loss": -0.9034, + "step": 45010 + }, + { + "epoch": 26.20488940628638, + "grad_norm": 0.172967329621315, + "learning_rate": 1.6097417579930153e-05, + "loss": -0.8982, + "step": 45020 + }, + { + "epoch": 26.21071012805588, + "grad_norm": 0.16967914998531342, + "learning_rate": 1.6077167332074834e-05, + "loss": -0.901, + "step": 45030 + }, + { + "epoch": 26.21653084982538, + "grad_norm": 0.16411367058753967, + "learning_rate": 1.605692738908037e-05, + "loss": -0.9057, + "step": 45040 + }, + { + "epoch": 26.222351571594878, + "grad_norm": 0.13190488517284393, + "learning_rate": 1.6036697757095176e-05, + "loss": -0.9022, + "step": 45050 + }, + { + "epoch": 26.228172293364377, + "grad_norm": 0.12421392649412155, + "learning_rate": 1.6016478442264428e-05, + "loss": -0.904, + "step": 45060 + }, + { + "epoch": 26.233993015133876, + "grad_norm": 0.15398389101028442, + "learning_rate": 1.599626945073026e-05, + "loss": -0.9044, + "step": 45070 + }, + { + "epoch": 26.239813736903375, + "grad_norm": 0.2252531349658966, + "learning_rate": 1.597607078863162e-05, + "loss": -0.9054, + "step": 45080 + }, + { + "epoch": 26.245634458672875, + "grad_norm": 0.13632124662399292, + "learning_rate": 1.595588246210432e-05, + "loss": -0.9049, + "step": 45090 + }, + { + "epoch": 26.251455180442374, + "grad_norm": 0.17892958223819733, + "learning_rate": 1.5935704477281048e-05, + "loss": -0.9069, + "step": 45100 + }, + { + "epoch": 26.257275902211873, + "grad_norm": 0.18510201573371887, + "learning_rate": 1.5915536840291323e-05, + "loss": -0.9059, + "step": 45110 + }, + { + "epoch": 26.263096623981372, + "grad_norm": 0.10917680710554123, + "learning_rate": 1.5895379557261576e-05, + "loss": -0.906, + "step": 45120 + }, + { + "epoch": 26.26891734575087, + "grad_norm": 0.2147781252861023, + "learning_rate": 1.5875232634315033e-05, + "loss": -0.9039, + "step": 45130 + }, + { + "epoch": 26.274738067520374, + "grad_norm": 0.16041745245456696, + "learning_rate": 1.5855096077571812e-05, + "loss": -0.9018, + "step": 45140 + }, + { + "epoch": 26.280558789289874, + "grad_norm": 0.20287580788135529, + "learning_rate": 1.5834969893148855e-05, + "loss": -0.9043, + "step": 45150 + }, + { + "epoch": 26.286379511059373, + "grad_norm": 0.16251981258392334, + "learning_rate": 1.581485408715997e-05, + "loss": -0.9062, + "step": 45160 + }, + { + "epoch": 26.292200232828872, + "grad_norm": 0.15144729614257812, + "learning_rate": 1.5794748665715785e-05, + "loss": -0.9046, + "step": 45170 + }, + { + "epoch": 26.29802095459837, + "grad_norm": 0.16405853629112244, + "learning_rate": 1.5774653634923857e-05, + "loss": -0.9066, + "step": 45180 + }, + { + "epoch": 26.30384167636787, + "grad_norm": 0.1574004739522934, + "learning_rate": 1.575456900088845e-05, + "loss": -0.9055, + "step": 45190 + }, + { + "epoch": 26.30966239813737, + "grad_norm": 0.09662231802940369, + "learning_rate": 1.5734494769710816e-05, + "loss": -0.901, + "step": 45200 + }, + { + "epoch": 26.31548311990687, + "grad_norm": 0.19008755683898926, + "learning_rate": 1.5714430947488912e-05, + "loss": -0.9082, + "step": 45210 + }, + { + "epoch": 26.321303841676368, + "grad_norm": 0.1978907287120819, + "learning_rate": 1.5694377540317645e-05, + "loss": -0.9024, + "step": 45220 + }, + { + "epoch": 26.327124563445867, + "grad_norm": 0.21955063939094543, + "learning_rate": 1.5674334554288694e-05, + "loss": -0.9076, + "step": 45230 + }, + { + "epoch": 26.332945285215366, + "grad_norm": 0.1727902740240097, + "learning_rate": 1.5654301995490582e-05, + "loss": -0.9056, + "step": 45240 + }, + { + "epoch": 26.338766006984866, + "grad_norm": 0.16749797761440277, + "learning_rate": 1.5634279870008685e-05, + "loss": -0.9077, + "step": 45250 + }, + { + "epoch": 26.344586728754365, + "grad_norm": 0.11758963763713837, + "learning_rate": 1.5614268183925174e-05, + "loss": -0.9045, + "step": 45260 + }, + { + "epoch": 26.350407450523864, + "grad_norm": 0.1228354349732399, + "learning_rate": 1.5594266943319097e-05, + "loss": -0.8972, + "step": 45270 + }, + { + "epoch": 26.356228172293363, + "grad_norm": 0.14734113216400146, + "learning_rate": 1.5574276154266294e-05, + "loss": -0.9069, + "step": 45280 + }, + { + "epoch": 26.362048894062863, + "grad_norm": 0.11730772256851196, + "learning_rate": 1.5554295822839437e-05, + "loss": -0.9063, + "step": 45290 + }, + { + "epoch": 26.36786961583236, + "grad_norm": 0.1903197169303894, + "learning_rate": 1.5534325955108025e-05, + "loss": -0.9079, + "step": 45300 + }, + { + "epoch": 26.37369033760186, + "grad_norm": 0.11666873842477798, + "learning_rate": 1.5514366557138373e-05, + "loss": -0.9061, + "step": 45310 + }, + { + "epoch": 26.379511059371364, + "grad_norm": 0.12967248260974884, + "learning_rate": 1.5494417634993602e-05, + "loss": -0.9035, + "step": 45320 + }, + { + "epoch": 26.385331781140863, + "grad_norm": 0.24469593167304993, + "learning_rate": 1.547447919473372e-05, + "loss": -0.9017, + "step": 45330 + }, + { + "epoch": 26.391152502910362, + "grad_norm": 0.14849571883678436, + "learning_rate": 1.5454551242415434e-05, + "loss": -0.9074, + "step": 45340 + }, + { + "epoch": 26.39697322467986, + "grad_norm": 0.14963647723197937, + "learning_rate": 1.543463378409239e-05, + "loss": -0.9063, + "step": 45350 + }, + { + "epoch": 26.40279394644936, + "grad_norm": 0.15437722206115723, + "learning_rate": 1.541472682581493e-05, + "loss": -0.8986, + "step": 45360 + }, + { + "epoch": 26.40861466821886, + "grad_norm": 0.16761434078216553, + "learning_rate": 1.5394830373630298e-05, + "loss": -0.905, + "step": 45370 + }, + { + "epoch": 26.41443538998836, + "grad_norm": 0.2150658369064331, + "learning_rate": 1.5374944433582506e-05, + "loss": -0.904, + "step": 45380 + }, + { + "epoch": 26.42025611175786, + "grad_norm": 0.1419554054737091, + "learning_rate": 1.5355069011712375e-05, + "loss": -0.9031, + "step": 45390 + }, + { + "epoch": 26.426076833527357, + "grad_norm": 0.23524726927280426, + "learning_rate": 1.5335204114057526e-05, + "loss": -0.8989, + "step": 45400 + }, + { + "epoch": 26.431897555296857, + "grad_norm": 0.1655990183353424, + "learning_rate": 1.5315349746652387e-05, + "loss": -0.9052, + "step": 45410 + }, + { + "epoch": 26.437718277066356, + "grad_norm": 0.12788672745227814, + "learning_rate": 1.5295505915528212e-05, + "loss": -0.9058, + "step": 45420 + }, + { + "epoch": 26.443538998835855, + "grad_norm": 0.15461833775043488, + "learning_rate": 1.5275672626713024e-05, + "loss": -0.9013, + "step": 45430 + }, + { + "epoch": 26.449359720605354, + "grad_norm": 0.147592693567276, + "learning_rate": 1.5255849886231643e-05, + "loss": -0.9036, + "step": 45440 + }, + { + "epoch": 26.455180442374854, + "grad_norm": 0.12829340994358063, + "learning_rate": 1.523603770010571e-05, + "loss": -0.9022, + "step": 45450 + }, + { + "epoch": 26.461001164144353, + "grad_norm": 0.26971668004989624, + "learning_rate": 1.521623607435363e-05, + "loss": -0.9078, + "step": 45460 + }, + { + "epoch": 26.466821885913852, + "grad_norm": 0.12286767363548279, + "learning_rate": 1.5196445014990612e-05, + "loss": -0.9057, + "step": 45470 + }, + { + "epoch": 26.47264260768335, + "grad_norm": 0.14054276049137115, + "learning_rate": 1.5176664528028672e-05, + "loss": -0.9061, + "step": 45480 + }, + { + "epoch": 26.47846332945285, + "grad_norm": 0.122809499502182, + "learning_rate": 1.5156894619476574e-05, + "loss": -0.906, + "step": 45490 + }, + { + "epoch": 26.484284051222353, + "grad_norm": 0.12261772900819778, + "learning_rate": 1.5137135295339938e-05, + "loss": -0.9008, + "step": 45500 + }, + { + "epoch": 26.490104772991852, + "grad_norm": 0.15882077813148499, + "learning_rate": 1.5117386561621073e-05, + "loss": -0.9083, + "step": 45510 + }, + { + "epoch": 26.49592549476135, + "grad_norm": 0.15479573607444763, + "learning_rate": 1.5097648424319167e-05, + "loss": -0.9053, + "step": 45520 + }, + { + "epoch": 26.50174621653085, + "grad_norm": 0.1308288723230362, + "learning_rate": 1.5077920889430119e-05, + "loss": -0.9045, + "step": 45530 + }, + { + "epoch": 26.50756693830035, + "grad_norm": 0.165984645485878, + "learning_rate": 1.5058203962946644e-05, + "loss": -0.9054, + "step": 45540 + }, + { + "epoch": 26.51338766006985, + "grad_norm": 0.18286146223545074, + "learning_rate": 1.503849765085822e-05, + "loss": -0.9079, + "step": 45550 + }, + { + "epoch": 26.51920838183935, + "grad_norm": 0.16409769654273987, + "learning_rate": 1.501880195915109e-05, + "loss": -0.903, + "step": 45560 + }, + { + "epoch": 26.525029103608848, + "grad_norm": 0.15661077201366425, + "learning_rate": 1.499911689380833e-05, + "loss": -0.899, + "step": 45570 + }, + { + "epoch": 26.530849825378347, + "grad_norm": 0.10852675884962082, + "learning_rate": 1.4979442460809683e-05, + "loss": -0.9101, + "step": 45580 + }, + { + "epoch": 26.536670547147846, + "grad_norm": 0.1301882565021515, + "learning_rate": 1.4959778666131763e-05, + "loss": -0.9049, + "step": 45590 + }, + { + "epoch": 26.542491268917345, + "grad_norm": 0.24195288121700287, + "learning_rate": 1.4940125515747905e-05, + "loss": -0.9046, + "step": 45600 + }, + { + "epoch": 26.548311990686845, + "grad_norm": 0.17425595223903656, + "learning_rate": 1.4920483015628211e-05, + "loss": -0.9062, + "step": 45610 + }, + { + "epoch": 26.554132712456344, + "grad_norm": 0.21477054059505463, + "learning_rate": 1.490085117173956e-05, + "loss": -0.9035, + "step": 45620 + }, + { + "epoch": 26.559953434225843, + "grad_norm": 0.17467063665390015, + "learning_rate": 1.488122999004558e-05, + "loss": -0.9084, + "step": 45630 + }, + { + "epoch": 26.565774155995342, + "grad_norm": 0.12266865372657776, + "learning_rate": 1.486161947650666e-05, + "loss": -0.9041, + "step": 45640 + }, + { + "epoch": 26.57159487776484, + "grad_norm": 0.18349255621433258, + "learning_rate": 1.4842019637079995e-05, + "loss": -0.9056, + "step": 45650 + }, + { + "epoch": 26.57741559953434, + "grad_norm": 0.2286103069782257, + "learning_rate": 1.482243047771944e-05, + "loss": -0.9057, + "step": 45660 + }, + { + "epoch": 26.583236321303843, + "grad_norm": 0.13224361836910248, + "learning_rate": 1.4802852004375712e-05, + "loss": -0.9087, + "step": 45670 + }, + { + "epoch": 26.589057043073343, + "grad_norm": 0.10444850474596024, + "learning_rate": 1.4783284222996218e-05, + "loss": -0.9061, + "step": 45680 + }, + { + "epoch": 26.594877764842842, + "grad_norm": 0.10349296778440475, + "learning_rate": 1.4763727139525135e-05, + "loss": -0.9076, + "step": 45690 + }, + { + "epoch": 26.60069848661234, + "grad_norm": 0.16774600744247437, + "learning_rate": 1.4744180759903392e-05, + "loss": -0.9072, + "step": 45700 + }, + { + "epoch": 26.60651920838184, + "grad_norm": 0.23411133885383606, + "learning_rate": 1.4724645090068635e-05, + "loss": -0.9047, + "step": 45710 + }, + { + "epoch": 26.61233993015134, + "grad_norm": 0.14863178133964539, + "learning_rate": 1.4705120135955341e-05, + "loss": -0.9076, + "step": 45720 + }, + { + "epoch": 26.61816065192084, + "grad_norm": 0.10922718048095703, + "learning_rate": 1.4685605903494614e-05, + "loss": -0.9065, + "step": 45730 + }, + { + "epoch": 26.623981373690338, + "grad_norm": 0.19279982149600983, + "learning_rate": 1.46661023986144e-05, + "loss": -0.9059, + "step": 45740 + }, + { + "epoch": 26.629802095459837, + "grad_norm": 0.16423402726650238, + "learning_rate": 1.4646609627239344e-05, + "loss": -0.904, + "step": 45750 + }, + { + "epoch": 26.635622817229336, + "grad_norm": 0.1260315328836441, + "learning_rate": 1.4627127595290835e-05, + "loss": -0.9041, + "step": 45760 + }, + { + "epoch": 26.641443538998836, + "grad_norm": 0.22127339243888855, + "learning_rate": 1.460765630868699e-05, + "loss": -0.9076, + "step": 45770 + }, + { + "epoch": 26.647264260768335, + "grad_norm": 0.15456324815750122, + "learning_rate": 1.4588195773342678e-05, + "loss": -0.9042, + "step": 45780 + }, + { + "epoch": 26.653084982537834, + "grad_norm": 0.1572858989238739, + "learning_rate": 1.4568745995169485e-05, + "loss": -0.9076, + "step": 45790 + }, + { + "epoch": 26.658905704307333, + "grad_norm": 0.17465317249298096, + "learning_rate": 1.4549306980075778e-05, + "loss": -0.9073, + "step": 45800 + }, + { + "epoch": 26.664726426076832, + "grad_norm": 0.1722448319196701, + "learning_rate": 1.4529878733966557e-05, + "loss": -0.9087, + "step": 45810 + }, + { + "epoch": 26.67054714784633, + "grad_norm": 0.11421433091163635, + "learning_rate": 1.4510461262743658e-05, + "loss": -0.9037, + "step": 45820 + }, + { + "epoch": 26.67636786961583, + "grad_norm": 0.16881956160068512, + "learning_rate": 1.4491054572305585e-05, + "loss": -0.903, + "step": 45830 + }, + { + "epoch": 26.682188591385334, + "grad_norm": 0.1391844004392624, + "learning_rate": 1.4471658668547566e-05, + "loss": -0.9093, + "step": 45840 + }, + { + "epoch": 26.688009313154833, + "grad_norm": 0.2235574722290039, + "learning_rate": 1.4452273557361579e-05, + "loss": -0.9031, + "step": 45850 + }, + { + "epoch": 26.693830034924332, + "grad_norm": 0.19576382637023926, + "learning_rate": 1.4432899244636282e-05, + "loss": -0.9087, + "step": 45860 + }, + { + "epoch": 26.69965075669383, + "grad_norm": 0.3001061677932739, + "learning_rate": 1.4413535736257134e-05, + "loss": -0.907, + "step": 45870 + }, + { + "epoch": 26.70547147846333, + "grad_norm": 0.15079668164253235, + "learning_rate": 1.439418303810619e-05, + "loss": -0.9085, + "step": 45880 + }, + { + "epoch": 26.71129220023283, + "grad_norm": 0.24969621002674103, + "learning_rate": 1.4374841156062352e-05, + "loss": -0.9055, + "step": 45890 + }, + { + "epoch": 26.71711292200233, + "grad_norm": 0.23943881690502167, + "learning_rate": 1.4355510096001112e-05, + "loss": -0.9085, + "step": 45900 + }, + { + "epoch": 26.722933643771828, + "grad_norm": 0.15335644781589508, + "learning_rate": 1.4336189863794786e-05, + "loss": -0.9049, + "step": 45910 + }, + { + "epoch": 26.728754365541327, + "grad_norm": 0.12585611641407013, + "learning_rate": 1.4316880465312327e-05, + "loss": -0.9082, + "step": 45920 + }, + { + "epoch": 26.734575087310827, + "grad_norm": 0.14489194750785828, + "learning_rate": 1.4297581906419426e-05, + "loss": -0.908, + "step": 45930 + }, + { + "epoch": 26.740395809080326, + "grad_norm": 0.1264864206314087, + "learning_rate": 1.4278294192978475e-05, + "loss": -0.9095, + "step": 45940 + }, + { + "epoch": 26.746216530849825, + "grad_norm": 0.22275523841381073, + "learning_rate": 1.4259017330848574e-05, + "loss": -0.9079, + "step": 45950 + }, + { + "epoch": 26.752037252619324, + "grad_norm": 0.14767040312290192, + "learning_rate": 1.4239751325885498e-05, + "loss": -0.9085, + "step": 45960 + }, + { + "epoch": 26.757857974388823, + "grad_norm": 0.14452306926250458, + "learning_rate": 1.4220496183941795e-05, + "loss": -0.909, + "step": 45970 + }, + { + "epoch": 26.763678696158323, + "grad_norm": 0.2125113159418106, + "learning_rate": 1.4201251910866648e-05, + "loss": -0.9072, + "step": 45980 + }, + { + "epoch": 26.769499417927822, + "grad_norm": 0.1639367938041687, + "learning_rate": 1.4182018512505957e-05, + "loss": -0.908, + "step": 45990 + }, + { + "epoch": 26.77532013969732, + "grad_norm": 0.19993168115615845, + "learning_rate": 1.4162795994702327e-05, + "loss": -0.9056, + "step": 46000 + }, + { + "epoch": 26.78114086146682, + "grad_norm": 0.16042105853557587, + "learning_rate": 1.4143584363295032e-05, + "loss": -0.9061, + "step": 46010 + }, + { + "epoch": 26.78696158323632, + "grad_norm": 0.20583273470401764, + "learning_rate": 1.4124383624120101e-05, + "loss": -0.9061, + "step": 46020 + }, + { + "epoch": 26.792782305005822, + "grad_norm": 0.264212042093277, + "learning_rate": 1.4105193783010151e-05, + "loss": -0.905, + "step": 46030 + }, + { + "epoch": 26.79860302677532, + "grad_norm": 0.19119049608707428, + "learning_rate": 1.4086014845794621e-05, + "loss": -0.9068, + "step": 46040 + }, + { + "epoch": 26.80442374854482, + "grad_norm": 0.19850145280361176, + "learning_rate": 1.4066846818299489e-05, + "loss": -0.9072, + "step": 46050 + }, + { + "epoch": 26.81024447031432, + "grad_norm": 0.1133117750287056, + "learning_rate": 1.4047689706347555e-05, + "loss": -0.9059, + "step": 46060 + }, + { + "epoch": 26.81606519208382, + "grad_norm": 0.19226863980293274, + "learning_rate": 1.402854351575822e-05, + "loss": -0.9059, + "step": 46070 + }, + { + "epoch": 26.82188591385332, + "grad_norm": 0.20253528654575348, + "learning_rate": 1.4009408252347588e-05, + "loss": -0.9072, + "step": 46080 + }, + { + "epoch": 26.827706635622818, + "grad_norm": 0.19544467329978943, + "learning_rate": 1.399028392192846e-05, + "loss": -0.9061, + "step": 46090 + }, + { + "epoch": 26.833527357392317, + "grad_norm": 0.09557852149009705, + "learning_rate": 1.397117053031029e-05, + "loss": -0.9024, + "step": 46100 + }, + { + "epoch": 26.839348079161816, + "grad_norm": 0.18083523213863373, + "learning_rate": 1.3952068083299213e-05, + "loss": -0.9079, + "step": 46110 + }, + { + "epoch": 26.845168800931315, + "grad_norm": 0.15630535781383514, + "learning_rate": 1.3932976586698082e-05, + "loss": -0.9074, + "step": 46120 + }, + { + "epoch": 26.850989522700814, + "grad_norm": 0.20414091646671295, + "learning_rate": 1.3913896046306363e-05, + "loss": -0.9095, + "step": 46130 + }, + { + "epoch": 26.856810244470314, + "grad_norm": 0.2078181356191635, + "learning_rate": 1.389482646792023e-05, + "loss": -0.9015, + "step": 46140 + }, + { + "epoch": 26.862630966239813, + "grad_norm": 0.16789180040359497, + "learning_rate": 1.387576785733251e-05, + "loss": -0.9048, + "step": 46150 + }, + { + "epoch": 26.868451688009312, + "grad_norm": 0.2383832186460495, + "learning_rate": 1.3856720220332703e-05, + "loss": -0.901, + "step": 46160 + }, + { + "epoch": 26.87427240977881, + "grad_norm": 0.1781298667192459, + "learning_rate": 1.383768356270701e-05, + "loss": -0.9083, + "step": 46170 + }, + { + "epoch": 26.88009313154831, + "grad_norm": 0.2301635444164276, + "learning_rate": 1.3818657890238207e-05, + "loss": -0.9062, + "step": 46180 + }, + { + "epoch": 26.88591385331781, + "grad_norm": 0.16831891238689423, + "learning_rate": 1.3799643208705859e-05, + "loss": -0.9087, + "step": 46190 + }, + { + "epoch": 26.891734575087312, + "grad_norm": 0.16371174156665802, + "learning_rate": 1.3780639523886058e-05, + "loss": -0.9013, + "step": 46200 + }, + { + "epoch": 26.89755529685681, + "grad_norm": 0.17336229979991913, + "learning_rate": 1.3761646841551668e-05, + "loss": -0.9054, + "step": 46210 + }, + { + "epoch": 26.90337601862631, + "grad_norm": 0.14127035439014435, + "learning_rate": 1.3742665167472146e-05, + "loss": -0.9093, + "step": 46220 + }, + { + "epoch": 26.90919674039581, + "grad_norm": 0.21148273348808289, + "learning_rate": 1.372369450741363e-05, + "loss": -0.9072, + "step": 46230 + }, + { + "epoch": 26.91501746216531, + "grad_norm": 0.10683184117078781, + "learning_rate": 1.3704734867138901e-05, + "loss": -0.9088, + "step": 46240 + }, + { + "epoch": 26.92083818393481, + "grad_norm": 0.11817853152751923, + "learning_rate": 1.36857862524074e-05, + "loss": -0.907, + "step": 46250 + }, + { + "epoch": 26.926658905704308, + "grad_norm": 0.14478178322315216, + "learning_rate": 1.3666848668975213e-05, + "loss": -0.9049, + "step": 46260 + }, + { + "epoch": 26.932479627473807, + "grad_norm": 0.18441177904605865, + "learning_rate": 1.3647922122595063e-05, + "loss": -0.9079, + "step": 46270 + }, + { + "epoch": 26.938300349243306, + "grad_norm": 0.15721912682056427, + "learning_rate": 1.3629006619016366e-05, + "loss": -0.9022, + "step": 46280 + }, + { + "epoch": 26.944121071012805, + "grad_norm": 0.13703621923923492, + "learning_rate": 1.3610102163985139e-05, + "loss": -0.9068, + "step": 46290 + }, + { + "epoch": 26.949941792782305, + "grad_norm": 0.12962517142295837, + "learning_rate": 1.3591208763244057e-05, + "loss": -0.9088, + "step": 46300 + }, + { + "epoch": 26.955762514551804, + "grad_norm": 0.11992615461349487, + "learning_rate": 1.3572326422532428e-05, + "loss": -0.9025, + "step": 46310 + }, + { + "epoch": 26.961583236321303, + "grad_norm": 0.18850590288639069, + "learning_rate": 1.355345514758622e-05, + "loss": -0.907, + "step": 46320 + }, + { + "epoch": 26.967403958090802, + "grad_norm": 0.16217787563800812, + "learning_rate": 1.3534594944138007e-05, + "loss": -0.9069, + "step": 46330 + }, + { + "epoch": 26.9732246798603, + "grad_norm": 0.24701622128486633, + "learning_rate": 1.3515745817917069e-05, + "loss": -0.908, + "step": 46340 + }, + { + "epoch": 26.9790454016298, + "grad_norm": 0.19055300951004028, + "learning_rate": 1.3496907774649208e-05, + "loss": -0.9088, + "step": 46350 + }, + { + "epoch": 26.9848661233993, + "grad_norm": 0.1611051708459854, + "learning_rate": 1.3478080820056987e-05, + "loss": -0.905, + "step": 46360 + }, + { + "epoch": 26.990686845168803, + "grad_norm": 0.14815177023410797, + "learning_rate": 1.3459264959859474e-05, + "loss": -0.9087, + "step": 46370 + }, + { + "epoch": 26.996507566938302, + "grad_norm": 0.19570867717266083, + "learning_rate": 1.3440460199772487e-05, + "loss": -0.9093, + "step": 46380 + }, + { + "epoch": 27.0023282887078, + "grad_norm": 0.2589295208454132, + "learning_rate": 1.3421666545508382e-05, + "loss": -0.9065, + "step": 46390 + }, + { + "epoch": 27.0081490104773, + "grad_norm": 0.1427881270647049, + "learning_rate": 1.3402884002776194e-05, + "loss": -0.9096, + "step": 46400 + }, + { + "epoch": 27.0139697322468, + "grad_norm": 0.1904592216014862, + "learning_rate": 1.3384112577281555e-05, + "loss": -0.9064, + "step": 46410 + }, + { + "epoch": 27.0197904540163, + "grad_norm": 0.1459612101316452, + "learning_rate": 1.3365352274726711e-05, + "loss": -0.9065, + "step": 46420 + }, + { + "epoch": 27.025611175785798, + "grad_norm": 0.17565129697322845, + "learning_rate": 1.3346603100810578e-05, + "loss": -0.9041, + "step": 46430 + }, + { + "epoch": 27.031431897555297, + "grad_norm": 0.15883313119411469, + "learning_rate": 1.3327865061228645e-05, + "loss": -0.9101, + "step": 46440 + }, + { + "epoch": 27.037252619324796, + "grad_norm": 0.1386631727218628, + "learning_rate": 1.330913816167304e-05, + "loss": -0.8996, + "step": 46450 + }, + { + "epoch": 27.043073341094296, + "grad_norm": 0.17823699116706848, + "learning_rate": 1.3290422407832492e-05, + "loss": -0.9057, + "step": 46460 + }, + { + "epoch": 27.048894062863795, + "grad_norm": 0.16669560968875885, + "learning_rate": 1.3271717805392354e-05, + "loss": -0.908, + "step": 46470 + }, + { + "epoch": 27.054714784633294, + "grad_norm": 0.12887218594551086, + "learning_rate": 1.3253024360034582e-05, + "loss": -0.9087, + "step": 46480 + }, + { + "epoch": 27.060535506402793, + "grad_norm": 0.10041362792253494, + "learning_rate": 1.323434207743779e-05, + "loss": -0.9039, + "step": 46490 + }, + { + "epoch": 27.066356228172292, + "grad_norm": 0.13868963718414307, + "learning_rate": 1.3215670963277105e-05, + "loss": -0.9059, + "step": 46500 + }, + { + "epoch": 27.07217694994179, + "grad_norm": 0.1289583146572113, + "learning_rate": 1.3197011023224376e-05, + "loss": -0.9127, + "step": 46510 + }, + { + "epoch": 27.07799767171129, + "grad_norm": 0.2643381953239441, + "learning_rate": 1.3178362262947941e-05, + "loss": -0.901, + "step": 46520 + }, + { + "epoch": 27.08381839348079, + "grad_norm": 0.1530427783727646, + "learning_rate": 1.3159724688112845e-05, + "loss": -0.9075, + "step": 46530 + }, + { + "epoch": 27.08963911525029, + "grad_norm": 0.24934197962284088, + "learning_rate": 1.3141098304380683e-05, + "loss": -0.909, + "step": 46540 + }, + { + "epoch": 27.095459837019792, + "grad_norm": 0.16303297877311707, + "learning_rate": 1.3122483117409651e-05, + "loss": -0.9047, + "step": 46550 + }, + { + "epoch": 27.10128055878929, + "grad_norm": 0.2425752878189087, + "learning_rate": 1.3103879132854552e-05, + "loss": -0.905, + "step": 46560 + }, + { + "epoch": 27.10710128055879, + "grad_norm": 0.13601192831993103, + "learning_rate": 1.3085286356366771e-05, + "loss": -0.907, + "step": 46570 + }, + { + "epoch": 27.11292200232829, + "grad_norm": 0.19126634299755096, + "learning_rate": 1.3066704793594337e-05, + "loss": -0.9067, + "step": 46580 + }, + { + "epoch": 27.11874272409779, + "grad_norm": 0.17387807369232178, + "learning_rate": 1.3048134450181816e-05, + "loss": -0.9081, + "step": 46590 + }, + { + "epoch": 27.124563445867288, + "grad_norm": 0.160071462392807, + "learning_rate": 1.3029575331770394e-05, + "loss": -0.9067, + "step": 46600 + }, + { + "epoch": 27.130384167636787, + "grad_norm": 0.24049833416938782, + "learning_rate": 1.3011027443997837e-05, + "loss": -0.9112, + "step": 46610 + }, + { + "epoch": 27.136204889406287, + "grad_norm": 0.1603558212518692, + "learning_rate": 1.2992490792498507e-05, + "loss": -0.908, + "step": 46620 + }, + { + "epoch": 27.142025611175786, + "grad_norm": 0.1539347767829895, + "learning_rate": 1.297396538290333e-05, + "loss": -0.907, + "step": 46630 + }, + { + "epoch": 27.147846332945285, + "grad_norm": 0.16767100989818573, + "learning_rate": 1.2955451220839888e-05, + "loss": -0.9118, + "step": 46640 + }, + { + "epoch": 27.153667054714784, + "grad_norm": 0.2736862301826477, + "learning_rate": 1.2936948311932223e-05, + "loss": -0.906, + "step": 46650 + }, + { + "epoch": 27.159487776484283, + "grad_norm": 0.15061573684215546, + "learning_rate": 1.2918456661801104e-05, + "loss": -0.9117, + "step": 46660 + }, + { + "epoch": 27.165308498253783, + "grad_norm": 0.11831748485565186, + "learning_rate": 1.2899976276063736e-05, + "loss": -0.9081, + "step": 46670 + }, + { + "epoch": 27.171129220023282, + "grad_norm": 0.17304320633411407, + "learning_rate": 1.2881507160334022e-05, + "loss": -0.9064, + "step": 46680 + }, + { + "epoch": 27.17694994179278, + "grad_norm": 0.1202363446354866, + "learning_rate": 1.286304932022238e-05, + "loss": -0.9112, + "step": 46690 + }, + { + "epoch": 27.18277066356228, + "grad_norm": 0.21698059141635895, + "learning_rate": 1.2844602761335806e-05, + "loss": -0.9065, + "step": 46700 + }, + { + "epoch": 27.18859138533178, + "grad_norm": 0.15559931099414825, + "learning_rate": 1.2826167489277885e-05, + "loss": -0.9097, + "step": 46710 + }, + { + "epoch": 27.194412107101282, + "grad_norm": 0.20066799223423004, + "learning_rate": 1.2807743509648745e-05, + "loss": -0.9115, + "step": 46720 + }, + { + "epoch": 27.20023282887078, + "grad_norm": 0.13137221336364746, + "learning_rate": 1.2789330828045149e-05, + "loss": -0.9065, + "step": 46730 + }, + { + "epoch": 27.20605355064028, + "grad_norm": 0.12020179629325867, + "learning_rate": 1.2770929450060332e-05, + "loss": -0.9049, + "step": 46740 + }, + { + "epoch": 27.21187427240978, + "grad_norm": 0.16420799493789673, + "learning_rate": 1.2752539381284184e-05, + "loss": -0.9055, + "step": 46750 + }, + { + "epoch": 27.21769499417928, + "grad_norm": 0.1826540231704712, + "learning_rate": 1.273416062730311e-05, + "loss": -0.9087, + "step": 46760 + }, + { + "epoch": 27.22351571594878, + "grad_norm": 0.12244300544261932, + "learning_rate": 1.2715793193700088e-05, + "loss": -0.9081, + "step": 46770 + }, + { + "epoch": 27.229336437718278, + "grad_norm": 0.12856335937976837, + "learning_rate": 1.2697437086054664e-05, + "loss": -0.9044, + "step": 46780 + }, + { + "epoch": 27.235157159487777, + "grad_norm": 0.26708874106407166, + "learning_rate": 1.2679092309942937e-05, + "loss": -0.9028, + "step": 46790 + }, + { + "epoch": 27.240977881257276, + "grad_norm": 0.1430046558380127, + "learning_rate": 1.266075887093755e-05, + "loss": -0.9126, + "step": 46800 + }, + { + "epoch": 27.246798603026775, + "grad_norm": 0.2508801221847534, + "learning_rate": 1.2642436774607757e-05, + "loss": -0.9073, + "step": 46810 + }, + { + "epoch": 27.252619324796274, + "grad_norm": 0.1241069883108139, + "learning_rate": 1.2624126026519278e-05, + "loss": -0.9123, + "step": 46820 + }, + { + "epoch": 27.258440046565774, + "grad_norm": 0.16908986866474152, + "learning_rate": 1.2605826632234474e-05, + "loss": -0.9093, + "step": 46830 + }, + { + "epoch": 27.264260768335273, + "grad_norm": 0.11288688331842422, + "learning_rate": 1.2587538597312198e-05, + "loss": -0.9098, + "step": 46840 + }, + { + "epoch": 27.270081490104772, + "grad_norm": 0.13267141580581665, + "learning_rate": 1.2569261927307884e-05, + "loss": -0.9041, + "step": 46850 + }, + { + "epoch": 27.27590221187427, + "grad_norm": 0.13112938404083252, + "learning_rate": 1.2550996627773493e-05, + "loss": -0.9086, + "step": 46860 + }, + { + "epoch": 27.28172293364377, + "grad_norm": 0.29286783933639526, + "learning_rate": 1.2532742704257527e-05, + "loss": -0.9068, + "step": 46870 + }, + { + "epoch": 27.28754365541327, + "grad_norm": 0.14779061079025269, + "learning_rate": 1.2514500162305087e-05, + "loss": -0.9104, + "step": 46880 + }, + { + "epoch": 27.29336437718277, + "grad_norm": 0.09489723294973373, + "learning_rate": 1.2496269007457728e-05, + "loss": -0.9077, + "step": 46890 + }, + { + "epoch": 27.29918509895227, + "grad_norm": 0.14635400474071503, + "learning_rate": 1.2478049245253625e-05, + "loss": -0.9117, + "step": 46900 + }, + { + "epoch": 27.30500582072177, + "grad_norm": 0.18830040097236633, + "learning_rate": 1.2459840881227459e-05, + "loss": -0.9071, + "step": 46910 + }, + { + "epoch": 27.31082654249127, + "grad_norm": 0.1583046168088913, + "learning_rate": 1.2441643920910435e-05, + "loss": -0.9081, + "step": 46920 + }, + { + "epoch": 27.31664726426077, + "grad_norm": 0.15653373301029205, + "learning_rate": 1.2423458369830322e-05, + "loss": -0.9112, + "step": 46930 + }, + { + "epoch": 27.32246798603027, + "grad_norm": 0.13022825121879578, + "learning_rate": 1.2405284233511406e-05, + "loss": -0.9077, + "step": 46940 + }, + { + "epoch": 27.328288707799768, + "grad_norm": 0.12178753316402435, + "learning_rate": 1.2387121517474487e-05, + "loss": -0.9105, + "step": 46950 + }, + { + "epoch": 27.334109429569267, + "grad_norm": 0.13452333211898804, + "learning_rate": 1.2368970227236975e-05, + "loss": -0.9081, + "step": 46960 + }, + { + "epoch": 27.339930151338766, + "grad_norm": 0.20323079824447632, + "learning_rate": 1.2350830368312688e-05, + "loss": -0.909, + "step": 46970 + }, + { + "epoch": 27.345750873108265, + "grad_norm": 0.2170499563217163, + "learning_rate": 1.2332701946212083e-05, + "loss": -0.907, + "step": 46980 + }, + { + "epoch": 27.351571594877765, + "grad_norm": 0.19262540340423584, + "learning_rate": 1.2314584966442077e-05, + "loss": -0.9069, + "step": 46990 + }, + { + "epoch": 27.357392316647264, + "grad_norm": 0.10018812865018845, + "learning_rate": 1.2296479434506136e-05, + "loss": -0.9091, + "step": 47000 + }, + { + "epoch": 27.363213038416763, + "grad_norm": 0.09829561412334442, + "learning_rate": 1.2278385355904232e-05, + "loss": -0.9125, + "step": 47010 + }, + { + "epoch": 27.369033760186262, + "grad_norm": 0.16111548244953156, + "learning_rate": 1.2260302736132867e-05, + "loss": -0.9133, + "step": 47020 + }, + { + "epoch": 27.37485448195576, + "grad_norm": 0.14569193124771118, + "learning_rate": 1.2242231580685098e-05, + "loss": -0.9077, + "step": 47030 + }, + { + "epoch": 27.38067520372526, + "grad_norm": 0.15395426750183105, + "learning_rate": 1.2224171895050413e-05, + "loss": -0.9081, + "step": 47040 + }, + { + "epoch": 27.38649592549476, + "grad_norm": 0.21509802341461182, + "learning_rate": 1.2206123684714903e-05, + "loss": -0.9092, + "step": 47050 + }, + { + "epoch": 27.39231664726426, + "grad_norm": 0.24803653359413147, + "learning_rate": 1.2188086955161132e-05, + "loss": -0.9095, + "step": 47060 + }, + { + "epoch": 27.398137369033762, + "grad_norm": 0.12137731909751892, + "learning_rate": 1.2170061711868175e-05, + "loss": -0.9105, + "step": 47070 + }, + { + "epoch": 27.40395809080326, + "grad_norm": 0.1368298977613449, + "learning_rate": 1.215204796031163e-05, + "loss": -0.909, + "step": 47080 + }, + { + "epoch": 27.40977881257276, + "grad_norm": 0.15048974752426147, + "learning_rate": 1.2134045705963599e-05, + "loss": -0.9128, + "step": 47090 + }, + { + "epoch": 27.41559953434226, + "grad_norm": 0.14603832364082336, + "learning_rate": 1.2116054954292689e-05, + "loss": -0.9111, + "step": 47100 + }, + { + "epoch": 27.42142025611176, + "grad_norm": 0.11973622441291809, + "learning_rate": 1.2098075710764011e-05, + "loss": -0.9095, + "step": 47110 + }, + { + "epoch": 27.427240977881258, + "grad_norm": 0.12881748378276825, + "learning_rate": 1.2080107980839183e-05, + "loss": -0.9089, + "step": 47120 + }, + { + "epoch": 27.433061699650757, + "grad_norm": 0.17233961820602417, + "learning_rate": 1.2062151769976343e-05, + "loss": -0.9084, + "step": 47130 + }, + { + "epoch": 27.438882421420256, + "grad_norm": 0.1346137523651123, + "learning_rate": 1.204420708363011e-05, + "loss": -0.9103, + "step": 47140 + }, + { + "epoch": 27.444703143189756, + "grad_norm": 0.12929081916809082, + "learning_rate": 1.2026273927251597e-05, + "loss": -0.9049, + "step": 47150 + }, + { + "epoch": 27.450523864959255, + "grad_norm": 0.2083767205476761, + "learning_rate": 1.2008352306288424e-05, + "loss": -0.9089, + "step": 47160 + }, + { + "epoch": 27.456344586728754, + "grad_norm": 0.19664974510669708, + "learning_rate": 1.1990442226184695e-05, + "loss": -0.9119, + "step": 47170 + }, + { + "epoch": 27.462165308498253, + "grad_norm": 0.2938607633113861, + "learning_rate": 1.1972543692381066e-05, + "loss": -0.9084, + "step": 47180 + }, + { + "epoch": 27.467986030267753, + "grad_norm": 0.17660462856292725, + "learning_rate": 1.1954656710314576e-05, + "loss": -0.9097, + "step": 47190 + }, + { + "epoch": 27.47380675203725, + "grad_norm": 0.18063358962535858, + "learning_rate": 1.1936781285418875e-05, + "loss": -0.9102, + "step": 47200 + }, + { + "epoch": 27.47962747380675, + "grad_norm": 0.13784624636173248, + "learning_rate": 1.1918917423123993e-05, + "loss": -0.9082, + "step": 47210 + }, + { + "epoch": 27.48544819557625, + "grad_norm": 0.18717707693576813, + "learning_rate": 1.1901065128856537e-05, + "loss": -0.9072, + "step": 47220 + }, + { + "epoch": 27.49126891734575, + "grad_norm": 0.1806868314743042, + "learning_rate": 1.1883224408039551e-05, + "loss": -0.9122, + "step": 47230 + }, + { + "epoch": 27.49708963911525, + "grad_norm": 0.20412111282348633, + "learning_rate": 1.1865395266092578e-05, + "loss": -0.91, + "step": 47240 + }, + { + "epoch": 27.50291036088475, + "grad_norm": 0.14323030412197113, + "learning_rate": 1.1847577708431633e-05, + "loss": -0.9092, + "step": 47250 + }, + { + "epoch": 27.50873108265425, + "grad_norm": 0.13794508576393127, + "learning_rate": 1.1829771740469225e-05, + "loss": -0.9118, + "step": 47260 + }, + { + "epoch": 27.51455180442375, + "grad_norm": 0.09846614301204681, + "learning_rate": 1.1811977367614324e-05, + "loss": -0.9133, + "step": 47270 + }, + { + "epoch": 27.52037252619325, + "grad_norm": 0.13081300258636475, + "learning_rate": 1.1794194595272412e-05, + "loss": -0.8992, + "step": 47280 + }, + { + "epoch": 27.52619324796275, + "grad_norm": 0.16442309319972992, + "learning_rate": 1.1776423428845423e-05, + "loss": -0.9109, + "step": 47290 + }, + { + "epoch": 27.532013969732247, + "grad_norm": 0.12700335681438446, + "learning_rate": 1.1758663873731756e-05, + "loss": -0.9111, + "step": 47300 + }, + { + "epoch": 27.537834691501747, + "grad_norm": 0.18307413160800934, + "learning_rate": 1.1740915935326302e-05, + "loss": -0.9076, + "step": 47310 + }, + { + "epoch": 27.543655413271246, + "grad_norm": 0.13114766776561737, + "learning_rate": 1.1723179619020396e-05, + "loss": -0.9116, + "step": 47320 + }, + { + "epoch": 27.549476135040745, + "grad_norm": 0.16384834051132202, + "learning_rate": 1.1705454930201914e-05, + "loss": -0.9098, + "step": 47330 + }, + { + "epoch": 27.555296856810244, + "grad_norm": 0.10680359601974487, + "learning_rate": 1.1687741874255087e-05, + "loss": -0.9071, + "step": 47340 + }, + { + "epoch": 27.561117578579744, + "grad_norm": 0.14882852137088776, + "learning_rate": 1.1670040456560728e-05, + "loss": -0.9114, + "step": 47350 + }, + { + "epoch": 27.566938300349243, + "grad_norm": 0.28041988611221313, + "learning_rate": 1.1652350682496005e-05, + "loss": -0.9065, + "step": 47360 + }, + { + "epoch": 27.572759022118742, + "grad_norm": 0.16075098514556885, + "learning_rate": 1.163467255743465e-05, + "loss": -0.9103, + "step": 47370 + }, + { + "epoch": 27.57857974388824, + "grad_norm": 0.16613203287124634, + "learning_rate": 1.1617006086746796e-05, + "loss": -0.9074, + "step": 47380 + }, + { + "epoch": 27.58440046565774, + "grad_norm": 0.1311594545841217, + "learning_rate": 1.1599351275799047e-05, + "loss": -0.911, + "step": 47390 + }, + { + "epoch": 27.59022118742724, + "grad_norm": 0.1543235182762146, + "learning_rate": 1.1581708129954466e-05, + "loss": -0.9105, + "step": 47400 + }, + { + "epoch": 27.59604190919674, + "grad_norm": 0.10766025632619858, + "learning_rate": 1.1564076654572587e-05, + "loss": -0.9125, + "step": 47410 + }, + { + "epoch": 27.601862630966238, + "grad_norm": 0.14061790704727173, + "learning_rate": 1.1546456855009358e-05, + "loss": -0.9125, + "step": 47420 + }, + { + "epoch": 27.60768335273574, + "grad_norm": 0.15391355752944946, + "learning_rate": 1.1528848736617248e-05, + "loss": -0.9083, + "step": 47430 + }, + { + "epoch": 27.61350407450524, + "grad_norm": 0.1543714702129364, + "learning_rate": 1.1511252304745112e-05, + "loss": -0.9114, + "step": 47440 + }, + { + "epoch": 27.61932479627474, + "grad_norm": 0.16057546436786652, + "learning_rate": 1.1493667564738297e-05, + "loss": -0.8983, + "step": 47450 + }, + { + "epoch": 27.62514551804424, + "grad_norm": 0.10791870951652527, + "learning_rate": 1.1476094521938574e-05, + "loss": -0.9125, + "step": 47460 + }, + { + "epoch": 27.630966239813738, + "grad_norm": 0.13444608449935913, + "learning_rate": 1.1458533181684167e-05, + "loss": -0.9062, + "step": 47470 + }, + { + "epoch": 27.636786961583237, + "grad_norm": 0.13472290337085724, + "learning_rate": 1.1440983549309753e-05, + "loss": -0.9111, + "step": 47480 + }, + { + "epoch": 27.642607683352736, + "grad_norm": 0.11249946802854538, + "learning_rate": 1.1423445630146434e-05, + "loss": -0.915, + "step": 47490 + }, + { + "epoch": 27.648428405122235, + "grad_norm": 0.2326815128326416, + "learning_rate": 1.1405919429521799e-05, + "loss": -0.9058, + "step": 47500 + }, + { + "epoch": 27.654249126891735, + "grad_norm": 0.13547134399414062, + "learning_rate": 1.1388404952759802e-05, + "loss": -0.9096, + "step": 47510 + }, + { + "epoch": 27.660069848661234, + "grad_norm": 0.14744317531585693, + "learning_rate": 1.1370902205180923e-05, + "loss": -0.911, + "step": 47520 + }, + { + "epoch": 27.665890570430733, + "grad_norm": 0.1809452325105667, + "learning_rate": 1.1353411192101987e-05, + "loss": -0.9086, + "step": 47530 + }, + { + "epoch": 27.671711292200232, + "grad_norm": 0.13469280302524567, + "learning_rate": 1.133593191883634e-05, + "loss": -0.9109, + "step": 47540 + }, + { + "epoch": 27.67753201396973, + "grad_norm": 0.15960343182086945, + "learning_rate": 1.1318464390693711e-05, + "loss": -0.9113, + "step": 47550 + }, + { + "epoch": 27.68335273573923, + "grad_norm": 0.1350407749414444, + "learning_rate": 1.1301008612980257e-05, + "loss": -0.9096, + "step": 47560 + }, + { + "epoch": 27.68917345750873, + "grad_norm": 0.2229999303817749, + "learning_rate": 1.128356459099863e-05, + "loss": -0.9082, + "step": 47570 + }, + { + "epoch": 27.69499417927823, + "grad_norm": 0.23042979836463928, + "learning_rate": 1.1266132330047802e-05, + "loss": -0.9106, + "step": 47580 + }, + { + "epoch": 27.70081490104773, + "grad_norm": 0.11122092604637146, + "learning_rate": 1.1248711835423281e-05, + "loss": -0.9071, + "step": 47590 + }, + { + "epoch": 27.70663562281723, + "grad_norm": 0.11016915738582611, + "learning_rate": 1.123130311241693e-05, + "loss": -0.9112, + "step": 47600 + }, + { + "epoch": 27.71245634458673, + "grad_norm": 0.14535239338874817, + "learning_rate": 1.1213906166317068e-05, + "loss": -0.9103, + "step": 47610 + }, + { + "epoch": 27.71827706635623, + "grad_norm": 0.3230452239513397, + "learning_rate": 1.1196521002408427e-05, + "loss": -0.9058, + "step": 47620 + }, + { + "epoch": 27.72409778812573, + "grad_norm": 0.09956876188516617, + "learning_rate": 1.1179147625972159e-05, + "loss": -0.9092, + "step": 47630 + }, + { + "epoch": 27.729918509895228, + "grad_norm": 0.12210952490568161, + "learning_rate": 1.1161786042285822e-05, + "loss": -0.9136, + "step": 47640 + }, + { + "epoch": 27.735739231664727, + "grad_norm": 0.1475231945514679, + "learning_rate": 1.1144436256623447e-05, + "loss": -0.9101, + "step": 47650 + }, + { + "epoch": 27.741559953434226, + "grad_norm": 0.19883428514003754, + "learning_rate": 1.1127098274255392e-05, + "loss": -0.9102, + "step": 47660 + }, + { + "epoch": 27.747380675203726, + "grad_norm": 0.1261066198348999, + "learning_rate": 1.1109772100448512e-05, + "loss": -0.9116, + "step": 47670 + }, + { + "epoch": 27.753201396973225, + "grad_norm": 0.18879994750022888, + "learning_rate": 1.1092457740466033e-05, + "loss": -0.9102, + "step": 47680 + }, + { + "epoch": 27.759022118742724, + "grad_norm": 0.20263248682022095, + "learning_rate": 1.10751551995676e-05, + "loss": -0.9047, + "step": 47690 + }, + { + "epoch": 27.764842840512223, + "grad_norm": 0.1828291118144989, + "learning_rate": 1.1057864483009262e-05, + "loss": -0.9097, + "step": 47700 + }, + { + "epoch": 27.770663562281722, + "grad_norm": 0.16034269332885742, + "learning_rate": 1.1040585596043473e-05, + "loss": -0.9097, + "step": 47710 + }, + { + "epoch": 27.77648428405122, + "grad_norm": 0.14734262228012085, + "learning_rate": 1.1023318543919148e-05, + "loss": -0.9105, + "step": 47720 + }, + { + "epoch": 27.78230500582072, + "grad_norm": 0.15923982858657837, + "learning_rate": 1.10060633318815e-05, + "loss": -0.9096, + "step": 47730 + }, + { + "epoch": 27.78812572759022, + "grad_norm": 0.11951206624507904, + "learning_rate": 1.0988819965172248e-05, + "loss": -0.9113, + "step": 47740 + }, + { + "epoch": 27.79394644935972, + "grad_norm": 0.1557607501745224, + "learning_rate": 1.0971588449029462e-05, + "loss": -0.9059, + "step": 47750 + }, + { + "epoch": 27.79976717112922, + "grad_norm": 0.2614199221134186, + "learning_rate": 1.095436878868762e-05, + "loss": -0.9035, + "step": 47760 + }, + { + "epoch": 27.80558789289872, + "grad_norm": 0.1301211267709732, + "learning_rate": 1.0937160989377598e-05, + "loss": -0.91, + "step": 47770 + }, + { + "epoch": 27.81140861466822, + "grad_norm": 0.1285940259695053, + "learning_rate": 1.0919965056326676e-05, + "loss": -0.9111, + "step": 47780 + }, + { + "epoch": 27.81722933643772, + "grad_norm": 0.10670747607946396, + "learning_rate": 1.0902780994758504e-05, + "loss": -0.9118, + "step": 47790 + }, + { + "epoch": 27.82305005820722, + "grad_norm": 0.13414332270622253, + "learning_rate": 1.0885608809893193e-05, + "loss": -0.9121, + "step": 47800 + }, + { + "epoch": 27.828870779976718, + "grad_norm": 0.19114072620868683, + "learning_rate": 1.0868448506947142e-05, + "loss": -0.9132, + "step": 47810 + }, + { + "epoch": 27.834691501746217, + "grad_norm": 0.22775499522686005, + "learning_rate": 1.0851300091133243e-05, + "loss": -0.9089, + "step": 47820 + }, + { + "epoch": 27.840512223515717, + "grad_norm": 0.16844883561134338, + "learning_rate": 1.083416356766071e-05, + "loss": -0.9122, + "step": 47830 + }, + { + "epoch": 27.846332945285216, + "grad_norm": 0.11471306532621384, + "learning_rate": 1.0817038941735175e-05, + "loss": -0.9079, + "step": 47840 + }, + { + "epoch": 27.852153667054715, + "grad_norm": 0.18072226643562317, + "learning_rate": 1.0799926218558642e-05, + "loss": -0.9105, + "step": 47850 + }, + { + "epoch": 27.857974388824214, + "grad_norm": 0.2161884903907776, + "learning_rate": 1.0782825403329488e-05, + "loss": -0.91, + "step": 47860 + }, + { + "epoch": 27.863795110593713, + "grad_norm": 0.2312365621328354, + "learning_rate": 1.076573650124254e-05, + "loss": -0.9088, + "step": 47870 + }, + { + "epoch": 27.869615832363213, + "grad_norm": 0.15416762232780457, + "learning_rate": 1.0748659517488891e-05, + "loss": -0.909, + "step": 47880 + }, + { + "epoch": 27.875436554132712, + "grad_norm": 0.12388164550065994, + "learning_rate": 1.0731594457256138e-05, + "loss": -0.9099, + "step": 47890 + }, + { + "epoch": 27.88125727590221, + "grad_norm": 0.11231181770563126, + "learning_rate": 1.0714541325728139e-05, + "loss": -0.9101, + "step": 47900 + }, + { + "epoch": 27.88707799767171, + "grad_norm": 0.18757276237010956, + "learning_rate": 1.0697500128085231e-05, + "loss": -0.9122, + "step": 47910 + }, + { + "epoch": 27.89289871944121, + "grad_norm": 0.1736993044614792, + "learning_rate": 1.0680470869504055e-05, + "loss": -0.9123, + "step": 47920 + }, + { + "epoch": 27.89871944121071, + "grad_norm": 0.16929595172405243, + "learning_rate": 1.066345355515766e-05, + "loss": -0.9118, + "step": 47930 + }, + { + "epoch": 27.904540162980208, + "grad_norm": 0.20394858717918396, + "learning_rate": 1.0646448190215453e-05, + "loss": -0.9159, + "step": 47940 + }, + { + "epoch": 27.91036088474971, + "grad_norm": 0.16551895439624786, + "learning_rate": 1.0629454779843217e-05, + "loss": -0.9115, + "step": 47950 + }, + { + "epoch": 27.91618160651921, + "grad_norm": 0.10871101170778275, + "learning_rate": 1.0612473329203082e-05, + "loss": -0.9126, + "step": 47960 + }, + { + "epoch": 27.92200232828871, + "grad_norm": 0.22053088247776031, + "learning_rate": 1.0595503843453596e-05, + "loss": -0.904, + "step": 47970 + }, + { + "epoch": 27.92782305005821, + "grad_norm": 0.16558924317359924, + "learning_rate": 1.0578546327749634e-05, + "loss": -0.9075, + "step": 47980 + }, + { + "epoch": 27.933643771827708, + "grad_norm": 0.10109088569879532, + "learning_rate": 1.0561600787242425e-05, + "loss": -0.9099, + "step": 47990 + }, + { + "epoch": 27.939464493597207, + "grad_norm": 0.09693928062915802, + "learning_rate": 1.0544667227079591e-05, + "loss": -0.9088, + "step": 48000 + }, + { + "epoch": 27.945285215366706, + "grad_norm": 0.1325165182352066, + "learning_rate": 1.0527745652405085e-05, + "loss": -0.9101, + "step": 48010 + }, + { + "epoch": 27.951105937136205, + "grad_norm": 0.1210135966539383, + "learning_rate": 1.051083606835927e-05, + "loss": -0.912, + "step": 48020 + }, + { + "epoch": 27.956926658905704, + "grad_norm": 0.12225443124771118, + "learning_rate": 1.049393848007878e-05, + "loss": -0.9092, + "step": 48030 + }, + { + "epoch": 27.962747380675204, + "grad_norm": 0.2052718698978424, + "learning_rate": 1.0477052892696709e-05, + "loss": -0.9087, + "step": 48040 + }, + { + "epoch": 27.968568102444703, + "grad_norm": 0.12743207812309265, + "learning_rate": 1.0460179311342394e-05, + "loss": -0.9072, + "step": 48050 + }, + { + "epoch": 27.974388824214202, + "grad_norm": 0.1389906108379364, + "learning_rate": 1.0443317741141634e-05, + "loss": -0.9133, + "step": 48060 + }, + { + "epoch": 27.9802095459837, + "grad_norm": 0.1761658489704132, + "learning_rate": 1.0426468187216514e-05, + "loss": -0.9095, + "step": 48070 + }, + { + "epoch": 27.9860302677532, + "grad_norm": 0.13265252113342285, + "learning_rate": 1.0409630654685477e-05, + "loss": -0.9144, + "step": 48080 + }, + { + "epoch": 27.9918509895227, + "grad_norm": 0.11906762421131134, + "learning_rate": 1.039280514866332e-05, + "loss": -0.9133, + "step": 48090 + }, + { + "epoch": 27.9976717112922, + "grad_norm": 0.13599999248981476, + "learning_rate": 1.0375991674261198e-05, + "loss": -0.9107, + "step": 48100 + }, + { + "epoch": 28.003492433061698, + "grad_norm": 0.11717156320810318, + "learning_rate": 1.0359190236586575e-05, + "loss": -0.9092, + "step": 48110 + }, + { + "epoch": 28.009313154831197, + "grad_norm": 0.13025692105293274, + "learning_rate": 1.0342400840743322e-05, + "loss": -0.913, + "step": 48120 + }, + { + "epoch": 28.0151338766007, + "grad_norm": 0.2543139159679413, + "learning_rate": 1.0325623491831593e-05, + "loss": -0.9134, + "step": 48130 + }, + { + "epoch": 28.0209545983702, + "grad_norm": 0.1712450236082077, + "learning_rate": 1.0308858194947906e-05, + "loss": -0.911, + "step": 48140 + }, + { + "epoch": 28.0267753201397, + "grad_norm": 0.15015901625156403, + "learning_rate": 1.0292104955185111e-05, + "loss": -0.9071, + "step": 48150 + }, + { + "epoch": 28.032596041909198, + "grad_norm": 0.14801348745822906, + "learning_rate": 1.0275363777632396e-05, + "loss": -0.9132, + "step": 48160 + }, + { + "epoch": 28.038416763678697, + "grad_norm": 0.10696122795343399, + "learning_rate": 1.0258634667375321e-05, + "loss": -0.9056, + "step": 48170 + }, + { + "epoch": 28.044237485448196, + "grad_norm": 0.2104111760854721, + "learning_rate": 1.02419176294957e-05, + "loss": -0.913, + "step": 48180 + }, + { + "epoch": 28.050058207217695, + "grad_norm": 0.09692558646202087, + "learning_rate": 1.0225212669071782e-05, + "loss": -0.9127, + "step": 48190 + }, + { + "epoch": 28.055878928987195, + "grad_norm": 0.15479691326618195, + "learning_rate": 1.0208519791178029e-05, + "loss": -0.914, + "step": 48200 + }, + { + "epoch": 28.061699650756694, + "grad_norm": 0.16867990791797638, + "learning_rate": 1.019183900088535e-05, + "loss": -0.9116, + "step": 48210 + }, + { + "epoch": 28.067520372526193, + "grad_norm": 0.15966542065143585, + "learning_rate": 1.0175170303260906e-05, + "loss": -0.9111, + "step": 48220 + }, + { + "epoch": 28.073341094295692, + "grad_norm": 0.15639086067676544, + "learning_rate": 1.0158513703368206e-05, + "loss": -0.9114, + "step": 48230 + }, + { + "epoch": 28.07916181606519, + "grad_norm": 0.14679104089736938, + "learning_rate": 1.0141869206267095e-05, + "loss": -0.9127, + "step": 48240 + }, + { + "epoch": 28.08498253783469, + "grad_norm": 0.13620731234550476, + "learning_rate": 1.0125236817013723e-05, + "loss": -0.9096, + "step": 48250 + }, + { + "epoch": 28.09080325960419, + "grad_norm": 0.16643795371055603, + "learning_rate": 1.010861654066056e-05, + "loss": -0.9125, + "step": 48260 + }, + { + "epoch": 28.09662398137369, + "grad_norm": 0.1446102410554886, + "learning_rate": 1.0092008382256434e-05, + "loss": -0.9112, + "step": 48270 + }, + { + "epoch": 28.10244470314319, + "grad_norm": 0.1829783022403717, + "learning_rate": 1.0075412346846458e-05, + "loss": -0.9123, + "step": 48280 + }, + { + "epoch": 28.108265424912688, + "grad_norm": 0.16033944487571716, + "learning_rate": 1.0058828439472056e-05, + "loss": -0.9126, + "step": 48290 + }, + { + "epoch": 28.11408614668219, + "grad_norm": 0.21303045749664307, + "learning_rate": 1.0042256665170996e-05, + "loss": -0.9117, + "step": 48300 + }, + { + "epoch": 28.11990686845169, + "grad_norm": 0.16936951875686646, + "learning_rate": 1.0025697028977332e-05, + "loss": -0.9103, + "step": 48310 + }, + { + "epoch": 28.12572759022119, + "grad_norm": 0.16854043304920197, + "learning_rate": 1.0009149535921454e-05, + "loss": -0.911, + "step": 48320 + }, + { + "epoch": 28.131548311990688, + "grad_norm": 0.10770110040903091, + "learning_rate": 9.992614191030031e-06, + "loss": -0.9155, + "step": 48330 + }, + { + "epoch": 28.137369033760187, + "grad_norm": 0.13924069702625275, + "learning_rate": 9.976090999326115e-06, + "loss": -0.9104, + "step": 48340 + }, + { + "epoch": 28.143189755529686, + "grad_norm": 0.11061007529497147, + "learning_rate": 9.959579965828952e-06, + "loss": -0.9151, + "step": 48350 + }, + { + "epoch": 28.149010477299186, + "grad_norm": 0.12236785888671875, + "learning_rate": 9.943081095554218e-06, + "loss": -0.913, + "step": 48360 + }, + { + "epoch": 28.154831199068685, + "grad_norm": 0.13726650178432465, + "learning_rate": 9.926594393513783e-06, + "loss": -0.9094, + "step": 48370 + }, + { + "epoch": 28.160651920838184, + "grad_norm": 0.11334909498691559, + "learning_rate": 9.910119864715906e-06, + "loss": -0.9122, + "step": 48380 + }, + { + "epoch": 28.166472642607683, + "grad_norm": 0.10651280730962753, + "learning_rate": 9.8936575141651e-06, + "loss": -0.9136, + "step": 48390 + }, + { + "epoch": 28.172293364377182, + "grad_norm": 0.20472390949726105, + "learning_rate": 9.877207346862194e-06, + "loss": -0.9105, + "step": 48400 + }, + { + "epoch": 28.17811408614668, + "grad_norm": 0.15068666636943817, + "learning_rate": 9.860769367804312e-06, + "loss": -0.9109, + "step": 48410 + }, + { + "epoch": 28.18393480791618, + "grad_norm": 0.11606143414974213, + "learning_rate": 9.844343581984877e-06, + "loss": -0.9123, + "step": 48420 + }, + { + "epoch": 28.18975552968568, + "grad_norm": 0.25576820969581604, + "learning_rate": 9.82792999439362e-06, + "loss": -0.9118, + "step": 48430 + }, + { + "epoch": 28.19557625145518, + "grad_norm": 0.17919766902923584, + "learning_rate": 9.811528610016546e-06, + "loss": -0.9129, + "step": 48440 + }, + { + "epoch": 28.20139697322468, + "grad_norm": 0.15240547060966492, + "learning_rate": 9.79513943383597e-06, + "loss": -0.9104, + "step": 48450 + }, + { + "epoch": 28.207217694994178, + "grad_norm": 0.13285811245441437, + "learning_rate": 9.778762470830489e-06, + "loss": -0.9084, + "step": 48460 + }, + { + "epoch": 28.213038416763677, + "grad_norm": 0.14376430213451385, + "learning_rate": 9.762397725974982e-06, + "loss": -0.9105, + "step": 48470 + }, + { + "epoch": 28.21885913853318, + "grad_norm": 0.2081788331270218, + "learning_rate": 9.746045204240622e-06, + "loss": -0.9108, + "step": 48480 + }, + { + "epoch": 28.22467986030268, + "grad_norm": 0.14933370053768158, + "learning_rate": 9.729704910594917e-06, + "loss": -0.912, + "step": 48490 + }, + { + "epoch": 28.230500582072178, + "grad_norm": 0.19576548039913177, + "learning_rate": 9.713376850001554e-06, + "loss": -0.9108, + "step": 48500 + }, + { + "epoch": 28.236321303841677, + "grad_norm": 0.10745595395565033, + "learning_rate": 9.697061027420622e-06, + "loss": -0.9122, + "step": 48510 + }, + { + "epoch": 28.242142025611177, + "grad_norm": 0.11604063957929611, + "learning_rate": 9.680757447808385e-06, + "loss": -0.911, + "step": 48520 + }, + { + "epoch": 28.247962747380676, + "grad_norm": 0.2882680892944336, + "learning_rate": 9.664466116117488e-06, + "loss": -0.9141, + "step": 48530 + }, + { + "epoch": 28.253783469150175, + "grad_norm": 0.14611229300498962, + "learning_rate": 9.64818703729678e-06, + "loss": -0.9106, + "step": 48540 + }, + { + "epoch": 28.259604190919674, + "grad_norm": 0.19577571749687195, + "learning_rate": 9.631920216291423e-06, + "loss": -0.9137, + "step": 48550 + }, + { + "epoch": 28.265424912689173, + "grad_norm": 0.21818296611309052, + "learning_rate": 9.615665658042849e-06, + "loss": -0.9106, + "step": 48560 + }, + { + "epoch": 28.271245634458673, + "grad_norm": 0.1900242269039154, + "learning_rate": 9.599423367488747e-06, + "loss": -0.9101, + "step": 48570 + }, + { + "epoch": 28.277066356228172, + "grad_norm": 0.1503566950559616, + "learning_rate": 9.583193349563124e-06, + "loss": -0.9125, + "step": 48580 + }, + { + "epoch": 28.28288707799767, + "grad_norm": 0.1747465431690216, + "learning_rate": 9.566975609196216e-06, + "loss": -0.9103, + "step": 48590 + }, + { + "epoch": 28.28870779976717, + "grad_norm": 0.15207456052303314, + "learning_rate": 9.550770151314548e-06, + "loss": -0.9127, + "step": 48600 + }, + { + "epoch": 28.29452852153667, + "grad_norm": 0.18898512423038483, + "learning_rate": 9.53457698084091e-06, + "loss": -0.912, + "step": 48610 + }, + { + "epoch": 28.30034924330617, + "grad_norm": 0.08084643632173538, + "learning_rate": 9.518396102694355e-06, + "loss": -0.9152, + "step": 48620 + }, + { + "epoch": 28.306169965075668, + "grad_norm": 0.11705009639263153, + "learning_rate": 9.502227521790198e-06, + "loss": -0.9086, + "step": 48630 + }, + { + "epoch": 28.311990686845167, + "grad_norm": 0.14263254404067993, + "learning_rate": 9.486071243040063e-06, + "loss": -0.9094, + "step": 48640 + }, + { + "epoch": 28.31781140861467, + "grad_norm": 0.14929307997226715, + "learning_rate": 9.469927271351747e-06, + "loss": -0.9125, + "step": 48650 + }, + { + "epoch": 28.32363213038417, + "grad_norm": 0.17452040314674377, + "learning_rate": 9.453795611629419e-06, + "loss": -0.9143, + "step": 48660 + }, + { + "epoch": 28.32945285215367, + "grad_norm": 0.18064887821674347, + "learning_rate": 9.437676268773399e-06, + "loss": -0.9143, + "step": 48670 + }, + { + "epoch": 28.335273573923168, + "grad_norm": 0.15509484708309174, + "learning_rate": 9.421569247680357e-06, + "loss": -0.9142, + "step": 48680 + }, + { + "epoch": 28.341094295692667, + "grad_norm": 0.15254785120487213, + "learning_rate": 9.40547455324316e-06, + "loss": -0.9112, + "step": 48690 + }, + { + "epoch": 28.346915017462166, + "grad_norm": 0.2016027867794037, + "learning_rate": 9.389392190350965e-06, + "loss": -0.9112, + "step": 48700 + }, + { + "epoch": 28.352735739231665, + "grad_norm": 0.13773711025714874, + "learning_rate": 9.373322163889153e-06, + "loss": -0.9091, + "step": 48710 + }, + { + "epoch": 28.358556461001164, + "grad_norm": 0.32073184847831726, + "learning_rate": 9.357264478739375e-06, + "loss": -0.9129, + "step": 48720 + }, + { + "epoch": 28.364377182770664, + "grad_norm": 0.1313478648662567, + "learning_rate": 9.341219139779567e-06, + "loss": -0.9155, + "step": 48730 + }, + { + "epoch": 28.370197904540163, + "grad_norm": 0.11660179495811462, + "learning_rate": 9.325186151883824e-06, + "loss": -0.9169, + "step": 48740 + }, + { + "epoch": 28.376018626309662, + "grad_norm": 0.15505747497081757, + "learning_rate": 9.30916551992258e-06, + "loss": -0.916, + "step": 48750 + }, + { + "epoch": 28.38183934807916, + "grad_norm": 0.2210179716348648, + "learning_rate": 9.293157248762479e-06, + "loss": -0.9112, + "step": 48760 + }, + { + "epoch": 28.38766006984866, + "grad_norm": 0.1603744924068451, + "learning_rate": 9.2771613432664e-06, + "loss": -0.9121, + "step": 48770 + }, + { + "epoch": 28.39348079161816, + "grad_norm": 0.1043265163898468, + "learning_rate": 9.261177808293481e-06, + "loss": -0.9043, + "step": 48780 + }, + { + "epoch": 28.39930151338766, + "grad_norm": 0.1470334827899933, + "learning_rate": 9.245206648699096e-06, + "loss": -0.9146, + "step": 48790 + }, + { + "epoch": 28.405122235157158, + "grad_norm": 0.1998933106660843, + "learning_rate": 9.22924786933485e-06, + "loss": -0.9114, + "step": 48800 + }, + { + "epoch": 28.410942956926657, + "grad_norm": 0.24749688804149628, + "learning_rate": 9.213301475048642e-06, + "loss": -0.9139, + "step": 48810 + }, + { + "epoch": 28.416763678696157, + "grad_norm": 0.2101418375968933, + "learning_rate": 9.197367470684504e-06, + "loss": -0.9125, + "step": 48820 + }, + { + "epoch": 28.42258440046566, + "grad_norm": 0.13888218998908997, + "learning_rate": 9.181445861082816e-06, + "loss": -0.9122, + "step": 48830 + }, + { + "epoch": 28.42840512223516, + "grad_norm": 0.2452528029680252, + "learning_rate": 9.16553665108012e-06, + "loss": -0.9043, + "step": 48840 + }, + { + "epoch": 28.434225844004658, + "grad_norm": 0.2214711606502533, + "learning_rate": 9.149639845509223e-06, + "loss": -0.9138, + "step": 48850 + }, + { + "epoch": 28.440046565774157, + "grad_norm": 0.21195998787879944, + "learning_rate": 9.133755449199144e-06, + "loss": -0.9129, + "step": 48860 + }, + { + "epoch": 28.445867287543656, + "grad_norm": 0.14791597425937653, + "learning_rate": 9.117883466975135e-06, + "loss": -0.9122, + "step": 48870 + }, + { + "epoch": 28.451688009313155, + "grad_norm": 0.15283803641796112, + "learning_rate": 9.10202390365873e-06, + "loss": -0.916, + "step": 48880 + }, + { + "epoch": 28.457508731082655, + "grad_norm": 0.12152280658483505, + "learning_rate": 9.086176764067583e-06, + "loss": -0.912, + "step": 48890 + }, + { + "epoch": 28.463329452852154, + "grad_norm": 0.2724970579147339, + "learning_rate": 9.070342053015684e-06, + "loss": -0.9135, + "step": 48900 + }, + { + "epoch": 28.469150174621653, + "grad_norm": 0.23542506992816925, + "learning_rate": 9.054519775313187e-06, + "loss": -0.9126, + "step": 48910 + }, + { + "epoch": 28.474970896391152, + "grad_norm": 0.10625530034303665, + "learning_rate": 9.038709935766476e-06, + "loss": -0.9149, + "step": 48920 + }, + { + "epoch": 28.48079161816065, + "grad_norm": 0.12450497597455978, + "learning_rate": 9.02291253917817e-06, + "loss": -0.9085, + "step": 48930 + }, + { + "epoch": 28.48661233993015, + "grad_norm": 0.15523681044578552, + "learning_rate": 9.007127590347091e-06, + "loss": -0.9116, + "step": 48940 + }, + { + "epoch": 28.49243306169965, + "grad_norm": 0.27601155638694763, + "learning_rate": 8.991355094068288e-06, + "loss": -0.9132, + "step": 48950 + }, + { + "epoch": 28.49825378346915, + "grad_norm": 0.11711965501308441, + "learning_rate": 8.975595055133062e-06, + "loss": -0.9125, + "step": 48960 + }, + { + "epoch": 28.50407450523865, + "grad_norm": 0.1478494554758072, + "learning_rate": 8.959847478328848e-06, + "loss": -0.913, + "step": 48970 + }, + { + "epoch": 28.509895227008148, + "grad_norm": 0.14590327441692352, + "learning_rate": 8.944112368439378e-06, + "loss": -0.9134, + "step": 48980 + }, + { + "epoch": 28.515715948777647, + "grad_norm": 0.12866534292697906, + "learning_rate": 8.928389730244552e-06, + "loss": -0.9157, + "step": 48990 + }, + { + "epoch": 28.52153667054715, + "grad_norm": 0.14917799830436707, + "learning_rate": 8.912679568520494e-06, + "loss": -0.9113, + "step": 49000 + }, + { + "epoch": 28.52735739231665, + "grad_norm": 0.20268292725086212, + "learning_rate": 8.896981888039534e-06, + "loss": -0.9134, + "step": 49010 + }, + { + "epoch": 28.533178114086148, + "grad_norm": 0.1524663269519806, + "learning_rate": 8.881296693570201e-06, + "loss": -0.9153, + "step": 49020 + }, + { + "epoch": 28.538998835855647, + "grad_norm": 0.12282276898622513, + "learning_rate": 8.865623989877281e-06, + "loss": -0.9128, + "step": 49030 + }, + { + "epoch": 28.544819557625146, + "grad_norm": 0.11268522590398788, + "learning_rate": 8.849963781721681e-06, + "loss": -0.9139, + "step": 49040 + }, + { + "epoch": 28.550640279394646, + "grad_norm": 0.1592005044221878, + "learning_rate": 8.834316073860588e-06, + "loss": -0.9131, + "step": 49050 + }, + { + "epoch": 28.556461001164145, + "grad_norm": 0.15728233754634857, + "learning_rate": 8.818680871047357e-06, + "loss": -0.9098, + "step": 49060 + }, + { + "epoch": 28.562281722933644, + "grad_norm": 0.15707333385944366, + "learning_rate": 8.803058178031549e-06, + "loss": -0.9141, + "step": 49070 + }, + { + "epoch": 28.568102444703143, + "grad_norm": 0.1308845728635788, + "learning_rate": 8.787447999558922e-06, + "loss": -0.9111, + "step": 49080 + }, + { + "epoch": 28.573923166472643, + "grad_norm": 0.15775470435619354, + "learning_rate": 8.77185034037144e-06, + "loss": -0.9119, + "step": 49090 + }, + { + "epoch": 28.57974388824214, + "grad_norm": 0.14140154421329498, + "learning_rate": 8.756265205207259e-06, + "loss": -0.9159, + "step": 49100 + }, + { + "epoch": 28.58556461001164, + "grad_norm": 0.11997009068727493, + "learning_rate": 8.740692598800732e-06, + "loss": -0.9167, + "step": 49110 + }, + { + "epoch": 28.59138533178114, + "grad_norm": 0.17119885981082916, + "learning_rate": 8.72513252588239e-06, + "loss": -0.913, + "step": 49120 + }, + { + "epoch": 28.59720605355064, + "grad_norm": 0.1839076727628708, + "learning_rate": 8.709584991178998e-06, + "loss": -0.9161, + "step": 49130 + }, + { + "epoch": 28.60302677532014, + "grad_norm": 0.10456859320402145, + "learning_rate": 8.694049999413479e-06, + "loss": -0.9144, + "step": 49140 + }, + { + "epoch": 28.608847497089638, + "grad_norm": 0.20494695007801056, + "learning_rate": 8.678527555304945e-06, + "loss": -0.9148, + "step": 49150 + }, + { + "epoch": 28.614668218859137, + "grad_norm": 0.11583708226680756, + "learning_rate": 8.663017663568712e-06, + "loss": -0.9158, + "step": 49160 + }, + { + "epoch": 28.620488940628636, + "grad_norm": 0.16919799149036407, + "learning_rate": 8.647520328916259e-06, + "loss": -0.9115, + "step": 49170 + }, + { + "epoch": 28.62630966239814, + "grad_norm": 0.13450080156326294, + "learning_rate": 8.632035556055307e-06, + "loss": -0.91, + "step": 49180 + }, + { + "epoch": 28.63213038416764, + "grad_norm": 0.1725490838289261, + "learning_rate": 8.616563349689672e-06, + "loss": -0.9024, + "step": 49190 + }, + { + "epoch": 28.637951105937137, + "grad_norm": 0.16478337347507477, + "learning_rate": 8.601103714519448e-06, + "loss": -0.9121, + "step": 49200 + }, + { + "epoch": 28.643771827706637, + "grad_norm": 0.1924445480108261, + "learning_rate": 8.58565665524082e-06, + "loss": -0.9151, + "step": 49210 + }, + { + "epoch": 28.649592549476136, + "grad_norm": 0.1535215526819229, + "learning_rate": 8.570222176546222e-06, + "loss": -0.9123, + "step": 49220 + }, + { + "epoch": 28.655413271245635, + "grad_norm": 0.23850594460964203, + "learning_rate": 8.554800283124242e-06, + "loss": -0.9111, + "step": 49230 + }, + { + "epoch": 28.661233993015134, + "grad_norm": 0.1396772414445877, + "learning_rate": 8.539390979659639e-06, + "loss": -0.9134, + "step": 49240 + }, + { + "epoch": 28.667054714784634, + "grad_norm": 0.16501620411872864, + "learning_rate": 8.523994270833352e-06, + "loss": -0.9126, + "step": 49250 + }, + { + "epoch": 28.672875436554133, + "grad_norm": 0.10756374895572662, + "learning_rate": 8.5086101613225e-06, + "loss": -0.9116, + "step": 49260 + }, + { + "epoch": 28.678696158323632, + "grad_norm": 0.16335222125053406, + "learning_rate": 8.493238655800346e-06, + "loss": -0.9144, + "step": 49270 + }, + { + "epoch": 28.68451688009313, + "grad_norm": 0.13042739033699036, + "learning_rate": 8.47787975893638e-06, + "loss": -0.9122, + "step": 49280 + }, + { + "epoch": 28.69033760186263, + "grad_norm": 0.18581433594226837, + "learning_rate": 8.462533475396211e-06, + "loss": -0.9132, + "step": 49290 + }, + { + "epoch": 28.69615832363213, + "grad_norm": 0.20709781348705292, + "learning_rate": 8.447199809841643e-06, + "loss": -0.9145, + "step": 49300 + }, + { + "epoch": 28.70197904540163, + "grad_norm": 0.14305689930915833, + "learning_rate": 8.431878766930635e-06, + "loss": -0.9126, + "step": 49310 + }, + { + "epoch": 28.707799767171128, + "grad_norm": 0.1700160950422287, + "learning_rate": 8.416570351317304e-06, + "loss": -0.9135, + "step": 49320 + }, + { + "epoch": 28.713620488940627, + "grad_norm": 0.1306789368391037, + "learning_rate": 8.401274567651973e-06, + "loss": -0.9128, + "step": 49330 + }, + { + "epoch": 28.719441210710126, + "grad_norm": 0.14292028546333313, + "learning_rate": 8.385991420581058e-06, + "loss": -0.9135, + "step": 49340 + }, + { + "epoch": 28.725261932479626, + "grad_norm": 0.15544089674949646, + "learning_rate": 8.370720914747215e-06, + "loss": -0.9132, + "step": 49350 + }, + { + "epoch": 28.73108265424913, + "grad_norm": 0.15032094717025757, + "learning_rate": 8.355463054789181e-06, + "loss": -0.9135, + "step": 49360 + }, + { + "epoch": 28.736903376018628, + "grad_norm": 0.2114156037569046, + "learning_rate": 8.340217845341919e-06, + "loss": -0.91, + "step": 49370 + }, + { + "epoch": 28.742724097788127, + "grad_norm": 0.20471908152103424, + "learning_rate": 8.324985291036514e-06, + "loss": -0.912, + "step": 49380 + }, + { + "epoch": 28.748544819557626, + "grad_norm": 0.16400252282619476, + "learning_rate": 8.309765396500213e-06, + "loss": -0.9135, + "step": 49390 + }, + { + "epoch": 28.754365541327125, + "grad_norm": 0.11759121716022491, + "learning_rate": 8.294558166356419e-06, + "loss": -0.9175, + "step": 49400 + }, + { + "epoch": 28.760186263096625, + "grad_norm": 0.13738496601581573, + "learning_rate": 8.279363605224683e-06, + "loss": -0.9167, + "step": 49410 + }, + { + "epoch": 28.766006984866124, + "grad_norm": 0.10930140316486359, + "learning_rate": 8.264181717720704e-06, + "loss": -0.9153, + "step": 49420 + }, + { + "epoch": 28.771827706635623, + "grad_norm": 0.22198960185050964, + "learning_rate": 8.249012508456361e-06, + "loss": -0.9074, + "step": 49430 + }, + { + "epoch": 28.777648428405122, + "grad_norm": 0.15585604310035706, + "learning_rate": 8.233855982039646e-06, + "loss": -0.914, + "step": 49440 + }, + { + "epoch": 28.78346915017462, + "grad_norm": 0.1602051854133606, + "learning_rate": 8.218712143074708e-06, + "loss": -0.9143, + "step": 49450 + }, + { + "epoch": 28.78928987194412, + "grad_norm": 0.10366084426641464, + "learning_rate": 8.203580996161858e-06, + "loss": -0.9171, + "step": 49460 + }, + { + "epoch": 28.79511059371362, + "grad_norm": 0.21284720301628113, + "learning_rate": 8.188462545897512e-06, + "loss": -0.9127, + "step": 49470 + }, + { + "epoch": 28.80093131548312, + "grad_norm": 0.15986058115959167, + "learning_rate": 8.173356796874304e-06, + "loss": -0.9144, + "step": 49480 + }, + { + "epoch": 28.80675203725262, + "grad_norm": 0.1211617961525917, + "learning_rate": 8.158263753680906e-06, + "loss": -0.9124, + "step": 49490 + }, + { + "epoch": 28.812572759022117, + "grad_norm": 0.14951589703559875, + "learning_rate": 8.143183420902239e-06, + "loss": -0.9138, + "step": 49500 + }, + { + "epoch": 28.818393480791617, + "grad_norm": 0.2017548382282257, + "learning_rate": 8.128115803119258e-06, + "loss": -0.9153, + "step": 49510 + }, + { + "epoch": 28.824214202561116, + "grad_norm": 0.17094570398330688, + "learning_rate": 8.11306090490916e-06, + "loss": -0.9163, + "step": 49520 + }, + { + "epoch": 28.83003492433062, + "grad_norm": 0.13401898741722107, + "learning_rate": 8.098018730845169e-06, + "loss": -0.9166, + "step": 49530 + }, + { + "epoch": 28.835855646100118, + "grad_norm": 0.12099941819906235, + "learning_rate": 8.082989285496745e-06, + "loss": -0.9159, + "step": 49540 + }, + { + "epoch": 28.841676367869617, + "grad_norm": 0.12130098789930344, + "learning_rate": 8.067972573429416e-06, + "loss": -0.9137, + "step": 49550 + }, + { + "epoch": 28.847497089639116, + "grad_norm": 0.1626567542552948, + "learning_rate": 8.052968599204874e-06, + "loss": -0.9143, + "step": 49560 + }, + { + "epoch": 28.853317811408616, + "grad_norm": 0.17395052313804626, + "learning_rate": 8.037977367380922e-06, + "loss": -0.9164, + "step": 49570 + }, + { + "epoch": 28.859138533178115, + "grad_norm": 0.1439186930656433, + "learning_rate": 8.022998882511495e-06, + "loss": -0.9123, + "step": 49580 + }, + { + "epoch": 28.864959254947614, + "grad_norm": 0.12460681796073914, + "learning_rate": 8.008033149146677e-06, + "loss": -0.9111, + "step": 49590 + }, + { + "epoch": 28.870779976717113, + "grad_norm": 0.19752290844917297, + "learning_rate": 7.993080171832656e-06, + "loss": -0.9103, + "step": 49600 + }, + { + "epoch": 28.876600698486612, + "grad_norm": 0.17972655594348907, + "learning_rate": 7.978139955111752e-06, + "loss": -0.9163, + "step": 49610 + }, + { + "epoch": 28.88242142025611, + "grad_norm": 0.09808167070150375, + "learning_rate": 7.9632125035224e-06, + "loss": -0.9128, + "step": 49620 + }, + { + "epoch": 28.88824214202561, + "grad_norm": 0.1992465853691101, + "learning_rate": 7.948297821599177e-06, + "loss": -0.9153, + "step": 49630 + }, + { + "epoch": 28.89406286379511, + "grad_norm": 0.10171467065811157, + "learning_rate": 7.933395913872755e-06, + "loss": -0.9161, + "step": 49640 + }, + { + "epoch": 28.89988358556461, + "grad_norm": 0.1720830500125885, + "learning_rate": 7.918506784869972e-06, + "loss": -0.9143, + "step": 49650 + }, + { + "epoch": 28.90570430733411, + "grad_norm": 0.1254064440727234, + "learning_rate": 7.903630439113707e-06, + "loss": -0.9155, + "step": 49660 + }, + { + "epoch": 28.911525029103608, + "grad_norm": 0.14695367217063904, + "learning_rate": 7.888766881123044e-06, + "loss": -0.9148, + "step": 49670 + }, + { + "epoch": 28.917345750873107, + "grad_norm": 0.1459064930677414, + "learning_rate": 7.873916115413099e-06, + "loss": -0.9171, + "step": 49680 + }, + { + "epoch": 28.923166472642606, + "grad_norm": 0.15973661839962006, + "learning_rate": 7.85907814649518e-06, + "loss": -0.9046, + "step": 49690 + }, + { + "epoch": 28.92898719441211, + "grad_norm": 0.1492578685283661, + "learning_rate": 7.844252978876649e-06, + "loss": -0.9138, + "step": 49700 + }, + { + "epoch": 28.934807916181608, + "grad_norm": 0.21898388862609863, + "learning_rate": 7.829440617061001e-06, + "loss": -0.9116, + "step": 49710 + }, + { + "epoch": 28.940628637951107, + "grad_norm": 0.09817763417959213, + "learning_rate": 7.814641065547851e-06, + "loss": -0.9106, + "step": 49720 + }, + { + "epoch": 28.946449359720607, + "grad_norm": 0.0998048484325409, + "learning_rate": 7.79985432883289e-06, + "loss": -0.918, + "step": 49730 + }, + { + "epoch": 28.952270081490106, + "grad_norm": 0.13371333479881287, + "learning_rate": 7.78508041140797e-06, + "loss": -0.9164, + "step": 49740 + }, + { + "epoch": 28.958090803259605, + "grad_norm": 0.21125410497188568, + "learning_rate": 7.770319317760993e-06, + "loss": -0.9117, + "step": 49750 + }, + { + "epoch": 28.963911525029104, + "grad_norm": 0.12158355116844177, + "learning_rate": 7.755571052376004e-06, + "loss": -0.9123, + "step": 49760 + }, + { + "epoch": 28.969732246798603, + "grad_norm": 0.1531154364347458, + "learning_rate": 7.740835619733128e-06, + "loss": -0.9136, + "step": 49770 + }, + { + "epoch": 28.975552968568103, + "grad_norm": 0.11023304611444473, + "learning_rate": 7.726113024308601e-06, + "loss": -0.9129, + "step": 49780 + }, + { + "epoch": 28.981373690337602, + "grad_norm": 0.14456228911876678, + "learning_rate": 7.711403270574746e-06, + "loss": -0.9153, + "step": 49790 + }, + { + "epoch": 28.9871944121071, + "grad_norm": 0.09951094537973404, + "learning_rate": 7.696706363000039e-06, + "loss": -0.9155, + "step": 49800 + }, + { + "epoch": 28.9930151338766, + "grad_norm": 0.13764601945877075, + "learning_rate": 7.682022306048959e-06, + "loss": -0.9178, + "step": 49810 + }, + { + "epoch": 28.9988358556461, + "grad_norm": 0.1847054809331894, + "learning_rate": 7.667351104182186e-06, + "loss": -0.9126, + "step": 49820 + }, + { + "epoch": 29.0046565774156, + "grad_norm": 0.11696567386388779, + "learning_rate": 7.652692761856395e-06, + "loss": -0.9164, + "step": 49830 + }, + { + "epoch": 29.010477299185098, + "grad_norm": 0.12810029089450836, + "learning_rate": 7.63804728352444e-06, + "loss": -0.9151, + "step": 49840 + }, + { + "epoch": 29.016298020954597, + "grad_norm": 0.18552474677562714, + "learning_rate": 7.623414673635215e-06, + "loss": -0.9159, + "step": 49850 + }, + { + "epoch": 29.022118742724096, + "grad_norm": 0.29491063952445984, + "learning_rate": 7.608794936633723e-06, + "loss": -0.9098, + "step": 49860 + }, + { + "epoch": 29.027939464493596, + "grad_norm": 0.1790449321269989, + "learning_rate": 7.594188076961056e-06, + "loss": -0.9124, + "step": 49870 + }, + { + "epoch": 29.0337601862631, + "grad_norm": 0.14321522414684296, + "learning_rate": 7.579594099054382e-06, + "loss": -0.9118, + "step": 49880 + }, + { + "epoch": 29.039580908032598, + "grad_norm": 0.1364687979221344, + "learning_rate": 7.565013007346983e-06, + "loss": -0.9138, + "step": 49890 + }, + { + "epoch": 29.045401629802097, + "grad_norm": 0.193926602602005, + "learning_rate": 7.5504448062682035e-06, + "loss": -0.915, + "step": 49900 + }, + { + "epoch": 29.051222351571596, + "grad_norm": 0.16509613394737244, + "learning_rate": 7.53588950024347e-06, + "loss": -0.9138, + "step": 49910 + }, + { + "epoch": 29.057043073341095, + "grad_norm": 0.1458844095468521, + "learning_rate": 7.5213470936943145e-06, + "loss": -0.9168, + "step": 49920 + }, + { + "epoch": 29.062863795110594, + "grad_norm": 0.12075694650411606, + "learning_rate": 7.506817591038323e-06, + "loss": -0.915, + "step": 49930 + }, + { + "epoch": 29.068684516880094, + "grad_norm": 0.12439486384391785, + "learning_rate": 7.492300996689183e-06, + "loss": -0.9053, + "step": 49940 + }, + { + "epoch": 29.074505238649593, + "grad_norm": 0.21911506354808807, + "learning_rate": 7.477797315056645e-06, + "loss": -0.9147, + "step": 49950 + }, + { + "epoch": 29.080325960419092, + "grad_norm": 0.19950458407402039, + "learning_rate": 7.463306550546539e-06, + "loss": -0.915, + "step": 49960 + }, + { + "epoch": 29.08614668218859, + "grad_norm": 0.1747719645500183, + "learning_rate": 7.448828707560812e-06, + "loss": -0.9133, + "step": 49970 + }, + { + "epoch": 29.09196740395809, + "grad_norm": 0.16532284021377563, + "learning_rate": 7.4343637904974e-06, + "loss": -0.9143, + "step": 49980 + }, + { + "epoch": 29.09778812572759, + "grad_norm": 0.1371060311794281, + "learning_rate": 7.419911803750401e-06, + "loss": -0.9145, + "step": 49990 + }, + { + "epoch": 29.10360884749709, + "grad_norm": 0.2528989911079407, + "learning_rate": 7.405472751709935e-06, + "loss": -0.9077, + "step": 50000 + } + ], + "logging_steps": 10, + "max_steps": 60000, + "num_input_tokens_seen": 0, + "num_train_epochs": 35, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}