{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 29.12, "eval_steps": 100, "global_step": 182000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 1.1145988702774048, "learning_rate": 5.94e-05, "loss": 129.2138, "step": 100 }, { "epoch": 0.032, "grad_norm": 0.3314463794231415, "learning_rate": 0.0001194, "loss": 147.1265, "step": 200 }, { "epoch": 0.048, "grad_norm": 0.30200499296188354, "learning_rate": 0.00017939999999999997, "loss": 147.1375, "step": 300 }, { "epoch": 0.064, "grad_norm": 0.20890414714813232, "learning_rate": 0.0002394, "loss": 141.107, "step": 400 }, { "epoch": 0.08, "grad_norm": 0.19977182149887085, "learning_rate": 0.00029939999999999996, "loss": 130.2311, "step": 500 }, { "epoch": 0.096, "grad_norm": 0.1718936711549759, "learning_rate": 0.00029999762390495616, "loss": 116.9488, "step": 600 }, { "epoch": 0.112, "grad_norm": 0.21659506857395172, "learning_rate": 0.00029999522380895233, "loss": 106.3702, "step": 700 }, { "epoch": 0.128, "grad_norm": 0.19612713158130646, "learning_rate": 0.0002999928237129485, "loss": 98.8033, "step": 800 }, { "epoch": 0.144, "grad_norm": 0.18958421051502228, "learning_rate": 0.00029999042361694467, "loss": 94.6761, "step": 900 }, { "epoch": 0.16, "grad_norm": 0.25341877341270447, "learning_rate": 0.00029998802352094084, "loss": 88.2629, "step": 1000 }, { "epoch": 0.176, "grad_norm": 0.1762186735868454, "learning_rate": 0.000299985623424937, "loss": 87.4362, "step": 1100 }, { "epoch": 0.192, "grad_norm": 0.23407000303268433, "learning_rate": 0.0002999832233289331, "loss": 85.7211, "step": 1200 }, { "epoch": 0.208, "grad_norm": 0.23202084004878998, "learning_rate": 0.0002999808232329293, "loss": 81.4749, "step": 1300 }, { "epoch": 0.224, "grad_norm": 0.1819111853837967, "learning_rate": 0.00029997842313692546, "loss": 80.3999, "step": 1400 }, { "epoch": 0.24, "grad_norm": 0.16154050827026367, "learning_rate": 0.00029997602304092163, "loss": 80.5113, "step": 1500 }, { "epoch": 0.256, "grad_norm": 0.20147816836833954, "learning_rate": 0.0002999736229449178, "loss": 77.4306, "step": 1600 }, { "epoch": 0.272, "grad_norm": 0.2032860815525055, "learning_rate": 0.0002999712228489139, "loss": 76.3299, "step": 1700 }, { "epoch": 0.288, "grad_norm": 0.20103086531162262, "learning_rate": 0.0002999688227529101, "loss": 77.0755, "step": 1800 }, { "epoch": 0.304, "grad_norm": 0.1930929720401764, "learning_rate": 0.00029996642265690625, "loss": 74.2643, "step": 1900 }, { "epoch": 0.32, "grad_norm": 0.21013671159744263, "learning_rate": 0.0002999640225609024, "loss": 75.9168, "step": 2000 }, { "epoch": 0.336, "grad_norm": 0.2554585635662079, "learning_rate": 0.0002999616224648986, "loss": 75.2005, "step": 2100 }, { "epoch": 0.352, "grad_norm": 0.21000510454177856, "learning_rate": 0.00029995922236889476, "loss": 74.1565, "step": 2200 }, { "epoch": 0.368, "grad_norm": 0.2096049040555954, "learning_rate": 0.0002999568222728909, "loss": 73.3684, "step": 2300 }, { "epoch": 0.384, "grad_norm": 0.2806188464164734, "learning_rate": 0.00029995442217688705, "loss": 73.9772, "step": 2400 }, { "epoch": 0.4, "grad_norm": 0.17476481199264526, "learning_rate": 0.0002999520220808832, "loss": 73.7125, "step": 2500 }, { "epoch": 0.416, "grad_norm": 0.26867198944091797, "learning_rate": 0.0002999496219848794, "loss": 72.5119, "step": 2600 }, { "epoch": 0.432, "grad_norm": 0.1896703690290451, "learning_rate": 0.00029994722188887555, "loss": 72.6918, "step": 2700 }, { "epoch": 0.448, "grad_norm": 0.2521280348300934, "learning_rate": 0.00029994482179287167, "loss": 72.1229, "step": 2800 }, { "epoch": 0.464, "grad_norm": 0.20409554243087769, "learning_rate": 0.00029994242169686784, "loss": 72.3524, "step": 2900 }, { "epoch": 0.48, "grad_norm": 0.1911861002445221, "learning_rate": 0.000299940021600864, "loss": 70.9714, "step": 3000 }, { "epoch": 0.496, "grad_norm": 0.21338903903961182, "learning_rate": 0.0002999376215048602, "loss": 69.5716, "step": 3100 }, { "epoch": 0.512, "grad_norm": 0.20922720432281494, "learning_rate": 0.00029993522140885634, "loss": 70.1812, "step": 3200 }, { "epoch": 0.528, "grad_norm": 0.2678331434726715, "learning_rate": 0.0002999328213128525, "loss": 68.8041, "step": 3300 }, { "epoch": 0.544, "grad_norm": 0.25610026717185974, "learning_rate": 0.00029993042121684863, "loss": 71.186, "step": 3400 }, { "epoch": 0.56, "grad_norm": 0.23267875611782074, "learning_rate": 0.0002999280211208448, "loss": 68.9921, "step": 3500 }, { "epoch": 0.576, "grad_norm": 0.23876765370368958, "learning_rate": 0.00029992562102484097, "loss": 69.738, "step": 3600 }, { "epoch": 0.592, "grad_norm": 0.1865028291940689, "learning_rate": 0.00029992322092883714, "loss": 68.9813, "step": 3700 }, { "epoch": 0.608, "grad_norm": 0.21735595166683197, "learning_rate": 0.0002999208208328333, "loss": 67.5755, "step": 3800 }, { "epoch": 0.624, "grad_norm": 0.16909943521022797, "learning_rate": 0.0002999184207368294, "loss": 66.3015, "step": 3900 }, { "epoch": 0.64, "grad_norm": 0.19918648898601532, "learning_rate": 0.0002999160206408256, "loss": 67.3844, "step": 4000 }, { "epoch": 0.656, "grad_norm": 0.22282840311527252, "learning_rate": 0.00029991362054482176, "loss": 66.0008, "step": 4100 }, { "epoch": 0.672, "grad_norm": 0.19900047779083252, "learning_rate": 0.00029991122044881793, "loss": 66.029, "step": 4200 }, { "epoch": 0.688, "grad_norm": 0.2067142128944397, "learning_rate": 0.0002999088203528141, "loss": 65.7196, "step": 4300 }, { "epoch": 0.704, "grad_norm": 0.24062038958072662, "learning_rate": 0.00029990642025681027, "loss": 66.7571, "step": 4400 }, { "epoch": 0.72, "grad_norm": 0.2454902082681656, "learning_rate": 0.0002999040201608064, "loss": 65.7736, "step": 4500 }, { "epoch": 0.736, "grad_norm": 0.24499955773353577, "learning_rate": 0.00029990162006480255, "loss": 65.498, "step": 4600 }, { "epoch": 0.752, "grad_norm": 0.2421354055404663, "learning_rate": 0.0002998992199687987, "loss": 65.9207, "step": 4700 }, { "epoch": 0.768, "grad_norm": 0.1900254338979721, "learning_rate": 0.0002998968198727949, "loss": 63.4017, "step": 4800 }, { "epoch": 0.784, "grad_norm": 0.21995197236537933, "learning_rate": 0.00029989441977679106, "loss": 65.4319, "step": 4900 }, { "epoch": 0.8, "grad_norm": 0.2170778065919876, "learning_rate": 0.00029989201968078717, "loss": 64.1503, "step": 5000 }, { "epoch": 0.816, "grad_norm": 0.29141783714294434, "learning_rate": 0.00029988961958478334, "loss": 63.4509, "step": 5100 }, { "epoch": 0.832, "grad_norm": 0.2149534821510315, "learning_rate": 0.0002998872194887795, "loss": 63.8549, "step": 5200 }, { "epoch": 0.848, "grad_norm": 0.2090325504541397, "learning_rate": 0.0002998848193927757, "loss": 62.5135, "step": 5300 }, { "epoch": 0.864, "grad_norm": 0.19093327224254608, "learning_rate": 0.00029988241929677185, "loss": 64.1856, "step": 5400 }, { "epoch": 0.88, "grad_norm": 0.24676312506198883, "learning_rate": 0.000299880019200768, "loss": 62.8992, "step": 5500 }, { "epoch": 0.896, "grad_norm": 0.2047237902879715, "learning_rate": 0.00029987761910476413, "loss": 63.5, "step": 5600 }, { "epoch": 0.912, "grad_norm": 0.2169736623764038, "learning_rate": 0.0002998752190087603, "loss": 63.2706, "step": 5700 }, { "epoch": 0.928, "grad_norm": 0.2212333083152771, "learning_rate": 0.00029987281891275647, "loss": 62.8563, "step": 5800 }, { "epoch": 0.944, "grad_norm": 0.22105100750923157, "learning_rate": 0.00029987041881675264, "loss": 61.4049, "step": 5900 }, { "epoch": 0.96, "grad_norm": 0.21934692561626434, "learning_rate": 0.0002998680187207488, "loss": 61.2102, "step": 6000 }, { "epoch": 0.976, "grad_norm": 0.231471449136734, "learning_rate": 0.0002998656186247449, "loss": 61.161, "step": 6100 }, { "epoch": 0.992, "grad_norm": 0.20244845747947693, "learning_rate": 0.0002998632185287411, "loss": 61.5284, "step": 6200 }, { "epoch": 1.008, "grad_norm": 0.31659385561943054, "learning_rate": 0.00029986081843273726, "loss": 59.6197, "step": 6300 }, { "epoch": 1.024, "grad_norm": 0.22351042926311493, "learning_rate": 0.00029985841833673343, "loss": 60.8731, "step": 6400 }, { "epoch": 1.04, "grad_norm": 0.20470276474952698, "learning_rate": 0.0002998560182407296, "loss": 60.5648, "step": 6500 }, { "epoch": 1.056, "grad_norm": 0.17768125236034393, "learning_rate": 0.00029985361814472577, "loss": 59.2689, "step": 6600 }, { "epoch": 1.072, "grad_norm": 0.20775848627090454, "learning_rate": 0.0002998512180487219, "loss": 58.2776, "step": 6700 }, { "epoch": 1.088, "grad_norm": 0.2682810127735138, "learning_rate": 0.00029984881795271806, "loss": 60.5164, "step": 6800 }, { "epoch": 1.104, "grad_norm": 0.22458679974079132, "learning_rate": 0.0002998464178567142, "loss": 60.1217, "step": 6900 }, { "epoch": 1.12, "grad_norm": 0.22781415283679962, "learning_rate": 0.0002998440177607104, "loss": 58.191, "step": 7000 }, { "epoch": 1.1360000000000001, "grad_norm": 0.2532273232936859, "learning_rate": 0.00029984161766470656, "loss": 58.8972, "step": 7100 }, { "epoch": 1.152, "grad_norm": 0.2014983743429184, "learning_rate": 0.00029983921756870273, "loss": 58.7748, "step": 7200 }, { "epoch": 1.168, "grad_norm": 0.19773030281066895, "learning_rate": 0.0002998368174726989, "loss": 57.9689, "step": 7300 }, { "epoch": 1.184, "grad_norm": 0.245356023311615, "learning_rate": 0.00029983441737669507, "loss": 57.855, "step": 7400 }, { "epoch": 1.2, "grad_norm": 0.2565186023712158, "learning_rate": 0.00029983201728069124, "loss": 56.8152, "step": 7500 }, { "epoch": 1.216, "grad_norm": 0.17781591415405273, "learning_rate": 0.00029982961718468735, "loss": 55.2139, "step": 7600 }, { "epoch": 1.232, "grad_norm": 0.21849973499774933, "learning_rate": 0.0002998272170886835, "loss": 55.9843, "step": 7700 }, { "epoch": 1.248, "grad_norm": 0.17623578011989594, "learning_rate": 0.0002998248169926797, "loss": 57.3084, "step": 7800 }, { "epoch": 1.264, "grad_norm": 0.22286267578601837, "learning_rate": 0.00029982241689667586, "loss": 56.4191, "step": 7900 }, { "epoch": 1.28, "grad_norm": 0.20891787111759186, "learning_rate": 0.00029982001680067203, "loss": 56.4775, "step": 8000 }, { "epoch": 1.296, "grad_norm": 0.19925983250141144, "learning_rate": 0.00029981761670466815, "loss": 55.0521, "step": 8100 }, { "epoch": 1.312, "grad_norm": 0.22015956044197083, "learning_rate": 0.0002998152166086643, "loss": 55.6771, "step": 8200 }, { "epoch": 1.328, "grad_norm": 0.24997876584529877, "learning_rate": 0.0002998128165126605, "loss": 53.8931, "step": 8300 }, { "epoch": 1.3439999999999999, "grad_norm": 0.2933981418609619, "learning_rate": 0.00029981041641665665, "loss": 56.6028, "step": 8400 }, { "epoch": 1.3599999999999999, "grad_norm": 0.1963578313589096, "learning_rate": 0.0002998080163206528, "loss": 54.5404, "step": 8500 }, { "epoch": 1.376, "grad_norm": 0.21487855911254883, "learning_rate": 0.000299805616224649, "loss": 54.2586, "step": 8600 }, { "epoch": 1.392, "grad_norm": 0.21776583790779114, "learning_rate": 0.0002998032161286451, "loss": 53.9896, "step": 8700 }, { "epoch": 1.408, "grad_norm": 0.2172229140996933, "learning_rate": 0.0002998008160326413, "loss": 53.8424, "step": 8800 }, { "epoch": 1.424, "grad_norm": 0.23105138540267944, "learning_rate": 0.00029979841593663745, "loss": 54.1874, "step": 8900 }, { "epoch": 1.44, "grad_norm": 0.18797878921031952, "learning_rate": 0.0002997960158406336, "loss": 53.3869, "step": 9000 }, { "epoch": 1.456, "grad_norm": 0.20597319304943085, "learning_rate": 0.0002997936157446298, "loss": 53.7132, "step": 9100 }, { "epoch": 1.472, "grad_norm": 0.21674391627311707, "learning_rate": 0.00029979121564862595, "loss": 52.2728, "step": 9200 }, { "epoch": 1.488, "grad_norm": 0.2250959277153015, "learning_rate": 0.00029978881555262207, "loss": 53.3457, "step": 9300 }, { "epoch": 1.504, "grad_norm": 0.19289842247962952, "learning_rate": 0.00029978641545661824, "loss": 52.898, "step": 9400 }, { "epoch": 1.52, "grad_norm": 0.2215307652950287, "learning_rate": 0.0002997840153606144, "loss": 52.8446, "step": 9500 }, { "epoch": 1.536, "grad_norm": 0.19949446618556976, "learning_rate": 0.0002997816152646106, "loss": 51.9649, "step": 9600 }, { "epoch": 1.552, "grad_norm": 0.1753661036491394, "learning_rate": 0.00029977921516860675, "loss": 51.5562, "step": 9700 }, { "epoch": 1.568, "grad_norm": 0.22938130795955658, "learning_rate": 0.00029977681507260286, "loss": 52.4538, "step": 9800 }, { "epoch": 1.584, "grad_norm": 0.255227655172348, "learning_rate": 0.00029977441497659903, "loss": 50.8902, "step": 9900 }, { "epoch": 1.6, "grad_norm": 0.24369871616363525, "learning_rate": 0.0002997720148805952, "loss": 50.8092, "step": 10000 }, { "epoch": 1.616, "grad_norm": 0.22126376628875732, "learning_rate": 0.0002997696387855514, "loss": 51.0513, "step": 10100 }, { "epoch": 1.6320000000000001, "grad_norm": 0.199215367436409, "learning_rate": 0.00029976723868954756, "loss": 49.6234, "step": 10200 }, { "epoch": 1.6480000000000001, "grad_norm": 0.22058773040771484, "learning_rate": 0.0002997648385935437, "loss": 51.2333, "step": 10300 }, { "epoch": 1.6640000000000001, "grad_norm": 0.26106688380241394, "learning_rate": 0.0002997624384975399, "loss": 49.6582, "step": 10400 }, { "epoch": 1.6800000000000002, "grad_norm": 0.23437049984931946, "learning_rate": 0.00029976003840153606, "loss": 49.6097, "step": 10500 }, { "epoch": 1.696, "grad_norm": 0.1709340363740921, "learning_rate": 0.00029975763830553223, "loss": 49.9149, "step": 10600 }, { "epoch": 1.712, "grad_norm": 0.2278878539800644, "learning_rate": 0.00029975523820952835, "loss": 50.2495, "step": 10700 }, { "epoch": 1.728, "grad_norm": 0.25324809551239014, "learning_rate": 0.0002997528381135245, "loss": 48.3701, "step": 10800 }, { "epoch": 1.744, "grad_norm": 0.21413564682006836, "learning_rate": 0.0002997504380175207, "loss": 48.8447, "step": 10900 }, { "epoch": 1.76, "grad_norm": 0.2975509464740753, "learning_rate": 0.00029974803792151686, "loss": 50.0095, "step": 11000 }, { "epoch": 1.776, "grad_norm": 0.19792191684246063, "learning_rate": 0.00029974566182647304, "loss": 49.2986, "step": 11100 }, { "epoch": 1.792, "grad_norm": 0.2350345253944397, "learning_rate": 0.0002997432617304692, "loss": 48.7027, "step": 11200 }, { "epoch": 1.808, "grad_norm": 0.19396322965621948, "learning_rate": 0.00029974086163446533, "loss": 47.9713, "step": 11300 }, { "epoch": 1.8239999999999998, "grad_norm": 0.2414630949497223, "learning_rate": 0.0002997384615384615, "loss": 48.7363, "step": 11400 }, { "epoch": 1.8399999999999999, "grad_norm": 0.2678147554397583, "learning_rate": 0.00029973606144245767, "loss": 48.4818, "step": 11500 }, { "epoch": 1.8559999999999999, "grad_norm": 0.19563674926757812, "learning_rate": 0.00029973366134645384, "loss": 48.2693, "step": 11600 }, { "epoch": 1.8719999999999999, "grad_norm": 0.22531713545322418, "learning_rate": 0.00029973126125045, "loss": 47.758, "step": 11700 }, { "epoch": 1.888, "grad_norm": 0.22199738025665283, "learning_rate": 0.0002997288611544461, "loss": 46.9644, "step": 11800 }, { "epoch": 1.904, "grad_norm": 0.253896027803421, "learning_rate": 0.0002997264610584423, "loss": 46.5968, "step": 11900 }, { "epoch": 1.92, "grad_norm": 0.18806882202625275, "learning_rate": 0.00029972406096243846, "loss": 48.2712, "step": 12000 }, { "epoch": 1.936, "grad_norm": 0.22023610770702362, "learning_rate": 0.00029972166086643463, "loss": 47.2612, "step": 12100 }, { "epoch": 1.952, "grad_norm": 0.213795468211174, "learning_rate": 0.0002997192607704308, "loss": 45.9592, "step": 12200 }, { "epoch": 1.968, "grad_norm": 0.19787845015525818, "learning_rate": 0.00029971686067442697, "loss": 47.5647, "step": 12300 }, { "epoch": 1.984, "grad_norm": 0.19648146629333496, "learning_rate": 0.0002997144605784231, "loss": 46.8397, "step": 12400 }, { "epoch": 2.0, "grad_norm": 0.1904546618461609, "learning_rate": 0.00029971206048241925, "loss": 46.2783, "step": 12500 }, { "epoch": 2.016, "grad_norm": 0.23515231907367706, "learning_rate": 0.0002997096603864154, "loss": 46.5475, "step": 12600 }, { "epoch": 2.032, "grad_norm": 0.21483579277992249, "learning_rate": 0.0002997072602904116, "loss": 44.2442, "step": 12700 }, { "epoch": 2.048, "grad_norm": 0.2563657760620117, "learning_rate": 0.00029970486019440776, "loss": 46.1955, "step": 12800 }, { "epoch": 2.064, "grad_norm": 0.20812326669692993, "learning_rate": 0.00029970246009840387, "loss": 45.5704, "step": 12900 }, { "epoch": 2.08, "grad_norm": 0.2190365344285965, "learning_rate": 0.00029970006000240004, "loss": 45.7909, "step": 13000 }, { "epoch": 2.096, "grad_norm": 0.2379041463136673, "learning_rate": 0.0002996976599063962, "loss": 46.2324, "step": 13100 }, { "epoch": 2.112, "grad_norm": 0.2170909345149994, "learning_rate": 0.0002996952598103924, "loss": 44.766, "step": 13200 }, { "epoch": 2.128, "grad_norm": 0.15927261114120483, "learning_rate": 0.00029969285971438855, "loss": 43.669, "step": 13300 }, { "epoch": 2.144, "grad_norm": 0.22271278500556946, "learning_rate": 0.0002996904596183847, "loss": 45.0739, "step": 13400 }, { "epoch": 2.16, "grad_norm": 0.17792785167694092, "learning_rate": 0.0002996880595223809, "loss": 43.8963, "step": 13500 }, { "epoch": 2.176, "grad_norm": 0.28457048535346985, "learning_rate": 0.00029968565942637706, "loss": 44.6317, "step": 13600 }, { "epoch": 2.192, "grad_norm": 0.19491800665855408, "learning_rate": 0.0002996832593303732, "loss": 43.8541, "step": 13700 }, { "epoch": 2.208, "grad_norm": 0.21633195877075195, "learning_rate": 0.00029968085923436934, "loss": 43.2844, "step": 13800 }, { "epoch": 2.224, "grad_norm": 0.2146127074956894, "learning_rate": 0.0002996784591383655, "loss": 45.0415, "step": 13900 }, { "epoch": 2.24, "grad_norm": 0.2204289436340332, "learning_rate": 0.0002996760590423617, "loss": 44.2757, "step": 14000 }, { "epoch": 2.2560000000000002, "grad_norm": 0.3051868677139282, "learning_rate": 0.00029967365894635785, "loss": 42.7227, "step": 14100 }, { "epoch": 2.2720000000000002, "grad_norm": 0.23641665279865265, "learning_rate": 0.000299671258850354, "loss": 44.0578, "step": 14200 }, { "epoch": 2.288, "grad_norm": 0.18554934859275818, "learning_rate": 0.0002996688587543502, "loss": 42.5159, "step": 14300 }, { "epoch": 2.304, "grad_norm": 0.24741467833518982, "learning_rate": 0.0002996664586583463, "loss": 42.9106, "step": 14400 }, { "epoch": 2.32, "grad_norm": 0.18483412265777588, "learning_rate": 0.00029966405856234247, "loss": 42.2459, "step": 14500 }, { "epoch": 2.336, "grad_norm": 0.24359823763370514, "learning_rate": 0.00029966165846633864, "loss": 42.6733, "step": 14600 }, { "epoch": 2.352, "grad_norm": 0.20456752181053162, "learning_rate": 0.0002996592583703348, "loss": 41.5754, "step": 14700 }, { "epoch": 2.368, "grad_norm": 0.24165822565555573, "learning_rate": 0.000299656858274331, "loss": 43.6988, "step": 14800 }, { "epoch": 2.384, "grad_norm": 0.20422741770744324, "learning_rate": 0.0002996544581783271, "loss": 41.9116, "step": 14900 }, { "epoch": 2.4, "grad_norm": 0.2413185089826584, "learning_rate": 0.00029965205808232326, "loss": 41.8573, "step": 15000 }, { "epoch": 2.416, "grad_norm": 0.20443005859851837, "learning_rate": 0.00029964968198727945, "loss": 42.3368, "step": 15100 }, { "epoch": 2.432, "grad_norm": 0.21270470321178436, "learning_rate": 0.0002996472818912756, "loss": 40.336, "step": 15200 }, { "epoch": 2.448, "grad_norm": 0.21689313650131226, "learning_rate": 0.0002996448817952718, "loss": 40.5125, "step": 15300 }, { "epoch": 2.464, "grad_norm": 0.25577059388160706, "learning_rate": 0.00029964248169926796, "loss": 40.5761, "step": 15400 }, { "epoch": 2.48, "grad_norm": 0.2624509930610657, "learning_rate": 0.0002996400816032641, "loss": 40.3047, "step": 15500 }, { "epoch": 2.496, "grad_norm": 0.225455641746521, "learning_rate": 0.00029963768150726024, "loss": 40.3576, "step": 15600 }, { "epoch": 2.512, "grad_norm": 0.18313691020011902, "learning_rate": 0.0002996352814112564, "loss": 41.113, "step": 15700 }, { "epoch": 2.528, "grad_norm": 0.21272344887256622, "learning_rate": 0.0002996328813152526, "loss": 41.2563, "step": 15800 }, { "epoch": 2.544, "grad_norm": 0.23525486886501312, "learning_rate": 0.00029963048121924875, "loss": 41.2227, "step": 15900 }, { "epoch": 2.56, "grad_norm": 0.226985365152359, "learning_rate": 0.00029962808112324487, "loss": 40.6251, "step": 16000 }, { "epoch": 2.576, "grad_norm": 0.20422585308551788, "learning_rate": 0.00029962568102724103, "loss": 40.6449, "step": 16100 }, { "epoch": 2.592, "grad_norm": 0.18906068801879883, "learning_rate": 0.0002996232809312372, "loss": 39.5927, "step": 16200 }, { "epoch": 2.608, "grad_norm": 0.21180450916290283, "learning_rate": 0.0002996208808352334, "loss": 39.7467, "step": 16300 }, { "epoch": 2.624, "grad_norm": 0.2399897575378418, "learning_rate": 0.00029961848073922954, "loss": 38.9522, "step": 16400 }, { "epoch": 2.64, "grad_norm": 0.1941596120595932, "learning_rate": 0.0002996160806432257, "loss": 39.5798, "step": 16500 }, { "epoch": 2.656, "grad_norm": 0.19715790450572968, "learning_rate": 0.0002996136805472218, "loss": 39.9061, "step": 16600 }, { "epoch": 2.672, "grad_norm": 0.22090336680412292, "learning_rate": 0.00029961128045121805, "loss": 39.6083, "step": 16700 }, { "epoch": 2.6879999999999997, "grad_norm": 0.26035964488983154, "learning_rate": 0.00029960890435617424, "loss": 39.3414, "step": 16800 }, { "epoch": 2.7039999999999997, "grad_norm": 0.21888568997383118, "learning_rate": 0.00029960650426017035, "loss": 38.3817, "step": 16900 }, { "epoch": 2.7199999999999998, "grad_norm": 0.29924601316452026, "learning_rate": 0.0002996041041641665, "loss": 38.3896, "step": 17000 }, { "epoch": 2.7359999999999998, "grad_norm": 0.20395514369010925, "learning_rate": 0.0002996017040681627, "loss": 38.8915, "step": 17100 }, { "epoch": 2.752, "grad_norm": 0.20730023086071014, "learning_rate": 0.00029959930397215886, "loss": 38.9281, "step": 17200 }, { "epoch": 2.768, "grad_norm": 0.23472309112548828, "learning_rate": 0.00029959690387615503, "loss": 39.371, "step": 17300 }, { "epoch": 2.784, "grad_norm": 0.2272721529006958, "learning_rate": 0.0002995945037801512, "loss": 38.7238, "step": 17400 }, { "epoch": 2.8, "grad_norm": 0.20280113816261292, "learning_rate": 0.0002995921036841473, "loss": 38.1639, "step": 17500 }, { "epoch": 2.816, "grad_norm": 0.21985846757888794, "learning_rate": 0.0002995897035881435, "loss": 38.2459, "step": 17600 }, { "epoch": 2.832, "grad_norm": 0.22791948914527893, "learning_rate": 0.00029958730349213965, "loss": 38.365, "step": 17700 }, { "epoch": 2.848, "grad_norm": 0.218161940574646, "learning_rate": 0.0002995849033961358, "loss": 37.7998, "step": 17800 }, { "epoch": 2.864, "grad_norm": 0.23389916121959686, "learning_rate": 0.000299582503300132, "loss": 38.0078, "step": 17900 }, { "epoch": 2.88, "grad_norm": 0.20153094828128815, "learning_rate": 0.0002995801032041281, "loss": 37.1053, "step": 18000 }, { "epoch": 2.896, "grad_norm": 0.231399804353714, "learning_rate": 0.0002995777031081243, "loss": 37.6589, "step": 18100 }, { "epoch": 2.912, "grad_norm": 0.19814245402812958, "learning_rate": 0.00029957530301212044, "loss": 36.8171, "step": 18200 }, { "epoch": 2.928, "grad_norm": 0.22390811145305634, "learning_rate": 0.0002995729029161166, "loss": 36.6616, "step": 18300 }, { "epoch": 2.944, "grad_norm": 0.19958479702472687, "learning_rate": 0.0002995705028201128, "loss": 36.0232, "step": 18400 }, { "epoch": 2.96, "grad_norm": 0.1972126066684723, "learning_rate": 0.00029956810272410895, "loss": 36.5331, "step": 18500 }, { "epoch": 2.976, "grad_norm": 0.18196193873882294, "learning_rate": 0.00029956570262810507, "loss": 36.8888, "step": 18600 }, { "epoch": 2.992, "grad_norm": 0.17047256231307983, "learning_rate": 0.00029956330253210124, "loss": 36.5987, "step": 18700 }, { "epoch": 3.008, "grad_norm": 0.22138766944408417, "learning_rate": 0.0002995609024360974, "loss": 36.2777, "step": 18800 }, { "epoch": 3.024, "grad_norm": 0.22713051736354828, "learning_rate": 0.0002995585023400936, "loss": 35.768, "step": 18900 }, { "epoch": 3.04, "grad_norm": 0.1997511237859726, "learning_rate": 0.00029955610224408974, "loss": 35.872, "step": 19000 }, { "epoch": 3.056, "grad_norm": 0.19796296954154968, "learning_rate": 0.00029955370214808586, "loss": 34.8971, "step": 19100 }, { "epoch": 3.072, "grad_norm": 0.1922471821308136, "learning_rate": 0.00029955130205208203, "loss": 35.4181, "step": 19200 }, { "epoch": 3.088, "grad_norm": 0.18493038415908813, "learning_rate": 0.0002995489019560782, "loss": 36.3712, "step": 19300 }, { "epoch": 3.104, "grad_norm": 0.22148194909095764, "learning_rate": 0.00029954650186007437, "loss": 34.5266, "step": 19400 }, { "epoch": 3.12, "grad_norm": 0.19701820611953735, "learning_rate": 0.00029954410176407054, "loss": 35.2642, "step": 19500 }, { "epoch": 3.136, "grad_norm": 0.1763058602809906, "learning_rate": 0.0002995417016680667, "loss": 36.1582, "step": 19600 }, { "epoch": 3.152, "grad_norm": 0.2792583107948303, "learning_rate": 0.0002995393015720628, "loss": 34.755, "step": 19700 }, { "epoch": 3.168, "grad_norm": 0.20418234169483185, "learning_rate": 0.00029953690147605904, "loss": 34.5373, "step": 19800 }, { "epoch": 3.184, "grad_norm": 0.24839259684085846, "learning_rate": 0.0002995345013800552, "loss": 34.5007, "step": 19900 }, { "epoch": 3.2, "grad_norm": 0.22200001776218414, "learning_rate": 0.00029953210128405133, "loss": 34.8183, "step": 20000 }, { "epoch": 3.216, "grad_norm": 0.2371726781129837, "learning_rate": 0.0002995297011880475, "loss": 34.0164, "step": 20100 }, { "epoch": 3.232, "grad_norm": 0.21370230615139008, "learning_rate": 0.00029952730109204367, "loss": 34.8268, "step": 20200 }, { "epoch": 3.248, "grad_norm": 0.20940592885017395, "learning_rate": 0.00029952490099603983, "loss": 33.8475, "step": 20300 }, { "epoch": 3.2640000000000002, "grad_norm": 0.18580414354801178, "learning_rate": 0.000299522500900036, "loss": 33.8718, "step": 20400 }, { "epoch": 3.2800000000000002, "grad_norm": 0.2200319468975067, "learning_rate": 0.0002995201008040322, "loss": 33.9083, "step": 20500 }, { "epoch": 3.296, "grad_norm": 0.18141067028045654, "learning_rate": 0.0002995177007080283, "loss": 33.2878, "step": 20600 }, { "epoch": 3.312, "grad_norm": 0.24104055762290955, "learning_rate": 0.00029951530061202446, "loss": 34.4549, "step": 20700 }, { "epoch": 3.328, "grad_norm": 0.22455894947052002, "learning_rate": 0.0002995129005160206, "loss": 33.2184, "step": 20800 }, { "epoch": 3.344, "grad_norm": 0.19662746787071228, "learning_rate": 0.0002995105244209768, "loss": 33.836, "step": 20900 }, { "epoch": 3.36, "grad_norm": 0.2322922796010971, "learning_rate": 0.000299508124324973, "loss": 33.1089, "step": 21000 }, { "epoch": 3.376, "grad_norm": 0.2140241116285324, "learning_rate": 0.0002995057482299292, "loss": 32.8205, "step": 21100 }, { "epoch": 3.392, "grad_norm": 0.19320878386497498, "learning_rate": 0.00029950334813392534, "loss": 32.8251, "step": 21200 }, { "epoch": 3.408, "grad_norm": 0.18298691511154175, "learning_rate": 0.0002995009480379215, "loss": 33.2469, "step": 21300 }, { "epoch": 3.424, "grad_norm": 0.22385163605213165, "learning_rate": 0.0002994985479419177, "loss": 32.4997, "step": 21400 }, { "epoch": 3.44, "grad_norm": 0.2047736793756485, "learning_rate": 0.0002994961478459138, "loss": 33.5516, "step": 21500 }, { "epoch": 3.456, "grad_norm": 0.242600679397583, "learning_rate": 0.00029949374774990996, "loss": 33.4754, "step": 21600 }, { "epoch": 3.472, "grad_norm": 0.21438950300216675, "learning_rate": 0.00029949134765390613, "loss": 33.2636, "step": 21700 }, { "epoch": 3.488, "grad_norm": 0.16991284489631653, "learning_rate": 0.0002994889475579023, "loss": 32.2435, "step": 21800 }, { "epoch": 3.504, "grad_norm": 0.21854659914970398, "learning_rate": 0.00029948654746189847, "loss": 32.986, "step": 21900 }, { "epoch": 3.52, "grad_norm": 0.22860901057720184, "learning_rate": 0.0002994841473658946, "loss": 32.1887, "step": 22000 }, { "epoch": 3.536, "grad_norm": 0.20433278381824493, "learning_rate": 0.00029948174726989076, "loss": 32.1502, "step": 22100 }, { "epoch": 3.552, "grad_norm": 0.19475246965885162, "learning_rate": 0.0002994793471738869, "loss": 32.0844, "step": 22200 }, { "epoch": 3.568, "grad_norm": 0.20006608963012695, "learning_rate": 0.0002994769470778831, "loss": 32.5956, "step": 22300 }, { "epoch": 3.584, "grad_norm": 0.17535006999969482, "learning_rate": 0.00029947454698187926, "loss": 32.1812, "step": 22400 }, { "epoch": 3.6, "grad_norm": 0.22252418100833893, "learning_rate": 0.00029947214688587543, "loss": 30.6041, "step": 22500 }, { "epoch": 3.616, "grad_norm": 0.18110983073711395, "learning_rate": 0.00029946974678987155, "loss": 31.7236, "step": 22600 }, { "epoch": 3.632, "grad_norm": 0.227754145860672, "learning_rate": 0.0002994673466938677, "loss": 31.2323, "step": 22700 }, { "epoch": 3.648, "grad_norm": 0.19320198893547058, "learning_rate": 0.0002994649465978639, "loss": 31.4608, "step": 22800 }, { "epoch": 3.664, "grad_norm": 0.17932754755020142, "learning_rate": 0.00029946254650186006, "loss": 31.9613, "step": 22900 }, { "epoch": 3.68, "grad_norm": 0.19677236676216125, "learning_rate": 0.0002994601464058562, "loss": 30.9284, "step": 23000 }, { "epoch": 3.6959999999999997, "grad_norm": 0.22562915086746216, "learning_rate": 0.00029945774630985234, "loss": 30.7692, "step": 23100 }, { "epoch": 3.7119999999999997, "grad_norm": 0.19202880561351776, "learning_rate": 0.0002994553462138485, "loss": 31.2991, "step": 23200 }, { "epoch": 3.7279999999999998, "grad_norm": 0.22251880168914795, "learning_rate": 0.0002994529461178447, "loss": 29.574, "step": 23300 }, { "epoch": 3.7439999999999998, "grad_norm": 0.18705110251903534, "learning_rate": 0.00029945054602184085, "loss": 30.2693, "step": 23400 }, { "epoch": 3.76, "grad_norm": 0.18061533570289612, "learning_rate": 0.000299448145925837, "loss": 30.0086, "step": 23500 }, { "epoch": 3.776, "grad_norm": 0.23449186980724335, "learning_rate": 0.0002994457458298332, "loss": 29.9262, "step": 23600 }, { "epoch": 3.792, "grad_norm": 0.20259559154510498, "learning_rate": 0.0002994433457338293, "loss": 30.0139, "step": 23700 }, { "epoch": 3.808, "grad_norm": 0.21019335091114044, "learning_rate": 0.00029944094563782547, "loss": 30.853, "step": 23800 }, { "epoch": 3.824, "grad_norm": 0.17927643656730652, "learning_rate": 0.00029943854554182164, "loss": 30.7392, "step": 23900 }, { "epoch": 3.84, "grad_norm": 0.18862564861774445, "learning_rate": 0.0002994361454458178, "loss": 29.3096, "step": 24000 }, { "epoch": 3.856, "grad_norm": 0.22294782102108002, "learning_rate": 0.000299433745349814, "loss": 30.2642, "step": 24100 }, { "epoch": 3.872, "grad_norm": 0.20843671262264252, "learning_rate": 0.0002994313452538101, "loss": 29.4115, "step": 24200 }, { "epoch": 3.888, "grad_norm": 0.19081708788871765, "learning_rate": 0.00029942894515780626, "loss": 30.0382, "step": 24300 }, { "epoch": 3.904, "grad_norm": 0.18849343061447144, "learning_rate": 0.00029942654506180243, "loss": 29.6371, "step": 24400 }, { "epoch": 3.92, "grad_norm": 0.2084178924560547, "learning_rate": 0.0002994241449657986, "loss": 29.5353, "step": 24500 }, { "epoch": 3.936, "grad_norm": 0.179380401968956, "learning_rate": 0.00029942174486979477, "loss": 29.1119, "step": 24600 }, { "epoch": 3.952, "grad_norm": 0.2312467098236084, "learning_rate": 0.00029941934477379094, "loss": 29.3352, "step": 24700 }, { "epoch": 3.968, "grad_norm": 0.19268761575222015, "learning_rate": 0.00029941694467778705, "loss": 29.1584, "step": 24800 }, { "epoch": 3.984, "grad_norm": 0.19523601233959198, "learning_rate": 0.0002994145445817832, "loss": 29.3122, "step": 24900 }, { "epoch": 4.0, "grad_norm": 0.18007320165634155, "learning_rate": 0.0002994121444857794, "loss": 29.1468, "step": 25000 }, { "epoch": 4.016, "grad_norm": 0.19717352092266083, "learning_rate": 0.00029940974438977556, "loss": 29.2291, "step": 25100 }, { "epoch": 4.032, "grad_norm": 0.18931248784065247, "learning_rate": 0.00029940736829473175, "loss": 28.4476, "step": 25200 }, { "epoch": 4.048, "grad_norm": 0.17574016749858856, "learning_rate": 0.0002994049681987279, "loss": 27.6189, "step": 25300 }, { "epoch": 4.064, "grad_norm": 0.19395378232002258, "learning_rate": 0.0002994025681027241, "loss": 28.3701, "step": 25400 }, { "epoch": 4.08, "grad_norm": 0.1916889250278473, "learning_rate": 0.00029940016800672026, "loss": 28.3605, "step": 25500 }, { "epoch": 4.096, "grad_norm": 0.229524627327919, "learning_rate": 0.0002993977679107164, "loss": 27.7045, "step": 25600 }, { "epoch": 4.112, "grad_norm": 0.191976860165596, "learning_rate": 0.00029939536781471254, "loss": 27.6015, "step": 25700 }, { "epoch": 4.128, "grad_norm": 0.20611730217933655, "learning_rate": 0.0002993929917196688, "loss": 27.3844, "step": 25800 }, { "epoch": 4.144, "grad_norm": 0.21954050660133362, "learning_rate": 0.00029939059162366495, "loss": 27.6474, "step": 25900 }, { "epoch": 4.16, "grad_norm": 0.23369371891021729, "learning_rate": 0.00029938819152766107, "loss": 27.0846, "step": 26000 }, { "epoch": 4.176, "grad_norm": 0.19088931381702423, "learning_rate": 0.00029938579143165724, "loss": 27.0919, "step": 26100 }, { "epoch": 4.192, "grad_norm": 0.16385389864444733, "learning_rate": 0.0002993833913356534, "loss": 26.7928, "step": 26200 }, { "epoch": 4.208, "grad_norm": 0.22816230356693268, "learning_rate": 0.0002993809912396496, "loss": 26.597, "step": 26300 }, { "epoch": 4.224, "grad_norm": 0.22640523314476013, "learning_rate": 0.00029937859114364574, "loss": 26.6011, "step": 26400 }, { "epoch": 4.24, "grad_norm": 0.18119996786117554, "learning_rate": 0.0002993761910476419, "loss": 26.8414, "step": 26500 }, { "epoch": 4.256, "grad_norm": 0.2026926428079605, "learning_rate": 0.00029937379095163803, "loss": 26.9172, "step": 26600 }, { "epoch": 4.272, "grad_norm": 0.20275373756885529, "learning_rate": 0.0002993713908556342, "loss": 26.6568, "step": 26700 }, { "epoch": 4.288, "grad_norm": 0.2261670082807541, "learning_rate": 0.00029936899075963037, "loss": 27.1839, "step": 26800 }, { "epoch": 4.304, "grad_norm": 0.18411505222320557, "learning_rate": 0.00029936659066362654, "loss": 26.4785, "step": 26900 }, { "epoch": 4.32, "grad_norm": 0.2916317582130432, "learning_rate": 0.0002993641905676227, "loss": 26.5309, "step": 27000 }, { "epoch": 4.336, "grad_norm": 0.18537244200706482, "learning_rate": 0.0002993617904716188, "loss": 27.1665, "step": 27100 }, { "epoch": 4.352, "grad_norm": 0.16285920143127441, "learning_rate": 0.000299359390375615, "loss": 27.2424, "step": 27200 }, { "epoch": 4.368, "grad_norm": 0.15773992240428925, "learning_rate": 0.00029935699027961116, "loss": 26.5359, "step": 27300 }, { "epoch": 4.384, "grad_norm": 0.18703384697437286, "learning_rate": 0.00029935459018360733, "loss": 27.342, "step": 27400 }, { "epoch": 4.4, "grad_norm": 0.18335498869419098, "learning_rate": 0.0002993521900876035, "loss": 27.0257, "step": 27500 }, { "epoch": 4.416, "grad_norm": 0.19414934515953064, "learning_rate": 0.00029934978999159967, "loss": 26.2998, "step": 27600 }, { "epoch": 4.432, "grad_norm": 0.20599210262298584, "learning_rate": 0.0002993473898955958, "loss": 25.9369, "step": 27700 }, { "epoch": 4.448, "grad_norm": 0.27044299244880676, "learning_rate": 0.00029934498979959195, "loss": 26.4132, "step": 27800 }, { "epoch": 4.464, "grad_norm": 0.22304300963878632, "learning_rate": 0.0002993425897035881, "loss": 26.2685, "step": 27900 }, { "epoch": 4.48, "grad_norm": 0.20784711837768555, "learning_rate": 0.0002993401896075843, "loss": 25.336, "step": 28000 }, { "epoch": 4.496, "grad_norm": 0.2017608880996704, "learning_rate": 0.00029933778951158046, "loss": 26.1331, "step": 28100 }, { "epoch": 4.5120000000000005, "grad_norm": 0.18563418090343475, "learning_rate": 0.0002993353894155766, "loss": 25.6813, "step": 28200 }, { "epoch": 4.5280000000000005, "grad_norm": 0.21515151858329773, "learning_rate": 0.00029933298931957274, "loss": 26.2951, "step": 28300 }, { "epoch": 4.5440000000000005, "grad_norm": 0.20512834191322327, "learning_rate": 0.0002993305892235689, "loss": 25.2256, "step": 28400 }, { "epoch": 4.5600000000000005, "grad_norm": 0.23129431903362274, "learning_rate": 0.0002993281891275651, "loss": 25.7071, "step": 28500 }, { "epoch": 4.576, "grad_norm": 0.18308007717132568, "learning_rate": 0.00029932578903156125, "loss": 25.5192, "step": 28600 }, { "epoch": 4.592, "grad_norm": 0.217178076505661, "learning_rate": 0.0002993233889355574, "loss": 25.349, "step": 28700 }, { "epoch": 4.608, "grad_norm": 0.18590569496154785, "learning_rate": 0.00029932098883955353, "loss": 25.2593, "step": 28800 }, { "epoch": 4.624, "grad_norm": 0.20052315294742584, "learning_rate": 0.0002993185887435497, "loss": 24.8334, "step": 28900 }, { "epoch": 4.64, "grad_norm": 0.21725590527057648, "learning_rate": 0.00029931621264850595, "loss": 24.6134, "step": 29000 }, { "epoch": 4.656, "grad_norm": 0.23973499238491058, "learning_rate": 0.00029931381255250206, "loss": 24.8209, "step": 29100 }, { "epoch": 4.672, "grad_norm": 0.20804470777511597, "learning_rate": 0.00029931141245649823, "loss": 25.0912, "step": 29200 }, { "epoch": 4.688, "grad_norm": 0.17555804550647736, "learning_rate": 0.0002993090363614544, "loss": 25.1723, "step": 29300 }, { "epoch": 4.704, "grad_norm": 0.17459039390087128, "learning_rate": 0.0002993066362654506, "loss": 24.5282, "step": 29400 }, { "epoch": 4.72, "grad_norm": 0.211078941822052, "learning_rate": 0.00029930423616944676, "loss": 24.6043, "step": 29500 }, { "epoch": 4.736, "grad_norm": 0.16957704722881317, "learning_rate": 0.0002993018360734429, "loss": 24.7947, "step": 29600 }, { "epoch": 4.752, "grad_norm": 0.2855212092399597, "learning_rate": 0.00029929943597743904, "loss": 24.5785, "step": 29700 }, { "epoch": 4.768, "grad_norm": 0.19777260720729828, "learning_rate": 0.0002992970358814352, "loss": 24.4989, "step": 29800 }, { "epoch": 4.784, "grad_norm": 0.17237554490566254, "learning_rate": 0.0002992946357854314, "loss": 24.6684, "step": 29900 }, { "epoch": 4.8, "grad_norm": 0.1824658066034317, "learning_rate": 0.00029929223568942755, "loss": 24.934, "step": 30000 }, { "epoch": 4.816, "grad_norm": 0.19774967432022095, "learning_rate": 0.0002992898355934237, "loss": 24.4343, "step": 30100 }, { "epoch": 4.832, "grad_norm": 0.2127138376235962, "learning_rate": 0.00029928743549741983, "loss": 24.7444, "step": 30200 }, { "epoch": 4.848, "grad_norm": 0.21794643998146057, "learning_rate": 0.000299285035401416, "loss": 25.2811, "step": 30300 }, { "epoch": 4.864, "grad_norm": 0.178062304854393, "learning_rate": 0.00029928263530541217, "loss": 24.9453, "step": 30400 }, { "epoch": 4.88, "grad_norm": 0.22796912491321564, "learning_rate": 0.00029928023520940834, "loss": 23.9367, "step": 30500 }, { "epoch": 4.896, "grad_norm": 0.18951456248760223, "learning_rate": 0.0002992778351134045, "loss": 23.7658, "step": 30600 }, { "epoch": 4.912, "grad_norm": 0.24202126264572144, "learning_rate": 0.0002992754350174007, "loss": 23.9004, "step": 30700 }, { "epoch": 4.928, "grad_norm": 0.19269002974033356, "learning_rate": 0.0002992730349213968, "loss": 23.2493, "step": 30800 }, { "epoch": 4.944, "grad_norm": 0.1657482087612152, "learning_rate": 0.00029927063482539296, "loss": 23.8883, "step": 30900 }, { "epoch": 4.96, "grad_norm": 0.151734858751297, "learning_rate": 0.00029926823472938913, "loss": 23.7884, "step": 31000 }, { "epoch": 4.976, "grad_norm": 0.2854020595550537, "learning_rate": 0.0002992658346333853, "loss": 24.1054, "step": 31100 }, { "epoch": 4.992, "grad_norm": 0.17750577628612518, "learning_rate": 0.00029926343453738147, "loss": 23.6583, "step": 31200 }, { "epoch": 5.008, "grad_norm": 0.17882367968559265, "learning_rate": 0.00029926103444137764, "loss": 23.4828, "step": 31300 }, { "epoch": 5.024, "grad_norm": 0.17182889580726624, "learning_rate": 0.0002992586343453738, "loss": 22.8774, "step": 31400 }, { "epoch": 5.04, "grad_norm": 0.20355378091335297, "learning_rate": 0.00029925623424937, "loss": 23.3064, "step": 31500 }, { "epoch": 5.056, "grad_norm": 0.21614141762256622, "learning_rate": 0.00029925383415336615, "loss": 22.8978, "step": 31600 }, { "epoch": 5.072, "grad_norm": 0.20654118061065674, "learning_rate": 0.00029925143405736226, "loss": 24.0182, "step": 31700 }, { "epoch": 5.088, "grad_norm": 0.17882691323757172, "learning_rate": 0.00029924903396135843, "loss": 22.8556, "step": 31800 }, { "epoch": 5.104, "grad_norm": 0.16477125883102417, "learning_rate": 0.0002992466338653546, "loss": 22.63, "step": 31900 }, { "epoch": 5.12, "grad_norm": 0.15241862833499908, "learning_rate": 0.00029924423376935077, "loss": 22.9513, "step": 32000 }, { "epoch": 5.136, "grad_norm": 0.17560409009456635, "learning_rate": 0.00029924183367334694, "loss": 22.808, "step": 32100 }, { "epoch": 5.152, "grad_norm": 0.18167634308338165, "learning_rate": 0.00029923943357734305, "loss": 23.0177, "step": 32200 }, { "epoch": 5.168, "grad_norm": 0.18328386545181274, "learning_rate": 0.0002992370334813392, "loss": 22.5144, "step": 32300 }, { "epoch": 5.184, "grad_norm": 0.20202048122882843, "learning_rate": 0.0002992346333853354, "loss": 23.1037, "step": 32400 }, { "epoch": 5.2, "grad_norm": 0.20026326179504395, "learning_rate": 0.00029923223328933156, "loss": 22.3593, "step": 32500 }, { "epoch": 5.216, "grad_norm": 0.1727285534143448, "learning_rate": 0.00029922983319332773, "loss": 22.214, "step": 32600 }, { "epoch": 5.232, "grad_norm": 0.1824960708618164, "learning_rate": 0.0002992274330973239, "loss": 22.2179, "step": 32700 }, { "epoch": 5.248, "grad_norm": 0.19371069967746735, "learning_rate": 0.00029922503300132, "loss": 22.453, "step": 32800 }, { "epoch": 5.264, "grad_norm": 0.22930407524108887, "learning_rate": 0.0002992226329053162, "loss": 22.1665, "step": 32900 }, { "epoch": 5.28, "grad_norm": 0.20372043550014496, "learning_rate": 0.00029922023280931235, "loss": 22.1181, "step": 33000 }, { "epoch": 5.296, "grad_norm": 0.20339564979076385, "learning_rate": 0.0002992178327133085, "loss": 22.5446, "step": 33100 }, { "epoch": 5.312, "grad_norm": 0.2182660847902298, "learning_rate": 0.0002992154326173047, "loss": 22.3062, "step": 33200 }, { "epoch": 5.328, "grad_norm": 0.18666419386863708, "learning_rate": 0.0002992130325213008, "loss": 22.0127, "step": 33300 }, { "epoch": 5.344, "grad_norm": 0.2193373292684555, "learning_rate": 0.000299210632425297, "loss": 22.1167, "step": 33400 }, { "epoch": 5.36, "grad_norm": 0.19642606377601624, "learning_rate": 0.00029920823232929315, "loss": 21.8393, "step": 33500 }, { "epoch": 5.376, "grad_norm": 0.24106252193450928, "learning_rate": 0.0002992058322332893, "loss": 21.7386, "step": 33600 }, { "epoch": 5.392, "grad_norm": 0.17611666023731232, "learning_rate": 0.0002992034321372855, "loss": 22.1787, "step": 33700 }, { "epoch": 5.408, "grad_norm": 0.23640978336334229, "learning_rate": 0.00029920103204128165, "loss": 21.5912, "step": 33800 }, { "epoch": 5.424, "grad_norm": 0.19579695165157318, "learning_rate": 0.00029919863194527777, "loss": 22.1147, "step": 33900 }, { "epoch": 5.44, "grad_norm": 0.18251273036003113, "learning_rate": 0.00029919623184927394, "loss": 21.8284, "step": 34000 }, { "epoch": 5.456, "grad_norm": 0.2099759876728058, "learning_rate": 0.0002991938317532701, "loss": 21.5234, "step": 34100 }, { "epoch": 5.4719999999999995, "grad_norm": 0.21391774713993073, "learning_rate": 0.0002991914316572663, "loss": 21.1876, "step": 34200 }, { "epoch": 5.4879999999999995, "grad_norm": 0.17656175792217255, "learning_rate": 0.00029918903156126244, "loss": 21.7905, "step": 34300 }, { "epoch": 5.504, "grad_norm": 0.1752483993768692, "learning_rate": 0.00029918663146525856, "loss": 20.9481, "step": 34400 }, { "epoch": 5.52, "grad_norm": 0.29879820346832275, "learning_rate": 0.00029918423136925473, "loss": 21.2073, "step": 34500 }, { "epoch": 5.536, "grad_norm": 0.1947035789489746, "learning_rate": 0.0002991818312732509, "loss": 21.0199, "step": 34600 }, { "epoch": 5.552, "grad_norm": 0.15402550995349884, "learning_rate": 0.00029917943117724707, "loss": 21.4862, "step": 34700 }, { "epoch": 5.568, "grad_norm": 0.21479055285453796, "learning_rate": 0.00029917703108124324, "loss": 20.3479, "step": 34800 }, { "epoch": 5.584, "grad_norm": 0.15968792140483856, "learning_rate": 0.0002991746309852394, "loss": 20.8151, "step": 34900 }, { "epoch": 5.6, "grad_norm": 0.16876402497291565, "learning_rate": 0.0002991722308892355, "loss": 21.8482, "step": 35000 }, { "epoch": 5.616, "grad_norm": 0.16191044449806213, "learning_rate": 0.0002991698307932317, "loss": 21.4486, "step": 35100 }, { "epoch": 5.632, "grad_norm": 0.20595960319042206, "learning_rate": 0.00029916743069722786, "loss": 21.7225, "step": 35200 }, { "epoch": 5.648, "grad_norm": 0.1939288079738617, "learning_rate": 0.00029916503060122403, "loss": 21.0107, "step": 35300 }, { "epoch": 5.664, "grad_norm": 0.20212168991565704, "learning_rate": 0.0002991626305052202, "loss": 20.4026, "step": 35400 }, { "epoch": 5.68, "grad_norm": 0.1956707388162613, "learning_rate": 0.0002991602544101764, "loss": 20.9491, "step": 35500 }, { "epoch": 5.696, "grad_norm": 0.22702528536319733, "learning_rate": 0.00029915785431417256, "loss": 21.12, "step": 35600 }, { "epoch": 5.712, "grad_norm": 0.19706673920154572, "learning_rate": 0.00029915547821912874, "loss": 21.5166, "step": 35700 }, { "epoch": 5.728, "grad_norm": 0.18108151853084564, "learning_rate": 0.0002991530781231249, "loss": 20.4059, "step": 35800 }, { "epoch": 5.744, "grad_norm": 0.1714268922805786, "learning_rate": 0.00029915067802712103, "loss": 20.2456, "step": 35900 }, { "epoch": 5.76, "grad_norm": 0.1415804773569107, "learning_rate": 0.0002991482779311172, "loss": 20.3176, "step": 36000 }, { "epoch": 5.776, "grad_norm": 0.1928543597459793, "learning_rate": 0.00029914587783511337, "loss": 20.797, "step": 36100 }, { "epoch": 5.792, "grad_norm": 0.17042042315006256, "learning_rate": 0.00029914347773910954, "loss": 20.2684, "step": 36200 }, { "epoch": 5.808, "grad_norm": 0.1929057389497757, "learning_rate": 0.0002991410776431057, "loss": 19.7169, "step": 36300 }, { "epoch": 5.824, "grad_norm": 0.19770380854606628, "learning_rate": 0.0002991386775471018, "loss": 20.3972, "step": 36400 }, { "epoch": 5.84, "grad_norm": 0.19927264750003815, "learning_rate": 0.000299136277451098, "loss": 20.3105, "step": 36500 }, { "epoch": 5.856, "grad_norm": 0.2222350686788559, "learning_rate": 0.00029913387735509416, "loss": 20.3396, "step": 36600 }, { "epoch": 5.872, "grad_norm": 0.15629681944847107, "learning_rate": 0.00029913147725909033, "loss": 19.7281, "step": 36700 }, { "epoch": 5.888, "grad_norm": 0.1714082509279251, "learning_rate": 0.0002991290771630865, "loss": 20.2121, "step": 36800 }, { "epoch": 5.904, "grad_norm": 0.19152860343456268, "learning_rate": 0.00029912667706708267, "loss": 20.3316, "step": 36900 }, { "epoch": 5.92, "grad_norm": 0.18097779154777527, "learning_rate": 0.0002991242769710788, "loss": 19.9225, "step": 37000 }, { "epoch": 5.936, "grad_norm": 0.21503089368343353, "learning_rate": 0.00029912187687507495, "loss": 20.3151, "step": 37100 }, { "epoch": 5.952, "grad_norm": 0.16976934671401978, "learning_rate": 0.0002991194767790711, "loss": 20.4782, "step": 37200 }, { "epoch": 5.968, "grad_norm": 0.1788826435804367, "learning_rate": 0.0002991170766830673, "loss": 19.616, "step": 37300 }, { "epoch": 5.984, "grad_norm": 0.17762643098831177, "learning_rate": 0.00029911467658706346, "loss": 19.4074, "step": 37400 }, { "epoch": 6.0, "grad_norm": 0.19231481850147247, "learning_rate": 0.0002991122764910596, "loss": 19.3966, "step": 37500 }, { "epoch": 6.016, "grad_norm": 0.2067825198173523, "learning_rate": 0.0002991098763950558, "loss": 19.6924, "step": 37600 }, { "epoch": 6.032, "grad_norm": 0.1930302083492279, "learning_rate": 0.00029910747629905196, "loss": 19.765, "step": 37700 }, { "epoch": 6.048, "grad_norm": 0.2076890915632248, "learning_rate": 0.00029910507620304813, "loss": 19.0516, "step": 37800 }, { "epoch": 6.064, "grad_norm": 0.2006111741065979, "learning_rate": 0.00029910267610704425, "loss": 19.1025, "step": 37900 }, { "epoch": 6.08, "grad_norm": 0.1836411952972412, "learning_rate": 0.0002991002760110404, "loss": 19.3714, "step": 38000 }, { "epoch": 6.096, "grad_norm": 0.1817934662103653, "learning_rate": 0.0002990978759150366, "loss": 19.1752, "step": 38100 }, { "epoch": 6.112, "grad_norm": 0.18150608241558075, "learning_rate": 0.00029909547581903276, "loss": 19.5865, "step": 38200 }, { "epoch": 6.128, "grad_norm": 0.3108033835887909, "learning_rate": 0.0002990930757230289, "loss": 19.3632, "step": 38300 }, { "epoch": 6.144, "grad_norm": 0.18861189484596252, "learning_rate": 0.00029909067562702504, "loss": 19.9617, "step": 38400 }, { "epoch": 6.16, "grad_norm": 0.16909874975681305, "learning_rate": 0.0002990882755310212, "loss": 19.8722, "step": 38500 }, { "epoch": 6.176, "grad_norm": 0.16401100158691406, "learning_rate": 0.0002990858754350174, "loss": 19.3652, "step": 38600 }, { "epoch": 6.192, "grad_norm": 0.17053301632404327, "learning_rate": 0.00029908347533901355, "loss": 19.4264, "step": 38700 }, { "epoch": 6.208, "grad_norm": 0.18607936799526215, "learning_rate": 0.0002990810752430097, "loss": 19.3128, "step": 38800 }, { "epoch": 6.224, "grad_norm": 0.2513495087623596, "learning_rate": 0.0002990786751470059, "loss": 20.1134, "step": 38900 }, { "epoch": 6.24, "grad_norm": 0.21938976645469666, "learning_rate": 0.000299076275051002, "loss": 19.5682, "step": 39000 }, { "epoch": 6.256, "grad_norm": 0.21253296732902527, "learning_rate": 0.00029907387495499817, "loss": 18.7325, "step": 39100 }, { "epoch": 6.272, "grad_norm": 0.21298116445541382, "learning_rate": 0.00029907147485899434, "loss": 19.0698, "step": 39200 }, { "epoch": 6.288, "grad_norm": 0.17804065346717834, "learning_rate": 0.0002990690747629905, "loss": 18.3022, "step": 39300 }, { "epoch": 6.304, "grad_norm": 0.31990084052085876, "learning_rate": 0.0002990666986679467, "loss": 18.9093, "step": 39400 }, { "epoch": 6.32, "grad_norm": 0.17742526531219482, "learning_rate": 0.0002990642985719428, "loss": 18.6614, "step": 39500 }, { "epoch": 6.336, "grad_norm": 0.20601534843444824, "learning_rate": 0.000299061898475939, "loss": 19.6871, "step": 39600 }, { "epoch": 6.352, "grad_norm": 0.16021846234798431, "learning_rate": 0.00029905949837993515, "loss": 18.6417, "step": 39700 }, { "epoch": 6.368, "grad_norm": 0.1588086634874344, "learning_rate": 0.0002990570982839313, "loss": 18.3146, "step": 39800 }, { "epoch": 6.384, "grad_norm": 0.21372877061367035, "learning_rate": 0.0002990546981879275, "loss": 19.0519, "step": 39900 }, { "epoch": 6.4, "grad_norm": 0.18066450953483582, "learning_rate": 0.00029905229809192366, "loss": 19.2848, "step": 40000 }, { "epoch": 6.416, "grad_norm": 0.23790153861045837, "learning_rate": 0.0002990498979959198, "loss": 18.7495, "step": 40100 }, { "epoch": 6.432, "grad_norm": 0.21764115989208221, "learning_rate": 0.00029904749789991594, "loss": 18.5835, "step": 40200 }, { "epoch": 6.448, "grad_norm": 0.18615952134132385, "learning_rate": 0.0002990450978039121, "loss": 17.9751, "step": 40300 }, { "epoch": 6.464, "grad_norm": 0.1657874882221222, "learning_rate": 0.0002990426977079083, "loss": 18.5635, "step": 40400 }, { "epoch": 6.48, "grad_norm": 0.3158019185066223, "learning_rate": 0.00029904029761190445, "loss": 18.6618, "step": 40500 }, { "epoch": 6.496, "grad_norm": 0.2320430427789688, "learning_rate": 0.0002990378975159006, "loss": 18.2968, "step": 40600 }, { "epoch": 6.5120000000000005, "grad_norm": 0.20868684351444244, "learning_rate": 0.0002990354974198968, "loss": 18.595, "step": 40700 }, { "epoch": 6.5280000000000005, "grad_norm": 0.2185734063386917, "learning_rate": 0.00029903309732389296, "loss": 17.9672, "step": 40800 }, { "epoch": 6.5440000000000005, "grad_norm": 0.22871826589107513, "learning_rate": 0.0002990306972278891, "loss": 18.0843, "step": 40900 }, { "epoch": 6.5600000000000005, "grad_norm": 0.16801375150680542, "learning_rate": 0.00029902829713188524, "loss": 18.138, "step": 41000 }, { "epoch": 6.576, "grad_norm": 0.17401717603206635, "learning_rate": 0.0002990258970358814, "loss": 18.7431, "step": 41100 }, { "epoch": 6.592, "grad_norm": 0.17664673924446106, "learning_rate": 0.0002990234969398776, "loss": 17.966, "step": 41200 }, { "epoch": 6.608, "grad_norm": 0.2024875283241272, "learning_rate": 0.00029902109684387375, "loss": 17.9339, "step": 41300 }, { "epoch": 6.624, "grad_norm": 0.19322896003723145, "learning_rate": 0.0002990186967478699, "loss": 18.5554, "step": 41400 }, { "epoch": 6.64, "grad_norm": 0.2797154188156128, "learning_rate": 0.00029901629665186603, "loss": 17.5192, "step": 41500 }, { "epoch": 6.656, "grad_norm": 0.2197944074869156, "learning_rate": 0.0002990138965558622, "loss": 18.4582, "step": 41600 }, { "epoch": 6.672, "grad_norm": 0.18805234134197235, "learning_rate": 0.00029901149645985837, "loss": 17.9245, "step": 41700 }, { "epoch": 6.688, "grad_norm": 0.14986388385295868, "learning_rate": 0.00029900909636385454, "loss": 17.7746, "step": 41800 }, { "epoch": 6.704, "grad_norm": 0.26323381066322327, "learning_rate": 0.0002990066962678507, "loss": 17.6134, "step": 41900 }, { "epoch": 6.72, "grad_norm": 0.1791141778230667, "learning_rate": 0.0002990042961718469, "loss": 17.7648, "step": 42000 }, { "epoch": 6.736, "grad_norm": 0.22629794478416443, "learning_rate": 0.000299001920076803, "loss": 18.2337, "step": 42100 }, { "epoch": 6.752, "grad_norm": 0.17983581125736237, "learning_rate": 0.0002989995199807992, "loss": 17.4193, "step": 42200 }, { "epoch": 6.768, "grad_norm": 0.17379482090473175, "learning_rate": 0.00029899711988479535, "loss": 17.9815, "step": 42300 }, { "epoch": 6.784, "grad_norm": 0.2074684351682663, "learning_rate": 0.0002989947197887915, "loss": 17.898, "step": 42400 }, { "epoch": 6.8, "grad_norm": 0.16909289360046387, "learning_rate": 0.0002989923196927877, "loss": 17.7292, "step": 42500 }, { "epoch": 6.816, "grad_norm": 0.184371218085289, "learning_rate": 0.00029898991959678386, "loss": 18.0706, "step": 42600 }, { "epoch": 6.832, "grad_norm": 0.17724382877349854, "learning_rate": 0.00029898751950078, "loss": 17.9871, "step": 42700 }, { "epoch": 6.848, "grad_norm": 0.2286718785762787, "learning_rate": 0.00029898511940477614, "loss": 17.5911, "step": 42800 }, { "epoch": 6.864, "grad_norm": 0.2002006471157074, "learning_rate": 0.0002989827193087723, "loss": 17.4336, "step": 42900 }, { "epoch": 6.88, "grad_norm": 0.20236457884311676, "learning_rate": 0.0002989803192127685, "loss": 17.0849, "step": 43000 }, { "epoch": 6.896, "grad_norm": 0.23483681678771973, "learning_rate": 0.00029897791911676465, "loss": 17.7893, "step": 43100 }, { "epoch": 6.912, "grad_norm": 0.18751464784145355, "learning_rate": 0.00029897551902076077, "loss": 17.4798, "step": 43200 }, { "epoch": 6.928, "grad_norm": 0.17341011762619019, "learning_rate": 0.00029897311892475694, "loss": 17.7278, "step": 43300 }, { "epoch": 6.944, "grad_norm": 0.15160439908504486, "learning_rate": 0.0002989707188287531, "loss": 17.4948, "step": 43400 }, { "epoch": 6.96, "grad_norm": 0.19316324591636658, "learning_rate": 0.0002989683187327493, "loss": 17.3409, "step": 43500 }, { "epoch": 6.976, "grad_norm": 0.1800646036863327, "learning_rate": 0.00029896591863674544, "loss": 17.5152, "step": 43600 }, { "epoch": 6.992, "grad_norm": 0.19359643757343292, "learning_rate": 0.0002989635185407416, "loss": 17.2701, "step": 43700 }, { "epoch": 7.008, "grad_norm": 0.21103709936141968, "learning_rate": 0.0002989611184447378, "loss": 17.0028, "step": 43800 }, { "epoch": 7.024, "grad_norm": 0.18972234427928925, "learning_rate": 0.00029895871834873395, "loss": 16.8714, "step": 43900 }, { "epoch": 7.04, "grad_norm": 0.16335220634937286, "learning_rate": 0.0002989563182527301, "loss": 17.1409, "step": 44000 }, { "epoch": 7.056, "grad_norm": 0.16595561802387238, "learning_rate": 0.00029895391815672624, "loss": 17.1677, "step": 44100 }, { "epoch": 7.072, "grad_norm": 0.1885690540075302, "learning_rate": 0.0002989515180607224, "loss": 17.1327, "step": 44200 }, { "epoch": 7.088, "grad_norm": 0.16525697708129883, "learning_rate": 0.0002989491179647186, "loss": 17.0265, "step": 44300 }, { "epoch": 7.104, "grad_norm": 0.17798613011837006, "learning_rate": 0.00029894671786871474, "loss": 16.5858, "step": 44400 }, { "epoch": 7.12, "grad_norm": 0.17442761361598969, "learning_rate": 0.0002989443177727109, "loss": 16.7029, "step": 44500 }, { "epoch": 7.136, "grad_norm": 0.17014281451702118, "learning_rate": 0.0002989419176767071, "loss": 16.3283, "step": 44600 }, { "epoch": 7.152, "grad_norm": 0.21125547587871552, "learning_rate": 0.0002989395175807032, "loss": 17.0964, "step": 44700 }, { "epoch": 7.168, "grad_norm": 0.15473531186580658, "learning_rate": 0.00029893711748469937, "loss": 17.2634, "step": 44800 }, { "epoch": 7.184, "grad_norm": 0.22423428297042847, "learning_rate": 0.00029893471738869553, "loss": 16.6492, "step": 44900 }, { "epoch": 7.2, "grad_norm": 0.23651999235153198, "learning_rate": 0.0002989323172926917, "loss": 17.2672, "step": 45000 }, { "epoch": 7.216, "grad_norm": 0.18389280140399933, "learning_rate": 0.00029892991719668787, "loss": 16.3061, "step": 45100 }, { "epoch": 7.232, "grad_norm": 0.19786329567432404, "learning_rate": 0.000298927517100684, "loss": 16.7178, "step": 45200 }, { "epoch": 7.248, "grad_norm": 0.1748264580965042, "learning_rate": 0.00029892511700468016, "loss": 16.8728, "step": 45300 }, { "epoch": 7.264, "grad_norm": 0.17337900400161743, "learning_rate": 0.0002989227169086763, "loss": 16.143, "step": 45400 }, { "epoch": 7.28, "grad_norm": 0.1627172827720642, "learning_rate": 0.0002989203168126725, "loss": 16.677, "step": 45500 }, { "epoch": 7.296, "grad_norm": 0.18607047200202942, "learning_rate": 0.00029891791671666866, "loss": 16.6493, "step": 45600 }, { "epoch": 7.312, "grad_norm": 0.17733363807201385, "learning_rate": 0.00029891551662066483, "loss": 16.8518, "step": 45700 }, { "epoch": 7.328, "grad_norm": 0.17257067561149597, "learning_rate": 0.00029891311652466095, "loss": 16.7963, "step": 45800 }, { "epoch": 7.344, "grad_norm": 0.22989864647388458, "learning_rate": 0.0002989107164286571, "loss": 16.6846, "step": 45900 }, { "epoch": 7.36, "grad_norm": 0.1924850195646286, "learning_rate": 0.0002989083163326533, "loss": 16.7258, "step": 46000 }, { "epoch": 7.376, "grad_norm": 0.15162524580955505, "learning_rate": 0.00029890591623664946, "loss": 16.0529, "step": 46100 }, { "epoch": 7.392, "grad_norm": 0.19990018010139465, "learning_rate": 0.00029890354014160564, "loss": 16.3768, "step": 46200 }, { "epoch": 7.408, "grad_norm": 0.1724652647972107, "learning_rate": 0.00029890114004560176, "loss": 17.0495, "step": 46300 }, { "epoch": 7.424, "grad_norm": 0.1920676976442337, "learning_rate": 0.00029889873994959793, "loss": 16.1202, "step": 46400 }, { "epoch": 7.44, "grad_norm": 0.1957552433013916, "learning_rate": 0.00029889636385455417, "loss": 16.413, "step": 46500 }, { "epoch": 7.456, "grad_norm": 0.14071592688560486, "learning_rate": 0.00029889396375855034, "loss": 15.732, "step": 46600 }, { "epoch": 7.4719999999999995, "grad_norm": 0.1833236664533615, "learning_rate": 0.00029889156366254646, "loss": 16.7192, "step": 46700 }, { "epoch": 7.4879999999999995, "grad_norm": 0.2189483791589737, "learning_rate": 0.0002988891635665426, "loss": 16.0979, "step": 46800 }, { "epoch": 7.504, "grad_norm": 0.17360301315784454, "learning_rate": 0.0002988867634705388, "loss": 15.8968, "step": 46900 }, { "epoch": 7.52, "grad_norm": 0.1952562779188156, "learning_rate": 0.00029888436337453496, "loss": 15.9731, "step": 47000 }, { "epoch": 7.536, "grad_norm": 0.1601036638021469, "learning_rate": 0.00029888196327853113, "loss": 16.392, "step": 47100 }, { "epoch": 7.552, "grad_norm": 0.17277076840400696, "learning_rate": 0.00029887956318252725, "loss": 15.9779, "step": 47200 }, { "epoch": 7.568, "grad_norm": 0.1868811696767807, "learning_rate": 0.0002988771630865234, "loss": 15.5355, "step": 47300 }, { "epoch": 7.584, "grad_norm": 0.2078930139541626, "learning_rate": 0.00029887478699147966, "loss": 15.8833, "step": 47400 }, { "epoch": 7.6, "grad_norm": 0.17647911608219147, "learning_rate": 0.0002988723868954758, "loss": 16.0442, "step": 47500 }, { "epoch": 7.616, "grad_norm": 0.20268210768699646, "learning_rate": 0.00029886998679947194, "loss": 16.1957, "step": 47600 }, { "epoch": 7.632, "grad_norm": 0.1820913553237915, "learning_rate": 0.0002988675867034681, "loss": 15.8208, "step": 47700 }, { "epoch": 7.648, "grad_norm": 0.2001231610774994, "learning_rate": 0.0002988651866074643, "loss": 16.1706, "step": 47800 }, { "epoch": 7.664, "grad_norm": 0.18558456003665924, "learning_rate": 0.00029886278651146045, "loss": 15.9747, "step": 47900 }, { "epoch": 7.68, "grad_norm": 0.17034992575645447, "learning_rate": 0.0002988603864154566, "loss": 16.4537, "step": 48000 }, { "epoch": 7.696, "grad_norm": 0.16974206268787384, "learning_rate": 0.00029885798631945274, "loss": 15.5116, "step": 48100 }, { "epoch": 7.712, "grad_norm": 0.1771545112133026, "learning_rate": 0.0002988555862234489, "loss": 15.8605, "step": 48200 }, { "epoch": 7.728, "grad_norm": 0.17756806313991547, "learning_rate": 0.0002988531861274451, "loss": 15.8965, "step": 48300 }, { "epoch": 7.744, "grad_norm": 0.20773237943649292, "learning_rate": 0.00029885078603144124, "loss": 15.1184, "step": 48400 }, { "epoch": 7.76, "grad_norm": 0.18383237719535828, "learning_rate": 0.0002988483859354374, "loss": 16.0467, "step": 48500 }, { "epoch": 7.776, "grad_norm": 0.18748898804187775, "learning_rate": 0.0002988459858394336, "loss": 15.3286, "step": 48600 }, { "epoch": 7.792, "grad_norm": 0.2877133786678314, "learning_rate": 0.0002988435857434297, "loss": 15.8562, "step": 48700 }, { "epoch": 7.808, "grad_norm": 0.168177530169487, "learning_rate": 0.00029884118564742587, "loss": 15.8613, "step": 48800 }, { "epoch": 7.824, "grad_norm": 0.18536759912967682, "learning_rate": 0.00029883878555142203, "loss": 15.8204, "step": 48900 }, { "epoch": 7.84, "grad_norm": 0.15699341893196106, "learning_rate": 0.0002988363854554182, "loss": 15.6026, "step": 49000 }, { "epoch": 7.856, "grad_norm": 0.17730812728405, "learning_rate": 0.0002988339853594144, "loss": 15.5268, "step": 49100 }, { "epoch": 7.872, "grad_norm": 0.16140446066856384, "learning_rate": 0.0002988315852634105, "loss": 15.3766, "step": 49200 }, { "epoch": 7.888, "grad_norm": 0.16114762425422668, "learning_rate": 0.00029882918516740666, "loss": 15.8614, "step": 49300 }, { "epoch": 7.904, "grad_norm": 0.19132892787456512, "learning_rate": 0.0002988267850714028, "loss": 15.4026, "step": 49400 }, { "epoch": 7.92, "grad_norm": 0.190206840634346, "learning_rate": 0.000298824384975399, "loss": 15.42, "step": 49500 }, { "epoch": 7.936, "grad_norm": 0.18264752626419067, "learning_rate": 0.00029882198487939516, "loss": 15.5455, "step": 49600 }, { "epoch": 7.952, "grad_norm": 0.1774350255727768, "learning_rate": 0.00029881958478339133, "loss": 15.7328, "step": 49700 }, { "epoch": 7.968, "grad_norm": 0.1655503213405609, "learning_rate": 0.00029881718468738745, "loss": 15.5836, "step": 49800 }, { "epoch": 7.984, "grad_norm": 0.18890833854675293, "learning_rate": 0.0002988147845913836, "loss": 15.4838, "step": 49900 }, { "epoch": 8.0, "grad_norm": 0.1880652904510498, "learning_rate": 0.0002988123844953798, "loss": 15.2114, "step": 50000 }, { "epoch": 8.016, "grad_norm": 0.18285752832889557, "learning_rate": 0.00029880998439937596, "loss": 14.9511, "step": 50100 }, { "epoch": 8.032, "grad_norm": 0.19436243176460266, "learning_rate": 0.0002988075843033721, "loss": 15.4968, "step": 50200 }, { "epoch": 8.048, "grad_norm": 0.1822815239429474, "learning_rate": 0.00029880518420736824, "loss": 14.7632, "step": 50300 }, { "epoch": 8.064, "grad_norm": 0.16189494729042053, "learning_rate": 0.0002988027841113644, "loss": 14.937, "step": 50400 }, { "epoch": 8.08, "grad_norm": 0.152993842959404, "learning_rate": 0.0002988003840153606, "loss": 14.676, "step": 50500 }, { "epoch": 8.096, "grad_norm": 0.2119678407907486, "learning_rate": 0.00029879798391935675, "loss": 15.725, "step": 50600 }, { "epoch": 8.112, "grad_norm": 0.22487041354179382, "learning_rate": 0.0002987955838233529, "loss": 15.0505, "step": 50700 }, { "epoch": 8.128, "grad_norm": 0.16072215139865875, "learning_rate": 0.0002987931837273491, "loss": 15.4103, "step": 50800 }, { "epoch": 8.144, "grad_norm": 0.16657765209674835, "learning_rate": 0.0002987907836313452, "loss": 14.7139, "step": 50900 }, { "epoch": 8.16, "grad_norm": 0.15327660739421844, "learning_rate": 0.00029878838353534137, "loss": 14.6325, "step": 51000 }, { "epoch": 8.176, "grad_norm": 0.20472773909568787, "learning_rate": 0.00029878598343933754, "loss": 14.7217, "step": 51100 }, { "epoch": 8.192, "grad_norm": 0.214088574051857, "learning_rate": 0.0002987835833433337, "loss": 14.121, "step": 51200 }, { "epoch": 8.208, "grad_norm": 0.20903360843658447, "learning_rate": 0.0002987811832473299, "loss": 15.1448, "step": 51300 }, { "epoch": 8.224, "grad_norm": 0.20621182024478912, "learning_rate": 0.000298778783151326, "loss": 14.7588, "step": 51400 }, { "epoch": 8.24, "grad_norm": 0.18515250086784363, "learning_rate": 0.00029877638305532216, "loss": 15.3639, "step": 51500 }, { "epoch": 8.256, "grad_norm": 0.17146657407283783, "learning_rate": 0.00029877398295931833, "loss": 14.4964, "step": 51600 }, { "epoch": 8.272, "grad_norm": 0.18953190743923187, "learning_rate": 0.0002987715828633145, "loss": 14.5639, "step": 51700 }, { "epoch": 8.288, "grad_norm": 0.17434297502040863, "learning_rate": 0.0002987692067682707, "loss": 15.2875, "step": 51800 }, { "epoch": 8.304, "grad_norm": 0.16686853766441345, "learning_rate": 0.00029876680667226686, "loss": 14.4679, "step": 51900 }, { "epoch": 8.32, "grad_norm": 0.14394892752170563, "learning_rate": 0.00029876440657626303, "loss": 14.5162, "step": 52000 }, { "epoch": 8.336, "grad_norm": 0.20816083252429962, "learning_rate": 0.0002987620064802592, "loss": 15.2646, "step": 52100 }, { "epoch": 8.352, "grad_norm": 0.16660048067569733, "learning_rate": 0.00029875960638425537, "loss": 15.0214, "step": 52200 }, { "epoch": 8.368, "grad_norm": 0.16948403418064117, "learning_rate": 0.0002987572062882515, "loss": 14.7227, "step": 52300 }, { "epoch": 8.384, "grad_norm": 0.15360529720783234, "learning_rate": 0.00029875480619224765, "loss": 14.8453, "step": 52400 }, { "epoch": 8.4, "grad_norm": 0.1730951964855194, "learning_rate": 0.0002987524060962438, "loss": 14.6784, "step": 52500 }, { "epoch": 8.416, "grad_norm": 0.1714763641357422, "learning_rate": 0.00029875000600024, "loss": 14.3347, "step": 52600 }, { "epoch": 8.432, "grad_norm": 0.21991823613643646, "learning_rate": 0.00029874760590423616, "loss": 14.7373, "step": 52700 }, { "epoch": 8.448, "grad_norm": 0.26085495948791504, "learning_rate": 0.00029874520580823233, "loss": 14.4799, "step": 52800 }, { "epoch": 8.464, "grad_norm": 0.15623599290847778, "learning_rate": 0.00029874280571222844, "loss": 14.9737, "step": 52900 }, { "epoch": 8.48, "grad_norm": 0.14685533940792084, "learning_rate": 0.0002987404056162246, "loss": 14.4126, "step": 53000 }, { "epoch": 8.496, "grad_norm": 0.19048573076725006, "learning_rate": 0.0002987380055202208, "loss": 14.6049, "step": 53100 }, { "epoch": 8.512, "grad_norm": 0.15729829668998718, "learning_rate": 0.00029873560542421695, "loss": 14.8894, "step": 53200 }, { "epoch": 8.528, "grad_norm": 0.18257932364940643, "learning_rate": 0.0002987332053282131, "loss": 14.3249, "step": 53300 }, { "epoch": 8.544, "grad_norm": 0.20492464303970337, "learning_rate": 0.00029873080523220923, "loss": 15.0053, "step": 53400 }, { "epoch": 8.56, "grad_norm": 0.22026245296001434, "learning_rate": 0.0002987284051362054, "loss": 14.1141, "step": 53500 }, { "epoch": 8.576, "grad_norm": 0.16078276932239532, "learning_rate": 0.00029872600504020157, "loss": 14.3822, "step": 53600 }, { "epoch": 8.592, "grad_norm": 0.19619469344615936, "learning_rate": 0.00029872360494419774, "loss": 14.3099, "step": 53700 }, { "epoch": 8.608, "grad_norm": 0.15051692724227905, "learning_rate": 0.0002987212048481939, "loss": 13.7999, "step": 53800 }, { "epoch": 8.624, "grad_norm": 0.19525863230228424, "learning_rate": 0.0002987188047521901, "loss": 14.3567, "step": 53900 }, { "epoch": 8.64, "grad_norm": 0.16883693635463715, "learning_rate": 0.0002987164046561862, "loss": 13.3731, "step": 54000 }, { "epoch": 8.656, "grad_norm": 0.1703290492296219, "learning_rate": 0.00029871400456018236, "loss": 13.8462, "step": 54100 }, { "epoch": 8.672, "grad_norm": 0.18907932937145233, "learning_rate": 0.00029871160446417853, "loss": 14.5297, "step": 54200 }, { "epoch": 8.688, "grad_norm": 0.16260308027267456, "learning_rate": 0.0002987092043681747, "loss": 14.0573, "step": 54300 }, { "epoch": 8.704, "grad_norm": 0.1732938140630722, "learning_rate": 0.0002987068282731309, "loss": 14.1114, "step": 54400 }, { "epoch": 8.72, "grad_norm": 0.20591895282268524, "learning_rate": 0.00029870442817712706, "loss": 13.7101, "step": 54500 }, { "epoch": 8.736, "grad_norm": 0.1871296912431717, "learning_rate": 0.00029870202808112323, "loss": 14.539, "step": 54600 }, { "epoch": 8.752, "grad_norm": 0.15711694955825806, "learning_rate": 0.0002986996279851194, "loss": 14.4353, "step": 54700 }, { "epoch": 8.768, "grad_norm": 0.1790015697479248, "learning_rate": 0.00029869722788911557, "loss": 14.4861, "step": 54800 }, { "epoch": 8.784, "grad_norm": 0.1903577744960785, "learning_rate": 0.0002986948277931117, "loss": 14.2582, "step": 54900 }, { "epoch": 8.8, "grad_norm": 0.18150964379310608, "learning_rate": 0.00029869242769710785, "loss": 13.9522, "step": 55000 }, { "epoch": 8.816, "grad_norm": 0.17604489624500275, "learning_rate": 0.000298690027601104, "loss": 14.4482, "step": 55100 }, { "epoch": 8.832, "grad_norm": 0.18487071990966797, "learning_rate": 0.0002986876275051002, "loss": 13.9656, "step": 55200 }, { "epoch": 8.848, "grad_norm": 0.15276212990283966, "learning_rate": 0.00029868522740909636, "loss": 14.2513, "step": 55300 }, { "epoch": 8.864, "grad_norm": 0.19339829683303833, "learning_rate": 0.0002986828273130925, "loss": 13.7151, "step": 55400 }, { "epoch": 8.88, "grad_norm": 0.14462265372276306, "learning_rate": 0.00029868042721708864, "loss": 13.8859, "step": 55500 }, { "epoch": 8.896, "grad_norm": 0.16163522005081177, "learning_rate": 0.0002986780271210848, "loss": 13.7567, "step": 55600 }, { "epoch": 8.912, "grad_norm": 0.15859289467334747, "learning_rate": 0.000298675627025081, "loss": 14.4693, "step": 55700 }, { "epoch": 8.928, "grad_norm": 0.1641652137041092, "learning_rate": 0.00029867322692907715, "loss": 13.6118, "step": 55800 }, { "epoch": 8.943999999999999, "grad_norm": 0.18410654366016388, "learning_rate": 0.0002986708268330733, "loss": 14.3033, "step": 55900 }, { "epoch": 8.96, "grad_norm": 0.18847694993019104, "learning_rate": 0.00029866842673706944, "loss": 13.2935, "step": 56000 }, { "epoch": 8.975999999999999, "grad_norm": 0.15224353969097137, "learning_rate": 0.0002986660266410656, "loss": 13.6185, "step": 56100 }, { "epoch": 8.992, "grad_norm": 0.15307171642780304, "learning_rate": 0.0002986636265450618, "loss": 13.9229, "step": 56200 }, { "epoch": 9.008, "grad_norm": 0.1455143541097641, "learning_rate": 0.00029866122644905794, "loss": 13.9716, "step": 56300 }, { "epoch": 9.024, "grad_norm": 0.18889980018138885, "learning_rate": 0.0002986588263530541, "loss": 13.8509, "step": 56400 }, { "epoch": 9.04, "grad_norm": 0.19757011532783508, "learning_rate": 0.0002986564262570502, "loss": 14.0519, "step": 56500 }, { "epoch": 9.056, "grad_norm": 0.18008406460285187, "learning_rate": 0.00029865405016200647, "loss": 13.1833, "step": 56600 }, { "epoch": 9.072, "grad_norm": 0.1602972447872162, "learning_rate": 0.00029865165006600264, "loss": 13.2838, "step": 56700 }, { "epoch": 9.088, "grad_norm": 0.17582525312900543, "learning_rate": 0.0002986492499699988, "loss": 13.898, "step": 56800 }, { "epoch": 9.104, "grad_norm": 0.15762995183467865, "learning_rate": 0.0002986468498739949, "loss": 13.5733, "step": 56900 }, { "epoch": 9.12, "grad_norm": 0.1670118272304535, "learning_rate": 0.0002986444497779911, "loss": 13.5845, "step": 57000 }, { "epoch": 9.136, "grad_norm": 0.18542303144931793, "learning_rate": 0.00029864204968198726, "loss": 13.9615, "step": 57100 }, { "epoch": 9.152, "grad_norm": 0.18144281208515167, "learning_rate": 0.00029863964958598343, "loss": 13.0945, "step": 57200 }, { "epoch": 9.168, "grad_norm": 0.18359419703483582, "learning_rate": 0.0002986372494899796, "loss": 13.4529, "step": 57300 }, { "epoch": 9.184, "grad_norm": 0.2034582495689392, "learning_rate": 0.0002986348493939757, "loss": 13.2086, "step": 57400 }, { "epoch": 9.2, "grad_norm": 0.1561286300420761, "learning_rate": 0.0002986324492979719, "loss": 13.5699, "step": 57500 }, { "epoch": 9.216, "grad_norm": 0.2128494530916214, "learning_rate": 0.00029863004920196805, "loss": 13.7906, "step": 57600 }, { "epoch": 9.232, "grad_norm": 0.18951255083084106, "learning_rate": 0.0002986276491059642, "loss": 13.4684, "step": 57700 }, { "epoch": 9.248, "grad_norm": 0.14849476516246796, "learning_rate": 0.0002986252490099604, "loss": 13.6832, "step": 57800 }, { "epoch": 9.264, "grad_norm": 0.19169315695762634, "learning_rate": 0.00029862284891395656, "loss": 12.9751, "step": 57900 }, { "epoch": 9.28, "grad_norm": 0.219793900847435, "learning_rate": 0.0002986204488179527, "loss": 13.4069, "step": 58000 }, { "epoch": 9.296, "grad_norm": 0.2139630764722824, "learning_rate": 0.00029861804872194884, "loss": 12.9185, "step": 58100 }, { "epoch": 9.312, "grad_norm": 0.1722664088010788, "learning_rate": 0.000298615648625945, "loss": 13.4876, "step": 58200 }, { "epoch": 9.328, "grad_norm": 0.15841473639011383, "learning_rate": 0.0002986132485299412, "loss": 13.481, "step": 58300 }, { "epoch": 9.344, "grad_norm": 0.17484904825687408, "learning_rate": 0.00029861084843393735, "loss": 13.5925, "step": 58400 }, { "epoch": 9.36, "grad_norm": 0.20388108491897583, "learning_rate": 0.00029860844833793347, "loss": 13.2549, "step": 58500 }, { "epoch": 9.376, "grad_norm": 0.17959387600421906, "learning_rate": 0.00029860604824192964, "loss": 13.571, "step": 58600 }, { "epoch": 9.392, "grad_norm": 0.1830485612154007, "learning_rate": 0.0002986036481459258, "loss": 13.0808, "step": 58700 }, { "epoch": 9.408, "grad_norm": 0.1935325413942337, "learning_rate": 0.000298601248049922, "loss": 12.9193, "step": 58800 }, { "epoch": 9.424, "grad_norm": 0.22928985953330994, "learning_rate": 0.00029859884795391814, "loss": 12.9233, "step": 58900 }, { "epoch": 9.44, "grad_norm": 0.17562927305698395, "learning_rate": 0.0002985964478579143, "loss": 13.0933, "step": 59000 }, { "epoch": 9.456, "grad_norm": 0.21014900505542755, "learning_rate": 0.00029859404776191043, "loss": 12.9421, "step": 59100 }, { "epoch": 9.472, "grad_norm": 0.16698358952999115, "learning_rate": 0.0002985916476659066, "loss": 13.6465, "step": 59200 }, { "epoch": 9.488, "grad_norm": 0.15990376472473145, "learning_rate": 0.00029858924756990277, "loss": 12.9832, "step": 59300 }, { "epoch": 9.504, "grad_norm": 0.21185587346553802, "learning_rate": 0.00029858684747389894, "loss": 13.3695, "step": 59400 }, { "epoch": 9.52, "grad_norm": 0.16105149686336517, "learning_rate": 0.0002985844473778951, "loss": 13.0733, "step": 59500 }, { "epoch": 9.536, "grad_norm": 0.22624213993549347, "learning_rate": 0.0002985820472818912, "loss": 13.2586, "step": 59600 }, { "epoch": 9.552, "grad_norm": 0.1732643097639084, "learning_rate": 0.0002985796471858874, "loss": 12.9246, "step": 59700 }, { "epoch": 9.568, "grad_norm": 0.18406638503074646, "learning_rate": 0.00029857724708988356, "loss": 13.4556, "step": 59800 }, { "epoch": 9.584, "grad_norm": 0.18207241594791412, "learning_rate": 0.0002985748709948398, "loss": 12.8405, "step": 59900 }, { "epoch": 9.6, "grad_norm": 0.14808227121829987, "learning_rate": 0.0002985724708988359, "loss": 13.0075, "step": 60000 }, { "epoch": 9.616, "grad_norm": 0.1976134330034256, "learning_rate": 0.0002985700708028321, "loss": 12.687, "step": 60100 }, { "epoch": 9.632, "grad_norm": 0.1712380349636078, "learning_rate": 0.00029856767070682825, "loss": 13.003, "step": 60200 }, { "epoch": 9.648, "grad_norm": 0.1509382426738739, "learning_rate": 0.0002985652706108244, "loss": 13.0863, "step": 60300 }, { "epoch": 9.664, "grad_norm": 0.1992410570383072, "learning_rate": 0.0002985628705148206, "loss": 13.1396, "step": 60400 }, { "epoch": 9.68, "grad_norm": 0.19914288818836212, "learning_rate": 0.0002985604704188167, "loss": 13.0716, "step": 60500 }, { "epoch": 9.696, "grad_norm": 0.17157557606697083, "learning_rate": 0.0002985580703228129, "loss": 12.5376, "step": 60600 }, { "epoch": 9.712, "grad_norm": 0.14820295572280884, "learning_rate": 0.00029855567022680905, "loss": 12.9209, "step": 60700 }, { "epoch": 9.728, "grad_norm": 0.17262442409992218, "learning_rate": 0.0002985532701308052, "loss": 13.3595, "step": 60800 }, { "epoch": 9.744, "grad_norm": 0.1804870218038559, "learning_rate": 0.0002985508700348014, "loss": 13.0037, "step": 60900 }, { "epoch": 9.76, "grad_norm": 0.1507444977760315, "learning_rate": 0.00029854846993879755, "loss": 12.5568, "step": 61000 }, { "epoch": 9.776, "grad_norm": 0.17809054255485535, "learning_rate": 0.00029854606984279367, "loss": 12.9826, "step": 61100 }, { "epoch": 9.792, "grad_norm": 0.25455987453460693, "learning_rate": 0.00029854366974678984, "loss": 12.5432, "step": 61200 }, { "epoch": 9.808, "grad_norm": 0.15175747871398926, "learning_rate": 0.000298541269650786, "loss": 12.9513, "step": 61300 }, { "epoch": 9.824, "grad_norm": 0.22233819961547852, "learning_rate": 0.0002985388695547822, "loss": 13.2744, "step": 61400 }, { "epoch": 9.84, "grad_norm": 0.1534196138381958, "learning_rate": 0.00029853646945877835, "loss": 12.4878, "step": 61500 }, { "epoch": 9.856, "grad_norm": 0.17612405121326447, "learning_rate": 0.00029853406936277446, "loss": 12.6281, "step": 61600 }, { "epoch": 9.872, "grad_norm": 0.14971201121807098, "learning_rate": 0.00029853166926677063, "loss": 12.4393, "step": 61700 }, { "epoch": 9.888, "grad_norm": 0.15717633068561554, "learning_rate": 0.0002985292691707668, "loss": 12.6903, "step": 61800 }, { "epoch": 9.904, "grad_norm": 0.1695670634508133, "learning_rate": 0.00029852686907476297, "loss": 12.9557, "step": 61900 }, { "epoch": 9.92, "grad_norm": 0.16429013013839722, "learning_rate": 0.00029852446897875914, "loss": 12.9804, "step": 62000 }, { "epoch": 9.936, "grad_norm": 0.1919148713350296, "learning_rate": 0.0002985220688827553, "loss": 12.8735, "step": 62100 }, { "epoch": 9.952, "grad_norm": 0.1977461278438568, "learning_rate": 0.0002985196687867514, "loss": 12.6665, "step": 62200 }, { "epoch": 9.968, "grad_norm": 0.3409396708011627, "learning_rate": 0.0002985172686907476, "loss": 11.9422, "step": 62300 }, { "epoch": 9.984, "grad_norm": 0.1977001428604126, "learning_rate": 0.00029851486859474376, "loss": 13.392, "step": 62400 }, { "epoch": 10.0, "grad_norm": 0.19805894792079926, "learning_rate": 0.00029851246849873993, "loss": 12.3432, "step": 62500 }, { "epoch": 10.016, "grad_norm": 0.1851508915424347, "learning_rate": 0.0002985100684027361, "loss": 12.8953, "step": 62600 }, { "epoch": 10.032, "grad_norm": 0.15137746930122375, "learning_rate": 0.0002985076683067322, "loss": 12.8256, "step": 62700 }, { "epoch": 10.048, "grad_norm": 0.1815025508403778, "learning_rate": 0.00029850529221168846, "loss": 12.2427, "step": 62800 }, { "epoch": 10.064, "grad_norm": 0.282045841217041, "learning_rate": 0.0002985028921156846, "loss": 12.5777, "step": 62900 }, { "epoch": 10.08, "grad_norm": 0.19669105112552643, "learning_rate": 0.0002985004920196808, "loss": 12.85, "step": 63000 }, { "epoch": 10.096, "grad_norm": 0.1557861566543579, "learning_rate": 0.0002984980919236769, "loss": 12.6325, "step": 63100 }, { "epoch": 10.112, "grad_norm": 0.16353458166122437, "learning_rate": 0.0002984956918276731, "loss": 12.5578, "step": 63200 }, { "epoch": 10.128, "grad_norm": 0.19124484062194824, "learning_rate": 0.00029849329173166925, "loss": 12.8784, "step": 63300 }, { "epoch": 10.144, "grad_norm": 0.16097944974899292, "learning_rate": 0.0002984908916356654, "loss": 11.7994, "step": 63400 }, { "epoch": 10.16, "grad_norm": 0.155614972114563, "learning_rate": 0.0002984884915396616, "loss": 11.9617, "step": 63500 }, { "epoch": 10.176, "grad_norm": 0.19013510644435883, "learning_rate": 0.0002984860914436577, "loss": 12.1663, "step": 63600 }, { "epoch": 10.192, "grad_norm": 0.21610714495182037, "learning_rate": 0.00029848369134765387, "loss": 12.2304, "step": 63700 }, { "epoch": 10.208, "grad_norm": 0.15554966032505035, "learning_rate": 0.00029848129125165004, "loss": 11.9337, "step": 63800 }, { "epoch": 10.224, "grad_norm": 0.14373019337654114, "learning_rate": 0.0002984788911556462, "loss": 12.5049, "step": 63900 }, { "epoch": 10.24, "grad_norm": 0.197763592004776, "learning_rate": 0.0002984764910596424, "loss": 12.2087, "step": 64000 }, { "epoch": 10.256, "grad_norm": 0.1522061824798584, "learning_rate": 0.00029847409096363855, "loss": 12.475, "step": 64100 }, { "epoch": 10.272, "grad_norm": 0.15849411487579346, "learning_rate": 0.00029847169086763466, "loss": 12.1301, "step": 64200 }, { "epoch": 10.288, "grad_norm": 0.1680125594139099, "learning_rate": 0.00029846929077163083, "loss": 12.2041, "step": 64300 }, { "epoch": 10.304, "grad_norm": 0.17618972063064575, "learning_rate": 0.000298466890675627, "loss": 12.1634, "step": 64400 }, { "epoch": 10.32, "grad_norm": 0.19345271587371826, "learning_rate": 0.00029846449057962317, "loss": 12.0509, "step": 64500 }, { "epoch": 10.336, "grad_norm": 0.15981802344322205, "learning_rate": 0.00029846209048361934, "loss": 11.879, "step": 64600 }, { "epoch": 10.352, "grad_norm": 0.1640341877937317, "learning_rate": 0.00029845969038761545, "loss": 12.3471, "step": 64700 }, { "epoch": 10.368, "grad_norm": 0.1751720905303955, "learning_rate": 0.0002984572902916116, "loss": 11.7085, "step": 64800 }, { "epoch": 10.384, "grad_norm": 0.15203487873077393, "learning_rate": 0.00029845491419656787, "loss": 11.9901, "step": 64900 }, { "epoch": 10.4, "grad_norm": 0.1836910843849182, "learning_rate": 0.00029845251410056403, "loss": 11.5864, "step": 65000 }, { "epoch": 10.416, "grad_norm": 0.2329769879579544, "learning_rate": 0.00029845011400456015, "loss": 11.8386, "step": 65100 }, { "epoch": 10.432, "grad_norm": 0.25904643535614014, "learning_rate": 0.0002984477139085563, "loss": 11.6842, "step": 65200 }, { "epoch": 10.448, "grad_norm": 0.16373856365680695, "learning_rate": 0.0002984453138125525, "loss": 11.9861, "step": 65300 }, { "epoch": 10.464, "grad_norm": 0.1684304028749466, "learning_rate": 0.00029844291371654866, "loss": 12.1751, "step": 65400 }, { "epoch": 10.48, "grad_norm": 0.1975129395723343, "learning_rate": 0.0002984405136205448, "loss": 11.9744, "step": 65500 }, { "epoch": 10.496, "grad_norm": 0.144730344414711, "learning_rate": 0.00029843811352454094, "loss": 11.7554, "step": 65600 }, { "epoch": 10.512, "grad_norm": 0.21416126191616058, "learning_rate": 0.0002984357134285371, "loss": 11.7885, "step": 65700 }, { "epoch": 10.528, "grad_norm": 0.1401461511850357, "learning_rate": 0.0002984333133325333, "loss": 12.2278, "step": 65800 }, { "epoch": 10.544, "grad_norm": 0.15199688076972961, "learning_rate": 0.00029843091323652945, "loss": 12.0611, "step": 65900 }, { "epoch": 10.56, "grad_norm": 0.16079574823379517, "learning_rate": 0.0002984285131405256, "loss": 11.3473, "step": 66000 }, { "epoch": 10.576, "grad_norm": 0.14441320300102234, "learning_rate": 0.0002984261130445218, "loss": 11.5284, "step": 66100 }, { "epoch": 10.592, "grad_norm": 0.1676328480243683, "learning_rate": 0.0002984237129485179, "loss": 11.6487, "step": 66200 }, { "epoch": 10.608, "grad_norm": 0.13956011831760406, "learning_rate": 0.00029842131285251407, "loss": 11.772, "step": 66300 }, { "epoch": 10.624, "grad_norm": 0.17723798751831055, "learning_rate": 0.00029841891275651024, "loss": 11.7424, "step": 66400 }, { "epoch": 10.64, "grad_norm": 0.18211066722869873, "learning_rate": 0.0002984165126605064, "loss": 11.9263, "step": 66500 }, { "epoch": 10.656, "grad_norm": 0.18465609848499298, "learning_rate": 0.0002984141125645026, "loss": 12.1533, "step": 66600 }, { "epoch": 10.672, "grad_norm": 0.15032535791397095, "learning_rate": 0.0002984117124684987, "loss": 11.8711, "step": 66700 }, { "epoch": 10.688, "grad_norm": 0.25048136711120605, "learning_rate": 0.00029840931237249486, "loss": 12.1925, "step": 66800 }, { "epoch": 10.704, "grad_norm": 0.17632503807544708, "learning_rate": 0.00029840691227649103, "loss": 12.0652, "step": 66900 }, { "epoch": 10.72, "grad_norm": 0.17492571473121643, "learning_rate": 0.0002984045121804872, "loss": 12.3961, "step": 67000 }, { "epoch": 10.736, "grad_norm": 0.17848367989063263, "learning_rate": 0.00029840211208448337, "loss": 12.0021, "step": 67100 }, { "epoch": 10.752, "grad_norm": 0.23175941407680511, "learning_rate": 0.00029839971198847954, "loss": 11.4583, "step": 67200 }, { "epoch": 10.768, "grad_norm": 0.24281519651412964, "learning_rate": 0.0002983973358934357, "loss": 12.0376, "step": 67300 }, { "epoch": 10.784, "grad_norm": 0.18129272758960724, "learning_rate": 0.00029839493579743184, "loss": 12.1892, "step": 67400 }, { "epoch": 10.8, "grad_norm": 0.1454136222600937, "learning_rate": 0.000298392535701428, "loss": 11.9333, "step": 67500 }, { "epoch": 10.816, "grad_norm": 0.12412439286708832, "learning_rate": 0.0002983901356054242, "loss": 11.0441, "step": 67600 }, { "epoch": 10.832, "grad_norm": 0.19814914464950562, "learning_rate": 0.00029838773550942035, "loss": 11.4348, "step": 67700 }, { "epoch": 10.848, "grad_norm": 0.2250308245420456, "learning_rate": 0.0002983853354134165, "loss": 11.723, "step": 67800 }, { "epoch": 10.864, "grad_norm": 0.1328551471233368, "learning_rate": 0.0002983829353174127, "loss": 11.4324, "step": 67900 }, { "epoch": 10.88, "grad_norm": 0.2366170883178711, "learning_rate": 0.00029838053522140886, "loss": 12.1462, "step": 68000 }, { "epoch": 10.896, "grad_norm": 0.20911742746829987, "learning_rate": 0.00029837813512540503, "loss": 11.6067, "step": 68100 }, { "epoch": 10.912, "grad_norm": 0.1770290583372116, "learning_rate": 0.00029837573502940114, "loss": 11.9299, "step": 68200 }, { "epoch": 10.928, "grad_norm": 0.21429571509361267, "learning_rate": 0.0002983733349333973, "loss": 11.3683, "step": 68300 }, { "epoch": 10.943999999999999, "grad_norm": 0.1542270928621292, "learning_rate": 0.0002983709348373935, "loss": 11.3472, "step": 68400 }, { "epoch": 10.96, "grad_norm": 0.2420985847711563, "learning_rate": 0.00029836853474138965, "loss": 11.5805, "step": 68500 }, { "epoch": 10.975999999999999, "grad_norm": 0.17665143311023712, "learning_rate": 0.0002983661346453858, "loss": 11.7406, "step": 68600 }, { "epoch": 10.992, "grad_norm": 0.26210835576057434, "learning_rate": 0.00029836373454938193, "loss": 11.7457, "step": 68700 }, { "epoch": 11.008, "grad_norm": 0.14472606778144836, "learning_rate": 0.0002983613344533781, "loss": 11.4662, "step": 68800 }, { "epoch": 11.024, "grad_norm": 0.17449091374874115, "learning_rate": 0.0002983589343573743, "loss": 11.0297, "step": 68900 }, { "epoch": 11.04, "grad_norm": 0.15488724410533905, "learning_rate": 0.00029835653426137044, "loss": 11.792, "step": 69000 }, { "epoch": 11.056, "grad_norm": 0.1447325348854065, "learning_rate": 0.0002983541341653666, "loss": 11.4483, "step": 69100 }, { "epoch": 11.072, "grad_norm": 0.17111489176750183, "learning_rate": 0.0002983517340693628, "loss": 11.1499, "step": 69200 }, { "epoch": 11.088, "grad_norm": 0.17446951568126678, "learning_rate": 0.0002983493339733589, "loss": 10.6961, "step": 69300 }, { "epoch": 11.104, "grad_norm": 0.1421278566122055, "learning_rate": 0.00029834693387735506, "loss": 11.4794, "step": 69400 }, { "epoch": 11.12, "grad_norm": 0.17439322173595428, "learning_rate": 0.00029834455778231125, "loss": 11.0965, "step": 69500 }, { "epoch": 11.136, "grad_norm": 0.16200323402881622, "learning_rate": 0.0002983421576863074, "loss": 11.1367, "step": 69600 }, { "epoch": 11.152, "grad_norm": 0.3391527831554413, "learning_rate": 0.0002983397575903036, "loss": 10.7709, "step": 69700 }, { "epoch": 11.168, "grad_norm": 0.18793489038944244, "learning_rate": 0.0002983373574942997, "loss": 11.1479, "step": 69800 }, { "epoch": 11.184, "grad_norm": 0.1996636688709259, "learning_rate": 0.0002983349573982959, "loss": 11.8347, "step": 69900 }, { "epoch": 11.2, "grad_norm": 0.166090190410614, "learning_rate": 0.00029833255730229205, "loss": 10.9514, "step": 70000 }, { "epoch": 11.216, "grad_norm": 0.17243006825447083, "learning_rate": 0.0002983301572062882, "loss": 11.2505, "step": 70100 }, { "epoch": 11.232, "grad_norm": 0.17860250174999237, "learning_rate": 0.0002983277571102844, "loss": 11.023, "step": 70200 }, { "epoch": 11.248, "grad_norm": 0.13896320760250092, "learning_rate": 0.00029832535701428055, "loss": 11.092, "step": 70300 }, { "epoch": 11.264, "grad_norm": 0.20008546113967896, "learning_rate": 0.00029832295691827667, "loss": 11.2161, "step": 70400 }, { "epoch": 11.28, "grad_norm": 0.14014984667301178, "learning_rate": 0.00029832055682227284, "loss": 11.315, "step": 70500 }, { "epoch": 11.296, "grad_norm": 0.16158168017864227, "learning_rate": 0.000298318156726269, "loss": 11.3935, "step": 70600 }, { "epoch": 11.312, "grad_norm": 0.15444719791412354, "learning_rate": 0.0002983157566302652, "loss": 10.9662, "step": 70700 }, { "epoch": 11.328, "grad_norm": 0.21788270771503448, "learning_rate": 0.00029831335653426134, "loss": 11.4848, "step": 70800 }, { "epoch": 11.344, "grad_norm": 0.17685194313526154, "learning_rate": 0.0002983109564382575, "loss": 11.3436, "step": 70900 }, { "epoch": 11.36, "grad_norm": 0.15553423762321472, "learning_rate": 0.0002983085563422537, "loss": 11.1136, "step": 71000 }, { "epoch": 11.376, "grad_norm": 0.1547129899263382, "learning_rate": 0.00029830615624624985, "loss": 10.7924, "step": 71100 }, { "epoch": 11.392, "grad_norm": 0.1907842457294464, "learning_rate": 0.000298303756150246, "loss": 10.9726, "step": 71200 }, { "epoch": 11.408, "grad_norm": 0.15053051710128784, "learning_rate": 0.00029830135605424214, "loss": 12.0626, "step": 71300 }, { "epoch": 11.424, "grad_norm": 0.14403216540813446, "learning_rate": 0.0002982989559582383, "loss": 11.428, "step": 71400 }, { "epoch": 11.44, "grad_norm": 0.15850169956684113, "learning_rate": 0.0002982965558622345, "loss": 11.1033, "step": 71500 }, { "epoch": 11.456, "grad_norm": 0.18223829567432404, "learning_rate": 0.00029829417976719066, "loss": 11.5088, "step": 71600 }, { "epoch": 11.472, "grad_norm": 0.18121246993541718, "learning_rate": 0.00029829177967118683, "loss": 11.0869, "step": 71700 }, { "epoch": 11.488, "grad_norm": 0.1591707020998001, "learning_rate": 0.00029828937957518295, "loss": 10.5898, "step": 71800 }, { "epoch": 11.504, "grad_norm": 0.1652923971414566, "learning_rate": 0.0002982869794791791, "loss": 11.3647, "step": 71900 }, { "epoch": 11.52, "grad_norm": 0.1930815577507019, "learning_rate": 0.0002982845793831753, "loss": 11.4873, "step": 72000 }, { "epoch": 11.536, "grad_norm": 0.1646055281162262, "learning_rate": 0.00029828217928717145, "loss": 11.3799, "step": 72100 }, { "epoch": 11.552, "grad_norm": 0.19326475262641907, "learning_rate": 0.0002982797791911676, "loss": 10.8387, "step": 72200 }, { "epoch": 11.568, "grad_norm": 0.23909342288970947, "learning_rate": 0.0002982773790951638, "loss": 10.757, "step": 72300 }, { "epoch": 11.584, "grad_norm": 0.1616702377796173, "learning_rate": 0.0002982749789991599, "loss": 10.7907, "step": 72400 }, { "epoch": 11.6, "grad_norm": 0.16581912338733673, "learning_rate": 0.0002982725789031561, "loss": 10.8977, "step": 72500 }, { "epoch": 11.616, "grad_norm": 0.1478215605020523, "learning_rate": 0.00029827017880715225, "loss": 10.9325, "step": 72600 }, { "epoch": 11.632, "grad_norm": 0.2693212628364563, "learning_rate": 0.0002982677787111484, "loss": 11.2731, "step": 72700 }, { "epoch": 11.648, "grad_norm": 0.15163065493106842, "learning_rate": 0.0002982653786151446, "loss": 11.0141, "step": 72800 }, { "epoch": 11.664, "grad_norm": 0.15364685654640198, "learning_rate": 0.00029826297851914075, "loss": 10.6781, "step": 72900 }, { "epoch": 11.68, "grad_norm": 0.1410771906375885, "learning_rate": 0.00029826057842313687, "loss": 11.0262, "step": 73000 }, { "epoch": 11.696, "grad_norm": 0.2245720773935318, "learning_rate": 0.00029825817832713304, "loss": 11.51, "step": 73100 }, { "epoch": 11.712, "grad_norm": 0.17434003949165344, "learning_rate": 0.0002982557782311292, "loss": 10.7819, "step": 73200 }, { "epoch": 11.728, "grad_norm": 0.13878166675567627, "learning_rate": 0.0002982534021360854, "loss": 10.8833, "step": 73300 }, { "epoch": 11.744, "grad_norm": 0.13650259375572205, "learning_rate": 0.00029825100204008157, "loss": 11.0158, "step": 73400 }, { "epoch": 11.76, "grad_norm": 0.22818398475646973, "learning_rate": 0.00029824860194407773, "loss": 10.8819, "step": 73500 }, { "epoch": 11.776, "grad_norm": 0.14601178467273712, "learning_rate": 0.0002982462018480739, "loss": 10.0593, "step": 73600 }, { "epoch": 11.792, "grad_norm": 0.2245131880044937, "learning_rate": 0.00029824380175207007, "loss": 10.6634, "step": 73700 }, { "epoch": 11.808, "grad_norm": 1.000320553779602, "learning_rate": 0.0002982414016560662, "loss": 10.961, "step": 73800 }, { "epoch": 11.824, "grad_norm": 0.18026384711265564, "learning_rate": 0.00029823900156006236, "loss": 11.1536, "step": 73900 }, { "epoch": 11.84, "grad_norm": 0.15758727490901947, "learning_rate": 0.0002982366014640585, "loss": 10.6586, "step": 74000 }, { "epoch": 11.856, "grad_norm": 0.19163353741168976, "learning_rate": 0.0002982342013680547, "loss": 11.0334, "step": 74100 }, { "epoch": 11.872, "grad_norm": 0.11467296630144119, "learning_rate": 0.00029823180127205086, "loss": 10.8224, "step": 74200 }, { "epoch": 11.888, "grad_norm": 0.15869416296482086, "learning_rate": 0.00029822940117604703, "loss": 10.4906, "step": 74300 }, { "epoch": 11.904, "grad_norm": 0.1966274380683899, "learning_rate": 0.00029822700108004315, "loss": 10.4152, "step": 74400 }, { "epoch": 11.92, "grad_norm": 0.16446225345134735, "learning_rate": 0.0002982246009840393, "loss": 10.4887, "step": 74500 }, { "epoch": 11.936, "grad_norm": 0.16940893232822418, "learning_rate": 0.0002982222008880355, "loss": 10.39, "step": 74600 }, { "epoch": 11.952, "grad_norm": 0.1838199496269226, "learning_rate": 0.00029821980079203166, "loss": 10.384, "step": 74700 }, { "epoch": 11.968, "grad_norm": 0.17523860931396484, "learning_rate": 0.0002982174006960278, "loss": 10.8568, "step": 74800 }, { "epoch": 11.984, "grad_norm": 0.1432792991399765, "learning_rate": 0.000298215000600024, "loss": 10.3596, "step": 74900 }, { "epoch": 12.0, "grad_norm": 0.20020250976085663, "learning_rate": 0.0002982126005040201, "loss": 10.14, "step": 75000 }, { "epoch": 12.016, "grad_norm": 0.19777518510818481, "learning_rate": 0.0002982102004080163, "loss": 10.9224, "step": 75100 }, { "epoch": 12.032, "grad_norm": 0.17126210033893585, "learning_rate": 0.00029820780031201245, "loss": 10.5306, "step": 75200 }, { "epoch": 12.048, "grad_norm": 0.16797253489494324, "learning_rate": 0.0002982054002160086, "loss": 10.8089, "step": 75300 }, { "epoch": 12.064, "grad_norm": 0.20862014591693878, "learning_rate": 0.0002982030001200048, "loss": 10.4757, "step": 75400 }, { "epoch": 12.08, "grad_norm": 0.18397895991802216, "learning_rate": 0.0002982006000240009, "loss": 9.9135, "step": 75500 }, { "epoch": 12.096, "grad_norm": 0.16641663014888763, "learning_rate": 0.00029819819992799707, "loss": 10.6077, "step": 75600 }, { "epoch": 12.112, "grad_norm": 0.16870319843292236, "learning_rate": 0.00029819579983199324, "loss": 10.5788, "step": 75700 }, { "epoch": 12.128, "grad_norm": 0.16674315929412842, "learning_rate": 0.0002981933997359894, "loss": 10.7791, "step": 75800 }, { "epoch": 12.144, "grad_norm": 0.1637590378522873, "learning_rate": 0.0002981909996399856, "loss": 10.0084, "step": 75900 }, { "epoch": 12.16, "grad_norm": 0.16165070235729218, "learning_rate": 0.00029818859954398175, "loss": 10.7957, "step": 76000 }, { "epoch": 12.176, "grad_norm": 0.1414174884557724, "learning_rate": 0.00029818619944797786, "loss": 9.8668, "step": 76100 }, { "epoch": 12.192, "grad_norm": 0.1490393877029419, "learning_rate": 0.00029818379935197403, "loss": 10.5844, "step": 76200 }, { "epoch": 12.208, "grad_norm": 0.15608841180801392, "learning_rate": 0.0002981813992559702, "loss": 10.7121, "step": 76300 }, { "epoch": 12.224, "grad_norm": 0.1658240258693695, "learning_rate": 0.00029817899915996637, "loss": 10.4018, "step": 76400 }, { "epoch": 12.24, "grad_norm": 0.1533997803926468, "learning_rate": 0.00029817659906396254, "loss": 10.0445, "step": 76500 }, { "epoch": 12.256, "grad_norm": 0.14606164395809174, "learning_rate": 0.00029817419896795865, "loss": 10.8624, "step": 76600 }, { "epoch": 12.272, "grad_norm": 0.1926526576280594, "learning_rate": 0.0002981717988719548, "loss": 9.9639, "step": 76700 }, { "epoch": 12.288, "grad_norm": 0.16846922039985657, "learning_rate": 0.000298169398775951, "loss": 10.4076, "step": 76800 }, { "epoch": 12.304, "grad_norm": 0.1497686505317688, "learning_rate": 0.00029816699867994716, "loss": 10.3741, "step": 76900 }, { "epoch": 12.32, "grad_norm": 0.17146418988704681, "learning_rate": 0.00029816459858394333, "loss": 10.6163, "step": 77000 }, { "epoch": 12.336, "grad_norm": 0.169904425740242, "learning_rate": 0.0002981621984879395, "loss": 10.0631, "step": 77100 }, { "epoch": 12.352, "grad_norm": 0.15850874781608582, "learning_rate": 0.00029815979839193567, "loss": 10.0799, "step": 77200 }, { "epoch": 12.368, "grad_norm": 0.15920597314834595, "learning_rate": 0.00029815739829593184, "loss": 9.6119, "step": 77300 }, { "epoch": 12.384, "grad_norm": 0.2246374636888504, "learning_rate": 0.000298154998199928, "loss": 10.3029, "step": 77400 }, { "epoch": 12.4, "grad_norm": 0.168796569108963, "learning_rate": 0.0002981525981039241, "loss": 10.3374, "step": 77500 }, { "epoch": 12.416, "grad_norm": 0.1864066869020462, "learning_rate": 0.0002981501980079203, "loss": 10.0087, "step": 77600 }, { "epoch": 12.432, "grad_norm": 0.14401012659072876, "learning_rate": 0.0002981478219128765, "loss": 10.1803, "step": 77700 }, { "epoch": 12.448, "grad_norm": 0.1375201791524887, "learning_rate": 0.00029814542181687265, "loss": 9.911, "step": 77800 }, { "epoch": 12.464, "grad_norm": 0.1398741900920868, "learning_rate": 0.0002981430217208688, "loss": 10.261, "step": 77900 }, { "epoch": 12.48, "grad_norm": 0.15873165428638458, "learning_rate": 0.000298140621624865, "loss": 10.7101, "step": 78000 }, { "epoch": 12.496, "grad_norm": 0.1714644730091095, "learning_rate": 0.0002981382215288611, "loss": 10.1714, "step": 78100 }, { "epoch": 12.512, "grad_norm": 0.1591562181711197, "learning_rate": 0.00029813582143285727, "loss": 10.1645, "step": 78200 }, { "epoch": 12.528, "grad_norm": 0.18264716863632202, "learning_rate": 0.00029813342133685344, "loss": 10.3564, "step": 78300 }, { "epoch": 12.544, "grad_norm": 0.1514509618282318, "learning_rate": 0.0002981310212408496, "loss": 10.0476, "step": 78400 }, { "epoch": 12.56, "grad_norm": 0.19021818041801453, "learning_rate": 0.0002981286211448458, "loss": 10.2492, "step": 78500 }, { "epoch": 12.576, "grad_norm": 0.21221980452537537, "learning_rate": 0.0002981262210488419, "loss": 9.7379, "step": 78600 }, { "epoch": 12.592, "grad_norm": 0.16575005650520325, "learning_rate": 0.00029812382095283806, "loss": 10.237, "step": 78700 }, { "epoch": 12.608, "grad_norm": 0.12602052092552185, "learning_rate": 0.00029812142085683423, "loss": 10.0729, "step": 78800 }, { "epoch": 12.624, "grad_norm": 0.23105710744857788, "learning_rate": 0.0002981190207608304, "loss": 9.8609, "step": 78900 }, { "epoch": 12.64, "grad_norm": 0.29600638151168823, "learning_rate": 0.00029811662066482657, "loss": 9.8653, "step": 79000 }, { "epoch": 12.656, "grad_norm": 0.19172607362270355, "learning_rate": 0.00029811422056882274, "loss": 9.8614, "step": 79100 }, { "epoch": 12.672, "grad_norm": 0.1930418759584427, "learning_rate": 0.00029811182047281886, "loss": 10.0208, "step": 79200 }, { "epoch": 12.688, "grad_norm": 0.12393278628587723, "learning_rate": 0.000298109420376815, "loss": 10.349, "step": 79300 }, { "epoch": 12.704, "grad_norm": 0.1565830409526825, "learning_rate": 0.0002981070202808112, "loss": 10.5402, "step": 79400 }, { "epoch": 12.72, "grad_norm": 0.13968247175216675, "learning_rate": 0.00029810462018480736, "loss": 9.9296, "step": 79500 }, { "epoch": 12.736, "grad_norm": 0.17765802145004272, "learning_rate": 0.00029810222008880353, "loss": 9.8002, "step": 79600 }, { "epoch": 12.752, "grad_norm": 0.23838719725608826, "learning_rate": 0.00029809981999279965, "loss": 9.8636, "step": 79700 }, { "epoch": 12.768, "grad_norm": 0.23086270689964294, "learning_rate": 0.0002980974438977559, "loss": 9.9585, "step": 79800 }, { "epoch": 12.784, "grad_norm": 0.14923255145549774, "learning_rate": 0.00029809504380175206, "loss": 9.5379, "step": 79900 }, { "epoch": 12.8, "grad_norm": 0.1599462628364563, "learning_rate": 0.00029809264370574823, "loss": 9.641, "step": 80000 }, { "epoch": 12.816, "grad_norm": 0.1716078370809555, "learning_rate": 0.00029809024360974434, "loss": 9.8697, "step": 80100 }, { "epoch": 12.832, "grad_norm": 0.19052661955356598, "learning_rate": 0.0002980878435137405, "loss": 9.6785, "step": 80200 }, { "epoch": 12.848, "grad_norm": 0.15575654804706573, "learning_rate": 0.0002980854434177367, "loss": 9.9394, "step": 80300 }, { "epoch": 12.864, "grad_norm": 0.19439518451690674, "learning_rate": 0.00029808304332173285, "loss": 9.5522, "step": 80400 }, { "epoch": 12.88, "grad_norm": 0.17798827588558197, "learning_rate": 0.000298080643225729, "loss": 9.9453, "step": 80500 }, { "epoch": 12.896, "grad_norm": 0.16586044430732727, "learning_rate": 0.00029807824312972513, "loss": 9.8505, "step": 80600 }, { "epoch": 12.912, "grad_norm": 0.15794214606285095, "learning_rate": 0.0002980758430337213, "loss": 10.0497, "step": 80700 }, { "epoch": 12.928, "grad_norm": 0.1685098111629486, "learning_rate": 0.0002980734429377175, "loss": 10.2658, "step": 80800 }, { "epoch": 12.943999999999999, "grad_norm": 0.16599301993846893, "learning_rate": 0.00029807104284171364, "loss": 9.837, "step": 80900 }, { "epoch": 12.96, "grad_norm": 0.14692434668540955, "learning_rate": 0.0002980686427457098, "loss": 10.1817, "step": 81000 }, { "epoch": 12.975999999999999, "grad_norm": 0.15374502539634705, "learning_rate": 0.000298066242649706, "loss": 10.1231, "step": 81100 }, { "epoch": 12.992, "grad_norm": 0.1369294375181198, "learning_rate": 0.0002980638425537021, "loss": 9.8245, "step": 81200 }, { "epoch": 13.008, "grad_norm": 0.20259645581245422, "learning_rate": 0.00029806144245769826, "loss": 9.7027, "step": 81300 }, { "epoch": 13.024, "grad_norm": 0.1258879452943802, "learning_rate": 0.00029805904236169443, "loss": 9.8863, "step": 81400 }, { "epoch": 13.04, "grad_norm": 0.14773085713386536, "learning_rate": 0.0002980566422656906, "loss": 9.4255, "step": 81500 }, { "epoch": 13.056, "grad_norm": 0.17212265729904175, "learning_rate": 0.00029805424216968677, "loss": 10.0506, "step": 81600 }, { "epoch": 13.072, "grad_norm": 0.179426372051239, "learning_rate": 0.0002980518420736829, "loss": 9.5137, "step": 81700 }, { "epoch": 13.088, "grad_norm": 0.15935377776622772, "learning_rate": 0.00029804944197767906, "loss": 9.3141, "step": 81800 }, { "epoch": 13.104, "grad_norm": 0.17460429668426514, "learning_rate": 0.0002980470418816752, "loss": 9.8005, "step": 81900 }, { "epoch": 13.12, "grad_norm": 0.20005491375923157, "learning_rate": 0.0002980446417856714, "loss": 9.7239, "step": 82000 }, { "epoch": 13.136, "grad_norm": 0.15051016211509705, "learning_rate": 0.00029804224168966756, "loss": 10.214, "step": 82100 }, { "epoch": 13.152, "grad_norm": 0.16659046709537506, "learning_rate": 0.00029803984159366373, "loss": 9.4695, "step": 82200 }, { "epoch": 13.168, "grad_norm": 0.16346730291843414, "learning_rate": 0.00029803744149765985, "loss": 9.5839, "step": 82300 }, { "epoch": 13.184, "grad_norm": 0.16145597398281097, "learning_rate": 0.000298035041401656, "loss": 9.2663, "step": 82400 }, { "epoch": 13.2, "grad_norm": 0.13834603130817413, "learning_rate": 0.00029803266530661226, "loss": 9.6926, "step": 82500 }, { "epoch": 13.216, "grad_norm": 0.17841538786888123, "learning_rate": 0.0002980302652106084, "loss": 9.4752, "step": 82600 }, { "epoch": 13.232, "grad_norm": 0.14639347791671753, "learning_rate": 0.00029802786511460454, "loss": 9.9606, "step": 82700 }, { "epoch": 13.248, "grad_norm": 0.15291540324687958, "learning_rate": 0.0002980254650186007, "loss": 9.9284, "step": 82800 }, { "epoch": 13.264, "grad_norm": 0.15908333659172058, "learning_rate": 0.0002980230649225969, "loss": 9.5464, "step": 82900 }, { "epoch": 13.28, "grad_norm": 0.16768860816955566, "learning_rate": 0.00029802066482659305, "loss": 10.2164, "step": 83000 }, { "epoch": 13.296, "grad_norm": 0.18221326172351837, "learning_rate": 0.0002980182647305892, "loss": 9.6566, "step": 83100 }, { "epoch": 13.312, "grad_norm": 0.13944192230701447, "learning_rate": 0.00029801586463458534, "loss": 9.4149, "step": 83200 }, { "epoch": 13.328, "grad_norm": 0.20090098679065704, "learning_rate": 0.0002980134645385815, "loss": 9.1968, "step": 83300 }, { "epoch": 13.344, "grad_norm": 0.17636704444885254, "learning_rate": 0.0002980110644425777, "loss": 9.4497, "step": 83400 }, { "epoch": 13.36, "grad_norm": 0.19672048091888428, "learning_rate": 0.00029800866434657384, "loss": 9.3083, "step": 83500 }, { "epoch": 13.376, "grad_norm": 0.1991618573665619, "learning_rate": 0.00029800626425057, "loss": 9.59, "step": 83600 }, { "epoch": 13.392, "grad_norm": 0.17260773479938507, "learning_rate": 0.00029800386415456613, "loss": 9.9553, "step": 83700 }, { "epoch": 13.408, "grad_norm": 0.13101576268672943, "learning_rate": 0.0002980014640585623, "loss": 10.0732, "step": 83800 }, { "epoch": 13.424, "grad_norm": 0.16349157691001892, "learning_rate": 0.00029799906396255847, "loss": 9.8363, "step": 83900 }, { "epoch": 13.44, "grad_norm": 0.1792200654745102, "learning_rate": 0.00029799666386655464, "loss": 9.9456, "step": 84000 }, { "epoch": 13.456, "grad_norm": 0.13476693630218506, "learning_rate": 0.0002979942637705508, "loss": 9.4642, "step": 84100 }, { "epoch": 13.472, "grad_norm": 0.17343075573444366, "learning_rate": 0.000297991863674547, "loss": 9.4041, "step": 84200 }, { "epoch": 13.488, "grad_norm": 0.16127794981002808, "learning_rate": 0.0002979894635785431, "loss": 9.2465, "step": 84300 }, { "epoch": 13.504, "grad_norm": 0.14993996918201447, "learning_rate": 0.00029798706348253926, "loss": 9.5946, "step": 84400 }, { "epoch": 13.52, "grad_norm": 0.21931160986423492, "learning_rate": 0.0002979846873874955, "loss": 9.5796, "step": 84500 }, { "epoch": 13.536, "grad_norm": 0.15303994715213776, "learning_rate": 0.0002979822872914916, "loss": 9.4222, "step": 84600 }, { "epoch": 13.552, "grad_norm": 0.1905248612165451, "learning_rate": 0.0002979798871954878, "loss": 9.4192, "step": 84700 }, { "epoch": 13.568, "grad_norm": 0.17656217515468597, "learning_rate": 0.00029797748709948395, "loss": 9.685, "step": 84800 }, { "epoch": 13.584, "grad_norm": 0.31464865803718567, "learning_rate": 0.0002979750870034801, "loss": 9.4839, "step": 84900 }, { "epoch": 13.6, "grad_norm": 0.20140250027179718, "learning_rate": 0.0002979726869074763, "loss": 9.4393, "step": 85000 }, { "epoch": 13.616, "grad_norm": 0.1453031599521637, "learning_rate": 0.00029797028681147246, "loss": 9.4777, "step": 85100 }, { "epoch": 13.632, "grad_norm": 0.15559718012809753, "learning_rate": 0.0002979678867154686, "loss": 9.7772, "step": 85200 }, { "epoch": 13.648, "grad_norm": 0.16849826276302338, "learning_rate": 0.00029796548661946475, "loss": 9.0954, "step": 85300 }, { "epoch": 13.664, "grad_norm": 0.15798023343086243, "learning_rate": 0.0002979630865234609, "loss": 9.7756, "step": 85400 }, { "epoch": 13.68, "grad_norm": 0.0940115824341774, "learning_rate": 0.0002979606864274571, "loss": 9.9294, "step": 85500 }, { "epoch": 13.696, "grad_norm": 0.18608032166957855, "learning_rate": 0.00029795828633145325, "loss": 9.4524, "step": 85600 }, { "epoch": 13.712, "grad_norm": 0.16172797977924347, "learning_rate": 0.00029795588623544937, "loss": 9.6146, "step": 85700 }, { "epoch": 13.728, "grad_norm": 0.1493913233280182, "learning_rate": 0.00029795348613944554, "loss": 8.8783, "step": 85800 }, { "epoch": 13.744, "grad_norm": 0.1365765631198883, "learning_rate": 0.0002979510860434417, "loss": 9.4707, "step": 85900 }, { "epoch": 13.76, "grad_norm": 0.17814397811889648, "learning_rate": 0.0002979486859474379, "loss": 9.4121, "step": 86000 }, { "epoch": 13.776, "grad_norm": 0.16484831273555756, "learning_rate": 0.00029794628585143405, "loss": 9.0902, "step": 86100 }, { "epoch": 13.792, "grad_norm": 0.1435382217168808, "learning_rate": 0.0002979438857554302, "loss": 9.4565, "step": 86200 }, { "epoch": 13.808, "grad_norm": 0.1451929211616516, "learning_rate": 0.00029794148565942633, "loss": 9.6377, "step": 86300 }, { "epoch": 13.824, "grad_norm": 0.1439056396484375, "learning_rate": 0.0002979390855634225, "loss": 9.2624, "step": 86400 }, { "epoch": 13.84, "grad_norm": 0.1712324023246765, "learning_rate": 0.00029793668546741867, "loss": 9.2021, "step": 86500 }, { "epoch": 13.856, "grad_norm": 0.15382009744644165, "learning_rate": 0.00029793428537141484, "loss": 8.8688, "step": 86600 }, { "epoch": 13.872, "grad_norm": 0.14327426254749298, "learning_rate": 0.000297931885275411, "loss": 9.2336, "step": 86700 }, { "epoch": 13.888, "grad_norm": 0.21682417392730713, "learning_rate": 0.0002979294851794071, "loss": 8.9508, "step": 86800 }, { "epoch": 13.904, "grad_norm": 0.18012550473213196, "learning_rate": 0.0002979270850834033, "loss": 8.8259, "step": 86900 }, { "epoch": 13.92, "grad_norm": 0.19224317371845245, "learning_rate": 0.00029792468498739946, "loss": 9.0594, "step": 87000 }, { "epoch": 13.936, "grad_norm": 0.14684438705444336, "learning_rate": 0.00029792228489139563, "loss": 8.6664, "step": 87100 }, { "epoch": 13.952, "grad_norm": 0.15808767080307007, "learning_rate": 0.0002979198847953918, "loss": 8.8133, "step": 87200 }, { "epoch": 13.968, "grad_norm": 0.1466471403837204, "learning_rate": 0.00029791748469938797, "loss": 9.2512, "step": 87300 }, { "epoch": 13.984, "grad_norm": 0.13929226994514465, "learning_rate": 0.0002979150846033841, "loss": 9.0263, "step": 87400 }, { "epoch": 14.0, "grad_norm": 0.1410779356956482, "learning_rate": 0.00029791268450738025, "loss": 9.0906, "step": 87500 }, { "epoch": 14.016, "grad_norm": 0.16633394360542297, "learning_rate": 0.0002979102844113764, "loss": 8.8764, "step": 87600 }, { "epoch": 14.032, "grad_norm": 0.19240239262580872, "learning_rate": 0.0002979078843153726, "loss": 8.6873, "step": 87700 }, { "epoch": 14.048, "grad_norm": 0.2285340428352356, "learning_rate": 0.00029790548421936876, "loss": 8.7636, "step": 87800 }, { "epoch": 14.064, "grad_norm": 0.16399361193180084, "learning_rate": 0.0002979030841233649, "loss": 9.3241, "step": 87900 }, { "epoch": 14.08, "grad_norm": 0.14966578781604767, "learning_rate": 0.00029790068402736104, "loss": 9.0301, "step": 88000 }, { "epoch": 14.096, "grad_norm": 0.17241202294826508, "learning_rate": 0.0002978982839313572, "loss": 8.9678, "step": 88100 }, { "epoch": 14.112, "grad_norm": 0.13520659506320953, "learning_rate": 0.0002978958838353534, "loss": 9.0678, "step": 88200 }, { "epoch": 14.128, "grad_norm": 0.15996631979942322, "learning_rate": 0.00029789348373934955, "loss": 8.7807, "step": 88300 }, { "epoch": 14.144, "grad_norm": 0.14483466744422913, "learning_rate": 0.0002978910836433457, "loss": 8.6088, "step": 88400 }, { "epoch": 14.16, "grad_norm": 0.15150679647922516, "learning_rate": 0.00029788868354734183, "loss": 9.2128, "step": 88500 }, { "epoch": 14.176, "grad_norm": 0.1668185293674469, "learning_rate": 0.0002978863074522981, "loss": 9.518, "step": 88600 }, { "epoch": 14.192, "grad_norm": 0.17209367454051971, "learning_rate": 0.00029788393135725427, "loss": 8.5952, "step": 88700 }, { "epoch": 14.208, "grad_norm": 0.15907296538352966, "learning_rate": 0.00029788155526221045, "loss": 8.7632, "step": 88800 }, { "epoch": 14.224, "grad_norm": 0.18298570811748505, "learning_rate": 0.0002978791551662066, "loss": 8.8021, "step": 88900 }, { "epoch": 14.24, "grad_norm": 0.19813942909240723, "learning_rate": 0.0002978767550702028, "loss": 9.1381, "step": 89000 }, { "epoch": 14.256, "grad_norm": 0.1819518506526947, "learning_rate": 0.00029787435497419896, "loss": 9.3086, "step": 89100 }, { "epoch": 14.272, "grad_norm": 0.1506895273923874, "learning_rate": 0.0002978719548781951, "loss": 8.7471, "step": 89200 }, { "epoch": 14.288, "grad_norm": 0.1686287224292755, "learning_rate": 0.00029786955478219125, "loss": 8.8441, "step": 89300 }, { "epoch": 14.304, "grad_norm": 0.1486745923757553, "learning_rate": 0.0002978671546861874, "loss": 9.1216, "step": 89400 }, { "epoch": 14.32, "grad_norm": 0.18762429058551788, "learning_rate": 0.0002978647545901836, "loss": 9.402, "step": 89500 }, { "epoch": 14.336, "grad_norm": 0.13964596390724182, "learning_rate": 0.00029786235449417975, "loss": 9.2773, "step": 89600 }, { "epoch": 14.352, "grad_norm": 0.2629782557487488, "learning_rate": 0.00029785995439817587, "loss": 9.05, "step": 89700 }, { "epoch": 14.368, "grad_norm": 0.12668898701667786, "learning_rate": 0.00029785755430217204, "loss": 8.8949, "step": 89800 }, { "epoch": 14.384, "grad_norm": 0.14362965524196625, "learning_rate": 0.0002978551542061682, "loss": 8.6261, "step": 89900 }, { "epoch": 14.4, "grad_norm": 0.16700971126556396, "learning_rate": 0.0002978527541101644, "loss": 8.8621, "step": 90000 }, { "epoch": 14.416, "grad_norm": 0.1597680300474167, "learning_rate": 0.00029785035401416055, "loss": 9.1614, "step": 90100 }, { "epoch": 14.432, "grad_norm": 0.16268526017665863, "learning_rate": 0.0002978479539181567, "loss": 9.2429, "step": 90200 }, { "epoch": 14.448, "grad_norm": 0.19829140603542328, "learning_rate": 0.00029784555382215283, "loss": 8.6337, "step": 90300 }, { "epoch": 14.464, "grad_norm": 0.1362706571817398, "learning_rate": 0.000297843153726149, "loss": 8.5578, "step": 90400 }, { "epoch": 14.48, "grad_norm": 0.17475652694702148, "learning_rate": 0.00029784075363014517, "loss": 9.3407, "step": 90500 }, { "epoch": 14.496, "grad_norm": 0.139988973736763, "learning_rate": 0.00029783835353414134, "loss": 8.9509, "step": 90600 }, { "epoch": 14.512, "grad_norm": 0.15270425379276276, "learning_rate": 0.0002978359534381375, "loss": 8.6833, "step": 90700 }, { "epoch": 14.528, "grad_norm": 0.12172385305166245, "learning_rate": 0.0002978335533421336, "loss": 8.1913, "step": 90800 }, { "epoch": 14.544, "grad_norm": 0.18453091382980347, "learning_rate": 0.0002978311532461298, "loss": 9.0573, "step": 90900 }, { "epoch": 14.56, "grad_norm": 0.12650534510612488, "learning_rate": 0.00029782875315012596, "loss": 8.8951, "step": 91000 }, { "epoch": 14.576, "grad_norm": 0.19508056342601776, "learning_rate": 0.00029782635305412213, "loss": 8.8831, "step": 91100 }, { "epoch": 14.592, "grad_norm": 0.12826193869113922, "learning_rate": 0.0002978239529581183, "loss": 8.7428, "step": 91200 }, { "epoch": 14.608, "grad_norm": 0.16497032344341278, "learning_rate": 0.00029782155286211447, "loss": 9.226, "step": 91300 }, { "epoch": 14.624, "grad_norm": 0.1467789113521576, "learning_rate": 0.0002978191527661106, "loss": 8.56, "step": 91400 }, { "epoch": 14.64, "grad_norm": 0.13535846769809723, "learning_rate": 0.00029781675267010675, "loss": 9.2005, "step": 91500 }, { "epoch": 14.656, "grad_norm": 0.2261963039636612, "learning_rate": 0.000297814352574103, "loss": 8.9913, "step": 91600 }, { "epoch": 14.672, "grad_norm": 0.16329319775104523, "learning_rate": 0.0002978119524780991, "loss": 8.8455, "step": 91700 }, { "epoch": 14.688, "grad_norm": 0.14644941687583923, "learning_rate": 0.00029780955238209526, "loss": 8.8035, "step": 91800 }, { "epoch": 14.704, "grad_norm": 0.17719560861587524, "learning_rate": 0.00029780715228609143, "loss": 8.9548, "step": 91900 }, { "epoch": 14.72, "grad_norm": 0.17204242944717407, "learning_rate": 0.0002978047521900876, "loss": 8.9065, "step": 92000 }, { "epoch": 14.736, "grad_norm": 0.15323054790496826, "learning_rate": 0.00029780235209408377, "loss": 8.642, "step": 92100 }, { "epoch": 14.752, "grad_norm": 0.12264496088027954, "learning_rate": 0.00029779995199807994, "loss": 8.7372, "step": 92200 }, { "epoch": 14.768, "grad_norm": 0.13607698678970337, "learning_rate": 0.00029779755190207605, "loss": 8.649, "step": 92300 }, { "epoch": 14.784, "grad_norm": 0.1529749035835266, "learning_rate": 0.0002977951518060722, "loss": 8.6928, "step": 92400 }, { "epoch": 14.8, "grad_norm": 0.14829668402671814, "learning_rate": 0.0002977927517100684, "loss": 8.2178, "step": 92500 }, { "epoch": 14.816, "grad_norm": 0.15614420175552368, "learning_rate": 0.00029779035161406456, "loss": 8.4939, "step": 92600 }, { "epoch": 14.832, "grad_norm": 0.18708457052707672, "learning_rate": 0.00029778795151806073, "loss": 8.4044, "step": 92700 }, { "epoch": 14.848, "grad_norm": 0.1700950413942337, "learning_rate": 0.00029778555142205684, "loss": 9.142, "step": 92800 }, { "epoch": 14.864, "grad_norm": 0.17176997661590576, "learning_rate": 0.000297783151326053, "loss": 8.3459, "step": 92900 }, { "epoch": 14.88, "grad_norm": 0.17668530344963074, "learning_rate": 0.0002977807512300492, "loss": 8.4129, "step": 93000 }, { "epoch": 14.896, "grad_norm": 0.13708771765232086, "learning_rate": 0.00029777835113404535, "loss": 8.6625, "step": 93100 }, { "epoch": 14.912, "grad_norm": 0.2073189914226532, "learning_rate": 0.00029777597503900154, "loss": 8.8295, "step": 93200 }, { "epoch": 14.928, "grad_norm": 0.1584160029888153, "learning_rate": 0.0002977735749429977, "loss": 8.2892, "step": 93300 }, { "epoch": 14.943999999999999, "grad_norm": 0.13419002294540405, "learning_rate": 0.0002977711748469938, "loss": 8.6564, "step": 93400 }, { "epoch": 14.96, "grad_norm": 0.12294425070285797, "learning_rate": 0.00029776877475099, "loss": 8.6937, "step": 93500 }, { "epoch": 14.975999999999999, "grad_norm": 0.12022320926189423, "learning_rate": 0.00029776637465498616, "loss": 8.6577, "step": 93600 }, { "epoch": 14.992, "grad_norm": 0.1635560393333435, "learning_rate": 0.00029776397455898233, "loss": 8.4075, "step": 93700 }, { "epoch": 15.008, "grad_norm": 0.12280473113059998, "learning_rate": 0.0002977615744629785, "loss": 8.3065, "step": 93800 }, { "epoch": 15.024, "grad_norm": 0.14091894030570984, "learning_rate": 0.0002977591743669746, "loss": 8.3845, "step": 93900 }, { "epoch": 15.04, "grad_norm": 0.16942408680915833, "learning_rate": 0.0002977567742709708, "loss": 8.2751, "step": 94000 }, { "epoch": 15.056, "grad_norm": 0.1858222782611847, "learning_rate": 0.00029775437417496695, "loss": 8.5152, "step": 94100 }, { "epoch": 15.072, "grad_norm": 0.15426284074783325, "learning_rate": 0.0002977519740789631, "loss": 8.2321, "step": 94200 }, { "epoch": 15.088, "grad_norm": 0.13960111141204834, "learning_rate": 0.0002977495739829593, "loss": 8.4343, "step": 94300 }, { "epoch": 15.104, "grad_norm": 0.1927483230829239, "learning_rate": 0.00029774717388695546, "loss": 8.26, "step": 94400 }, { "epoch": 15.12, "grad_norm": 0.15174433588981628, "learning_rate": 0.0002977447737909516, "loss": 8.665, "step": 94500 }, { "epoch": 15.136, "grad_norm": 0.14686360955238342, "learning_rate": 0.00029774237369494774, "loss": 8.0608, "step": 94600 }, { "epoch": 15.152, "grad_norm": 0.15865716338157654, "learning_rate": 0.00029773997359894397, "loss": 8.4204, "step": 94700 }, { "epoch": 15.168, "grad_norm": 0.14913444221019745, "learning_rate": 0.0002977375735029401, "loss": 8.5544, "step": 94800 }, { "epoch": 15.184, "grad_norm": 0.12727545201778412, "learning_rate": 0.00029773517340693625, "loss": 7.9671, "step": 94900 }, { "epoch": 15.2, "grad_norm": 0.18612131476402283, "learning_rate": 0.0002977327733109324, "loss": 8.5797, "step": 95000 }, { "epoch": 15.216, "grad_norm": 0.1876545250415802, "learning_rate": 0.0002977303732149286, "loss": 8.3126, "step": 95100 }, { "epoch": 15.232, "grad_norm": 0.45961084961891174, "learning_rate": 0.00029772797311892476, "loss": 8.772, "step": 95200 }, { "epoch": 15.248, "grad_norm": 0.16763293743133545, "learning_rate": 0.00029772557302292093, "loss": 8.6089, "step": 95300 }, { "epoch": 15.264, "grad_norm": 0.17058174312114716, "learning_rate": 0.00029772317292691704, "loss": 8.5425, "step": 95400 }, { "epoch": 15.28, "grad_norm": 0.17006829380989075, "learning_rate": 0.0002977207728309132, "loss": 8.8057, "step": 95500 }, { "epoch": 15.296, "grad_norm": 0.09077399969100952, "learning_rate": 0.0002977183727349094, "loss": 8.343, "step": 95600 }, { "epoch": 15.312, "grad_norm": 0.0950964093208313, "learning_rate": 0.00029771599663986557, "loss": 8.3518, "step": 95700 }, { "epoch": 15.328, "grad_norm": 0.14622962474822998, "learning_rate": 0.00029771359654386174, "loss": 8.1654, "step": 95800 }, { "epoch": 15.344, "grad_norm": 0.16222132742404938, "learning_rate": 0.00029771119644785785, "loss": 8.6123, "step": 95900 }, { "epoch": 15.36, "grad_norm": 0.13185660541057587, "learning_rate": 0.000297708796351854, "loss": 8.6665, "step": 96000 }, { "epoch": 15.376, "grad_norm": 0.1910812258720398, "learning_rate": 0.0002977063962558502, "loss": 8.2323, "step": 96100 }, { "epoch": 15.392, "grad_norm": 0.18493321537971497, "learning_rate": 0.00029770399615984636, "loss": 8.2076, "step": 96200 }, { "epoch": 15.408, "grad_norm": 0.15737323462963104, "learning_rate": 0.00029770159606384253, "loss": 8.4031, "step": 96300 }, { "epoch": 15.424, "grad_norm": 0.1808168590068817, "learning_rate": 0.0002976991959678387, "loss": 8.0816, "step": 96400 }, { "epoch": 15.44, "grad_norm": 0.12530648708343506, "learning_rate": 0.0002976967958718348, "loss": 8.0609, "step": 96500 }, { "epoch": 15.456, "grad_norm": 0.12963543832302094, "learning_rate": 0.000297694395775831, "loss": 8.092, "step": 96600 }, { "epoch": 15.472, "grad_norm": 0.1329260617494583, "learning_rate": 0.00029769199567982715, "loss": 8.4219, "step": 96700 }, { "epoch": 15.488, "grad_norm": 0.1603865921497345, "learning_rate": 0.0002976895955838233, "loss": 7.8878, "step": 96800 }, { "epoch": 15.504, "grad_norm": 0.16902674734592438, "learning_rate": 0.0002976871954878195, "loss": 8.2197, "step": 96900 }, { "epoch": 15.52, "grad_norm": 0.15807543694972992, "learning_rate": 0.0002976847953918156, "loss": 7.937, "step": 97000 }, { "epoch": 15.536, "grad_norm": 0.15132875740528107, "learning_rate": 0.0002976823952958118, "loss": 8.6177, "step": 97100 }, { "epoch": 15.552, "grad_norm": 0.1347590982913971, "learning_rate": 0.00029767999519980795, "loss": 8.7107, "step": 97200 }, { "epoch": 15.568, "grad_norm": 0.16151072084903717, "learning_rate": 0.0002976775951038041, "loss": 8.4782, "step": 97300 }, { "epoch": 15.584, "grad_norm": 0.194889098405838, "learning_rate": 0.0002976751950078003, "loss": 8.128, "step": 97400 }, { "epoch": 15.6, "grad_norm": 0.18148979544639587, "learning_rate": 0.00029767279491179645, "loss": 8.3591, "step": 97500 }, { "epoch": 15.616, "grad_norm": 0.1610337197780609, "learning_rate": 0.00029767039481579257, "loss": 8.8492, "step": 97600 }, { "epoch": 15.632, "grad_norm": 0.15079425275325775, "learning_rate": 0.00029766799471978874, "loss": 8.2512, "step": 97700 }, { "epoch": 15.648, "grad_norm": 0.1274147629737854, "learning_rate": 0.0002976655946237849, "loss": 8.2239, "step": 97800 }, { "epoch": 15.664, "grad_norm": 0.14330662786960602, "learning_rate": 0.0002976631945277811, "loss": 8.3046, "step": 97900 }, { "epoch": 15.68, "grad_norm": 0.17394746840000153, "learning_rate": 0.00029766079443177725, "loss": 8.2542, "step": 98000 }, { "epoch": 15.696, "grad_norm": 0.15639960765838623, "learning_rate": 0.0002976583943357734, "loss": 8.3993, "step": 98100 }, { "epoch": 15.712, "grad_norm": 0.12845559418201447, "learning_rate": 0.0002976559942397696, "loss": 8.2055, "step": 98200 }, { "epoch": 15.728, "grad_norm": 0.1673252284526825, "learning_rate": 0.00029765359414376575, "loss": 8.2969, "step": 98300 }, { "epoch": 15.744, "grad_norm": 0.12345835566520691, "learning_rate": 0.0002976511940477619, "loss": 8.4381, "step": 98400 }, { "epoch": 15.76, "grad_norm": 0.19648896157741547, "learning_rate": 0.00029764879395175804, "loss": 8.0932, "step": 98500 }, { "epoch": 15.776, "grad_norm": 0.14960013329982758, "learning_rate": 0.0002976463938557542, "loss": 8.4303, "step": 98600 }, { "epoch": 15.792, "grad_norm": 0.19554351270198822, "learning_rate": 0.0002976439937597504, "loss": 8.0159, "step": 98700 }, { "epoch": 15.808, "grad_norm": 0.1545807123184204, "learning_rate": 0.00029764159366374654, "loss": 8.0277, "step": 98800 }, { "epoch": 15.824, "grad_norm": 0.11705837398767471, "learning_rate": 0.0002976391935677427, "loss": 8.2474, "step": 98900 }, { "epoch": 15.84, "grad_norm": 0.16222915053367615, "learning_rate": 0.00029763679347173883, "loss": 7.8129, "step": 99000 }, { "epoch": 15.856, "grad_norm": 0.18901053071022034, "learning_rate": 0.000297634393375735, "loss": 8.3068, "step": 99100 }, { "epoch": 15.872, "grad_norm": 0.13031688332557678, "learning_rate": 0.00029763199327973117, "loss": 8.1526, "step": 99200 }, { "epoch": 15.888, "grad_norm": 0.17539045214653015, "learning_rate": 0.00029762959318372734, "loss": 7.7545, "step": 99300 }, { "epoch": 15.904, "grad_norm": NaN, "learning_rate": 0.0002976271930877235, "loss": 7.8745, "step": 99400 }, { "epoch": 15.92, "grad_norm": 0.17992717027664185, "learning_rate": 0.0002976248169926797, "loss": 7.9663, "step": 99500 }, { "epoch": 15.936, "grad_norm": 0.40667879581451416, "learning_rate": 0.0002976224168966758, "loss": 8.1505, "step": 99600 }, { "epoch": 15.952, "grad_norm": 0.15805494785308838, "learning_rate": 0.000297620016800672, "loss": 8.4417, "step": 99700 }, { "epoch": 15.968, "grad_norm": 0.16626039147377014, "learning_rate": 0.00029761761670466815, "loss": 8.2951, "step": 99800 }, { "epoch": 15.984, "grad_norm": 0.14239948987960815, "learning_rate": 0.0002976152166086643, "loss": 8.3205, "step": 99900 }, { "epoch": 16.0, "grad_norm": 0.24553033709526062, "learning_rate": 0.0002976128165126605, "loss": 8.2056, "step": 100000 }, { "epoch": 16.016, "grad_norm": 0.18159309029579163, "learning_rate": 0.0002976104164166566, "loss": 7.9151, "step": 100100 }, { "epoch": 16.032, "grad_norm": 0.16968666017055511, "learning_rate": 0.00029760801632065277, "loss": 7.8903, "step": 100200 }, { "epoch": 16.048, "grad_norm": 0.1661410927772522, "learning_rate": 0.00029760561622464894, "loss": 8.3051, "step": 100300 }, { "epoch": 16.064, "grad_norm": 0.1526879370212555, "learning_rate": 0.0002976032161286451, "loss": 7.8435, "step": 100400 }, { "epoch": 16.08, "grad_norm": 0.14917099475860596, "learning_rate": 0.0002976008160326413, "loss": 8.0571, "step": 100500 }, { "epoch": 16.096, "grad_norm": 0.15157845616340637, "learning_rate": 0.00029759841593663745, "loss": 8.0002, "step": 100600 }, { "epoch": 16.112, "grad_norm": 0.1487221121788025, "learning_rate": 0.00029759601584063356, "loss": 7.864, "step": 100700 }, { "epoch": 16.128, "grad_norm": 0.1397908627986908, "learning_rate": 0.00029759361574462973, "loss": 8.0639, "step": 100800 }, { "epoch": 16.144, "grad_norm": 0.1495772898197174, "learning_rate": 0.0002975912156486259, "loss": 7.8346, "step": 100900 }, { "epoch": 16.16, "grad_norm": 0.17440412938594818, "learning_rate": 0.00029758881555262207, "loss": 8.1732, "step": 101000 }, { "epoch": 16.176, "grad_norm": 0.15802791714668274, "learning_rate": 0.00029758641545661824, "loss": 7.9528, "step": 101100 }, { "epoch": 16.192, "grad_norm": 0.15488143265247345, "learning_rate": 0.0002975840153606144, "loss": 7.8414, "step": 101200 }, { "epoch": 16.208, "grad_norm": 0.1365291178226471, "learning_rate": 0.0002975816152646106, "loss": 7.9363, "step": 101300 }, { "epoch": 16.224, "grad_norm": 0.13933680951595306, "learning_rate": 0.00029757921516860675, "loss": 7.5429, "step": 101400 }, { "epoch": 16.24, "grad_norm": 0.19280196726322174, "learning_rate": 0.0002975768150726029, "loss": 7.913, "step": 101500 }, { "epoch": 16.256, "grad_norm": 0.11700501292943954, "learning_rate": 0.00029757441497659903, "loss": 8.0237, "step": 101600 }, { "epoch": 16.272, "grad_norm": 0.16518530249595642, "learning_rate": 0.0002975720388815552, "loss": 7.8771, "step": 101700 }, { "epoch": 16.288, "grad_norm": 0.14215916395187378, "learning_rate": 0.0002975696387855514, "loss": 8.2513, "step": 101800 }, { "epoch": 16.304, "grad_norm": 0.15119720995426178, "learning_rate": 0.00029756723868954756, "loss": 8.0416, "step": 101900 }, { "epoch": 16.32, "grad_norm": 0.17267923057079315, "learning_rate": 0.0002975648385935437, "loss": 7.7183, "step": 102000 }, { "epoch": 16.336, "grad_norm": 0.13659106194972992, "learning_rate": 0.00029756243849753984, "loss": 7.6539, "step": 102100 }, { "epoch": 16.352, "grad_norm": 0.13859499990940094, "learning_rate": 0.000297560038401536, "loss": 7.9309, "step": 102200 }, { "epoch": 16.368, "grad_norm": 0.16713272035121918, "learning_rate": 0.0002975576383055322, "loss": 7.7884, "step": 102300 }, { "epoch": 16.384, "grad_norm": 0.19469381868839264, "learning_rate": 0.00029755523820952835, "loss": 7.6944, "step": 102400 }, { "epoch": 16.4, "grad_norm": 0.14082291722297668, "learning_rate": 0.0002975528381135245, "loss": 7.5828, "step": 102500 }, { "epoch": 16.416, "grad_norm": 0.12121783196926117, "learning_rate": 0.0002975504380175207, "loss": 7.813, "step": 102600 }, { "epoch": 16.432, "grad_norm": 0.22072196006774902, "learning_rate": 0.0002975480379215168, "loss": 8.2315, "step": 102700 }, { "epoch": 16.448, "grad_norm": 0.1469603329896927, "learning_rate": 0.00029754563782551297, "loss": 8.0137, "step": 102800 }, { "epoch": 16.464, "grad_norm": 0.11437113583087921, "learning_rate": 0.00029754323772950914, "loss": 7.3291, "step": 102900 }, { "epoch": 16.48, "grad_norm": 0.17373935878276825, "learning_rate": 0.0002975408376335053, "loss": 8.0078, "step": 103000 }, { "epoch": 16.496, "grad_norm": 0.12379905581474304, "learning_rate": 0.0002975384375375015, "loss": 8.0724, "step": 103100 }, { "epoch": 16.512, "grad_norm": 0.1540013700723648, "learning_rate": 0.00029753603744149765, "loss": 7.6953, "step": 103200 }, { "epoch": 16.528, "grad_norm": 0.21880146861076355, "learning_rate": 0.00029753363734549376, "loss": 8.0522, "step": 103300 }, { "epoch": 16.544, "grad_norm": 0.14410023391246796, "learning_rate": 0.00029753123724948993, "loss": 8.191, "step": 103400 }, { "epoch": 16.56, "grad_norm": 0.13037148118019104, "learning_rate": 0.0002975288371534861, "loss": 7.6117, "step": 103500 }, { "epoch": 16.576, "grad_norm": 0.16236849129199982, "learning_rate": 0.00029752643705748227, "loss": 7.9894, "step": 103600 }, { "epoch": 16.592, "grad_norm": 0.1502009928226471, "learning_rate": 0.00029752403696147844, "loss": 7.7302, "step": 103700 }, { "epoch": 16.608, "grad_norm": 0.18485447764396667, "learning_rate": 0.00029752163686547455, "loss": 7.8743, "step": 103800 }, { "epoch": 16.624, "grad_norm": 0.12873640656471252, "learning_rate": 0.0002975192367694707, "loss": 7.6197, "step": 103900 }, { "epoch": 16.64, "grad_norm": 0.11517874896526337, "learning_rate": 0.0002975168366734669, "loss": 7.4887, "step": 104000 }, { "epoch": 16.656, "grad_norm": 0.11515144258737564, "learning_rate": 0.00029751443657746306, "loss": 7.706, "step": 104100 }, { "epoch": 16.672, "grad_norm": 0.15465959906578064, "learning_rate": 0.00029751203648145923, "loss": 7.3052, "step": 104200 }, { "epoch": 16.688, "grad_norm": 0.12962587177753448, "learning_rate": 0.0002975096603864154, "loss": 7.8117, "step": 104300 }, { "epoch": 16.704, "grad_norm": 0.18321260809898376, "learning_rate": 0.0002975072602904116, "loss": 7.4464, "step": 104400 }, { "epoch": 16.72, "grad_norm": 0.1769808679819107, "learning_rate": 0.00029750486019440776, "loss": 7.8639, "step": 104500 }, { "epoch": 16.736, "grad_norm": 0.15869227051734924, "learning_rate": 0.00029750246009840393, "loss": 7.7956, "step": 104600 }, { "epoch": 16.752, "grad_norm": 0.12134505808353424, "learning_rate": 0.00029750006000240004, "loss": 7.5809, "step": 104700 }, { "epoch": 16.768, "grad_norm": 0.13986830413341522, "learning_rate": 0.0002974976599063962, "loss": 7.4372, "step": 104800 }, { "epoch": 16.784, "grad_norm": 0.1761140078306198, "learning_rate": 0.0002974952598103924, "loss": 7.7486, "step": 104900 }, { "epoch": 16.8, "grad_norm": 0.13163812458515167, "learning_rate": 0.00029749285971438855, "loss": 7.834, "step": 105000 }, { "epoch": 16.816, "grad_norm": 0.1813841462135315, "learning_rate": 0.0002974904596183847, "loss": 7.5974, "step": 105100 }, { "epoch": 16.832, "grad_norm": 0.15655750036239624, "learning_rate": 0.0002974880595223809, "loss": 7.4437, "step": 105200 }, { "epoch": 16.848, "grad_norm": 0.16123917698860168, "learning_rate": 0.000297485659426377, "loss": 7.347, "step": 105300 }, { "epoch": 16.864, "grad_norm": 0.18692290782928467, "learning_rate": 0.00029748325933037317, "loss": 7.8658, "step": 105400 }, { "epoch": 16.88, "grad_norm": 0.15913629531860352, "learning_rate": 0.00029748085923436934, "loss": 7.9134, "step": 105500 }, { "epoch": 16.896, "grad_norm": 0.1343807876110077, "learning_rate": 0.0002974784591383655, "loss": 7.5983, "step": 105600 }, { "epoch": 16.912, "grad_norm": 0.2009182572364807, "learning_rate": 0.0002974760590423617, "loss": 7.3442, "step": 105700 }, { "epoch": 16.928, "grad_norm": 0.1569000780582428, "learning_rate": 0.0002974736589463578, "loss": 7.5953, "step": 105800 }, { "epoch": 16.944, "grad_norm": 0.1601628214120865, "learning_rate": 0.00029747125885035396, "loss": 7.5624, "step": 105900 }, { "epoch": 16.96, "grad_norm": 0.14143775403499603, "learning_rate": 0.00029746885875435013, "loss": 7.579, "step": 106000 }, { "epoch": 16.976, "grad_norm": 0.2106146216392517, "learning_rate": 0.0002974664586583463, "loss": 7.5958, "step": 106100 }, { "epoch": 16.992, "grad_norm": 0.17329080402851105, "learning_rate": 0.00029746405856234247, "loss": 8.0935, "step": 106200 }, { "epoch": 17.008, "grad_norm": 0.19225256145000458, "learning_rate": 0.00029746165846633864, "loss": 6.8958, "step": 106300 }, { "epoch": 17.024, "grad_norm": 0.17550058662891388, "learning_rate": 0.00029745925837033476, "loss": 7.4002, "step": 106400 }, { "epoch": 17.04, "grad_norm": 0.16778625547885895, "learning_rate": 0.0002974568582743309, "loss": 7.698, "step": 106500 }, { "epoch": 17.056, "grad_norm": 0.14647962152957916, "learning_rate": 0.0002974544581783271, "loss": 7.5615, "step": 106600 }, { "epoch": 17.072, "grad_norm": 0.15024389326572418, "learning_rate": 0.00029745205808232326, "loss": 7.6671, "step": 106700 }, { "epoch": 17.088, "grad_norm": 0.11949127167463303, "learning_rate": 0.00029744965798631943, "loss": 7.6843, "step": 106800 }, { "epoch": 17.104, "grad_norm": 0.15480674803256989, "learning_rate": 0.00029744725789031555, "loss": 7.9465, "step": 106900 }, { "epoch": 17.12, "grad_norm": 0.14191922545433044, "learning_rate": 0.0002974448577943117, "loss": 7.7372, "step": 107000 }, { "epoch": 17.136, "grad_norm": 0.19336700439453125, "learning_rate": 0.0002974424576983079, "loss": 7.6904, "step": 107100 }, { "epoch": 17.152, "grad_norm": 0.17240415513515472, "learning_rate": 0.0002974400576023041, "loss": 7.4487, "step": 107200 }, { "epoch": 17.168, "grad_norm": 0.135718435049057, "learning_rate": 0.0002974376575063002, "loss": 7.5844, "step": 107300 }, { "epoch": 17.184, "grad_norm": 0.13594204187393188, "learning_rate": 0.0002974352574102964, "loss": 7.1186, "step": 107400 }, { "epoch": 17.2, "grad_norm": 0.14997251331806183, "learning_rate": 0.00029743285731429256, "loss": 7.3525, "step": 107500 }, { "epoch": 17.216, "grad_norm": 0.1264813244342804, "learning_rate": 0.00029743045721828873, "loss": 7.8519, "step": 107600 }, { "epoch": 17.232, "grad_norm": 0.16751745343208313, "learning_rate": 0.0002974280571222849, "loss": 7.346, "step": 107700 }, { "epoch": 17.248, "grad_norm": 0.196015402674675, "learning_rate": 0.000297425657026281, "loss": 7.5401, "step": 107800 }, { "epoch": 17.264, "grad_norm": 0.14854785799980164, "learning_rate": 0.0002974232569302772, "loss": 7.3802, "step": 107900 }, { "epoch": 17.28, "grad_norm": 0.1462150365114212, "learning_rate": 0.00029742085683427335, "loss": 7.56, "step": 108000 }, { "epoch": 17.296, "grad_norm": 0.18656545877456665, "learning_rate": 0.0002974184567382695, "loss": 7.4044, "step": 108100 }, { "epoch": 17.312, "grad_norm": 0.15170492231845856, "learning_rate": 0.0002974160566422657, "loss": 7.1246, "step": 108200 }, { "epoch": 17.328, "grad_norm": 0.13659091293811798, "learning_rate": 0.00029741365654626186, "loss": 7.5455, "step": 108300 }, { "epoch": 17.344, "grad_norm": 0.1527138650417328, "learning_rate": 0.000297411256450258, "loss": 7.5807, "step": 108400 }, { "epoch": 17.36, "grad_norm": 0.15352298319339752, "learning_rate": 0.00029740885635425415, "loss": 7.3586, "step": 108500 }, { "epoch": 17.376, "grad_norm": 0.16372795403003693, "learning_rate": 0.0002974065042601704, "loss": 7.5309, "step": 108600 }, { "epoch": 17.392, "grad_norm": 0.14718171954154968, "learning_rate": 0.0002974041041641665, "loss": 7.7871, "step": 108700 }, { "epoch": 17.408, "grad_norm": 0.13745012879371643, "learning_rate": 0.0002974017040681627, "loss": 7.4228, "step": 108800 }, { "epoch": 17.424, "grad_norm": 0.1310426890850067, "learning_rate": 0.00029739930397215886, "loss": 6.914, "step": 108900 }, { "epoch": 17.44, "grad_norm": 0.1291857808828354, "learning_rate": 0.00029739690387615503, "loss": 7.5163, "step": 109000 }, { "epoch": 17.456, "grad_norm": 0.1615869104862213, "learning_rate": 0.0002973945037801512, "loss": 6.9051, "step": 109100 }, { "epoch": 17.472, "grad_norm": 0.11409099400043488, "learning_rate": 0.00029739210368414737, "loss": 7.4919, "step": 109200 }, { "epoch": 17.488, "grad_norm": 0.12527474761009216, "learning_rate": 0.0002973897035881435, "loss": 7.5104, "step": 109300 }, { "epoch": 17.504, "grad_norm": 0.1936863362789154, "learning_rate": 0.00029738730349213965, "loss": 7.1046, "step": 109400 }, { "epoch": 17.52, "grad_norm": 0.12854978442192078, "learning_rate": 0.0002973849033961358, "loss": 7.4067, "step": 109500 }, { "epoch": 17.536, "grad_norm": 0.13116727769374847, "learning_rate": 0.000297382503300132, "loss": 7.2106, "step": 109600 }, { "epoch": 17.552, "grad_norm": 0.16138528287410736, "learning_rate": 0.00029738010320412816, "loss": 7.263, "step": 109700 }, { "epoch": 17.568, "grad_norm": 0.14999186992645264, "learning_rate": 0.0002973777031081243, "loss": 7.428, "step": 109800 }, { "epoch": 17.584, "grad_norm": 0.13564202189445496, "learning_rate": 0.00029737530301212045, "loss": 7.6592, "step": 109900 }, { "epoch": 17.6, "grad_norm": 0.14535826444625854, "learning_rate": 0.0002973729029161166, "loss": 7.2886, "step": 110000 }, { "epoch": 17.616, "grad_norm": 0.13466519117355347, "learning_rate": 0.0002973705028201128, "loss": 7.4852, "step": 110100 }, { "epoch": 17.632, "grad_norm": 0.1622999757528305, "learning_rate": 0.00029736810272410895, "loss": 7.6437, "step": 110200 }, { "epoch": 17.648, "grad_norm": 0.15417474508285522, "learning_rate": 0.0002973657026281051, "loss": 7.4305, "step": 110300 }, { "epoch": 17.664, "grad_norm": 0.1484052836894989, "learning_rate": 0.00029736330253210124, "loss": 7.5558, "step": 110400 }, { "epoch": 17.68, "grad_norm": 0.15688396990299225, "learning_rate": 0.0002973609024360974, "loss": 7.4349, "step": 110500 }, { "epoch": 17.696, "grad_norm": 0.15338055789470673, "learning_rate": 0.0002973585023400936, "loss": 7.2818, "step": 110600 }, { "epoch": 17.712, "grad_norm": 0.1761266142129898, "learning_rate": 0.00029735610224408974, "loss": 7.2618, "step": 110700 }, { "epoch": 17.728, "grad_norm": 0.17337530851364136, "learning_rate": 0.0002973537021480859, "loss": 7.0263, "step": 110800 }, { "epoch": 17.744, "grad_norm": 0.14693669974803925, "learning_rate": 0.00029735130205208203, "loss": 6.9075, "step": 110900 }, { "epoch": 17.76, "grad_norm": 0.14184145629405975, "learning_rate": 0.00029734892595703827, "loss": 7.1306, "step": 111000 }, { "epoch": 17.776, "grad_norm": 0.15281623601913452, "learning_rate": 0.00029734652586103444, "loss": 6.9965, "step": 111100 }, { "epoch": 17.792, "grad_norm": 0.30168259143829346, "learning_rate": 0.0002973441257650306, "loss": 7.3388, "step": 111200 }, { "epoch": 17.808, "grad_norm": 0.15365231037139893, "learning_rate": 0.0002973417256690267, "loss": 7.2799, "step": 111300 }, { "epoch": 17.824, "grad_norm": 0.1704150289297104, "learning_rate": 0.0002973393255730229, "loss": 7.3031, "step": 111400 }, { "epoch": 17.84, "grad_norm": 0.16025039553642273, "learning_rate": 0.00029733692547701906, "loss": 6.9446, "step": 111500 }, { "epoch": 17.856, "grad_norm": 0.14661014080047607, "learning_rate": 0.00029733452538101523, "loss": 7.4911, "step": 111600 }, { "epoch": 17.872, "grad_norm": 0.18997499346733093, "learning_rate": 0.0002973321252850114, "loss": 7.2489, "step": 111700 }, { "epoch": 17.888, "grad_norm": 0.16025018692016602, "learning_rate": 0.0002973297251890075, "loss": 7.4835, "step": 111800 }, { "epoch": 17.904, "grad_norm": 0.19556750357151031, "learning_rate": 0.0002973273250930037, "loss": 7.5087, "step": 111900 }, { "epoch": 17.92, "grad_norm": 0.14444762468338013, "learning_rate": 0.00029732492499699986, "loss": 7.3942, "step": 112000 }, { "epoch": 17.936, "grad_norm": 0.12939786911010742, "learning_rate": 0.000297322524900996, "loss": 7.0694, "step": 112100 }, { "epoch": 17.951999999999998, "grad_norm": 0.1845860481262207, "learning_rate": 0.0002973201248049922, "loss": 7.3517, "step": 112200 }, { "epoch": 17.968, "grad_norm": 0.1611936390399933, "learning_rate": 0.00029731772470898836, "loss": 7.3119, "step": 112300 }, { "epoch": 17.984, "grad_norm": 0.1410474330186844, "learning_rate": 0.0002973153246129845, "loss": 7.1857, "step": 112400 }, { "epoch": 18.0, "grad_norm": 0.14935807883739471, "learning_rate": 0.00029731292451698065, "loss": 7.2314, "step": 112500 }, { "epoch": 18.016, "grad_norm": 0.11792614310979843, "learning_rate": 0.0002973105244209768, "loss": 7.0182, "step": 112600 }, { "epoch": 18.032, "grad_norm": 0.19907847046852112, "learning_rate": 0.000297308124324973, "loss": 7.036, "step": 112700 }, { "epoch": 18.048, "grad_norm": 0.11814866214990616, "learning_rate": 0.00029730572422896915, "loss": 7.2484, "step": 112800 }, { "epoch": 18.064, "grad_norm": 0.16914184391498566, "learning_rate": 0.00029730332413296527, "loss": 7.0729, "step": 112900 }, { "epoch": 18.08, "grad_norm": 0.11930215358734131, "learning_rate": 0.00029730092403696144, "loss": 6.9642, "step": 113000 }, { "epoch": 18.096, "grad_norm": 0.14744411408901215, "learning_rate": 0.0002972985239409576, "loss": 7.1132, "step": 113100 }, { "epoch": 18.112, "grad_norm": 0.1400415003299713, "learning_rate": 0.0002972961238449538, "loss": 7.1415, "step": 113200 }, { "epoch": 18.128, "grad_norm": 0.1671387106180191, "learning_rate": 0.00029729374774990997, "loss": 7.2558, "step": 113300 }, { "epoch": 18.144, "grad_norm": 0.16554495692253113, "learning_rate": 0.00029729134765390613, "loss": 6.9987, "step": 113400 }, { "epoch": 18.16, "grad_norm": 0.1383550763130188, "learning_rate": 0.0002972889475579023, "loss": 7.0975, "step": 113500 }, { "epoch": 18.176, "grad_norm": 0.1566449999809265, "learning_rate": 0.0002972865474618985, "loss": 7.0562, "step": 113600 }, { "epoch": 18.192, "grad_norm": 0.19498635828495026, "learning_rate": 0.00029728414736589464, "loss": 6.6165, "step": 113700 }, { "epoch": 18.208, "grad_norm": 0.1640356481075287, "learning_rate": 0.00029728174726989076, "loss": 7.1794, "step": 113800 }, { "epoch": 18.224, "grad_norm": 0.11614058166742325, "learning_rate": 0.0002972793471738869, "loss": 7.285, "step": 113900 }, { "epoch": 18.24, "grad_norm": 0.15918317437171936, "learning_rate": 0.0002972769470778831, "loss": 7.163, "step": 114000 }, { "epoch": 18.256, "grad_norm": 0.1565544754266739, "learning_rate": 0.00029727454698187926, "loss": 7.225, "step": 114100 }, { "epoch": 18.272, "grad_norm": 0.17850929498672485, "learning_rate": 0.00029727214688587543, "loss": 6.801, "step": 114200 }, { "epoch": 18.288, "grad_norm": 0.11589377373456955, "learning_rate": 0.0002972697467898716, "loss": 6.8754, "step": 114300 }, { "epoch": 18.304, "grad_norm": 0.13528980314731598, "learning_rate": 0.0002972673466938677, "loss": 7.1785, "step": 114400 }, { "epoch": 18.32, "grad_norm": 0.14462067186832428, "learning_rate": 0.0002972649465978639, "loss": 6.7743, "step": 114500 }, { "epoch": 18.336, "grad_norm": 0.11352884024381638, "learning_rate": 0.0002972625705028201, "loss": 7.195, "step": 114600 }, { "epoch": 18.352, "grad_norm": 0.15487293899059296, "learning_rate": 0.00029726017040681624, "loss": 6.9974, "step": 114700 }, { "epoch": 18.368, "grad_norm": 0.18302305042743683, "learning_rate": 0.0002972577703108124, "loss": 7.3688, "step": 114800 }, { "epoch": 18.384, "grad_norm": 0.13732467591762543, "learning_rate": 0.00029725537021480853, "loss": 7.1072, "step": 114900 }, { "epoch": 18.4, "grad_norm": 0.16661597788333893, "learning_rate": 0.0002972529701188047, "loss": 6.9747, "step": 115000 }, { "epoch": 18.416, "grad_norm": 0.13797527551651, "learning_rate": 0.00029725057002280087, "loss": 6.9419, "step": 115100 }, { "epoch": 18.432, "grad_norm": 0.12859782576560974, "learning_rate": 0.00029724816992679704, "loss": 6.7853, "step": 115200 }, { "epoch": 18.448, "grad_norm": 0.14815713465213776, "learning_rate": 0.0002972457698307932, "loss": 7.2451, "step": 115300 }, { "epoch": 18.464, "grad_norm": 0.17937737703323364, "learning_rate": 0.0002972433697347894, "loss": 6.9378, "step": 115400 }, { "epoch": 18.48, "grad_norm": 0.1678260713815689, "learning_rate": 0.0002972409696387855, "loss": 7.324, "step": 115500 }, { "epoch": 18.496, "grad_norm": 0.1482672095298767, "learning_rate": 0.0002972385695427817, "loss": 6.7464, "step": 115600 }, { "epoch": 18.512, "grad_norm": 0.13717281818389893, "learning_rate": 0.0002972361694467779, "loss": 6.9728, "step": 115700 }, { "epoch": 18.528, "grad_norm": 0.16356568038463593, "learning_rate": 0.000297233769350774, "loss": 6.4269, "step": 115800 }, { "epoch": 18.544, "grad_norm": 0.11255384981632233, "learning_rate": 0.00029723136925477017, "loss": 6.8938, "step": 115900 }, { "epoch": 18.56, "grad_norm": 0.18403998017311096, "learning_rate": 0.00029722896915876634, "loss": 7.5852, "step": 116000 }, { "epoch": 18.576, "grad_norm": 0.16399045288562775, "learning_rate": 0.0002972265690627625, "loss": 6.8499, "step": 116100 }, { "epoch": 18.592, "grad_norm": 0.1565336287021637, "learning_rate": 0.0002972241689667587, "loss": 6.7727, "step": 116200 }, { "epoch": 18.608, "grad_norm": 0.19689014554023743, "learning_rate": 0.00029722176887075484, "loss": 7.1385, "step": 116300 }, { "epoch": 18.624, "grad_norm": 0.13252195715904236, "learning_rate": 0.00029721936877475096, "loss": 6.6291, "step": 116400 }, { "epoch": 18.64, "grad_norm": 0.12019433081150055, "learning_rate": 0.00029721696867874713, "loss": 6.8913, "step": 116500 }, { "epoch": 18.656, "grad_norm": 0.16386528313159943, "learning_rate": 0.0002972145685827433, "loss": 6.7989, "step": 116600 }, { "epoch": 18.672, "grad_norm": 0.13716477155685425, "learning_rate": 0.00029721216848673947, "loss": 6.6763, "step": 116700 }, { "epoch": 18.688, "grad_norm": 0.13785770535469055, "learning_rate": 0.00029720976839073564, "loss": 6.6476, "step": 116800 }, { "epoch": 18.704, "grad_norm": 0.1605842560529709, "learning_rate": 0.00029720736829473175, "loss": 6.6566, "step": 116900 }, { "epoch": 18.72, "grad_norm": 0.19339755177497864, "learning_rate": 0.0002972049681987279, "loss": 6.9454, "step": 117000 }, { "epoch": 18.736, "grad_norm": 0.14963068068027496, "learning_rate": 0.0002972025681027241, "loss": 7.0718, "step": 117100 }, { "epoch": 18.752, "grad_norm": 0.1378934234380722, "learning_rate": 0.00029720016800672026, "loss": 6.7582, "step": 117200 }, { "epoch": 18.768, "grad_norm": 0.1546606719493866, "learning_rate": 0.0002971977679107164, "loss": 6.9278, "step": 117300 }, { "epoch": 18.784, "grad_norm": 0.13777601718902588, "learning_rate": 0.0002971953678147126, "loss": 6.821, "step": 117400 }, { "epoch": 18.8, "grad_norm": 0.1833031326532364, "learning_rate": 0.0002971929677187087, "loss": 7.345, "step": 117500 }, { "epoch": 18.816, "grad_norm": 0.13752517104148865, "learning_rate": 0.0002971905676227049, "loss": 7.0435, "step": 117600 }, { "epoch": 18.832, "grad_norm": 0.14740273356437683, "learning_rate": 0.00029718816752670105, "loss": 7.0617, "step": 117700 }, { "epoch": 18.848, "grad_norm": 0.13207408785820007, "learning_rate": 0.0002971857674306972, "loss": 6.9374, "step": 117800 }, { "epoch": 18.864, "grad_norm": 0.14092418551445007, "learning_rate": 0.0002971833673346934, "loss": 6.5626, "step": 117900 }, { "epoch": 18.88, "grad_norm": 0.19631852209568024, "learning_rate": 0.0002971809672386895, "loss": 7.162, "step": 118000 }, { "epoch": 18.896, "grad_norm": 0.12741628289222717, "learning_rate": 0.00029717856714268567, "loss": 6.8316, "step": 118100 }, { "epoch": 18.912, "grad_norm": 0.17144246399402618, "learning_rate": 0.00029717616704668184, "loss": 6.5714, "step": 118200 }, { "epoch": 18.928, "grad_norm": 0.1456017643213272, "learning_rate": 0.000297173766950678, "loss": 7.1563, "step": 118300 }, { "epoch": 18.944, "grad_norm": 0.17816682159900665, "learning_rate": 0.0002971713668546742, "loss": 7.1767, "step": 118400 }, { "epoch": 18.96, "grad_norm": 0.274588942527771, "learning_rate": 0.00029716896675867035, "loss": 6.9244, "step": 118500 }, { "epoch": 18.976, "grad_norm": 0.14686717092990875, "learning_rate": 0.00029716656666266646, "loss": 6.9108, "step": 118600 }, { "epoch": 18.992, "grad_norm": 0.1549716740846634, "learning_rate": 0.00029716416656666263, "loss": 7.1166, "step": 118700 }, { "epoch": 19.008, "grad_norm": 0.24241045117378235, "learning_rate": 0.0002971617664706588, "loss": 6.7128, "step": 118800 }, { "epoch": 19.024, "grad_norm": 0.14365893602371216, "learning_rate": 0.00029715936637465497, "loss": 6.5973, "step": 118900 }, { "epoch": 19.04, "grad_norm": 0.1771174818277359, "learning_rate": 0.00029715696627865114, "loss": 6.8558, "step": 119000 }, { "epoch": 19.056, "grad_norm": 0.1703067272901535, "learning_rate": 0.00029715456618264726, "loss": 6.748, "step": 119100 }, { "epoch": 19.072, "grad_norm": 0.1466696858406067, "learning_rate": 0.0002971521660866434, "loss": 6.6093, "step": 119200 }, { "epoch": 19.088, "grad_norm": 0.16070063412189484, "learning_rate": 0.0002971497659906396, "loss": 6.7417, "step": 119300 }, { "epoch": 19.104, "grad_norm": 0.2056402564048767, "learning_rate": 0.00029714738989559584, "loss": 6.4175, "step": 119400 }, { "epoch": 19.12, "grad_norm": 0.207046240568161, "learning_rate": 0.00029714498979959195, "loss": 6.9465, "step": 119500 }, { "epoch": 19.136, "grad_norm": 0.12638603150844574, "learning_rate": 0.0002971425897035881, "loss": 6.882, "step": 119600 }, { "epoch": 19.152, "grad_norm": 0.17709197103977203, "learning_rate": 0.0002971401896075843, "loss": 6.5151, "step": 119700 }, { "epoch": 19.168, "grad_norm": 0.14313985407352448, "learning_rate": 0.00029713778951158046, "loss": 6.6897, "step": 119800 }, { "epoch": 19.184, "grad_norm": 0.14212185144424438, "learning_rate": 0.00029713538941557663, "loss": 7.0293, "step": 119900 }, { "epoch": 19.2, "grad_norm": 0.14830344915390015, "learning_rate": 0.00029713298931957274, "loss": 6.8398, "step": 120000 }, { "epoch": 19.216, "grad_norm": 0.24165965616703033, "learning_rate": 0.0002971305892235689, "loss": 6.715, "step": 120100 }, { "epoch": 19.232, "grad_norm": 0.13292773067951202, "learning_rate": 0.0002971281891275651, "loss": 6.8165, "step": 120200 }, { "epoch": 19.248, "grad_norm": 0.1639406383037567, "learning_rate": 0.00029712578903156125, "loss": 6.9099, "step": 120300 }, { "epoch": 19.264, "grad_norm": 0.18321408331394196, "learning_rate": 0.0002971233889355574, "loss": 6.4805, "step": 120400 }, { "epoch": 19.28, "grad_norm": 0.18382756412029266, "learning_rate": 0.0002971209888395536, "loss": 6.8172, "step": 120500 }, { "epoch": 19.296, "grad_norm": 0.15303823351860046, "learning_rate": 0.0002971185887435497, "loss": 6.2661, "step": 120600 }, { "epoch": 19.312, "grad_norm": 0.1740507036447525, "learning_rate": 0.0002971161886475459, "loss": 6.6127, "step": 120700 }, { "epoch": 19.328, "grad_norm": 0.14414259791374207, "learning_rate": 0.00029711378855154204, "loss": 6.4442, "step": 120800 }, { "epoch": 19.344, "grad_norm": 0.14647360146045685, "learning_rate": 0.0002971113884555382, "loss": 6.6076, "step": 120900 }, { "epoch": 19.36, "grad_norm": 0.15991808474063873, "learning_rate": 0.0002971089883595344, "loss": 6.787, "step": 121000 }, { "epoch": 19.376, "grad_norm": 0.1332535594701767, "learning_rate": 0.0002971065882635305, "loss": 6.7092, "step": 121100 }, { "epoch": 19.392, "grad_norm": 0.14746126532554626, "learning_rate": 0.00029710418816752667, "loss": 6.7574, "step": 121200 }, { "epoch": 19.408, "grad_norm": 0.13268060982227325, "learning_rate": 0.00029710178807152283, "loss": 6.4729, "step": 121300 }, { "epoch": 19.424, "grad_norm": 0.18852052092552185, "learning_rate": 0.000297099387975519, "loss": 6.7246, "step": 121400 }, { "epoch": 19.44, "grad_norm": 0.20590665936470032, "learning_rate": 0.00029709698787951517, "loss": 6.7032, "step": 121500 }, { "epoch": 19.456, "grad_norm": 0.18409046530723572, "learning_rate": 0.00029709458778351134, "loss": 6.9088, "step": 121600 }, { "epoch": 19.472, "grad_norm": 0.1330518126487732, "learning_rate": 0.00029709218768750746, "loss": 6.7912, "step": 121700 }, { "epoch": 19.488, "grad_norm": 0.17881762981414795, "learning_rate": 0.0002970897875915036, "loss": 6.6976, "step": 121800 }, { "epoch": 19.504, "grad_norm": 0.1952984780073166, "learning_rate": 0.0002970873874954998, "loss": 6.6684, "step": 121900 }, { "epoch": 19.52, "grad_norm": 0.10283193737268448, "learning_rate": 0.00029708498739949596, "loss": 6.8239, "step": 122000 }, { "epoch": 19.536, "grad_norm": 0.14318746328353882, "learning_rate": 0.00029708258730349213, "loss": 6.3829, "step": 122100 }, { "epoch": 19.552, "grad_norm": 0.27563196420669556, "learning_rate": 0.00029708018720748825, "loss": 6.5011, "step": 122200 }, { "epoch": 19.568, "grad_norm": 0.22338111698627472, "learning_rate": 0.0002970777871114844, "loss": 6.5485, "step": 122300 }, { "epoch": 19.584, "grad_norm": 0.12649616599082947, "learning_rate": 0.0002970753870154806, "loss": 6.7374, "step": 122400 }, { "epoch": 19.6, "grad_norm": 0.15860269963741302, "learning_rate": 0.00029707298691947676, "loss": 6.3596, "step": 122500 }, { "epoch": 19.616, "grad_norm": 0.12358345836400986, "learning_rate": 0.00029707061082443294, "loss": 6.3242, "step": 122600 }, { "epoch": 19.632, "grad_norm": 0.16506068408489227, "learning_rate": 0.0002970682107284291, "loss": 6.5935, "step": 122700 }, { "epoch": 19.648, "grad_norm": 0.19951657950878143, "learning_rate": 0.0002970658106324253, "loss": 6.4781, "step": 122800 }, { "epoch": 19.664, "grad_norm": 0.16879688203334808, "learning_rate": 0.00029706341053642145, "loss": 6.4468, "step": 122900 }, { "epoch": 19.68, "grad_norm": 0.14565648138523102, "learning_rate": 0.0002970610104404176, "loss": 6.635, "step": 123000 }, { "epoch": 19.696, "grad_norm": 0.12739145755767822, "learning_rate": 0.00029705861034441374, "loss": 6.7823, "step": 123100 }, { "epoch": 19.712, "grad_norm": 0.1428256332874298, "learning_rate": 0.0002970562102484099, "loss": 6.3011, "step": 123200 }, { "epoch": 19.728, "grad_norm": 0.1541672646999359, "learning_rate": 0.0002970538101524061, "loss": 6.93, "step": 123300 }, { "epoch": 19.744, "grad_norm": 0.14009244740009308, "learning_rate": 0.00029705141005640224, "loss": 6.4553, "step": 123400 }, { "epoch": 19.76, "grad_norm": 0.1925840973854065, "learning_rate": 0.0002970490099603984, "loss": 6.812, "step": 123500 }, { "epoch": 19.776, "grad_norm": 0.1624009907245636, "learning_rate": 0.0002970466098643946, "loss": 6.644, "step": 123600 }, { "epoch": 19.792, "grad_norm": 0.12902632355690002, "learning_rate": 0.0002970442097683907, "loss": 6.8444, "step": 123700 }, { "epoch": 19.808, "grad_norm": 0.1572074443101883, "learning_rate": 0.00029704180967238687, "loss": 6.8285, "step": 123800 }, { "epoch": 19.824, "grad_norm": 0.17196834087371826, "learning_rate": 0.00029703940957638304, "loss": 6.318, "step": 123900 }, { "epoch": 19.84, "grad_norm": 0.14329147338867188, "learning_rate": 0.0002970370094803792, "loss": 6.5197, "step": 124000 }, { "epoch": 19.856, "grad_norm": 0.12039805948734283, "learning_rate": 0.0002970346093843754, "loss": 6.3033, "step": 124100 }, { "epoch": 19.872, "grad_norm": 0.1786791980266571, "learning_rate": 0.0002970322092883715, "loss": 6.669, "step": 124200 }, { "epoch": 19.888, "grad_norm": 0.12987840175628662, "learning_rate": 0.00029702980919236766, "loss": 6.2543, "step": 124300 }, { "epoch": 19.904, "grad_norm": 0.12259730696678162, "learning_rate": 0.00029702740909636383, "loss": 6.4946, "step": 124400 }, { "epoch": 19.92, "grad_norm": 0.10069935768842697, "learning_rate": 0.00029702500900036, "loss": 6.7976, "step": 124500 }, { "epoch": 19.936, "grad_norm": 0.14555324614048004, "learning_rate": 0.00029702260890435617, "loss": 6.3994, "step": 124600 }, { "epoch": 19.951999999999998, "grad_norm": 0.15070566534996033, "learning_rate": 0.00029702020880835234, "loss": 6.3558, "step": 124700 }, { "epoch": 19.968, "grad_norm": 0.13936389982700348, "learning_rate": 0.00029701780871234845, "loss": 6.369, "step": 124800 }, { "epoch": 19.984, "grad_norm": 0.20414897799491882, "learning_rate": 0.0002970154086163446, "loss": 6.4591, "step": 124900 }, { "epoch": 20.0, "grad_norm": 0.17090056836605072, "learning_rate": 0.0002970130085203408, "loss": 6.6428, "step": 125000 }, { "epoch": 20.016, "grad_norm": 0.13628321886062622, "learning_rate": 0.00029701060842433696, "loss": 6.6142, "step": 125100 }, { "epoch": 20.032, "grad_norm": 0.1602114588022232, "learning_rate": 0.0002970082083283331, "loss": 6.2906, "step": 125200 }, { "epoch": 20.048, "grad_norm": 0.16529148817062378, "learning_rate": 0.00029700580823232924, "loss": 6.32, "step": 125300 }, { "epoch": 20.064, "grad_norm": 0.09591558575630188, "learning_rate": 0.0002970034081363254, "loss": 6.5236, "step": 125400 }, { "epoch": 20.08, "grad_norm": 0.16209086775779724, "learning_rate": 0.0002970010080403216, "loss": 6.0982, "step": 125500 }, { "epoch": 20.096, "grad_norm": 0.14823907613754272, "learning_rate": 0.00029699860794431775, "loss": 6.5177, "step": 125600 }, { "epoch": 20.112, "grad_norm": 0.14667312800884247, "learning_rate": 0.0002969962078483139, "loss": 6.2496, "step": 125700 }, { "epoch": 20.128, "grad_norm": 0.14101973176002502, "learning_rate": 0.0002969938077523101, "loss": 6.4982, "step": 125800 }, { "epoch": 20.144, "grad_norm": 0.15947328507900238, "learning_rate": 0.0002969914076563062, "loss": 6.2799, "step": 125900 }, { "epoch": 20.16, "grad_norm": 0.1501172035932541, "learning_rate": 0.00029698900756030237, "loss": 6.3317, "step": 126000 }, { "epoch": 20.176, "grad_norm": 0.15825922787189484, "learning_rate": 0.00029698660746429854, "loss": 6.2838, "step": 126100 }, { "epoch": 20.192, "grad_norm": 0.14270856976509094, "learning_rate": 0.00029698423136925473, "loss": 6.2077, "step": 126200 }, { "epoch": 20.208, "grad_norm": 0.1994931846857071, "learning_rate": 0.0002969818312732509, "loss": 6.3276, "step": 126300 }, { "epoch": 20.224, "grad_norm": 0.2308851182460785, "learning_rate": 0.00029697943117724707, "loss": 6.3211, "step": 126400 }, { "epoch": 20.24, "grad_norm": 0.21615839004516602, "learning_rate": 0.00029697703108124324, "loss": 6.2481, "step": 126500 }, { "epoch": 20.256, "grad_norm": 0.14972296357154846, "learning_rate": 0.0002969746309852394, "loss": 6.3543, "step": 126600 }, { "epoch": 20.272, "grad_norm": 0.164517343044281, "learning_rate": 0.0002969722308892356, "loss": 6.3991, "step": 126700 }, { "epoch": 20.288, "grad_norm": 0.15623216331005096, "learning_rate": 0.0002969698307932317, "loss": 6.6786, "step": 126800 }, { "epoch": 20.304, "grad_norm": 0.1451660692691803, "learning_rate": 0.00029696743069722786, "loss": 6.2966, "step": 126900 }, { "epoch": 20.32, "grad_norm": 0.17200326919555664, "learning_rate": 0.00029696503060122403, "loss": 6.4685, "step": 127000 }, { "epoch": 20.336, "grad_norm": 0.15096783638000488, "learning_rate": 0.0002969626305052202, "loss": 6.2486, "step": 127100 }, { "epoch": 20.352, "grad_norm": 0.14257729053497314, "learning_rate": 0.00029696023040921637, "loss": 6.2078, "step": 127200 }, { "epoch": 20.368, "grad_norm": 0.21399612724781036, "learning_rate": 0.0002969578303132125, "loss": 6.0766, "step": 127300 }, { "epoch": 20.384, "grad_norm": 0.11737848818302155, "learning_rate": 0.00029695543021720865, "loss": 6.3663, "step": 127400 }, { "epoch": 20.4, "grad_norm": 0.13575823605060577, "learning_rate": 0.0002969530301212048, "loss": 6.202, "step": 127500 }, { "epoch": 20.416, "grad_norm": 0.15899422764778137, "learning_rate": 0.000296950630025201, "loss": 6.0727, "step": 127600 }, { "epoch": 20.432, "grad_norm": 0.18363483250141144, "learning_rate": 0.00029694822992919716, "loss": 6.594, "step": 127700 }, { "epoch": 20.448, "grad_norm": 0.1325751096010208, "learning_rate": 0.00029694582983319333, "loss": 6.532, "step": 127800 }, { "epoch": 20.464, "grad_norm": 0.13950107991695404, "learning_rate": 0.00029694342973718944, "loss": 5.9695, "step": 127900 }, { "epoch": 20.48, "grad_norm": 0.09819541126489639, "learning_rate": 0.0002969410296411856, "loss": 6.3775, "step": 128000 }, { "epoch": 20.496, "grad_norm": 0.15788622200489044, "learning_rate": 0.0002969386295451818, "loss": 6.5626, "step": 128100 }, { "epoch": 20.512, "grad_norm": 0.1338583081960678, "learning_rate": 0.00029693622944917795, "loss": 6.3808, "step": 128200 }, { "epoch": 20.528, "grad_norm": 0.1711709052324295, "learning_rate": 0.0002969338293531741, "loss": 6.3297, "step": 128300 }, { "epoch": 20.544, "grad_norm": 0.10356644541025162, "learning_rate": 0.00029693142925717023, "loss": 6.2275, "step": 128400 }, { "epoch": 20.56, "grad_norm": 0.17266201972961426, "learning_rate": 0.0002969290291611664, "loss": 6.399, "step": 128500 }, { "epoch": 20.576, "grad_norm": 0.1582164466381073, "learning_rate": 0.0002969266290651626, "loss": 6.186, "step": 128600 }, { "epoch": 20.592, "grad_norm": 0.15661326050758362, "learning_rate": 0.00029692422896915874, "loss": 6.3988, "step": 128700 }, { "epoch": 20.608, "grad_norm": 0.12148367613554001, "learning_rate": 0.00029692185287411493, "loss": 6.4026, "step": 128800 }, { "epoch": 20.624, "grad_norm": 0.15861108899116516, "learning_rate": 0.0002969194527781111, "loss": 6.1632, "step": 128900 }, { "epoch": 20.64, "grad_norm": 0.21511606872081757, "learning_rate": 0.00029691705268210727, "loss": 6.1254, "step": 129000 }, { "epoch": 20.656, "grad_norm": 0.17380183935165405, "learning_rate": 0.00029691465258610344, "loss": 5.8979, "step": 129100 }, { "epoch": 20.672, "grad_norm": 0.15295742452144623, "learning_rate": 0.0002969122524900996, "loss": 6.1504, "step": 129200 }, { "epoch": 20.688, "grad_norm": 0.14123979210853577, "learning_rate": 0.0002969098523940957, "loss": 6.3968, "step": 129300 }, { "epoch": 20.704, "grad_norm": 0.11941767483949661, "learning_rate": 0.0002969074522980919, "loss": 6.2761, "step": 129400 }, { "epoch": 20.72, "grad_norm": 0.1716291755437851, "learning_rate": 0.00029690505220208806, "loss": 6.1725, "step": 129500 }, { "epoch": 20.736, "grad_norm": 0.10485927015542984, "learning_rate": 0.00029690265210608423, "loss": 6.3992, "step": 129600 }, { "epoch": 20.752, "grad_norm": 0.14606288075447083, "learning_rate": 0.0002969002520100804, "loss": 6.3221, "step": 129700 }, { "epoch": 20.768, "grad_norm": 0.1599857658147812, "learning_rate": 0.00029689785191407657, "loss": 6.4159, "step": 129800 }, { "epoch": 20.784, "grad_norm": 0.1607884019613266, "learning_rate": 0.0002968954518180727, "loss": 6.2899, "step": 129900 }, { "epoch": 20.8, "grad_norm": 0.17046970129013062, "learning_rate": 0.00029689305172206885, "loss": 6.195, "step": 130000 }, { "epoch": 20.816, "grad_norm": 0.17893536388874054, "learning_rate": 0.000296890651626065, "loss": 6.3987, "step": 130100 }, { "epoch": 20.832, "grad_norm": 0.15878397226333618, "learning_rate": 0.0002968882515300612, "loss": 6.8826, "step": 130200 }, { "epoch": 20.848, "grad_norm": 0.17702220380306244, "learning_rate": 0.00029688585143405736, "loss": 6.4912, "step": 130300 }, { "epoch": 20.864, "grad_norm": 0.1281166672706604, "learning_rate": 0.0002968834513380535, "loss": 6.5531, "step": 130400 }, { "epoch": 20.88, "grad_norm": 0.16799704730510712, "learning_rate": 0.00029688105124204964, "loss": 5.9929, "step": 130500 }, { "epoch": 20.896, "grad_norm": 0.1236133724451065, "learning_rate": 0.0002968786511460458, "loss": 6.0232, "step": 130600 }, { "epoch": 20.912, "grad_norm": 0.1369544267654419, "learning_rate": 0.000296876251050042, "loss": 6.5761, "step": 130700 }, { "epoch": 20.928, "grad_norm": 0.13266846537590027, "learning_rate": 0.00029687385095403815, "loss": 6.1677, "step": 130800 }, { "epoch": 20.944, "grad_norm": 0.11849372833967209, "learning_rate": 0.0002968714508580343, "loss": 6.0787, "step": 130900 }, { "epoch": 20.96, "grad_norm": 0.11395172029733658, "learning_rate": 0.00029686905076203044, "loss": 6.2634, "step": 131000 }, { "epoch": 20.976, "grad_norm": 0.11821906268596649, "learning_rate": 0.0002968666746669866, "loss": 6.388, "step": 131100 }, { "epoch": 20.992, "grad_norm": 0.12622199952602386, "learning_rate": 0.00029686427457098285, "loss": 6.0103, "step": 131200 }, { "epoch": 21.008, "grad_norm": 0.16676801443099976, "learning_rate": 0.00029686187447497896, "loss": 5.865, "step": 131300 }, { "epoch": 21.024, "grad_norm": 0.15502384305000305, "learning_rate": 0.00029685947437897513, "loss": 6.165, "step": 131400 }, { "epoch": 21.04, "grad_norm": 0.24440471827983856, "learning_rate": 0.0002968570742829713, "loss": 5.9314, "step": 131500 }, { "epoch": 21.056, "grad_norm": 0.1315223127603531, "learning_rate": 0.00029685467418696747, "loss": 6.0678, "step": 131600 }, { "epoch": 21.072, "grad_norm": 0.1865660399198532, "learning_rate": 0.00029685227409096364, "loss": 5.9805, "step": 131700 }, { "epoch": 21.088, "grad_norm": 0.2066924124956131, "learning_rate": 0.0002968498739949598, "loss": 6.1499, "step": 131800 }, { "epoch": 21.104, "grad_norm": 0.14284636080265045, "learning_rate": 0.0002968474738989559, "loss": 5.7731, "step": 131900 }, { "epoch": 21.12, "grad_norm": 0.15058225393295288, "learning_rate": 0.0002968450738029521, "loss": 6.1113, "step": 132000 }, { "epoch": 21.136, "grad_norm": 0.12619538605213165, "learning_rate": 0.00029684267370694826, "loss": 5.9437, "step": 132100 }, { "epoch": 21.152, "grad_norm": 0.15766064822673798, "learning_rate": 0.00029684027361094443, "loss": 6.2503, "step": 132200 }, { "epoch": 21.168, "grad_norm": 0.14563268423080444, "learning_rate": 0.0002968378735149406, "loss": 5.96, "step": 132300 }, { "epoch": 21.184, "grad_norm": 0.14157824218273163, "learning_rate": 0.0002968354734189367, "loss": 6.1794, "step": 132400 }, { "epoch": 21.2, "grad_norm": 0.18574143946170807, "learning_rate": 0.0002968330733229329, "loss": 6.3155, "step": 132500 }, { "epoch": 21.216, "grad_norm": 0.11855421960353851, "learning_rate": 0.00029683067322692905, "loss": 6.4108, "step": 132600 }, { "epoch": 21.232, "grad_norm": 0.12140708416700363, "learning_rate": 0.0002968282731309252, "loss": 6.0888, "step": 132700 }, { "epoch": 21.248, "grad_norm": 0.17192867398262024, "learning_rate": 0.0002968258730349214, "loss": 6.2884, "step": 132800 }, { "epoch": 21.264, "grad_norm": 0.13360394537448883, "learning_rate": 0.00029682347293891756, "loss": 6.1993, "step": 132900 }, { "epoch": 21.28, "grad_norm": 0.16163136065006256, "learning_rate": 0.0002968210968438737, "loss": 6.2262, "step": 133000 }, { "epoch": 21.296, "grad_norm": 0.12919676303863525, "learning_rate": 0.00029681869674786987, "loss": 5.8, "step": 133100 }, { "epoch": 21.312, "grad_norm": 0.1594499945640564, "learning_rate": 0.00029681629665186603, "loss": 5.8055, "step": 133200 }, { "epoch": 21.328, "grad_norm": 0.12262352555990219, "learning_rate": 0.0002968138965558622, "loss": 5.6412, "step": 133300 }, { "epoch": 21.344, "grad_norm": 0.16952601075172424, "learning_rate": 0.0002968114964598584, "loss": 6.0173, "step": 133400 }, { "epoch": 21.36, "grad_norm": 0.17378447949886322, "learning_rate": 0.0002968090963638545, "loss": 5.5105, "step": 133500 }, { "epoch": 21.376, "grad_norm": 0.12117540836334229, "learning_rate": 0.00029680669626785066, "loss": 6.5432, "step": 133600 }, { "epoch": 21.392, "grad_norm": 0.15760718286037445, "learning_rate": 0.0002968042961718468, "loss": 5.6998, "step": 133700 }, { "epoch": 21.408, "grad_norm": 0.20163291692733765, "learning_rate": 0.000296801896075843, "loss": 5.9457, "step": 133800 }, { "epoch": 21.424, "grad_norm": 0.1601804941892624, "learning_rate": 0.00029679949597983916, "loss": 5.7331, "step": 133900 }, { "epoch": 21.44, "grad_norm": 0.147283673286438, "learning_rate": 0.00029679709588383533, "loss": 6.034, "step": 134000 }, { "epoch": 21.456, "grad_norm": 0.1677253395318985, "learning_rate": 0.00029679469578783145, "loss": 6.4454, "step": 134100 }, { "epoch": 21.472, "grad_norm": 0.1402285099029541, "learning_rate": 0.0002967922956918276, "loss": 5.9842, "step": 134200 }, { "epoch": 21.488, "grad_norm": 0.185127392411232, "learning_rate": 0.00029678989559582384, "loss": 6.0976, "step": 134300 }, { "epoch": 21.504, "grad_norm": 0.17136482894420624, "learning_rate": 0.00029678749549981996, "loss": 6.3848, "step": 134400 }, { "epoch": 21.52, "grad_norm": 0.14343611896038055, "learning_rate": 0.0002967850954038161, "loss": 6.1087, "step": 134500 }, { "epoch": 21.536, "grad_norm": 0.13721515238285065, "learning_rate": 0.0002967826953078123, "loss": 6.0383, "step": 134600 }, { "epoch": 21.552, "grad_norm": 0.13419759273529053, "learning_rate": 0.00029678029521180846, "loss": 5.8767, "step": 134700 }, { "epoch": 21.568, "grad_norm": 0.18504373729228973, "learning_rate": 0.00029677789511580463, "loss": 6.0607, "step": 134800 }, { "epoch": 21.584, "grad_norm": 0.14880910515785217, "learning_rate": 0.0002967754950198008, "loss": 5.9108, "step": 134900 }, { "epoch": 21.6, "grad_norm": 0.13054971396923065, "learning_rate": 0.0002967730949237969, "loss": 6.0197, "step": 135000 }, { "epoch": 21.616, "grad_norm": 0.16096660494804382, "learning_rate": 0.0002967706948277931, "loss": 5.8114, "step": 135100 }, { "epoch": 21.632, "grad_norm": 0.16552191972732544, "learning_rate": 0.00029676829473178926, "loss": 6.2389, "step": 135200 }, { "epoch": 21.648, "grad_norm": 0.13705958425998688, "learning_rate": 0.0002967658946357854, "loss": 6.2474, "step": 135300 }, { "epoch": 21.664, "grad_norm": 0.17535176873207092, "learning_rate": 0.0002967634945397816, "loss": 6.0806, "step": 135400 }, { "epoch": 21.68, "grad_norm": 0.15185397863388062, "learning_rate": 0.0002967610944437777, "loss": 6.2673, "step": 135500 }, { "epoch": 21.696, "grad_norm": 0.1459989696741104, "learning_rate": 0.0002967586943477739, "loss": 6.1566, "step": 135600 }, { "epoch": 21.712, "grad_norm": 0.1216706857085228, "learning_rate": 0.00029675629425177005, "loss": 5.9801, "step": 135700 }, { "epoch": 21.728, "grad_norm": 0.1349131315946579, "learning_rate": 0.0002967538941557662, "loss": 5.8902, "step": 135800 }, { "epoch": 21.744, "grad_norm": 0.14793895184993744, "learning_rate": 0.0002967514940597624, "loss": 5.7143, "step": 135900 }, { "epoch": 21.76, "grad_norm": 0.171220600605011, "learning_rate": 0.00029674909396375855, "loss": 5.7715, "step": 136000 }, { "epoch": 21.776, "grad_norm": 0.18677209317684174, "learning_rate": 0.00029674669386775467, "loss": 5.9996, "step": 136100 }, { "epoch": 21.792, "grad_norm": 0.153004989027977, "learning_rate": 0.00029674429377175084, "loss": 6.1678, "step": 136200 }, { "epoch": 21.808, "grad_norm": 0.12716227769851685, "learning_rate": 0.000296741893675747, "loss": 5.8525, "step": 136300 }, { "epoch": 21.824, "grad_norm": 0.15531957149505615, "learning_rate": 0.0002967394935797432, "loss": 5.703, "step": 136400 }, { "epoch": 21.84, "grad_norm": 0.16813132166862488, "learning_rate": 0.00029673709348373935, "loss": 5.7367, "step": 136500 }, { "epoch": 21.856, "grad_norm": 0.1366407722234726, "learning_rate": 0.0002967346933877355, "loss": 6.4011, "step": 136600 }, { "epoch": 21.872, "grad_norm": 0.1486620455980301, "learning_rate": 0.00029673229329173163, "loss": 6.0592, "step": 136700 }, { "epoch": 21.888, "grad_norm": 0.1474551409482956, "learning_rate": 0.0002967298931957278, "loss": 6.1269, "step": 136800 }, { "epoch": 21.904, "grad_norm": 0.1317261904478073, "learning_rate": 0.00029672749309972397, "loss": 6.2704, "step": 136900 }, { "epoch": 21.92, "grad_norm": 0.12736591696739197, "learning_rate": 0.00029672511700468016, "loss": 5.9018, "step": 137000 }, { "epoch": 21.936, "grad_norm": 0.17512458562850952, "learning_rate": 0.0002967227169086763, "loss": 6.1423, "step": 137100 }, { "epoch": 21.951999999999998, "grad_norm": 0.2035478949546814, "learning_rate": 0.0002967203408136325, "loss": 5.8421, "step": 137200 }, { "epoch": 21.968, "grad_norm": 0.15790584683418274, "learning_rate": 0.0002967179407176287, "loss": 5.6449, "step": 137300 }, { "epoch": 21.984, "grad_norm": 0.13050822913646698, "learning_rate": 0.00029671554062162485, "loss": 6.0866, "step": 137400 }, { "epoch": 22.0, "grad_norm": 0.1332990825176239, "learning_rate": 0.00029671314052562097, "loss": 5.8362, "step": 137500 }, { "epoch": 22.016, "grad_norm": 0.14409734308719635, "learning_rate": 0.00029671074042961714, "loss": 5.7401, "step": 137600 }, { "epoch": 22.032, "grad_norm": 0.1513838768005371, "learning_rate": 0.0002967083403336133, "loss": 5.8022, "step": 137700 }, { "epoch": 22.048, "grad_norm": 0.14416912198066711, "learning_rate": 0.0002967059402376095, "loss": 5.7687, "step": 137800 }, { "epoch": 22.064, "grad_norm": 0.13069897890090942, "learning_rate": 0.00029670354014160565, "loss": 5.7314, "step": 137900 }, { "epoch": 22.08, "grad_norm": 0.15089532732963562, "learning_rate": 0.0002967011400456018, "loss": 5.6511, "step": 138000 }, { "epoch": 22.096, "grad_norm": 0.1493406444787979, "learning_rate": 0.00029669873994959793, "loss": 5.7553, "step": 138100 }, { "epoch": 22.112, "grad_norm": 0.11403771489858627, "learning_rate": 0.0002966963398535941, "loss": 5.8785, "step": 138200 }, { "epoch": 22.128, "grad_norm": 0.1418454647064209, "learning_rate": 0.00029669393975759027, "loss": 5.906, "step": 138300 }, { "epoch": 22.144, "grad_norm": 0.14632883667945862, "learning_rate": 0.00029669153966158644, "loss": 5.7911, "step": 138400 }, { "epoch": 22.16, "grad_norm": 0.18317896127700806, "learning_rate": 0.0002966891395655826, "loss": 5.6022, "step": 138500 }, { "epoch": 22.176, "grad_norm": 0.14640462398529053, "learning_rate": 0.0002966867394695788, "loss": 5.6879, "step": 138600 }, { "epoch": 22.192, "grad_norm": 0.11322261393070221, "learning_rate": 0.0002966843393735749, "loss": 5.679, "step": 138700 }, { "epoch": 22.208, "grad_norm": 0.14412596821784973, "learning_rate": 0.00029668193927757106, "loss": 5.6202, "step": 138800 }, { "epoch": 22.224, "grad_norm": 0.14023444056510925, "learning_rate": 0.00029667953918156723, "loss": 6.0133, "step": 138900 }, { "epoch": 22.24, "grad_norm": 0.18092051148414612, "learning_rate": 0.0002966771390855634, "loss": 5.6881, "step": 139000 }, { "epoch": 22.256, "grad_norm": 0.13267236948013306, "learning_rate": 0.00029667473898955957, "loss": 5.742, "step": 139100 }, { "epoch": 22.272, "grad_norm": 0.1066688597202301, "learning_rate": 0.0002966723388935557, "loss": 5.9524, "step": 139200 }, { "epoch": 22.288, "grad_norm": 0.17234094440937042, "learning_rate": 0.00029666993879755185, "loss": 6.0385, "step": 139300 }, { "epoch": 22.304, "grad_norm": 0.1593136042356491, "learning_rate": 0.000296667538701548, "loss": 5.7894, "step": 139400 }, { "epoch": 22.32, "grad_norm": 0.1161966621875763, "learning_rate": 0.0002966651386055442, "loss": 5.6333, "step": 139500 }, { "epoch": 22.336, "grad_norm": 0.16088221967220306, "learning_rate": 0.00029666273850954036, "loss": 5.3016, "step": 139600 }, { "epoch": 22.352, "grad_norm": 0.195027694106102, "learning_rate": 0.00029666033841353653, "loss": 5.8886, "step": 139700 }, { "epoch": 22.368, "grad_norm": 0.17010509967803955, "learning_rate": 0.00029665793831753264, "loss": 5.7462, "step": 139800 }, { "epoch": 22.384, "grad_norm": 0.15900500118732452, "learning_rate": 0.0002966555382215288, "loss": 6.1951, "step": 139900 }, { "epoch": 22.4, "grad_norm": 0.20321440696716309, "learning_rate": 0.000296653138125525, "loss": 5.8264, "step": 140000 }, { "epoch": 22.416, "grad_norm": 0.21823586523532867, "learning_rate": 0.00029665073802952115, "loss": 5.7779, "step": 140100 }, { "epoch": 22.432, "grad_norm": 0.12739881873130798, "learning_rate": 0.0002966483379335173, "loss": 5.6477, "step": 140200 }, { "epoch": 22.448, "grad_norm": 0.1288122534751892, "learning_rate": 0.00029664593783751344, "loss": 5.5937, "step": 140300 }, { "epoch": 22.464, "grad_norm": 0.12690824270248413, "learning_rate": 0.0002966435377415096, "loss": 6.0249, "step": 140400 }, { "epoch": 22.48, "grad_norm": 0.16361913084983826, "learning_rate": 0.00029664113764550583, "loss": 5.8957, "step": 140500 }, { "epoch": 22.496, "grad_norm": 0.13729694485664368, "learning_rate": 0.000296638737549502, "loss": 5.8405, "step": 140600 }, { "epoch": 22.512, "grad_norm": 0.19917264580726624, "learning_rate": 0.0002966363374534981, "loss": 5.9084, "step": 140700 }, { "epoch": 22.528, "grad_norm": 0.15145164728164673, "learning_rate": 0.0002966339373574943, "loss": 5.4631, "step": 140800 }, { "epoch": 22.544, "grad_norm": 0.11967241019010544, "learning_rate": 0.00029663153726149045, "loss": 5.9098, "step": 140900 }, { "epoch": 22.56, "grad_norm": 0.15000027418136597, "learning_rate": 0.0002966291371654866, "loss": 5.7238, "step": 141000 }, { "epoch": 22.576, "grad_norm": 0.16883157193660736, "learning_rate": 0.0002966267370694828, "loss": 5.738, "step": 141100 }, { "epoch": 22.592, "grad_norm": 0.13367842137813568, "learning_rate": 0.0002966243369734789, "loss": 5.5043, "step": 141200 }, { "epoch": 22.608, "grad_norm": 0.15113677084445953, "learning_rate": 0.00029662193687747507, "loss": 5.6651, "step": 141300 }, { "epoch": 22.624, "grad_norm": 0.13519582152366638, "learning_rate": 0.00029661953678147124, "loss": 5.9082, "step": 141400 }, { "epoch": 22.64, "grad_norm": 0.15879906713962555, "learning_rate": 0.0002966171366854674, "loss": 6.094, "step": 141500 }, { "epoch": 22.656, "grad_norm": 0.16288715600967407, "learning_rate": 0.0002966147365894636, "loss": 5.5707, "step": 141600 }, { "epoch": 22.672, "grad_norm": 0.14412395656108856, "learning_rate": 0.00029661233649345975, "loss": 5.6827, "step": 141700 }, { "epoch": 22.688, "grad_norm": 0.14847436547279358, "learning_rate": 0.00029660993639745586, "loss": 5.4179, "step": 141800 }, { "epoch": 22.704, "grad_norm": 0.13256803154945374, "learning_rate": 0.00029660753630145203, "loss": 5.6927, "step": 141900 }, { "epoch": 22.72, "grad_norm": 0.13526926934719086, "learning_rate": 0.0002966051362054482, "loss": 5.7505, "step": 142000 }, { "epoch": 22.736, "grad_norm": 0.2226150929927826, "learning_rate": 0.00029660273610944437, "loss": 5.6683, "step": 142100 }, { "epoch": 22.752, "grad_norm": 0.12251828610897064, "learning_rate": 0.00029660033601344054, "loss": 5.4908, "step": 142200 }, { "epoch": 22.768, "grad_norm": 0.15432491898536682, "learning_rate": 0.00029659793591743666, "loss": 5.5662, "step": 142300 }, { "epoch": 22.784, "grad_norm": 0.13890361785888672, "learning_rate": 0.0002965955358214328, "loss": 5.6202, "step": 142400 }, { "epoch": 22.8, "grad_norm": 0.10568337142467499, "learning_rate": 0.000296593135725429, "loss": 5.7232, "step": 142500 }, { "epoch": 22.816, "grad_norm": 0.14877153933048248, "learning_rate": 0.00029659073562942516, "loss": 5.4585, "step": 142600 }, { "epoch": 22.832, "grad_norm": 0.1703936904668808, "learning_rate": 0.00029658833553342133, "loss": 5.8294, "step": 142700 }, { "epoch": 22.848, "grad_norm": 0.12574242055416107, "learning_rate": 0.0002965859594383775, "loss": 6.0963, "step": 142800 }, { "epoch": 22.864, "grad_norm": 0.1556757390499115, "learning_rate": 0.00029658355934237364, "loss": 5.6681, "step": 142900 }, { "epoch": 22.88, "grad_norm": 0.14058822393417358, "learning_rate": 0.0002965811592463698, "loss": 5.6148, "step": 143000 }, { "epoch": 22.896, "grad_norm": 0.1746063232421875, "learning_rate": 0.000296578759150366, "loss": 5.698, "step": 143100 }, { "epoch": 22.912, "grad_norm": 0.14458870887756348, "learning_rate": 0.00029657635905436214, "loss": 5.439, "step": 143200 }, { "epoch": 22.928, "grad_norm": 0.1708308756351471, "learning_rate": 0.0002965739589583583, "loss": 5.8077, "step": 143300 }, { "epoch": 22.944, "grad_norm": 0.1382734328508377, "learning_rate": 0.00029657155886235443, "loss": 5.603, "step": 143400 }, { "epoch": 22.96, "grad_norm": 0.15728691220283508, "learning_rate": 0.0002965691587663506, "loss": 5.8985, "step": 143500 }, { "epoch": 22.976, "grad_norm": 0.12880076467990875, "learning_rate": 0.00029656675867034677, "loss": 5.7958, "step": 143600 }, { "epoch": 22.992, "grad_norm": 0.130670964717865, "learning_rate": 0.000296564358574343, "loss": 5.6226, "step": 143700 }, { "epoch": 23.008, "grad_norm": 0.1519329994916916, "learning_rate": 0.0002965619584783391, "loss": 5.5619, "step": 143800 }, { "epoch": 23.024, "grad_norm": 0.11900737136602402, "learning_rate": 0.0002965595583823353, "loss": 5.5148, "step": 143900 }, { "epoch": 23.04, "grad_norm": 0.13805437088012695, "learning_rate": 0.00029655715828633144, "loss": 5.1992, "step": 144000 }, { "epoch": 23.056, "grad_norm": 0.15381775796413422, "learning_rate": 0.0002965547581903276, "loss": 5.6994, "step": 144100 }, { "epoch": 23.072, "grad_norm": 0.17571000754833221, "learning_rate": 0.0002965523580943238, "loss": 5.4076, "step": 144200 }, { "epoch": 23.088, "grad_norm": 0.1299617439508438, "learning_rate": 0.0002965499579983199, "loss": 5.5817, "step": 144300 }, { "epoch": 23.104, "grad_norm": 0.1709066480398178, "learning_rate": 0.00029654755790231607, "loss": 5.6442, "step": 144400 }, { "epoch": 23.12, "grad_norm": 0.11673315614461899, "learning_rate": 0.00029654515780631224, "loss": 5.4461, "step": 144500 }, { "epoch": 23.136, "grad_norm": 0.17694547772407532, "learning_rate": 0.0002965427577103084, "loss": 5.4203, "step": 144600 }, { "epoch": 23.152, "grad_norm": 0.1397058516740799, "learning_rate": 0.0002965403576143046, "loss": 5.6535, "step": 144700 }, { "epoch": 23.168, "grad_norm": 0.14913706481456757, "learning_rate": 0.00029653795751830074, "loss": 5.327, "step": 144800 }, { "epoch": 23.184, "grad_norm": 0.0980440080165863, "learning_rate": 0.0002965355814232569, "loss": 5.6265, "step": 144900 }, { "epoch": 23.2, "grad_norm": 0.14519555866718292, "learning_rate": 0.00029653318132725305, "loss": 5.5968, "step": 145000 }, { "epoch": 23.216, "grad_norm": 0.14121969044208527, "learning_rate": 0.0002965307812312492, "loss": 5.3419, "step": 145100 }, { "epoch": 23.232, "grad_norm": 0.14867204427719116, "learning_rate": 0.0002965283811352454, "loss": 5.5432, "step": 145200 }, { "epoch": 23.248, "grad_norm": 0.14526410400867462, "learning_rate": 0.00029652598103924155, "loss": 5.4119, "step": 145300 }, { "epoch": 23.264, "grad_norm": 0.16068951785564423, "learning_rate": 0.00029652358094323767, "loss": 5.6084, "step": 145400 }, { "epoch": 23.28, "grad_norm": 0.1540200263261795, "learning_rate": 0.00029652118084723384, "loss": 5.3346, "step": 145500 }, { "epoch": 23.296, "grad_norm": 0.1306939572095871, "learning_rate": 0.00029651878075123, "loss": 5.4401, "step": 145600 }, { "epoch": 23.312, "grad_norm": 0.19503143429756165, "learning_rate": 0.0002965163806552262, "loss": 5.5145, "step": 145700 }, { "epoch": 23.328, "grad_norm": 0.16698400676250458, "learning_rate": 0.00029651398055922235, "loss": 5.4459, "step": 145800 }, { "epoch": 23.344, "grad_norm": 0.14990036189556122, "learning_rate": 0.0002965115804632185, "loss": 5.9844, "step": 145900 }, { "epoch": 23.36, "grad_norm": 0.12152257561683655, "learning_rate": 0.00029650918036721463, "loss": 5.4034, "step": 146000 }, { "epoch": 23.376, "grad_norm": 0.12588883936405182, "learning_rate": 0.0002965067802712108, "loss": 5.6587, "step": 146100 }, { "epoch": 23.392, "grad_norm": 0.13769680261611938, "learning_rate": 0.00029650438017520697, "loss": 5.6661, "step": 146200 }, { "epoch": 23.408, "grad_norm": 0.18270593881607056, "learning_rate": 0.00029650198007920314, "loss": 5.4772, "step": 146300 }, { "epoch": 23.424, "grad_norm": 0.16988155245780945, "learning_rate": 0.0002964995799831993, "loss": 5.861, "step": 146400 }, { "epoch": 23.44, "grad_norm": 0.15813444554805756, "learning_rate": 0.0002964971798871954, "loss": 5.5742, "step": 146500 }, { "epoch": 23.456, "grad_norm": 0.20319218933582306, "learning_rate": 0.0002964947797911916, "loss": 5.5046, "step": 146600 }, { "epoch": 23.472, "grad_norm": 0.1794954091310501, "learning_rate": 0.00029649237969518776, "loss": 5.4266, "step": 146700 }, { "epoch": 23.488, "grad_norm": 0.18233439326286316, "learning_rate": 0.000296489979599184, "loss": 5.7988, "step": 146800 }, { "epoch": 23.504, "grad_norm": 0.24476204812526703, "learning_rate": 0.0002964875795031801, "loss": 5.5573, "step": 146900 }, { "epoch": 23.52, "grad_norm": 0.12210160493850708, "learning_rate": 0.00029648517940717627, "loss": 5.3991, "step": 147000 }, { "epoch": 23.536, "grad_norm": 0.18380597233772278, "learning_rate": 0.00029648277931117244, "loss": 5.7061, "step": 147100 }, { "epoch": 23.552, "grad_norm": 0.14776001870632172, "learning_rate": 0.0002964803792151686, "loss": 5.6827, "step": 147200 }, { "epoch": 23.568, "grad_norm": 0.13290056586265564, "learning_rate": 0.0002964779791191648, "loss": 5.6598, "step": 147300 }, { "epoch": 23.584, "grad_norm": 0.12878666818141937, "learning_rate": 0.0002964755790231609, "loss": 5.4732, "step": 147400 }, { "epoch": 23.6, "grad_norm": 0.11875222623348236, "learning_rate": 0.00029647317892715706, "loss": 5.9345, "step": 147500 }, { "epoch": 23.616, "grad_norm": 0.1489972323179245, "learning_rate": 0.00029647077883115323, "loss": 5.5631, "step": 147600 }, { "epoch": 23.632, "grad_norm": 0.22594046592712402, "learning_rate": 0.0002964683787351494, "loss": 5.2854, "step": 147700 }, { "epoch": 23.648, "grad_norm": 0.14621250331401825, "learning_rate": 0.00029646597863914557, "loss": 5.2938, "step": 147800 }, { "epoch": 23.664, "grad_norm": 0.14641734957695007, "learning_rate": 0.00029646357854314174, "loss": 5.7265, "step": 147900 }, { "epoch": 23.68, "grad_norm": 0.14452804625034332, "learning_rate": 0.00029646117844713785, "loss": 5.3081, "step": 148000 }, { "epoch": 23.696, "grad_norm": 0.1696479767560959, "learning_rate": 0.000296458778351134, "loss": 5.7359, "step": 148100 }, { "epoch": 23.712, "grad_norm": 0.1629931777715683, "learning_rate": 0.0002964563782551302, "loss": 5.8091, "step": 148200 }, { "epoch": 23.728, "grad_norm": 0.1588413119316101, "learning_rate": 0.00029645397815912636, "loss": 5.8185, "step": 148300 }, { "epoch": 23.744, "grad_norm": 0.1528206616640091, "learning_rate": 0.00029645157806312253, "loss": 5.6945, "step": 148400 }, { "epoch": 23.76, "grad_norm": 0.16446250677108765, "learning_rate": 0.00029644917796711864, "loss": 5.1739, "step": 148500 }, { "epoch": 23.776, "grad_norm": 0.14487922191619873, "learning_rate": 0.00029644680187207483, "loss": 5.5836, "step": 148600 }, { "epoch": 23.792, "grad_norm": 0.297879159450531, "learning_rate": 0.0002964444257770311, "loss": 5.5247, "step": 148700 }, { "epoch": 23.808, "grad_norm": 0.1171737089753151, "learning_rate": 0.00029644202568102724, "loss": 5.3085, "step": 148800 }, { "epoch": 23.824, "grad_norm": 0.1464715600013733, "learning_rate": 0.00029643962558502336, "loss": 5.3029, "step": 148900 }, { "epoch": 23.84, "grad_norm": 0.16126649081707, "learning_rate": 0.0002964372254890195, "loss": 5.7273, "step": 149000 }, { "epoch": 23.856, "grad_norm": 0.10824692994356155, "learning_rate": 0.0002964348253930157, "loss": 5.3296, "step": 149100 }, { "epoch": 23.872, "grad_norm": 0.14661309123039246, "learning_rate": 0.00029643242529701187, "loss": 5.828, "step": 149200 }, { "epoch": 23.888, "grad_norm": 0.16918961703777313, "learning_rate": 0.00029643002520100803, "loss": 5.359, "step": 149300 }, { "epoch": 23.904, "grad_norm": 0.14028948545455933, "learning_rate": 0.00029642762510500415, "loss": 5.5027, "step": 149400 }, { "epoch": 23.92, "grad_norm": 0.15497733652591705, "learning_rate": 0.0002964252250090003, "loss": 5.7539, "step": 149500 }, { "epoch": 23.936, "grad_norm": 0.12349986284971237, "learning_rate": 0.0002964228249129965, "loss": 5.1582, "step": 149600 }, { "epoch": 23.951999999999998, "grad_norm": 0.1359599381685257, "learning_rate": 0.00029642042481699266, "loss": 5.4394, "step": 149700 }, { "epoch": 23.968, "grad_norm": 0.18629401922225952, "learning_rate": 0.0002964180247209888, "loss": 5.4743, "step": 149800 }, { "epoch": 23.984, "grad_norm": 0.1438770890235901, "learning_rate": 0.000296415624624985, "loss": 5.4707, "step": 149900 }, { "epoch": 24.0, "grad_norm": 0.11876608431339264, "learning_rate": 0.0002964132245289811, "loss": 5.2108, "step": 150000 }, { "epoch": 24.016, "grad_norm": 0.1379069685935974, "learning_rate": 0.0002964108244329773, "loss": 5.5858, "step": 150100 }, { "epoch": 24.032, "grad_norm": 0.15197959542274475, "learning_rate": 0.00029640842433697345, "loss": 5.3452, "step": 150200 }, { "epoch": 24.048, "grad_norm": 0.16093584895133972, "learning_rate": 0.0002964060242409696, "loss": 5.1725, "step": 150300 }, { "epoch": 24.064, "grad_norm": 0.14459937810897827, "learning_rate": 0.0002964036241449658, "loss": 5.529, "step": 150400 }, { "epoch": 24.08, "grad_norm": 0.15908825397491455, "learning_rate": 0.0002964012240489619, "loss": 5.0667, "step": 150500 }, { "epoch": 24.096, "grad_norm": 0.14320479333400726, "learning_rate": 0.00029639882395295807, "loss": 5.4541, "step": 150600 }, { "epoch": 24.112, "grad_norm": 0.1382274329662323, "learning_rate": 0.00029639642385695424, "loss": 5.4337, "step": 150700 }, { "epoch": 24.128, "grad_norm": 0.09485090523958206, "learning_rate": 0.0002963940237609504, "loss": 5.5169, "step": 150800 }, { "epoch": 24.144, "grad_norm": 0.1434488147497177, "learning_rate": 0.0002963916236649466, "loss": 5.1838, "step": 150900 }, { "epoch": 24.16, "grad_norm": 0.172550767660141, "learning_rate": 0.00029638922356894275, "loss": 5.4995, "step": 151000 }, { "epoch": 24.176, "grad_norm": 0.17296665906906128, "learning_rate": 0.00029638682347293886, "loss": 5.3814, "step": 151100 }, { "epoch": 24.192, "grad_norm": 0.13183431327342987, "learning_rate": 0.00029638442337693503, "loss": 5.4961, "step": 151200 }, { "epoch": 24.208, "grad_norm": 0.11805009096860886, "learning_rate": 0.0002963820472818913, "loss": 5.3575, "step": 151300 }, { "epoch": 24.224, "grad_norm": 0.1694483608007431, "learning_rate": 0.0002963796471858874, "loss": 5.4198, "step": 151400 }, { "epoch": 24.24, "grad_norm": 0.14694049954414368, "learning_rate": 0.00029637724708988356, "loss": 5.2369, "step": 151500 }, { "epoch": 24.256, "grad_norm": 0.14818693697452545, "learning_rate": 0.00029637484699387973, "loss": 5.5989, "step": 151600 }, { "epoch": 24.272, "grad_norm": 0.12142101675271988, "learning_rate": 0.0002963724468978759, "loss": 5.5808, "step": 151700 }, { "epoch": 24.288, "grad_norm": 0.1072693020105362, "learning_rate": 0.00029637004680187207, "loss": 5.2257, "step": 151800 }, { "epoch": 24.304, "grad_norm": 0.20452247560024261, "learning_rate": 0.00029636764670586824, "loss": 4.9512, "step": 151900 }, { "epoch": 24.32, "grad_norm": 0.13785667717456818, "learning_rate": 0.00029636524660986435, "loss": 5.3486, "step": 152000 }, { "epoch": 24.336, "grad_norm": 0.16348302364349365, "learning_rate": 0.0002963628465138605, "loss": 5.416, "step": 152100 }, { "epoch": 24.352, "grad_norm": 0.12873555719852448, "learning_rate": 0.0002963604464178567, "loss": 5.4854, "step": 152200 }, { "epoch": 24.368, "grad_norm": 0.14430370926856995, "learning_rate": 0.00029635804632185286, "loss": 5.083, "step": 152300 }, { "epoch": 24.384, "grad_norm": 0.14247077703475952, "learning_rate": 0.00029635564622584903, "loss": 5.2926, "step": 152400 }, { "epoch": 24.4, "grad_norm": 0.12942449748516083, "learning_rate": 0.00029635324612984514, "loss": 5.2287, "step": 152500 }, { "epoch": 24.416, "grad_norm": 0.1290571689605713, "learning_rate": 0.0002963508460338413, "loss": 5.1295, "step": 152600 }, { "epoch": 24.432, "grad_norm": 0.14392858743667603, "learning_rate": 0.0002963484459378375, "loss": 5.2795, "step": 152700 }, { "epoch": 24.448, "grad_norm": 0.10403969883918762, "learning_rate": 0.00029634604584183365, "loss": 5.4616, "step": 152800 }, { "epoch": 24.464, "grad_norm": 0.1357210874557495, "learning_rate": 0.0002963436457458298, "loss": 5.0671, "step": 152900 }, { "epoch": 24.48, "grad_norm": 0.162188321352005, "learning_rate": 0.000296341245649826, "loss": 5.1244, "step": 153000 }, { "epoch": 24.496, "grad_norm": 0.1423524171113968, "learning_rate": 0.0002963388455538221, "loss": 5.2658, "step": 153100 }, { "epoch": 24.512, "grad_norm": 0.15725597739219666, "learning_rate": 0.00029633644545781827, "loss": 5.4486, "step": 153200 }, { "epoch": 24.528, "grad_norm": 0.10184895247220993, "learning_rate": 0.00029633404536181444, "loss": 5.1975, "step": 153300 }, { "epoch": 24.544, "grad_norm": 0.11968593299388885, "learning_rate": 0.0002963316452658106, "loss": 5.0282, "step": 153400 }, { "epoch": 24.56, "grad_norm": 0.15125450491905212, "learning_rate": 0.0002963292451698068, "loss": 5.0548, "step": 153500 }, { "epoch": 24.576, "grad_norm": 0.1498018205165863, "learning_rate": 0.0002963268450738029, "loss": 5.2235, "step": 153600 }, { "epoch": 24.592, "grad_norm": 0.14961381256580353, "learning_rate": 0.00029632444497779906, "loss": 5.282, "step": 153700 }, { "epoch": 24.608, "grad_norm": 0.10805343836545944, "learning_rate": 0.00029632204488179523, "loss": 5.2164, "step": 153800 }, { "epoch": 24.624, "grad_norm": 0.1407497674226761, "learning_rate": 0.0002963196447857914, "loss": 5.8793, "step": 153900 }, { "epoch": 24.64, "grad_norm": 0.15589803457260132, "learning_rate": 0.00029631724468978757, "loss": 5.2803, "step": 154000 }, { "epoch": 24.656, "grad_norm": 0.15549539029598236, "learning_rate": 0.00029631484459378374, "loss": 5.5255, "step": 154100 }, { "epoch": 24.672, "grad_norm": 0.14697429537773132, "learning_rate": 0.00029631244449777986, "loss": 5.2088, "step": 154200 }, { "epoch": 24.688, "grad_norm": 0.14445632696151733, "learning_rate": 0.000296310044401776, "loss": 5.314, "step": 154300 }, { "epoch": 24.704, "grad_norm": 0.13264203071594238, "learning_rate": 0.0002963076443057722, "loss": 5.1363, "step": 154400 }, { "epoch": 24.72, "grad_norm": 0.14595112204551697, "learning_rate": 0.00029630524420976836, "loss": 5.1834, "step": 154500 }, { "epoch": 24.736, "grad_norm": 0.15063650906085968, "learning_rate": 0.00029630284411376453, "loss": 5.2409, "step": 154600 }, { "epoch": 24.752, "grad_norm": 0.1531144678592682, "learning_rate": 0.00029630044401776065, "loss": 5.3414, "step": 154700 }, { "epoch": 24.768, "grad_norm": 0.15418265759944916, "learning_rate": 0.0002962980439217568, "loss": 5.3579, "step": 154800 }, { "epoch": 24.784, "grad_norm": 0.13664741814136505, "learning_rate": 0.000296295643825753, "loss": 5.4855, "step": 154900 }, { "epoch": 24.8, "grad_norm": 0.15261198580265045, "learning_rate": 0.00029629324372974916, "loss": 5.5078, "step": 155000 }, { "epoch": 24.816, "grad_norm": 0.1436208039522171, "learning_rate": 0.0002962908436337453, "loss": 5.2359, "step": 155100 }, { "epoch": 24.832, "grad_norm": 0.1557721495628357, "learning_rate": 0.0002962884435377415, "loss": 5.1472, "step": 155200 }, { "epoch": 24.848, "grad_norm": 0.1639142483472824, "learning_rate": 0.0002962860434417376, "loss": 5.1701, "step": 155300 }, { "epoch": 24.864, "grad_norm": 0.1857120245695114, "learning_rate": 0.0002962836433457338, "loss": 5.3149, "step": 155400 }, { "epoch": 24.88, "grad_norm": 0.1384589672088623, "learning_rate": 0.00029628124324972995, "loss": 5.1655, "step": 155500 }, { "epoch": 24.896, "grad_norm": 0.16934780776500702, "learning_rate": 0.0002962788431537261, "loss": 5.0212, "step": 155600 }, { "epoch": 24.912, "grad_norm": 0.14011263847351074, "learning_rate": 0.0002962764430577223, "loss": 5.3506, "step": 155700 }, { "epoch": 24.928, "grad_norm": 0.12232084572315216, "learning_rate": 0.0002962740429617184, "loss": 4.9836, "step": 155800 }, { "epoch": 24.944, "grad_norm": 0.1219339519739151, "learning_rate": 0.00029627164286571457, "loss": 5.337, "step": 155900 }, { "epoch": 24.96, "grad_norm": 0.13951101899147034, "learning_rate": 0.0002962692667706708, "loss": 5.6947, "step": 156000 }, { "epoch": 24.976, "grad_norm": 0.15717874467372894, "learning_rate": 0.000296266866674667, "loss": 5.0598, "step": 156100 }, { "epoch": 24.992, "grad_norm": 0.16753438115119934, "learning_rate": 0.0002962644665786631, "loss": 5.1918, "step": 156200 }, { "epoch": 25.008, "grad_norm": 0.11955256760120392, "learning_rate": 0.00029626206648265927, "loss": 5.1705, "step": 156300 }, { "epoch": 25.024, "grad_norm": 0.11964499950408936, "learning_rate": 0.00029625966638665544, "loss": 5.3443, "step": 156400 }, { "epoch": 25.04, "grad_norm": 0.123370461165905, "learning_rate": 0.0002962572662906516, "loss": 4.9845, "step": 156500 }, { "epoch": 25.056, "grad_norm": 0.12556427717208862, "learning_rate": 0.0002962548661946478, "loss": 4.9369, "step": 156600 }, { "epoch": 25.072, "grad_norm": 0.15033285319805145, "learning_rate": 0.0002962524660986439, "loss": 5.1891, "step": 156700 }, { "epoch": 25.088, "grad_norm": 0.157626673579216, "learning_rate": 0.00029625006600264006, "loss": 5.0871, "step": 156800 }, { "epoch": 25.104, "grad_norm": 0.12489177286624908, "learning_rate": 0.0002962476659066362, "loss": 4.9887, "step": 156900 }, { "epoch": 25.12, "grad_norm": 0.17784586548805237, "learning_rate": 0.0002962452658106324, "loss": 4.9263, "step": 157000 }, { "epoch": 25.136, "grad_norm": 0.26584434509277344, "learning_rate": 0.00029624286571462857, "loss": 5.1268, "step": 157100 }, { "epoch": 25.152, "grad_norm": 0.14168865978717804, "learning_rate": 0.00029624046561862473, "loss": 5.4578, "step": 157200 }, { "epoch": 25.168, "grad_norm": 0.1289631426334381, "learning_rate": 0.00029623806552262085, "loss": 5.2466, "step": 157300 }, { "epoch": 25.184, "grad_norm": 0.12273957580327988, "learning_rate": 0.000296235665426617, "loss": 4.7845, "step": 157400 }, { "epoch": 25.2, "grad_norm": 0.24651670455932617, "learning_rate": 0.0002962332653306132, "loss": 5.0988, "step": 157500 }, { "epoch": 25.216, "grad_norm": 0.1415649801492691, "learning_rate": 0.00029623086523460936, "loss": 5.0998, "step": 157600 }, { "epoch": 25.232, "grad_norm": 0.1132798045873642, "learning_rate": 0.0002962284651386055, "loss": 5.2229, "step": 157700 }, { "epoch": 25.248, "grad_norm": 0.10961470752954483, "learning_rate": 0.00029622606504260164, "loss": 4.9959, "step": 157800 }, { "epoch": 25.264, "grad_norm": 0.16054928302764893, "learning_rate": 0.0002962236649465978, "loss": 4.989, "step": 157900 }, { "epoch": 25.28, "grad_norm": 0.16918180882930756, "learning_rate": 0.000296221264850594, "loss": 5.0824, "step": 158000 }, { "epoch": 25.296, "grad_norm": 0.12880262732505798, "learning_rate": 0.00029621886475459015, "loss": 4.6069, "step": 158100 }, { "epoch": 25.312, "grad_norm": 0.16930246353149414, "learning_rate": 0.0002962164646585863, "loss": 5.0421, "step": 158200 }, { "epoch": 25.328, "grad_norm": 0.15791450440883636, "learning_rate": 0.0002962140645625825, "loss": 5.1324, "step": 158300 }, { "epoch": 25.344, "grad_norm": 0.12896622717380524, "learning_rate": 0.0002962116644665786, "loss": 4.8697, "step": 158400 }, { "epoch": 25.36, "grad_norm": 0.15522588789463043, "learning_rate": 0.00029620926437057477, "loss": 5.112, "step": 158500 }, { "epoch": 25.376, "grad_norm": 0.15994909405708313, "learning_rate": 0.00029620686427457094, "loss": 5.1186, "step": 158600 }, { "epoch": 25.392, "grad_norm": 0.16203735768795013, "learning_rate": 0.0002962044641785671, "loss": 5.2136, "step": 158700 }, { "epoch": 25.408, "grad_norm": 0.14830628037452698, "learning_rate": 0.0002962020640825633, "loss": 4.8028, "step": 158800 }, { "epoch": 25.424, "grad_norm": 0.17855019867420197, "learning_rate": 0.00029619966398655945, "loss": 5.2293, "step": 158900 }, { "epoch": 25.44, "grad_norm": 0.13485394418239594, "learning_rate": 0.00029619728789151564, "loss": 5.1688, "step": 159000 }, { "epoch": 25.456, "grad_norm": 0.15001603960990906, "learning_rate": 0.0002961948877955118, "loss": 5.2429, "step": 159100 }, { "epoch": 25.472, "grad_norm": 0.15747343003749847, "learning_rate": 0.000296192487699508, "loss": 5.0648, "step": 159200 }, { "epoch": 25.488, "grad_norm": 0.11709601432085037, "learning_rate": 0.0002961900876035041, "loss": 4.9424, "step": 159300 }, { "epoch": 25.504, "grad_norm": 0.14115624129772186, "learning_rate": 0.00029618768750750026, "loss": 5.2824, "step": 159400 }, { "epoch": 25.52, "grad_norm": 0.13271014392375946, "learning_rate": 0.00029618528741149643, "loss": 5.2082, "step": 159500 }, { "epoch": 25.536, "grad_norm": 0.13927429914474487, "learning_rate": 0.0002961828873154926, "loss": 5.0302, "step": 159600 }, { "epoch": 25.552, "grad_norm": 0.1625901609659195, "learning_rate": 0.00029618048721948877, "loss": 5.2649, "step": 159700 }, { "epoch": 25.568, "grad_norm": 0.1242537572979927, "learning_rate": 0.0002961780871234849, "loss": 5.3638, "step": 159800 }, { "epoch": 25.584, "grad_norm": 0.22442211210727692, "learning_rate": 0.00029617568702748105, "loss": 4.7374, "step": 159900 }, { "epoch": 25.6, "grad_norm": 0.1424286961555481, "learning_rate": 0.0002961732869314772, "loss": 5.0878, "step": 160000 }, { "epoch": 25.616, "grad_norm": 0.16174399852752686, "learning_rate": 0.0002961708868354734, "loss": 5.4059, "step": 160100 }, { "epoch": 25.632, "grad_norm": 0.12529495358467102, "learning_rate": 0.00029616848673946956, "loss": 5.1528, "step": 160200 }, { "epoch": 25.648, "grad_norm": 0.14766289293766022, "learning_rate": 0.00029616608664346573, "loss": 5.2453, "step": 160300 }, { "epoch": 25.664, "grad_norm": 0.12722782790660858, "learning_rate": 0.00029616368654746184, "loss": 5.1237, "step": 160400 }, { "epoch": 25.68, "grad_norm": 0.1653498262166977, "learning_rate": 0.000296161286451458, "loss": 5.2606, "step": 160500 }, { "epoch": 25.696, "grad_norm": 0.15743720531463623, "learning_rate": 0.0002961588863554542, "loss": 5.3842, "step": 160600 }, { "epoch": 25.712, "grad_norm": 0.11641506105661392, "learning_rate": 0.00029615648625945035, "loss": 5.0112, "step": 160700 }, { "epoch": 25.728, "grad_norm": 0.1600313037633896, "learning_rate": 0.0002961540861634465, "loss": 5.1207, "step": 160800 }, { "epoch": 25.744, "grad_norm": 0.1792784333229065, "learning_rate": 0.0002961516860674427, "loss": 5.1801, "step": 160900 }, { "epoch": 25.76, "grad_norm": 0.12263203412294388, "learning_rate": 0.0002961492859714388, "loss": 5.1875, "step": 161000 }, { "epoch": 25.776, "grad_norm": 0.1638142168521881, "learning_rate": 0.00029614690987639505, "loss": 5.5503, "step": 161100 }, { "epoch": 25.792, "grad_norm": 0.12107832729816437, "learning_rate": 0.0002961445097803912, "loss": 5.312, "step": 161200 }, { "epoch": 25.808, "grad_norm": 0.1593557745218277, "learning_rate": 0.00029614210968438733, "loss": 5.0444, "step": 161300 }, { "epoch": 25.824, "grad_norm": 0.14629554748535156, "learning_rate": 0.0002961397095883835, "loss": 5.2007, "step": 161400 }, { "epoch": 25.84, "grad_norm": 0.14022816717624664, "learning_rate": 0.00029613730949237967, "loss": 5.1234, "step": 161500 }, { "epoch": 25.856, "grad_norm": 0.15026092529296875, "learning_rate": 0.00029613490939637584, "loss": 5.1459, "step": 161600 }, { "epoch": 25.872, "grad_norm": 0.16642487049102783, "learning_rate": 0.000296132509300372, "loss": 5.074, "step": 161700 }, { "epoch": 25.888, "grad_norm": 0.16100358963012695, "learning_rate": 0.0002961301092043681, "loss": 4.8445, "step": 161800 }, { "epoch": 25.904, "grad_norm": 0.14411258697509766, "learning_rate": 0.0002961277091083643, "loss": 4.7157, "step": 161900 }, { "epoch": 25.92, "grad_norm": 0.10813727974891663, "learning_rate": 0.00029612530901236046, "loss": 5.0682, "step": 162000 }, { "epoch": 25.936, "grad_norm": 0.14450779557228088, "learning_rate": 0.00029612290891635663, "loss": 5.241, "step": 162100 }, { "epoch": 25.951999999999998, "grad_norm": 0.16171583533287048, "learning_rate": 0.0002961205088203528, "loss": 5.1133, "step": 162200 }, { "epoch": 25.968, "grad_norm": 0.12712721526622772, "learning_rate": 0.00029611810872434897, "loss": 5.0328, "step": 162300 }, { "epoch": 25.984, "grad_norm": 0.12672489881515503, "learning_rate": 0.0002961157086283451, "loss": 4.8169, "step": 162400 }, { "epoch": 26.0, "grad_norm": 0.15172095596790314, "learning_rate": 0.00029611330853234125, "loss": 5.092, "step": 162500 }, { "epoch": 26.016, "grad_norm": 0.18036304414272308, "learning_rate": 0.0002961109084363374, "loss": 4.7511, "step": 162600 }, { "epoch": 26.032, "grad_norm": 0.16676302254199982, "learning_rate": 0.0002961085083403336, "loss": 4.9628, "step": 162700 }, { "epoch": 26.048, "grad_norm": 0.1724889576435089, "learning_rate": 0.00029610610824432976, "loss": 4.8742, "step": 162800 }, { "epoch": 26.064, "grad_norm": 0.1280188113451004, "learning_rate": 0.00029610370814832593, "loss": 5.3059, "step": 162900 }, { "epoch": 26.08, "grad_norm": 0.15785780549049377, "learning_rate": 0.00029610130805232204, "loss": 4.8671, "step": 163000 }, { "epoch": 26.096, "grad_norm": 0.14080898463726044, "learning_rate": 0.0002960989079563182, "loss": 5.1418, "step": 163100 }, { "epoch": 26.112, "grad_norm": 0.13095679879188538, "learning_rate": 0.0002960965078603144, "loss": 4.7194, "step": 163200 }, { "epoch": 26.128, "grad_norm": 0.1574213057756424, "learning_rate": 0.00029609410776431055, "loss": 4.9184, "step": 163300 }, { "epoch": 26.144, "grad_norm": 0.13669663667678833, "learning_rate": 0.0002960917076683067, "loss": 5.0563, "step": 163400 }, { "epoch": 26.16, "grad_norm": 0.15946930646896362, "learning_rate": 0.00029608930757230284, "loss": 4.7656, "step": 163500 }, { "epoch": 26.176, "grad_norm": 0.1457744687795639, "learning_rate": 0.000296086907476299, "loss": 4.894, "step": 163600 }, { "epoch": 26.192, "grad_norm": 0.10747674852609634, "learning_rate": 0.0002960845073802952, "loss": 5.1462, "step": 163700 }, { "epoch": 26.208, "grad_norm": 0.22094644606113434, "learning_rate": 0.00029608210728429134, "loss": 5.3243, "step": 163800 }, { "epoch": 26.224, "grad_norm": 0.12370151281356812, "learning_rate": 0.0002960797071882875, "loss": 4.8294, "step": 163900 }, { "epoch": 26.24, "grad_norm": 0.1479647010564804, "learning_rate": 0.0002960773070922837, "loss": 5.0416, "step": 164000 }, { "epoch": 26.256, "grad_norm": 0.15605013072490692, "learning_rate": 0.0002960749069962798, "loss": 5.2773, "step": 164100 }, { "epoch": 26.272, "grad_norm": 0.1911146342754364, "learning_rate": 0.00029607250690027597, "loss": 4.939, "step": 164200 }, { "epoch": 26.288, "grad_norm": 0.12012562155723572, "learning_rate": 0.0002960701308052322, "loss": 4.8719, "step": 164300 }, { "epoch": 26.304, "grad_norm": 0.12493129819631577, "learning_rate": 0.0002960677307092283, "loss": 4.7802, "step": 164400 }, { "epoch": 26.32, "grad_norm": 0.12632489204406738, "learning_rate": 0.0002960653306132245, "loss": 4.8725, "step": 164500 }, { "epoch": 26.336, "grad_norm": 0.15591692924499512, "learning_rate": 0.00029606293051722066, "loss": 5.2183, "step": 164600 }, { "epoch": 26.352, "grad_norm": 0.12113320082426071, "learning_rate": 0.00029606053042121683, "loss": 4.981, "step": 164700 }, { "epoch": 26.368, "grad_norm": 0.12973067164421082, "learning_rate": 0.000296058130325213, "loss": 5.1433, "step": 164800 }, { "epoch": 26.384, "grad_norm": 0.15297859907150269, "learning_rate": 0.00029605573022920917, "loss": 4.9628, "step": 164900 }, { "epoch": 26.4, "grad_norm": 0.13537169992923737, "learning_rate": 0.0002960533301332053, "loss": 4.6621, "step": 165000 }, { "epoch": 26.416, "grad_norm": 0.12161804735660553, "learning_rate": 0.00029605093003720145, "loss": 4.9027, "step": 165100 }, { "epoch": 26.432, "grad_norm": 0.14561276137828827, "learning_rate": 0.0002960485299411976, "loss": 4.7497, "step": 165200 }, { "epoch": 26.448, "grad_norm": 0.1523263305425644, "learning_rate": 0.0002960461298451938, "loss": 4.7575, "step": 165300 }, { "epoch": 26.464, "grad_norm": 0.13894937932491302, "learning_rate": 0.00029604372974918996, "loss": 5.1487, "step": 165400 }, { "epoch": 26.48, "grad_norm": 0.1122347041964531, "learning_rate": 0.0002960413296531861, "loss": 4.8517, "step": 165500 }, { "epoch": 26.496, "grad_norm": 0.12737123668193817, "learning_rate": 0.00029603892955718225, "loss": 4.8187, "step": 165600 }, { "epoch": 26.512, "grad_norm": 0.1302328109741211, "learning_rate": 0.0002960365294611784, "loss": 4.6812, "step": 165700 }, { "epoch": 26.528, "grad_norm": 0.14844807982444763, "learning_rate": 0.0002960341293651746, "loss": 4.9271, "step": 165800 }, { "epoch": 26.544, "grad_norm": 0.17675945162773132, "learning_rate": 0.00029603172926917075, "loss": 4.7797, "step": 165900 }, { "epoch": 26.56, "grad_norm": 0.18416370451450348, "learning_rate": 0.0002960293291731669, "loss": 5.1626, "step": 166000 }, { "epoch": 26.576, "grad_norm": 0.12005133926868439, "learning_rate": 0.00029602692907716304, "loss": 4.7074, "step": 166100 }, { "epoch": 26.592, "grad_norm": 0.185636967420578, "learning_rate": 0.0002960245289811592, "loss": 5.175, "step": 166200 }, { "epoch": 26.608, "grad_norm": 0.11722932010889053, "learning_rate": 0.0002960221288851554, "loss": 4.9977, "step": 166300 }, { "epoch": 26.624, "grad_norm": 0.13763803243637085, "learning_rate": 0.00029601972878915154, "loss": 4.732, "step": 166400 }, { "epoch": 26.64, "grad_norm": 0.13912682235240936, "learning_rate": 0.0002960173286931477, "loss": 4.877, "step": 166500 }, { "epoch": 26.656, "grad_norm": 0.10087449103593826, "learning_rate": 0.00029601492859714383, "loss": 4.7994, "step": 166600 }, { "epoch": 26.672, "grad_norm": 0.1845891773700714, "learning_rate": 0.00029601252850114, "loss": 5.4515, "step": 166700 }, { "epoch": 26.688, "grad_norm": 0.14900504052639008, "learning_rate": 0.00029601012840513617, "loss": 5.0709, "step": 166800 }, { "epoch": 26.704, "grad_norm": 0.19447046518325806, "learning_rate": 0.00029600772830913234, "loss": 4.8345, "step": 166900 }, { "epoch": 26.72, "grad_norm": 0.15507912635803223, "learning_rate": 0.0002960053282131285, "loss": 4.909, "step": 167000 }, { "epoch": 26.736, "grad_norm": 0.12142092734575272, "learning_rate": 0.0002960029281171247, "loss": 4.8017, "step": 167100 }, { "epoch": 26.752, "grad_norm": 0.12530605494976044, "learning_rate": 0.0002960005280211208, "loss": 5.1347, "step": 167200 }, { "epoch": 26.768, "grad_norm": 0.14327798783779144, "learning_rate": 0.00029599812792511696, "loss": 4.7235, "step": 167300 }, { "epoch": 26.784, "grad_norm": 0.14647874236106873, "learning_rate": 0.00029599572782911313, "loss": 4.9018, "step": 167400 }, { "epoch": 26.8, "grad_norm": 0.13197900354862213, "learning_rate": 0.0002959933277331093, "loss": 5.1885, "step": 167500 }, { "epoch": 26.816, "grad_norm": 0.13953787088394165, "learning_rate": 0.00029599092763710547, "loss": 4.8121, "step": 167600 }, { "epoch": 26.832, "grad_norm": 0.16823934018611908, "learning_rate": 0.0002959885275411016, "loss": 4.7129, "step": 167700 }, { "epoch": 26.848, "grad_norm": 0.1557362824678421, "learning_rate": 0.00029598612744509775, "loss": 5.2257, "step": 167800 }, { "epoch": 26.864, "grad_norm": 0.16123229265213013, "learning_rate": 0.000295983751350054, "loss": 4.8921, "step": 167900 }, { "epoch": 26.88, "grad_norm": 0.1613980084657669, "learning_rate": 0.00029598135125405016, "loss": 5.0361, "step": 168000 }, { "epoch": 26.896, "grad_norm": 0.1302555948495865, "learning_rate": 0.0002959789511580463, "loss": 5.0077, "step": 168100 }, { "epoch": 26.912, "grad_norm": 0.15182837843894958, "learning_rate": 0.00029597655106204245, "loss": 5.0202, "step": 168200 }, { "epoch": 26.928, "grad_norm": 0.13955193758010864, "learning_rate": 0.0002959741509660386, "loss": 4.9305, "step": 168300 }, { "epoch": 26.944, "grad_norm": 0.1417885273694992, "learning_rate": 0.0002959717508700348, "loss": 5.0889, "step": 168400 }, { "epoch": 26.96, "grad_norm": 0.14792856574058533, "learning_rate": 0.00029596935077403095, "loss": 4.8685, "step": 168500 }, { "epoch": 26.976, "grad_norm": 0.14266085624694824, "learning_rate": 0.00029596695067802707, "loss": 5.1578, "step": 168600 }, { "epoch": 26.992, "grad_norm": 0.11925966292619705, "learning_rate": 0.00029596455058202324, "loss": 4.6746, "step": 168700 }, { "epoch": 27.008, "grad_norm": 0.13332228362560272, "learning_rate": 0.0002959621504860194, "loss": 5.1295, "step": 168800 }, { "epoch": 27.024, "grad_norm": 0.13257551193237305, "learning_rate": 0.0002959597503900156, "loss": 5.0958, "step": 168900 }, { "epoch": 27.04, "grad_norm": 0.11077175289392471, "learning_rate": 0.00029595735029401175, "loss": 4.6509, "step": 169000 }, { "epoch": 27.056, "grad_norm": 0.1581268608570099, "learning_rate": 0.0002959549501980079, "loss": 4.7619, "step": 169100 }, { "epoch": 27.072, "grad_norm": 0.15108828246593475, "learning_rate": 0.00029595255010200403, "loss": 4.7792, "step": 169200 }, { "epoch": 27.088, "grad_norm": 0.15362246334552765, "learning_rate": 0.0002959501500060002, "loss": 5.189, "step": 169300 }, { "epoch": 27.104, "grad_norm": 0.1353999823331833, "learning_rate": 0.00029594774990999637, "loss": 4.7698, "step": 169400 }, { "epoch": 27.12, "grad_norm": 0.15684208273887634, "learning_rate": 0.00029594534981399254, "loss": 4.8111, "step": 169500 }, { "epoch": 27.136, "grad_norm": 0.17176128923892975, "learning_rate": 0.0002959429497179887, "loss": 4.8735, "step": 169600 }, { "epoch": 27.152, "grad_norm": 0.12857766449451447, "learning_rate": 0.0002959405496219848, "loss": 4.5602, "step": 169700 }, { "epoch": 27.168, "grad_norm": 0.2216508835554123, "learning_rate": 0.000295938149525981, "loss": 4.6848, "step": 169800 }, { "epoch": 27.184, "grad_norm": 0.18342281877994537, "learning_rate": 0.00029593577343093723, "loss": 4.9973, "step": 169900 }, { "epoch": 27.2, "grad_norm": 0.2726237177848816, "learning_rate": 0.0002959333733349334, "loss": 4.8341, "step": 170000 }, { "epoch": 27.216, "grad_norm": 0.1373586356639862, "learning_rate": 0.0002959309732389295, "loss": 4.914, "step": 170100 }, { "epoch": 27.232, "grad_norm": 0.13454484939575195, "learning_rate": 0.0002959285731429257, "loss": 5.0239, "step": 170200 }, { "epoch": 27.248, "grad_norm": 0.146050363779068, "learning_rate": 0.00029592617304692186, "loss": 4.7314, "step": 170300 }, { "epoch": 27.264, "grad_norm": 0.14222508668899536, "learning_rate": 0.000295923772950918, "loss": 4.6159, "step": 170400 }, { "epoch": 27.28, "grad_norm": 0.14632238447666168, "learning_rate": 0.0002959213728549142, "loss": 4.4062, "step": 170500 }, { "epoch": 27.296, "grad_norm": 0.16428226232528687, "learning_rate": 0.0002959189727589103, "loss": 5.1747, "step": 170600 }, { "epoch": 27.312, "grad_norm": 0.1323370337486267, "learning_rate": 0.0002959165726629065, "loss": 4.5199, "step": 170700 }, { "epoch": 27.328, "grad_norm": 0.14235830307006836, "learning_rate": 0.00029591417256690265, "loss": 4.9103, "step": 170800 }, { "epoch": 27.344, "grad_norm": 0.13216975331306458, "learning_rate": 0.0002959117724708988, "loss": 4.8293, "step": 170900 }, { "epoch": 27.36, "grad_norm": 0.15071095526218414, "learning_rate": 0.000295909372374895, "loss": 4.9801, "step": 171000 }, { "epoch": 27.376, "grad_norm": 0.1272030919790268, "learning_rate": 0.00029590697227889116, "loss": 4.9456, "step": 171100 }, { "epoch": 27.392, "grad_norm": 0.13579507172107697, "learning_rate": 0.00029590457218288727, "loss": 4.8712, "step": 171200 }, { "epoch": 27.408, "grad_norm": 0.12844951450824738, "learning_rate": 0.00029590217208688344, "loss": 4.679, "step": 171300 }, { "epoch": 27.424, "grad_norm": 0.10488644242286682, "learning_rate": 0.0002958997719908796, "loss": 4.8333, "step": 171400 }, { "epoch": 27.44, "grad_norm": 0.1397544890642166, "learning_rate": 0.0002958973718948758, "loss": 4.9637, "step": 171500 }, { "epoch": 27.456, "grad_norm": 0.17122800648212433, "learning_rate": 0.00029589497179887195, "loss": 4.5042, "step": 171600 }, { "epoch": 27.472, "grad_norm": 0.1432805061340332, "learning_rate": 0.00029589257170286806, "loss": 4.9236, "step": 171700 }, { "epoch": 27.488, "grad_norm": 0.2430882304906845, "learning_rate": 0.00029589017160686423, "loss": 4.6134, "step": 171800 }, { "epoch": 27.504, "grad_norm": 0.12965236604213715, "learning_rate": 0.0002958877715108604, "loss": 4.8867, "step": 171900 }, { "epoch": 27.52, "grad_norm": 0.13079382479190826, "learning_rate": 0.00029588537141485657, "loss": 4.7196, "step": 172000 }, { "epoch": 27.536, "grad_norm": 0.16515448689460754, "learning_rate": 0.00029588299531981276, "loss": 4.6995, "step": 172100 }, { "epoch": 27.552, "grad_norm": 0.12594960629940033, "learning_rate": 0.00029588059522380893, "loss": 4.8708, "step": 172200 }, { "epoch": 27.568, "grad_norm": 0.1570487916469574, "learning_rate": 0.0002958781951278051, "loss": 4.8169, "step": 172300 }, { "epoch": 27.584, "grad_norm": 0.13092289865016937, "learning_rate": 0.00029587579503180127, "loss": 4.695, "step": 172400 }, { "epoch": 27.6, "grad_norm": 0.14942535758018494, "learning_rate": 0.00029587339493579744, "loss": 4.7415, "step": 172500 }, { "epoch": 27.616, "grad_norm": 0.12075886875391006, "learning_rate": 0.00029587099483979355, "loss": 4.4839, "step": 172600 }, { "epoch": 27.632, "grad_norm": 0.11725221574306488, "learning_rate": 0.0002958685947437897, "loss": 4.8162, "step": 172700 }, { "epoch": 27.648, "grad_norm": 0.20893152058124542, "learning_rate": 0.0002958661946477859, "loss": 4.78, "step": 172800 }, { "epoch": 27.664, "grad_norm": 0.14231526851654053, "learning_rate": 0.00029586379455178206, "loss": 4.7212, "step": 172900 }, { "epoch": 27.68, "grad_norm": 0.1261710226535797, "learning_rate": 0.0002958613944557782, "loss": 4.96, "step": 173000 }, { "epoch": 27.696, "grad_norm": 0.1408015638589859, "learning_rate": 0.0002958589943597744, "loss": 4.7388, "step": 173100 }, { "epoch": 27.712, "grad_norm": 0.14422334730625153, "learning_rate": 0.0002958565942637705, "loss": 4.5018, "step": 173200 }, { "epoch": 27.728, "grad_norm": 0.17371025681495667, "learning_rate": 0.0002958541941677667, "loss": 4.792, "step": 173300 }, { "epoch": 27.744, "grad_norm": 0.21515819430351257, "learning_rate": 0.00029585179407176285, "loss": 4.8225, "step": 173400 }, { "epoch": 27.76, "grad_norm": 0.1557329297065735, "learning_rate": 0.000295849393975759, "loss": 4.6305, "step": 173500 }, { "epoch": 27.776, "grad_norm": 0.13870660960674286, "learning_rate": 0.0002958469938797552, "loss": 4.5486, "step": 173600 }, { "epoch": 27.792, "grad_norm": 0.13383133709430695, "learning_rate": 0.0002958445937837513, "loss": 4.6136, "step": 173700 }, { "epoch": 27.808, "grad_norm": 0.1399243175983429, "learning_rate": 0.00029584219368774747, "loss": 4.9352, "step": 173800 }, { "epoch": 27.824, "grad_norm": 0.11231095343828201, "learning_rate": 0.00029583979359174364, "loss": 4.9996, "step": 173900 }, { "epoch": 27.84, "grad_norm": 0.16128210723400116, "learning_rate": 0.0002958373934957398, "loss": 4.7546, "step": 174000 }, { "epoch": 27.856, "grad_norm": 0.15589210391044617, "learning_rate": 0.000295834993399736, "loss": 4.8234, "step": 174100 }, { "epoch": 27.872, "grad_norm": 0.22979894280433655, "learning_rate": 0.00029583259330373215, "loss": 4.8117, "step": 174200 }, { "epoch": 27.888, "grad_norm": 0.14024117588996887, "learning_rate": 0.00029583019320772826, "loss": 4.5712, "step": 174300 }, { "epoch": 27.904, "grad_norm": 0.16881561279296875, "learning_rate": 0.00029582779311172443, "loss": 4.8696, "step": 174400 }, { "epoch": 27.92, "grad_norm": 0.14194153249263763, "learning_rate": 0.0002958253930157206, "loss": 4.7792, "step": 174500 }, { "epoch": 27.936, "grad_norm": 0.16409501433372498, "learning_rate": 0.00029582299291971677, "loss": 4.862, "step": 174600 }, { "epoch": 27.951999999999998, "grad_norm": 0.21548931300640106, "learning_rate": 0.00029582059282371294, "loss": 4.6556, "step": 174700 }, { "epoch": 27.968, "grad_norm": 0.15370036661624908, "learning_rate": 0.00029581819272770906, "loss": 4.7855, "step": 174800 }, { "epoch": 27.984, "grad_norm": 0.1505698263645172, "learning_rate": 0.0002958157926317052, "loss": 4.5333, "step": 174900 }, { "epoch": 28.0, "grad_norm": 0.13952812552452087, "learning_rate": 0.0002958133925357014, "loss": 5.0827, "step": 175000 }, { "epoch": 28.016, "grad_norm": 0.14113423228263855, "learning_rate": 0.00029581099243969756, "loss": 4.4652, "step": 175100 }, { "epoch": 28.032, "grad_norm": 0.13563218712806702, "learning_rate": 0.00029580859234369373, "loss": 4.4769, "step": 175200 }, { "epoch": 28.048, "grad_norm": 0.16485312581062317, "learning_rate": 0.0002958061922476899, "loss": 4.7196, "step": 175300 }, { "epoch": 28.064, "grad_norm": 0.1928679645061493, "learning_rate": 0.000295803792151686, "loss": 4.5181, "step": 175400 }, { "epoch": 28.08, "grad_norm": 0.16406244039535522, "learning_rate": 0.00029580141605664226, "loss": 4.5547, "step": 175500 }, { "epoch": 28.096, "grad_norm": 0.12744209170341492, "learning_rate": 0.00029579901596063843, "loss": 4.6802, "step": 175600 }, { "epoch": 28.112, "grad_norm": 0.15242663025856018, "learning_rate": 0.00029579661586463454, "loss": 4.7076, "step": 175700 }, { "epoch": 28.128, "grad_norm": 0.1231980100274086, "learning_rate": 0.0002957942157686307, "loss": 4.7097, "step": 175800 }, { "epoch": 28.144, "grad_norm": 0.1742876172065735, "learning_rate": 0.0002957918156726269, "loss": 4.8166, "step": 175900 }, { "epoch": 28.16, "grad_norm": 0.15425816178321838, "learning_rate": 0.00029578941557662305, "loss": 4.6306, "step": 176000 }, { "epoch": 28.176, "grad_norm": 0.1423932909965515, "learning_rate": 0.0002957870154806192, "loss": 4.7671, "step": 176100 }, { "epoch": 28.192, "grad_norm": 0.13283143937587738, "learning_rate": 0.0002957846153846154, "loss": 4.5074, "step": 176200 }, { "epoch": 28.208, "grad_norm": 0.1560533046722412, "learning_rate": 0.0002957822152886115, "loss": 4.8514, "step": 176300 }, { "epoch": 28.224, "grad_norm": 0.12814775109291077, "learning_rate": 0.0002957798151926077, "loss": 4.7173, "step": 176400 }, { "epoch": 28.24, "grad_norm": 0.1441114842891693, "learning_rate": 0.00029577741509660384, "loss": 4.7003, "step": 176500 }, { "epoch": 28.256, "grad_norm": 0.13554996252059937, "learning_rate": 0.0002957750150006, "loss": 4.6206, "step": 176600 }, { "epoch": 28.272, "grad_norm": 0.21647945046424866, "learning_rate": 0.0002957726149045962, "loss": 4.9289, "step": 176700 }, { "epoch": 28.288, "grad_norm": 0.1216735765337944, "learning_rate": 0.0002957702148085923, "loss": 4.7441, "step": 176800 }, { "epoch": 28.304, "grad_norm": 0.12911395728588104, "learning_rate": 0.00029576781471258847, "loss": 4.6493, "step": 176900 }, { "epoch": 28.32, "grad_norm": 0.12240692973136902, "learning_rate": 0.00029576541461658463, "loss": 4.7305, "step": 177000 }, { "epoch": 28.336, "grad_norm": 0.17344659566879272, "learning_rate": 0.0002957630145205808, "loss": 4.5246, "step": 177100 }, { "epoch": 28.352, "grad_norm": 0.12759949266910553, "learning_rate": 0.00029576061442457697, "loss": 4.6852, "step": 177200 }, { "epoch": 28.368, "grad_norm": 0.12402662634849548, "learning_rate": 0.00029575821432857314, "loss": 4.5194, "step": 177300 }, { "epoch": 28.384, "grad_norm": 0.19976910948753357, "learning_rate": 0.00029575581423256926, "loss": 4.5166, "step": 177400 }, { "epoch": 28.4, "grad_norm": 0.14362084865570068, "learning_rate": 0.0002957534141365654, "loss": 4.5147, "step": 177500 }, { "epoch": 28.416, "grad_norm": 0.13851560652256012, "learning_rate": 0.0002957510140405616, "loss": 4.5473, "step": 177600 }, { "epoch": 28.432, "grad_norm": 0.13696688413619995, "learning_rate": 0.00029574861394455776, "loss": 4.7163, "step": 177700 }, { "epoch": 28.448, "grad_norm": 0.1331932544708252, "learning_rate": 0.00029574621384855393, "loss": 5.0066, "step": 177800 }, { "epoch": 28.464, "grad_norm": 0.13118359446525574, "learning_rate": 0.00029574381375255005, "loss": 4.7009, "step": 177900 }, { "epoch": 28.48, "grad_norm": 0.11460904031991959, "learning_rate": 0.0002957414136565462, "loss": 4.5525, "step": 178000 }, { "epoch": 28.496, "grad_norm": 0.11112211644649506, "learning_rate": 0.0002957390135605424, "loss": 4.8012, "step": 178100 }, { "epoch": 28.512, "grad_norm": 0.1618378460407257, "learning_rate": 0.00029573661346453856, "loss": 4.8419, "step": 178200 }, { "epoch": 28.528, "grad_norm": 0.13665986061096191, "learning_rate": 0.0002957342133685347, "loss": 4.6129, "step": 178300 }, { "epoch": 28.544, "grad_norm": 0.10059978067874908, "learning_rate": 0.0002957318132725309, "loss": 4.7326, "step": 178400 }, { "epoch": 28.56, "grad_norm": 0.1575680524110794, "learning_rate": 0.000295729413176527, "loss": 5.0102, "step": 178500 }, { "epoch": 28.576, "grad_norm": 0.10887812077999115, "learning_rate": 0.0002957270130805232, "loss": 4.7228, "step": 178600 }, { "epoch": 28.592, "grad_norm": 0.08943487703800201, "learning_rate": 0.0002957246369854794, "loss": 4.4294, "step": 178700 }, { "epoch": 28.608, "grad_norm": 0.14149336516857147, "learning_rate": 0.00029572223688947554, "loss": 4.6056, "step": 178800 }, { "epoch": 28.624, "grad_norm": 0.12872636318206787, "learning_rate": 0.0002957198367934717, "loss": 4.8457, "step": 178900 }, { "epoch": 28.64, "grad_norm": 0.15382656455039978, "learning_rate": 0.0002957174366974679, "loss": 4.7641, "step": 179000 }, { "epoch": 28.656, "grad_norm": 0.15484744310379028, "learning_rate": 0.00029571503660146404, "loss": 4.7261, "step": 179100 }, { "epoch": 28.672, "grad_norm": 0.1385447382926941, "learning_rate": 0.0002957126365054602, "loss": 4.8178, "step": 179200 }, { "epoch": 28.688, "grad_norm": 0.09416704624891281, "learning_rate": 0.0002957102364094564, "loss": 4.462, "step": 179300 }, { "epoch": 28.704, "grad_norm": 0.11756269633769989, "learning_rate": 0.0002957078363134525, "loss": 4.9817, "step": 179400 }, { "epoch": 28.72, "grad_norm": 0.16298645734786987, "learning_rate": 0.00029570543621744867, "loss": 4.7884, "step": 179500 }, { "epoch": 28.736, "grad_norm": 0.1666107177734375, "learning_rate": 0.00029570303612144484, "loss": 4.5478, "step": 179600 }, { "epoch": 28.752, "grad_norm": 0.14432166516780853, "learning_rate": 0.000295700636025441, "loss": 4.5671, "step": 179700 }, { "epoch": 28.768, "grad_norm": 0.14455050230026245, "learning_rate": 0.0002956982359294372, "loss": 4.4565, "step": 179800 }, { "epoch": 28.784, "grad_norm": 0.11911621689796448, "learning_rate": 0.0002956958358334333, "loss": 4.8298, "step": 179900 }, { "epoch": 28.8, "grad_norm": 0.11492261290550232, "learning_rate": 0.00029569343573742946, "loss": 4.8744, "step": 180000 }, { "epoch": 28.816, "grad_norm": 0.11532367020845413, "learning_rate": 0.00029569103564142563, "loss": 4.9461, "step": 180100 }, { "epoch": 28.832, "grad_norm": 0.11335845291614532, "learning_rate": 0.0002956886355454218, "loss": 4.6438, "step": 180200 }, { "epoch": 28.848, "grad_norm": 0.13290923833847046, "learning_rate": 0.00029568623544941797, "loss": 4.5029, "step": 180300 }, { "epoch": 28.864, "grad_norm": 0.12123245000839233, "learning_rate": 0.00029568383535341414, "loss": 5.002, "step": 180400 }, { "epoch": 28.88, "grad_norm": 0.1688774973154068, "learning_rate": 0.00029568143525741025, "loss": 4.5888, "step": 180500 }, { "epoch": 28.896, "grad_norm": 0.12593814730644226, "learning_rate": 0.0002956790351614064, "loss": 4.5949, "step": 180600 }, { "epoch": 28.912, "grad_norm": 0.13134326040744781, "learning_rate": 0.0002956766350654026, "loss": 4.3431, "step": 180700 }, { "epoch": 28.928, "grad_norm": 0.14252367615699768, "learning_rate": 0.00029567423496939876, "loss": 4.1599, "step": 180800 }, { "epoch": 28.944, "grad_norm": 0.13371191918849945, "learning_rate": 0.0002956718348733949, "loss": 4.4618, "step": 180900 }, { "epoch": 28.96, "grad_norm": 0.2305118888616562, "learning_rate": 0.00029566943477739104, "loss": 4.7324, "step": 181000 }, { "epoch": 28.976, "grad_norm": 0.17778520286083221, "learning_rate": 0.0002956670346813872, "loss": 4.5895, "step": 181100 }, { "epoch": 28.992, "grad_norm": 0.16209328174591064, "learning_rate": 0.0002956646345853834, "loss": 4.5924, "step": 181200 }, { "epoch": 29.008, "grad_norm": 0.13874457776546478, "learning_rate": 0.0002956622584903396, "loss": 4.5032, "step": 181300 }, { "epoch": 29.024, "grad_norm": 0.13318394124507904, "learning_rate": 0.00029565985839433574, "loss": 4.3979, "step": 181400 }, { "epoch": 29.04, "grad_norm": 0.1424497812986374, "learning_rate": 0.0002956574582983319, "loss": 4.6121, "step": 181500 }, { "epoch": 29.056, "grad_norm": 0.1274562031030655, "learning_rate": 0.0002956550582023281, "loss": 4.6716, "step": 181600 }, { "epoch": 29.072, "grad_norm": 0.15418770909309387, "learning_rate": 0.00029565265810632425, "loss": 4.4586, "step": 181700 }, { "epoch": 29.088, "grad_norm": 0.1679641753435135, "learning_rate": 0.0002956502580103204, "loss": 4.4676, "step": 181800 }, { "epoch": 29.104, "grad_norm": 0.10988187789916992, "learning_rate": 0.00029564788191527655, "loss": 4.4074, "step": 181900 }, { "epoch": 29.12, "grad_norm": 0.13705100119113922, "learning_rate": 0.0002956454818192727, "loss": 4.5681, "step": 182000 } ], "logging_steps": 100, "max_steps": 12500000, "num_input_tokens_seen": 0, "num_train_epochs": 2000, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.600452986732544e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }