{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 62980, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015878056525881232, "grad_norm": 69456.203125, "learning_rate": 4.992219752302319e-05, "loss": 3.3489, "step": 50 }, { "epoch": 0.031756113051762465, "grad_norm": 28315.216796875, "learning_rate": 4.984280724039378e-05, "loss": 2.1825, "step": 100 }, { "epoch": 0.047634169577643694, "grad_norm": 22939.28515625, "learning_rate": 4.976341695776437e-05, "loss": 1.8984, "step": 150 }, { "epoch": 0.06351222610352493, "grad_norm": 23446.751953125, "learning_rate": 4.968402667513497e-05, "loss": 1.8356, "step": 200 }, { "epoch": 0.07939028262940616, "grad_norm": 15620.2763671875, "learning_rate": 4.960463639250556e-05, "loss": 1.8063, "step": 250 }, { "epoch": 0.09526833915528739, "grad_norm": 17849.13671875, "learning_rate": 4.9525246109876156e-05, "loss": 1.7904, "step": 300 }, { "epoch": 0.11114639568116863, "grad_norm": 16811.81640625, "learning_rate": 4.944585582724674e-05, "loss": 1.7763, "step": 350 }, { "epoch": 0.12702445220704986, "grad_norm": 15963.05859375, "learning_rate": 4.936646554461734e-05, "loss": 1.7694, "step": 400 }, { "epoch": 0.1429025087329311, "grad_norm": 13125.708984375, "learning_rate": 4.928707526198794e-05, "loss": 1.7613, "step": 450 }, { "epoch": 0.15878056525881232, "grad_norm": 13808.8876953125, "learning_rate": 4.920768497935853e-05, "loss": 1.7565, "step": 500 }, { "epoch": 0.17465862178469355, "grad_norm": 15005.7353515625, "learning_rate": 4.912829469672912e-05, "loss": 1.7512, "step": 550 }, { "epoch": 0.19053667831057478, "grad_norm": 15644.3916015625, "learning_rate": 4.904890441409972e-05, "loss": 1.7457, "step": 600 }, { "epoch": 0.206414734836456, "grad_norm": 13973.876953125, "learning_rate": 4.896951413147031e-05, "loss": 1.7395, "step": 650 }, { "epoch": 0.22229279136233726, "grad_norm": 16523.75390625, "learning_rate": 4.88901238488409e-05, "loss": 1.7285, "step": 700 }, { "epoch": 0.2381708478882185, "grad_norm": 25229.689453125, "learning_rate": 4.88107335662115e-05, "loss": 1.7046, "step": 750 }, { "epoch": 0.2540489044140997, "grad_norm": 48292.5078125, "learning_rate": 4.8731343283582094e-05, "loss": 1.664, "step": 800 }, { "epoch": 0.2699269609399809, "grad_norm": 38503.4296875, "learning_rate": 4.865195300095269e-05, "loss": 1.6121, "step": 850 }, { "epoch": 0.2858050174658622, "grad_norm": 48101.1484375, "learning_rate": 4.8572562718323275e-05, "loss": 1.5622, "step": 900 }, { "epoch": 0.30168307399174343, "grad_norm": 63038.48046875, "learning_rate": 4.8493172435693875e-05, "loss": 1.5095, "step": 950 }, { "epoch": 0.31756113051762463, "grad_norm": 80867.75, "learning_rate": 4.841378215306447e-05, "loss": 1.4587, "step": 1000 }, { "epoch": 0.3334391870435059, "grad_norm": 42233.703125, "learning_rate": 4.8334391870435056e-05, "loss": 1.4149, "step": 1050 }, { "epoch": 0.3493172435693871, "grad_norm": 53614.875, "learning_rate": 4.825500158780566e-05, "loss": 1.375, "step": 1100 }, { "epoch": 0.36519530009526835, "grad_norm": 43879.546875, "learning_rate": 4.817561130517625e-05, "loss": 1.3372, "step": 1150 }, { "epoch": 0.38107335662114955, "grad_norm": 43690.40625, "learning_rate": 4.8096221022546844e-05, "loss": 1.3026, "step": 1200 }, { "epoch": 0.3969514131470308, "grad_norm": 46655.68359375, "learning_rate": 4.801683073991743e-05, "loss": 1.2798, "step": 1250 }, { "epoch": 0.412829469672912, "grad_norm": 39245.97265625, "learning_rate": 4.793744045728803e-05, "loss": 1.2521, "step": 1300 }, { "epoch": 0.42870752619879327, "grad_norm": 52106.55859375, "learning_rate": 4.7858050174658626e-05, "loss": 1.2299, "step": 1350 }, { "epoch": 0.4445855827246745, "grad_norm": 41391.16796875, "learning_rate": 4.777865989202922e-05, "loss": 1.2141, "step": 1400 }, { "epoch": 0.4604636392505557, "grad_norm": 41273.65625, "learning_rate": 4.769926960939981e-05, "loss": 1.1954, "step": 1450 }, { "epoch": 0.476341695776437, "grad_norm": 48099.3203125, "learning_rate": 4.761987932677041e-05, "loss": 1.1785, "step": 1500 }, { "epoch": 0.4922197523023182, "grad_norm": 41167.43359375, "learning_rate": 4.7540489044141e-05, "loss": 1.1642, "step": 1550 }, { "epoch": 0.5080978088281994, "grad_norm": 40066.16796875, "learning_rate": 4.746109876151159e-05, "loss": 1.15, "step": 1600 }, { "epoch": 0.5239758653540807, "grad_norm": 57272.24609375, "learning_rate": 4.738170847888219e-05, "loss": 1.1426, "step": 1650 }, { "epoch": 0.5398539218799618, "grad_norm": 38700.19921875, "learning_rate": 4.730231819625278e-05, "loss": 1.1286, "step": 1700 }, { "epoch": 0.5557319784058431, "grad_norm": 39080.23828125, "learning_rate": 4.7222927913623376e-05, "loss": 1.116, "step": 1750 }, { "epoch": 0.5716100349317244, "grad_norm": 40904.78515625, "learning_rate": 4.714353763099397e-05, "loss": 1.1037, "step": 1800 }, { "epoch": 0.5874880914576056, "grad_norm": 46008.6328125, "learning_rate": 4.7064147348364564e-05, "loss": 1.1006, "step": 1850 }, { "epoch": 0.6033661479834869, "grad_norm": 51325.14453125, "learning_rate": 4.698475706573516e-05, "loss": 1.0899, "step": 1900 }, { "epoch": 0.619244204509368, "grad_norm": 38173.07421875, "learning_rate": 4.6905366783105745e-05, "loss": 1.0767, "step": 1950 }, { "epoch": 0.6351222610352493, "grad_norm": 36432.33203125, "learning_rate": 4.6825976500476345e-05, "loss": 1.0737, "step": 2000 }, { "epoch": 0.6510003175611305, "grad_norm": 38897.46484375, "learning_rate": 4.674658621784694e-05, "loss": 1.0631, "step": 2050 }, { "epoch": 0.6668783740870118, "grad_norm": 36565.90234375, "learning_rate": 4.666719593521753e-05, "loss": 1.0611, "step": 2100 }, { "epoch": 0.6827564306128929, "grad_norm": 38040.09765625, "learning_rate": 4.658780565258813e-05, "loss": 1.049, "step": 2150 }, { "epoch": 0.6986344871387742, "grad_norm": 40119.8984375, "learning_rate": 4.650841536995872e-05, "loss": 1.0401, "step": 2200 }, { "epoch": 0.7145125436646554, "grad_norm": 38403.37890625, "learning_rate": 4.6429025087329314e-05, "loss": 1.0373, "step": 2250 }, { "epoch": 0.7303906001905367, "grad_norm": 35558.62109375, "learning_rate": 4.634963480469991e-05, "loss": 1.0327, "step": 2300 }, { "epoch": 0.746268656716418, "grad_norm": 41901.265625, "learning_rate": 4.62702445220705e-05, "loss": 1.0258, "step": 2350 }, { "epoch": 0.7621467132422991, "grad_norm": 35863.7109375, "learning_rate": 4.6190854239441096e-05, "loss": 1.0181, "step": 2400 }, { "epoch": 0.7780247697681804, "grad_norm": 34477.07421875, "learning_rate": 4.611146395681169e-05, "loss": 1.0156, "step": 2450 }, { "epoch": 0.7939028262940616, "grad_norm": 36488.30078125, "learning_rate": 4.603207367418228e-05, "loss": 1.0109, "step": 2500 }, { "epoch": 0.8097808828199429, "grad_norm": 37530.74609375, "learning_rate": 4.595268339155288e-05, "loss": 1.0063, "step": 2550 }, { "epoch": 0.825658939345824, "grad_norm": 38817.25390625, "learning_rate": 4.587329310892347e-05, "loss": 1.0007, "step": 2600 }, { "epoch": 0.8415369958717053, "grad_norm": 36987.03515625, "learning_rate": 4.5793902826294065e-05, "loss": 0.9925, "step": 2650 }, { "epoch": 0.8574150523975865, "grad_norm": 36874.86328125, "learning_rate": 4.571451254366466e-05, "loss": 0.9931, "step": 2700 }, { "epoch": 0.8732931089234678, "grad_norm": 38325.61328125, "learning_rate": 4.563512226103525e-05, "loss": 0.984, "step": 2750 }, { "epoch": 0.889171165449349, "grad_norm": 35897.0859375, "learning_rate": 4.5555731978405846e-05, "loss": 0.982, "step": 2800 }, { "epoch": 0.9050492219752302, "grad_norm": 38165.0, "learning_rate": 4.547634169577643e-05, "loss": 0.9786, "step": 2850 }, { "epoch": 0.9209272785011114, "grad_norm": 36260.8671875, "learning_rate": 4.5396951413147034e-05, "loss": 0.9736, "step": 2900 }, { "epoch": 0.9368053350269927, "grad_norm": 35681.60546875, "learning_rate": 4.531756113051763e-05, "loss": 0.9657, "step": 2950 }, { "epoch": 0.952683391552874, "grad_norm": 33949.65234375, "learning_rate": 4.523817084788822e-05, "loss": 0.9691, "step": 3000 }, { "epoch": 0.9685614480787551, "grad_norm": 34695.6796875, "learning_rate": 4.5158780565258815e-05, "loss": 0.9621, "step": 3050 }, { "epoch": 0.9844395046046364, "grad_norm": 42460.8359375, "learning_rate": 4.507939028262941e-05, "loss": 0.9627, "step": 3100 }, { "epoch": 1.0003175611305177, "grad_norm": 37083.6796875, "learning_rate": 4.5e-05, "loss": 0.9538, "step": 3150 }, { "epoch": 1.0161956176563989, "grad_norm": 35333.8359375, "learning_rate": 4.49206097173706e-05, "loss": 0.951, "step": 3200 }, { "epoch": 1.03207367418228, "grad_norm": 34177.54296875, "learning_rate": 4.484121943474119e-05, "loss": 0.9496, "step": 3250 }, { "epoch": 1.0479517307081614, "grad_norm": 33729.51953125, "learning_rate": 4.4761829152111784e-05, "loss": 0.9431, "step": 3300 }, { "epoch": 1.0638297872340425, "grad_norm": 35274.5625, "learning_rate": 4.468243886948238e-05, "loss": 0.9405, "step": 3350 }, { "epoch": 1.0797078437599237, "grad_norm": 35014.6796875, "learning_rate": 4.460304858685297e-05, "loss": 0.9404, "step": 3400 }, { "epoch": 1.095585900285805, "grad_norm": 34381.96484375, "learning_rate": 4.4523658304223566e-05, "loss": 0.9367, "step": 3450 }, { "epoch": 1.1114639568116862, "grad_norm": 36695.0625, "learning_rate": 4.444426802159416e-05, "loss": 0.9309, "step": 3500 }, { "epoch": 1.1273420133375676, "grad_norm": 37935.7734375, "learning_rate": 4.436487773896475e-05, "loss": 0.9331, "step": 3550 }, { "epoch": 1.1432200698634487, "grad_norm": 34156.03125, "learning_rate": 4.428548745633535e-05, "loss": 0.9251, "step": 3600 }, { "epoch": 1.1590981263893299, "grad_norm": 34130.3046875, "learning_rate": 4.420609717370594e-05, "loss": 0.9263, "step": 3650 }, { "epoch": 1.1749761829152112, "grad_norm": 35052.70703125, "learning_rate": 4.4126706891076535e-05, "loss": 0.9217, "step": 3700 }, { "epoch": 1.1908542394410924, "grad_norm": 36061.64453125, "learning_rate": 4.404731660844713e-05, "loss": 0.9197, "step": 3750 }, { "epoch": 1.2067322959669737, "grad_norm": 32517.8984375, "learning_rate": 4.396792632581772e-05, "loss": 0.9208, "step": 3800 }, { "epoch": 1.2226103524928549, "grad_norm": 35002.6328125, "learning_rate": 4.3888536043188316e-05, "loss": 0.9197, "step": 3850 }, { "epoch": 1.238488409018736, "grad_norm": 32863.40234375, "learning_rate": 4.380914576055891e-05, "loss": 0.9089, "step": 3900 }, { "epoch": 1.2543664655446174, "grad_norm": 34469.52734375, "learning_rate": 4.3729755477929504e-05, "loss": 0.9127, "step": 3950 }, { "epoch": 1.2702445220704985, "grad_norm": 32288.01171875, "learning_rate": 4.36503651953001e-05, "loss": 0.9062, "step": 4000 }, { "epoch": 1.28612257859638, "grad_norm": 34935.56640625, "learning_rate": 4.357097491267069e-05, "loss": 0.9076, "step": 4050 }, { "epoch": 1.302000635122261, "grad_norm": 32266.908203125, "learning_rate": 4.3491584630041285e-05, "loss": 0.9079, "step": 4100 }, { "epoch": 1.3178786916481422, "grad_norm": 33226.34375, "learning_rate": 4.341219434741188e-05, "loss": 0.9044, "step": 4150 }, { "epoch": 1.3337567481740236, "grad_norm": 34449.453125, "learning_rate": 4.333280406478247e-05, "loss": 0.9038, "step": 4200 }, { "epoch": 1.3496348046999047, "grad_norm": 32593.271484375, "learning_rate": 4.325341378215307e-05, "loss": 0.8969, "step": 4250 }, { "epoch": 1.365512861225786, "grad_norm": 33997.08984375, "learning_rate": 4.317402349952366e-05, "loss": 0.8955, "step": 4300 }, { "epoch": 1.3813909177516672, "grad_norm": 34988.44921875, "learning_rate": 4.3094633216894254e-05, "loss": 0.8925, "step": 4350 }, { "epoch": 1.3972689742775484, "grad_norm": 31535.099609375, "learning_rate": 4.301524293426485e-05, "loss": 0.8931, "step": 4400 }, { "epoch": 1.4131470308034297, "grad_norm": 32298.515625, "learning_rate": 4.293585265163544e-05, "loss": 0.8914, "step": 4450 }, { "epoch": 1.4290250873293109, "grad_norm": 33798.9140625, "learning_rate": 4.2856462369006036e-05, "loss": 0.889, "step": 4500 }, { "epoch": 1.4449031438551923, "grad_norm": 33727.1484375, "learning_rate": 4.277707208637663e-05, "loss": 0.8891, "step": 4550 }, { "epoch": 1.4607812003810734, "grad_norm": 39095.109375, "learning_rate": 4.269768180374722e-05, "loss": 0.8932, "step": 4600 }, { "epoch": 1.4766592569069545, "grad_norm": 33266.66796875, "learning_rate": 4.261829152111782e-05, "loss": 0.8843, "step": 4650 }, { "epoch": 1.4925373134328357, "grad_norm": 33800.22265625, "learning_rate": 4.253890123848841e-05, "loss": 0.882, "step": 4700 }, { "epoch": 1.508415369958717, "grad_norm": 31127.240234375, "learning_rate": 4.2459510955859005e-05, "loss": 0.8828, "step": 4750 }, { "epoch": 1.5242934264845984, "grad_norm": 33765.43359375, "learning_rate": 4.23801206732296e-05, "loss": 0.8802, "step": 4800 }, { "epoch": 1.5401714830104796, "grad_norm": 32892.484375, "learning_rate": 4.230073039060019e-05, "loss": 0.882, "step": 4850 }, { "epoch": 1.5560495395363607, "grad_norm": 32505.76953125, "learning_rate": 4.2221340107970786e-05, "loss": 0.8772, "step": 4900 }, { "epoch": 1.5719275960622419, "grad_norm": 32439.31640625, "learning_rate": 4.214194982534138e-05, "loss": 0.8742, "step": 4950 }, { "epoch": 1.5878056525881232, "grad_norm": 32518.017578125, "learning_rate": 4.2062559542711974e-05, "loss": 0.873, "step": 5000 }, { "epoch": 1.6036837091140046, "grad_norm": 34613.1640625, "learning_rate": 4.198316926008257e-05, "loss": 0.873, "step": 5050 }, { "epoch": 1.6195617656398857, "grad_norm": 32058.451171875, "learning_rate": 4.190377897745316e-05, "loss": 0.873, "step": 5100 }, { "epoch": 1.6354398221657669, "grad_norm": 31566.736328125, "learning_rate": 4.1824388694823755e-05, "loss": 0.8675, "step": 5150 }, { "epoch": 1.651317878691648, "grad_norm": 33988.2734375, "learning_rate": 4.174499841219435e-05, "loss": 0.8667, "step": 5200 }, { "epoch": 1.6671959352175294, "grad_norm": 33580.97265625, "learning_rate": 4.166560812956494e-05, "loss": 0.8638, "step": 5250 }, { "epoch": 1.6830739917434108, "grad_norm": 33622.66015625, "learning_rate": 4.158621784693554e-05, "loss": 0.8668, "step": 5300 }, { "epoch": 1.698952048269292, "grad_norm": 32790.08203125, "learning_rate": 4.150682756430613e-05, "loss": 0.8616, "step": 5350 }, { "epoch": 1.714830104795173, "grad_norm": 31265.09765625, "learning_rate": 4.1427437281676724e-05, "loss": 0.8578, "step": 5400 }, { "epoch": 1.7307081613210542, "grad_norm": 31559.154296875, "learning_rate": 4.134804699904732e-05, "loss": 0.86, "step": 5450 }, { "epoch": 1.7465862178469356, "grad_norm": 31868.90234375, "learning_rate": 4.126865671641791e-05, "loss": 0.8612, "step": 5500 }, { "epoch": 1.7624642743728167, "grad_norm": 32017.955078125, "learning_rate": 4.1189266433788506e-05, "loss": 0.8599, "step": 5550 }, { "epoch": 1.778342330898698, "grad_norm": 32002.77734375, "learning_rate": 4.11098761511591e-05, "loss": 0.8592, "step": 5600 }, { "epoch": 1.7942203874245792, "grad_norm": 30244.373046875, "learning_rate": 4.103048586852969e-05, "loss": 0.855, "step": 5650 }, { "epoch": 1.8100984439504604, "grad_norm": 32472.248046875, "learning_rate": 4.095109558590029e-05, "loss": 0.857, "step": 5700 }, { "epoch": 1.8259765004763417, "grad_norm": 32474.484375, "learning_rate": 4.087170530327088e-05, "loss": 0.8526, "step": 5750 }, { "epoch": 1.841854557002223, "grad_norm": 31490.275390625, "learning_rate": 4.0792315020641475e-05, "loss": 0.8559, "step": 5800 }, { "epoch": 1.8577326135281043, "grad_norm": 31051.58203125, "learning_rate": 4.071292473801207e-05, "loss": 0.8522, "step": 5850 }, { "epoch": 1.8736106700539854, "grad_norm": 31409.7578125, "learning_rate": 4.063353445538267e-05, "loss": 0.8486, "step": 5900 }, { "epoch": 1.8894887265798666, "grad_norm": 32382.998046875, "learning_rate": 4.0554144172753256e-05, "loss": 0.8515, "step": 5950 }, { "epoch": 1.9053667831057477, "grad_norm": 31483.380859375, "learning_rate": 4.047475389012385e-05, "loss": 0.8477, "step": 6000 }, { "epoch": 1.921244839631629, "grad_norm": 32504.404296875, "learning_rate": 4.0395363607494444e-05, "loss": 0.8506, "step": 6050 }, { "epoch": 1.9371228961575104, "grad_norm": 31748.40234375, "learning_rate": 4.031597332486504e-05, "loss": 0.8461, "step": 6100 }, { "epoch": 1.9530009526833916, "grad_norm": 31397.05078125, "learning_rate": 4.023658304223563e-05, "loss": 0.8459, "step": 6150 }, { "epoch": 1.9688790092092727, "grad_norm": 32932.44140625, "learning_rate": 4.0157192759606225e-05, "loss": 0.8431, "step": 6200 }, { "epoch": 1.9847570657351539, "grad_norm": 32290.40234375, "learning_rate": 4.007780247697682e-05, "loss": 0.8435, "step": 6250 }, { "epoch": 2.0006351222610355, "grad_norm": 31642.935546875, "learning_rate": 3.999841219434741e-05, "loss": 0.8403, "step": 6300 }, { "epoch": 2.0165131787869166, "grad_norm": 31722.009765625, "learning_rate": 3.991902191171801e-05, "loss": 0.8393, "step": 6350 }, { "epoch": 2.0323912353127978, "grad_norm": 30855.62890625, "learning_rate": 3.98396316290886e-05, "loss": 0.8356, "step": 6400 }, { "epoch": 2.048269291838679, "grad_norm": 31825.283203125, "learning_rate": 3.9760241346459194e-05, "loss": 0.8342, "step": 6450 }, { "epoch": 2.06414734836456, "grad_norm": 32278.908203125, "learning_rate": 3.968085106382979e-05, "loss": 0.8337, "step": 6500 }, { "epoch": 2.080025404890441, "grad_norm": 31714.283203125, "learning_rate": 3.960146078120038e-05, "loss": 0.8348, "step": 6550 }, { "epoch": 2.0959034614163228, "grad_norm": 31567.390625, "learning_rate": 3.9522070498570976e-05, "loss": 0.8281, "step": 6600 }, { "epoch": 2.111781517942204, "grad_norm": 32440.802734375, "learning_rate": 3.944268021594157e-05, "loss": 0.831, "step": 6650 }, { "epoch": 2.127659574468085, "grad_norm": 32060.837890625, "learning_rate": 3.936328993331216e-05, "loss": 0.831, "step": 6700 }, { "epoch": 2.143537630993966, "grad_norm": 31656.82421875, "learning_rate": 3.928389965068276e-05, "loss": 0.829, "step": 6750 }, { "epoch": 2.1594156875198474, "grad_norm": 31442.107421875, "learning_rate": 3.920450936805336e-05, "loss": 0.8266, "step": 6800 }, { "epoch": 2.175293744045729, "grad_norm": 31876.103515625, "learning_rate": 3.9125119085423945e-05, "loss": 0.8256, "step": 6850 }, { "epoch": 2.19117180057161, "grad_norm": 31034.79296875, "learning_rate": 3.904572880279454e-05, "loss": 0.8217, "step": 6900 }, { "epoch": 2.2070498570974912, "grad_norm": 32682.744140625, "learning_rate": 3.896633852016513e-05, "loss": 0.8301, "step": 6950 }, { "epoch": 2.2229279136233724, "grad_norm": 32618.359375, "learning_rate": 3.8886948237535726e-05, "loss": 0.8211, "step": 7000 }, { "epoch": 2.2388059701492535, "grad_norm": 30968.298828125, "learning_rate": 3.880755795490632e-05, "loss": 0.825, "step": 7050 }, { "epoch": 2.254684026675135, "grad_norm": 30861.9921875, "learning_rate": 3.8728167672276914e-05, "loss": 0.8201, "step": 7100 }, { "epoch": 2.2705620832010163, "grad_norm": 30823.109375, "learning_rate": 3.8648777389647514e-05, "loss": 0.8197, "step": 7150 }, { "epoch": 2.2864401397268974, "grad_norm": 30141.990234375, "learning_rate": 3.85693871070181e-05, "loss": 0.8224, "step": 7200 }, { "epoch": 2.3023181962527786, "grad_norm": 34504.34375, "learning_rate": 3.8489996824388695e-05, "loss": 0.821, "step": 7250 }, { "epoch": 2.3181962527786597, "grad_norm": 30797.966796875, "learning_rate": 3.841060654175929e-05, "loss": 0.8203, "step": 7300 }, { "epoch": 2.3340743093045413, "grad_norm": 30129.06640625, "learning_rate": 3.833121625912988e-05, "loss": 0.8178, "step": 7350 }, { "epoch": 2.3499523658304224, "grad_norm": 30087.42578125, "learning_rate": 3.825182597650048e-05, "loss": 0.8219, "step": 7400 }, { "epoch": 2.3658304223563036, "grad_norm": 30917.18359375, "learning_rate": 3.817243569387107e-05, "loss": 0.8163, "step": 7450 }, { "epoch": 2.3817084788821847, "grad_norm": 33235.78515625, "learning_rate": 3.809304541124167e-05, "loss": 0.8149, "step": 7500 }, { "epoch": 2.397586535408066, "grad_norm": 30538.533203125, "learning_rate": 3.801365512861226e-05, "loss": 0.818, "step": 7550 }, { "epoch": 2.4134645919339475, "grad_norm": 31500.251953125, "learning_rate": 3.793426484598285e-05, "loss": 0.8171, "step": 7600 }, { "epoch": 2.4293426484598286, "grad_norm": 30612.111328125, "learning_rate": 3.7854874563353446e-05, "loss": 0.8132, "step": 7650 }, { "epoch": 2.4452207049857098, "grad_norm": 30479.9140625, "learning_rate": 3.7775484280724046e-05, "loss": 0.8147, "step": 7700 }, { "epoch": 2.461098761511591, "grad_norm": 30102.48046875, "learning_rate": 3.769609399809463e-05, "loss": 0.8112, "step": 7750 }, { "epoch": 2.476976818037472, "grad_norm": 30969.11328125, "learning_rate": 3.761670371546523e-05, "loss": 0.8161, "step": 7800 }, { "epoch": 2.4928548745633536, "grad_norm": 30215.08203125, "learning_rate": 3.753731343283583e-05, "loss": 0.8118, "step": 7850 }, { "epoch": 2.508732931089235, "grad_norm": 30152.841796875, "learning_rate": 3.7457923150206415e-05, "loss": 0.813, "step": 7900 }, { "epoch": 2.524610987615116, "grad_norm": 30816.564453125, "learning_rate": 3.737853286757701e-05, "loss": 0.8124, "step": 7950 }, { "epoch": 2.540489044140997, "grad_norm": 31137.095703125, "learning_rate": 3.72991425849476e-05, "loss": 0.8105, "step": 8000 }, { "epoch": 2.5563671006668782, "grad_norm": 30815.490234375, "learning_rate": 3.72197523023182e-05, "loss": 0.8117, "step": 8050 }, { "epoch": 2.57224515719276, "grad_norm": 31679.986328125, "learning_rate": 3.714036201968879e-05, "loss": 0.808, "step": 8100 }, { "epoch": 2.588123213718641, "grad_norm": 30918.755859375, "learning_rate": 3.7060971737059384e-05, "loss": 0.807, "step": 8150 }, { "epoch": 2.604001270244522, "grad_norm": 30721.857421875, "learning_rate": 3.6981581454429984e-05, "loss": 0.8077, "step": 8200 }, { "epoch": 2.6198793267704032, "grad_norm": 30369.298828125, "learning_rate": 3.690219117180057e-05, "loss": 0.8067, "step": 8250 }, { "epoch": 2.6357573832962844, "grad_norm": 30493.7421875, "learning_rate": 3.6822800889171165e-05, "loss": 0.8063, "step": 8300 }, { "epoch": 2.651635439822166, "grad_norm": 30101.5859375, "learning_rate": 3.674341060654176e-05, "loss": 0.8065, "step": 8350 }, { "epoch": 2.667513496348047, "grad_norm": 30403.134765625, "learning_rate": 3.666402032391236e-05, "loss": 0.8023, "step": 8400 }, { "epoch": 2.6833915528739283, "grad_norm": 30230.22265625, "learning_rate": 3.658463004128295e-05, "loss": 0.8025, "step": 8450 }, { "epoch": 2.6992696093998094, "grad_norm": 30146.58203125, "learning_rate": 3.650523975865354e-05, "loss": 0.8035, "step": 8500 }, { "epoch": 2.7151476659256906, "grad_norm": 30205.19921875, "learning_rate": 3.6425849476024134e-05, "loss": 0.8023, "step": 8550 }, { "epoch": 2.731025722451572, "grad_norm": 29393.84375, "learning_rate": 3.6346459193394735e-05, "loss": 0.7985, "step": 8600 }, { "epoch": 2.7469037789774533, "grad_norm": 30105.896484375, "learning_rate": 3.626706891076532e-05, "loss": 0.798, "step": 8650 }, { "epoch": 2.7627818355033344, "grad_norm": 31707.62109375, "learning_rate": 3.6187678628135916e-05, "loss": 0.8019, "step": 8700 }, { "epoch": 2.7786598920292156, "grad_norm": 30057.185546875, "learning_rate": 3.6108288345506516e-05, "loss": 0.8019, "step": 8750 }, { "epoch": 2.7945379485550967, "grad_norm": 31893.216796875, "learning_rate": 3.60288980628771e-05, "loss": 0.8004, "step": 8800 }, { "epoch": 2.8104160050809783, "grad_norm": 29928.91015625, "learning_rate": 3.59495077802477e-05, "loss": 0.7989, "step": 8850 }, { "epoch": 2.8262940616068595, "grad_norm": 29859.1015625, "learning_rate": 3.587011749761829e-05, "loss": 0.7987, "step": 8900 }, { "epoch": 2.8421721181327406, "grad_norm": 29658.259765625, "learning_rate": 3.579072721498889e-05, "loss": 0.8001, "step": 8950 }, { "epoch": 2.8580501746586218, "grad_norm": 29646.7421875, "learning_rate": 3.571133693235948e-05, "loss": 0.7946, "step": 9000 }, { "epoch": 2.873928231184503, "grad_norm": 29755.7890625, "learning_rate": 3.563194664973007e-05, "loss": 0.7983, "step": 9050 }, { "epoch": 2.8898062877103845, "grad_norm": 29201.306640625, "learning_rate": 3.555255636710067e-05, "loss": 0.7966, "step": 9100 }, { "epoch": 2.905684344236265, "grad_norm": 30059.353515625, "learning_rate": 3.547316608447126e-05, "loss": 0.7968, "step": 9150 }, { "epoch": 2.921562400762147, "grad_norm": 29836.736328125, "learning_rate": 3.5393775801841854e-05, "loss": 0.7915, "step": 9200 }, { "epoch": 2.937440457288028, "grad_norm": 29474.76171875, "learning_rate": 3.531438551921245e-05, "loss": 0.7951, "step": 9250 }, { "epoch": 2.953318513813909, "grad_norm": 29720.64453125, "learning_rate": 3.523499523658305e-05, "loss": 0.7973, "step": 9300 }, { "epoch": 2.9691965703397907, "grad_norm": 30905.662109375, "learning_rate": 3.5155604953953635e-05, "loss": 0.796, "step": 9350 }, { "epoch": 2.9850746268656714, "grad_norm": 29326.87109375, "learning_rate": 3.507621467132423e-05, "loss": 0.7943, "step": 9400 }, { "epoch": 3.000952683391553, "grad_norm": 29983.439453125, "learning_rate": 3.499682438869483e-05, "loss": 0.7913, "step": 9450 }, { "epoch": 3.016830739917434, "grad_norm": 31416.537109375, "learning_rate": 3.4917434106065423e-05, "loss": 0.7875, "step": 9500 }, { "epoch": 3.0327087964433153, "grad_norm": 29326.58984375, "learning_rate": 3.483804382343601e-05, "loss": 0.788, "step": 9550 }, { "epoch": 3.0485868529691964, "grad_norm": 30804.06640625, "learning_rate": 3.4758653540806604e-05, "loss": 0.7874, "step": 9600 }, { "epoch": 3.064464909495078, "grad_norm": 30785.3125, "learning_rate": 3.4679263258177205e-05, "loss": 0.7889, "step": 9650 }, { "epoch": 3.080342966020959, "grad_norm": 29545.892578125, "learning_rate": 3.459987297554779e-05, "loss": 0.786, "step": 9700 }, { "epoch": 3.0962210225468403, "grad_norm": 29333.875, "learning_rate": 3.4520482692918386e-05, "loss": 0.7869, "step": 9750 }, { "epoch": 3.1120990790727214, "grad_norm": 29140.724609375, "learning_rate": 3.4441092410288986e-05, "loss": 0.7848, "step": 9800 }, { "epoch": 3.1279771355986026, "grad_norm": 29844.708984375, "learning_rate": 3.436170212765958e-05, "loss": 0.7833, "step": 9850 }, { "epoch": 3.143855192124484, "grad_norm": 29902.115234375, "learning_rate": 3.428231184503017e-05, "loss": 0.7868, "step": 9900 }, { "epoch": 3.1597332486503653, "grad_norm": 29875.98828125, "learning_rate": 3.420292156240076e-05, "loss": 0.7824, "step": 9950 }, { "epoch": 3.1756113051762465, "grad_norm": 29536.19921875, "learning_rate": 3.412353127977136e-05, "loss": 0.7819, "step": 10000 }, { "epoch": 3.1914893617021276, "grad_norm": 29816.875, "learning_rate": 3.404414099714195e-05, "loss": 0.7804, "step": 10050 }, { "epoch": 3.2073674182280087, "grad_norm": 30143.560546875, "learning_rate": 3.396475071451254e-05, "loss": 0.7819, "step": 10100 }, { "epoch": 3.22324547475389, "grad_norm": 29888.271484375, "learning_rate": 3.388536043188314e-05, "loss": 0.7817, "step": 10150 }, { "epoch": 3.2391235312797715, "grad_norm": 29966.201171875, "learning_rate": 3.380597014925374e-05, "loss": 0.7809, "step": 10200 }, { "epoch": 3.2550015878056526, "grad_norm": 30786.470703125, "learning_rate": 3.3726579866624324e-05, "loss": 0.7765, "step": 10250 }, { "epoch": 3.2708796443315338, "grad_norm": 30272.826171875, "learning_rate": 3.364718958399492e-05, "loss": 0.7806, "step": 10300 }, { "epoch": 3.286757700857415, "grad_norm": 29083.400390625, "learning_rate": 3.356779930136552e-05, "loss": 0.7799, "step": 10350 }, { "epoch": 3.302635757383296, "grad_norm": 29236.140625, "learning_rate": 3.348840901873611e-05, "loss": 0.7801, "step": 10400 }, { "epoch": 3.3185138139091777, "grad_norm": 29460.9296875, "learning_rate": 3.34090187361067e-05, "loss": 0.7779, "step": 10450 }, { "epoch": 3.334391870435059, "grad_norm": 29783.908203125, "learning_rate": 3.332962845347729e-05, "loss": 0.7782, "step": 10500 }, { "epoch": 3.35026992696094, "grad_norm": 30124.951171875, "learning_rate": 3.3250238170847893e-05, "loss": 0.7778, "step": 10550 }, { "epoch": 3.366147983486821, "grad_norm": 29516.72265625, "learning_rate": 3.317084788821848e-05, "loss": 0.7771, "step": 10600 }, { "epoch": 3.3820260400127022, "grad_norm": 29329.431640625, "learning_rate": 3.3091457605589074e-05, "loss": 0.7777, "step": 10650 }, { "epoch": 3.397904096538584, "grad_norm": 29930.46875, "learning_rate": 3.3012067322959675e-05, "loss": 0.7776, "step": 10700 }, { "epoch": 3.413782153064465, "grad_norm": 30200.29296875, "learning_rate": 3.293267704033027e-05, "loss": 0.7774, "step": 10750 }, { "epoch": 3.429660209590346, "grad_norm": 29434.279296875, "learning_rate": 3.2853286757700856e-05, "loss": 0.7794, "step": 10800 }, { "epoch": 3.4455382661162273, "grad_norm": 29533.7265625, "learning_rate": 3.277389647507145e-05, "loss": 0.78, "step": 10850 }, { "epoch": 3.4614163226421084, "grad_norm": 30062.75390625, "learning_rate": 3.269450619244205e-05, "loss": 0.7715, "step": 10900 }, { "epoch": 3.47729437916799, "grad_norm": 30203.49609375, "learning_rate": 3.261511590981264e-05, "loss": 0.7754, "step": 10950 }, { "epoch": 3.493172435693871, "grad_norm": 29971.337890625, "learning_rate": 3.253572562718323e-05, "loss": 0.7743, "step": 11000 }, { "epoch": 3.5090504922197523, "grad_norm": 30626.96875, "learning_rate": 3.245633534455383e-05, "loss": 0.7722, "step": 11050 }, { "epoch": 3.5249285487456334, "grad_norm": 29813.41796875, "learning_rate": 3.2376945061924425e-05, "loss": 0.7762, "step": 11100 }, { "epoch": 3.5408066052715146, "grad_norm": 30018.158203125, "learning_rate": 3.229755477929501e-05, "loss": 0.7708, "step": 11150 }, { "epoch": 3.556684661797396, "grad_norm": 28745.65625, "learning_rate": 3.2218164496665606e-05, "loss": 0.7703, "step": 11200 }, { "epoch": 3.5725627183232773, "grad_norm": 29827.396484375, "learning_rate": 3.213877421403621e-05, "loss": 0.7731, "step": 11250 }, { "epoch": 3.5884407748491585, "grad_norm": 29104.228515625, "learning_rate": 3.20593839314068e-05, "loss": 0.7697, "step": 11300 }, { "epoch": 3.6043188313750396, "grad_norm": 29951.318359375, "learning_rate": 3.197999364877739e-05, "loss": 0.7719, "step": 11350 }, { "epoch": 3.6201968879009208, "grad_norm": 29854.1171875, "learning_rate": 3.190060336614799e-05, "loss": 0.7707, "step": 11400 }, { "epoch": 3.6360749444268023, "grad_norm": 29064.115234375, "learning_rate": 3.182121308351858e-05, "loss": 0.7679, "step": 11450 }, { "epoch": 3.6519530009526835, "grad_norm": 29561.064453125, "learning_rate": 3.174182280088917e-05, "loss": 0.773, "step": 11500 }, { "epoch": 3.6678310574785646, "grad_norm": 30519.935546875, "learning_rate": 3.166243251825976e-05, "loss": 0.7715, "step": 11550 }, { "epoch": 3.683709114004446, "grad_norm": 29498.1796875, "learning_rate": 3.1583042235630363e-05, "loss": 0.7696, "step": 11600 }, { "epoch": 3.699587170530327, "grad_norm": 29892.5, "learning_rate": 3.150365195300096e-05, "loss": 0.7722, "step": 11650 }, { "epoch": 3.7154652270562085, "grad_norm": 29588.62890625, "learning_rate": 3.1424261670371544e-05, "loss": 0.7695, "step": 11700 }, { "epoch": 3.7313432835820897, "grad_norm": 28297.962890625, "learning_rate": 3.1344871387742145e-05, "loss": 0.767, "step": 11750 }, { "epoch": 3.747221340107971, "grad_norm": 29926.146484375, "learning_rate": 3.126548110511274e-05, "loss": 0.7686, "step": 11800 }, { "epoch": 3.763099396633852, "grad_norm": 29238.9375, "learning_rate": 3.1186090822483326e-05, "loss": 0.7657, "step": 11850 }, { "epoch": 3.778977453159733, "grad_norm": 29130.10546875, "learning_rate": 3.110670053985392e-05, "loss": 0.7664, "step": 11900 }, { "epoch": 3.7948555096856147, "grad_norm": 28701.7890625, "learning_rate": 3.102731025722452e-05, "loss": 0.7658, "step": 11950 }, { "epoch": 3.810733566211496, "grad_norm": 29051.44140625, "learning_rate": 3.0947919974595114e-05, "loss": 0.7693, "step": 12000 }, { "epoch": 3.826611622737377, "grad_norm": 29298.595703125, "learning_rate": 3.08685296919657e-05, "loss": 0.767, "step": 12050 }, { "epoch": 3.842489679263258, "grad_norm": 28832.44921875, "learning_rate": 3.07891394093363e-05, "loss": 0.7655, "step": 12100 }, { "epoch": 3.8583677357891393, "grad_norm": 28990.67578125, "learning_rate": 3.0709749126706895e-05, "loss": 0.7618, "step": 12150 }, { "epoch": 3.874245792315021, "grad_norm": 28622.34765625, "learning_rate": 3.063035884407749e-05, "loss": 0.7664, "step": 12200 }, { "epoch": 3.890123848840902, "grad_norm": 29283.064453125, "learning_rate": 3.0550968561448076e-05, "loss": 0.7678, "step": 12250 }, { "epoch": 3.906001905366783, "grad_norm": 29989.224609375, "learning_rate": 3.0471578278818673e-05, "loss": 0.7631, "step": 12300 }, { "epoch": 3.9218799618926643, "grad_norm": 30515.478515625, "learning_rate": 3.039218799618927e-05, "loss": 0.7634, "step": 12350 }, { "epoch": 3.9377580184185454, "grad_norm": 29057.341796875, "learning_rate": 3.031279771355986e-05, "loss": 0.7649, "step": 12400 }, { "epoch": 3.953636074944427, "grad_norm": 29256.357421875, "learning_rate": 3.0233407430930455e-05, "loss": 0.7617, "step": 12450 }, { "epoch": 3.969514131470308, "grad_norm": 29832.517578125, "learning_rate": 3.015401714830105e-05, "loss": 0.762, "step": 12500 }, { "epoch": 3.9853921879961893, "grad_norm": 30056.580078125, "learning_rate": 3.0074626865671646e-05, "loss": 0.7635, "step": 12550 }, { "epoch": 4.001270244522071, "grad_norm": 30290.673828125, "learning_rate": 2.9995236583042236e-05, "loss": 0.761, "step": 12600 }, { "epoch": 4.017148301047952, "grad_norm": 30393.2421875, "learning_rate": 2.991584630041283e-05, "loss": 0.7576, "step": 12650 }, { "epoch": 4.033026357573833, "grad_norm": 29898.7734375, "learning_rate": 2.9836456017783427e-05, "loss": 0.7592, "step": 12700 }, { "epoch": 4.048904414099714, "grad_norm": 29665.693359375, "learning_rate": 2.9757065735154018e-05, "loss": 0.7564, "step": 12750 }, { "epoch": 4.0647824706255955, "grad_norm": 29677.63671875, "learning_rate": 2.967767545252461e-05, "loss": 0.7564, "step": 12800 }, { "epoch": 4.080660527151477, "grad_norm": 29343.1328125, "learning_rate": 2.9598285169895205e-05, "loss": 0.7589, "step": 12850 }, { "epoch": 4.096538583677358, "grad_norm": 28720.55078125, "learning_rate": 2.9518894887265802e-05, "loss": 0.7543, "step": 12900 }, { "epoch": 4.112416640203239, "grad_norm": 28604.630859375, "learning_rate": 2.9439504604636393e-05, "loss": 0.7554, "step": 12950 }, { "epoch": 4.12829469672912, "grad_norm": 29508.154296875, "learning_rate": 2.9360114322006987e-05, "loss": 0.7555, "step": 13000 }, { "epoch": 4.144172753255002, "grad_norm": 28996.201171875, "learning_rate": 2.9280724039377584e-05, "loss": 0.7552, "step": 13050 }, { "epoch": 4.160050809780882, "grad_norm": 31208.623046875, "learning_rate": 2.9201333756748178e-05, "loss": 0.7584, "step": 13100 }, { "epoch": 4.175928866306764, "grad_norm": 29778.853515625, "learning_rate": 2.9121943474118768e-05, "loss": 0.7549, "step": 13150 }, { "epoch": 4.1918069228326456, "grad_norm": 29337.587890625, "learning_rate": 2.9042553191489362e-05, "loss": 0.7546, "step": 13200 }, { "epoch": 4.207684979358526, "grad_norm": 28682.619140625, "learning_rate": 2.896316290885996e-05, "loss": 0.749, "step": 13250 }, { "epoch": 4.223563035884408, "grad_norm": 29200.912109375, "learning_rate": 2.888377262623055e-05, "loss": 0.7545, "step": 13300 }, { "epoch": 4.2394410924102885, "grad_norm": 30967.3515625, "learning_rate": 2.8804382343601143e-05, "loss": 0.7558, "step": 13350 }, { "epoch": 4.25531914893617, "grad_norm": 30665.638671875, "learning_rate": 2.872499206097174e-05, "loss": 0.7532, "step": 13400 }, { "epoch": 4.271197205462052, "grad_norm": 28959.0625, "learning_rate": 2.8645601778342334e-05, "loss": 0.7553, "step": 13450 }, { "epoch": 4.287075261987932, "grad_norm": 29186.98046875, "learning_rate": 2.8566211495712925e-05, "loss": 0.7535, "step": 13500 }, { "epoch": 4.302953318513814, "grad_norm": 29713.626953125, "learning_rate": 2.848682121308352e-05, "loss": 0.7526, "step": 13550 }, { "epoch": 4.318831375039695, "grad_norm": 30507.126953125, "learning_rate": 2.8407430930454116e-05, "loss": 0.7557, "step": 13600 }, { "epoch": 4.334709431565576, "grad_norm": 28860.220703125, "learning_rate": 2.8328040647824706e-05, "loss": 0.7497, "step": 13650 }, { "epoch": 4.350587488091458, "grad_norm": 28531.17578125, "learning_rate": 2.82486503651953e-05, "loss": 0.7536, "step": 13700 }, { "epoch": 4.366465544617339, "grad_norm": 29637.4296875, "learning_rate": 2.8169260082565897e-05, "loss": 0.7536, "step": 13750 }, { "epoch": 4.38234360114322, "grad_norm": 29468.94140625, "learning_rate": 2.808986979993649e-05, "loss": 0.7517, "step": 13800 }, { "epoch": 4.398221657669101, "grad_norm": 29931.30859375, "learning_rate": 2.801047951730708e-05, "loss": 0.7514, "step": 13850 }, { "epoch": 4.4140997141949825, "grad_norm": 29154.953125, "learning_rate": 2.7931089234677675e-05, "loss": 0.7545, "step": 13900 }, { "epoch": 4.429977770720864, "grad_norm": 29288.595703125, "learning_rate": 2.7851698952048273e-05, "loss": 0.7535, "step": 13950 }, { "epoch": 4.445855827246745, "grad_norm": 29386.255859375, "learning_rate": 2.7772308669418866e-05, "loss": 0.7517, "step": 14000 }, { "epoch": 4.461733883772626, "grad_norm": 28722.88671875, "learning_rate": 2.7692918386789457e-05, "loss": 0.7482, "step": 14050 }, { "epoch": 4.477611940298507, "grad_norm": 29279.75, "learning_rate": 2.7613528104160054e-05, "loss": 0.7481, "step": 14100 }, { "epoch": 4.493489996824389, "grad_norm": 29140.587890625, "learning_rate": 2.7534137821530648e-05, "loss": 0.7462, "step": 14150 }, { "epoch": 4.50936805335027, "grad_norm": 28993.6484375, "learning_rate": 2.7454747538901238e-05, "loss": 0.7473, "step": 14200 }, { "epoch": 4.525246109876151, "grad_norm": 30390.90234375, "learning_rate": 2.7375357256271832e-05, "loss": 0.7515, "step": 14250 }, { "epoch": 4.5411241664020325, "grad_norm": 29218.51953125, "learning_rate": 2.729596697364243e-05, "loss": 0.7471, "step": 14300 }, { "epoch": 4.557002222927913, "grad_norm": 30701.79296875, "learning_rate": 2.7216576691013023e-05, "loss": 0.7487, "step": 14350 }, { "epoch": 4.572880279453795, "grad_norm": 29310.14453125, "learning_rate": 2.7137186408383613e-05, "loss": 0.7492, "step": 14400 }, { "epoch": 4.588758335979676, "grad_norm": 29325.59375, "learning_rate": 2.705779612575421e-05, "loss": 0.7479, "step": 14450 }, { "epoch": 4.604636392505557, "grad_norm": 29217.953125, "learning_rate": 2.6978405843124804e-05, "loss": 0.7512, "step": 14500 }, { "epoch": 4.620514449031439, "grad_norm": 29466.189453125, "learning_rate": 2.6899015560495395e-05, "loss": 0.7477, "step": 14550 }, { "epoch": 4.636392505557319, "grad_norm": 28055.72265625, "learning_rate": 2.681962527786599e-05, "loss": 0.749, "step": 14600 }, { "epoch": 4.652270562083201, "grad_norm": 28245.34375, "learning_rate": 2.6740234995236586e-05, "loss": 0.7479, "step": 14650 }, { "epoch": 4.668148618609083, "grad_norm": 29280.736328125, "learning_rate": 2.666084471260718e-05, "loss": 0.7484, "step": 14700 }, { "epoch": 4.684026675134963, "grad_norm": 29603.513671875, "learning_rate": 2.658145442997777e-05, "loss": 0.7499, "step": 14750 }, { "epoch": 4.699904731660845, "grad_norm": 28774.55078125, "learning_rate": 2.6502064147348364e-05, "loss": 0.7463, "step": 14800 }, { "epoch": 4.715782788186726, "grad_norm": 29580.22265625, "learning_rate": 2.642267386471896e-05, "loss": 0.7424, "step": 14850 }, { "epoch": 4.731660844712607, "grad_norm": 28998.916015625, "learning_rate": 2.6343283582089555e-05, "loss": 0.7457, "step": 14900 }, { "epoch": 4.747538901238489, "grad_norm": 28983.86328125, "learning_rate": 2.6263893299460145e-05, "loss": 0.7412, "step": 14950 }, { "epoch": 4.7634169577643695, "grad_norm": 30523.78125, "learning_rate": 2.6184503016830743e-05, "loss": 0.7468, "step": 15000 }, { "epoch": 4.779295014290251, "grad_norm": 29167.177734375, "learning_rate": 2.6105112734201336e-05, "loss": 0.7435, "step": 15050 }, { "epoch": 4.795173070816132, "grad_norm": 29224.529296875, "learning_rate": 2.6025722451571927e-05, "loss": 0.7409, "step": 15100 }, { "epoch": 4.811051127342013, "grad_norm": 28679.3046875, "learning_rate": 2.594633216894252e-05, "loss": 0.7394, "step": 15150 }, { "epoch": 4.826929183867895, "grad_norm": 29085.837890625, "learning_rate": 2.5866941886313118e-05, "loss": 0.745, "step": 15200 }, { "epoch": 4.842807240393776, "grad_norm": 28241.09765625, "learning_rate": 2.578755160368371e-05, "loss": 0.7456, "step": 15250 }, { "epoch": 4.858685296919657, "grad_norm": 29162.150390625, "learning_rate": 2.5708161321054302e-05, "loss": 0.7428, "step": 15300 }, { "epoch": 4.874563353445538, "grad_norm": 29517.7578125, "learning_rate": 2.56287710384249e-05, "loss": 0.7418, "step": 15350 }, { "epoch": 4.8904414099714195, "grad_norm": 28757.634765625, "learning_rate": 2.5549380755795493e-05, "loss": 0.7475, "step": 15400 }, { "epoch": 4.906319466497301, "grad_norm": 29112.775390625, "learning_rate": 2.5469990473166083e-05, "loss": 0.742, "step": 15450 }, { "epoch": 4.922197523023182, "grad_norm": 29536.763671875, "learning_rate": 2.5390600190536677e-05, "loss": 0.7423, "step": 15500 }, { "epoch": 4.938075579549063, "grad_norm": 29418.62890625, "learning_rate": 2.5311209907907274e-05, "loss": 0.7445, "step": 15550 }, { "epoch": 4.953953636074944, "grad_norm": 29107.703125, "learning_rate": 2.5231819625277868e-05, "loss": 0.7421, "step": 15600 }, { "epoch": 4.969831692600826, "grad_norm": 28584.513671875, "learning_rate": 2.515242934264846e-05, "loss": 0.7416, "step": 15650 }, { "epoch": 4.985709749126707, "grad_norm": 29202.330078125, "learning_rate": 2.5073039060019056e-05, "loss": 0.7404, "step": 15700 }, { "epoch": 5.001587805652588, "grad_norm": 28508.923828125, "learning_rate": 2.499364877738965e-05, "loss": 0.7401, "step": 15750 }, { "epoch": 5.01746586217847, "grad_norm": 29855.150390625, "learning_rate": 2.4914258494760243e-05, "loss": 0.7377, "step": 15800 }, { "epoch": 5.03334391870435, "grad_norm": 28641.822265625, "learning_rate": 2.4834868212130834e-05, "loss": 0.7373, "step": 15850 }, { "epoch": 5.049221975230232, "grad_norm": 29080.0234375, "learning_rate": 2.475547792950143e-05, "loss": 0.7387, "step": 15900 }, { "epoch": 5.0651000317561135, "grad_norm": 29752.501953125, "learning_rate": 2.467608764687202e-05, "loss": 0.7364, "step": 15950 }, { "epoch": 5.080978088281994, "grad_norm": 29244.943359375, "learning_rate": 2.459669736424262e-05, "loss": 0.7386, "step": 16000 }, { "epoch": 5.096856144807876, "grad_norm": 30756.474609375, "learning_rate": 2.4517307081613213e-05, "loss": 0.7365, "step": 16050 }, { "epoch": 5.1127342013337564, "grad_norm": 29783.80078125, "learning_rate": 2.4437916798983806e-05, "loss": 0.7345, "step": 16100 }, { "epoch": 5.128612257859638, "grad_norm": 28871.20703125, "learning_rate": 2.43585265163544e-05, "loss": 0.7385, "step": 16150 }, { "epoch": 5.14449031438552, "grad_norm": 29666.92578125, "learning_rate": 2.4279136233724994e-05, "loss": 0.7364, "step": 16200 }, { "epoch": 5.1603683709114, "grad_norm": 27965.49609375, "learning_rate": 2.4199745951095588e-05, "loss": 0.7349, "step": 16250 }, { "epoch": 5.176246427437282, "grad_norm": 29663.21484375, "learning_rate": 2.4120355668466178e-05, "loss": 0.7336, "step": 16300 }, { "epoch": 5.192124483963163, "grad_norm": 29286.236328125, "learning_rate": 2.4040965385836775e-05, "loss": 0.7353, "step": 16350 }, { "epoch": 5.208002540489044, "grad_norm": 28999.544921875, "learning_rate": 2.396157510320737e-05, "loss": 0.7364, "step": 16400 }, { "epoch": 5.223880597014926, "grad_norm": 29259.37890625, "learning_rate": 2.3882184820577963e-05, "loss": 0.7386, "step": 16450 }, { "epoch": 5.2397586535408065, "grad_norm": 29506.087890625, "learning_rate": 2.3802794537948557e-05, "loss": 0.734, "step": 16500 }, { "epoch": 5.255636710066688, "grad_norm": 29443.67578125, "learning_rate": 2.372340425531915e-05, "loss": 0.7408, "step": 16550 }, { "epoch": 5.271514766592569, "grad_norm": 29926.109375, "learning_rate": 2.3644013972689744e-05, "loss": 0.7368, "step": 16600 }, { "epoch": 5.28739282311845, "grad_norm": 30687.26953125, "learning_rate": 2.3564623690060338e-05, "loss": 0.7345, "step": 16650 }, { "epoch": 5.303270879644332, "grad_norm": 28573.580078125, "learning_rate": 2.3485233407430932e-05, "loss": 0.7348, "step": 16700 }, { "epoch": 5.319148936170213, "grad_norm": 30217.728515625, "learning_rate": 2.3405843124801526e-05, "loss": 0.7348, "step": 16750 }, { "epoch": 5.335026992696094, "grad_norm": 28830.787109375, "learning_rate": 2.332645284217212e-05, "loss": 0.7389, "step": 16800 }, { "epoch": 5.350905049221975, "grad_norm": 29589.51953125, "learning_rate": 2.3247062559542713e-05, "loss": 0.7364, "step": 16850 }, { "epoch": 5.3667831057478566, "grad_norm": 28568.32421875, "learning_rate": 2.3167672276913307e-05, "loss": 0.7348, "step": 16900 }, { "epoch": 5.382661162273738, "grad_norm": 30656.90234375, "learning_rate": 2.30882819942839e-05, "loss": 0.7346, "step": 16950 }, { "epoch": 5.398539218799619, "grad_norm": 29044.07421875, "learning_rate": 2.3008891711654495e-05, "loss": 0.7363, "step": 17000 }, { "epoch": 5.4144172753255, "grad_norm": 29246.9765625, "learning_rate": 2.292950142902509e-05, "loss": 0.7376, "step": 17050 }, { "epoch": 5.430295331851381, "grad_norm": 29861.26171875, "learning_rate": 2.2850111146395683e-05, "loss": 0.7322, "step": 17100 }, { "epoch": 5.446173388377263, "grad_norm": 29695.34765625, "learning_rate": 2.2770720863766276e-05, "loss": 0.7324, "step": 17150 }, { "epoch": 5.462051444903144, "grad_norm": 29459.607421875, "learning_rate": 2.269133058113687e-05, "loss": 0.733, "step": 17200 }, { "epoch": 5.477929501429025, "grad_norm": 28512.21484375, "learning_rate": 2.2611940298507464e-05, "loss": 0.7274, "step": 17250 }, { "epoch": 5.493807557954907, "grad_norm": 29732.572265625, "learning_rate": 2.2532550015878058e-05, "loss": 0.7333, "step": 17300 }, { "epoch": 5.509685614480787, "grad_norm": 29110.30859375, "learning_rate": 2.245315973324865e-05, "loss": 0.7344, "step": 17350 }, { "epoch": 5.525563671006669, "grad_norm": 29294.5859375, "learning_rate": 2.2373769450619245e-05, "loss": 0.7292, "step": 17400 }, { "epoch": 5.5414417275325505, "grad_norm": 28999.29296875, "learning_rate": 2.229437916798984e-05, "loss": 0.7322, "step": 17450 }, { "epoch": 5.557319784058431, "grad_norm": 28789.3359375, "learning_rate": 2.2214988885360433e-05, "loss": 0.7334, "step": 17500 }, { "epoch": 5.573197840584313, "grad_norm": 29767.94140625, "learning_rate": 2.2135598602731027e-05, "loss": 0.7324, "step": 17550 }, { "epoch": 5.5890758971101935, "grad_norm": 29029.384765625, "learning_rate": 2.205620832010162e-05, "loss": 0.7318, "step": 17600 }, { "epoch": 5.604953953636075, "grad_norm": 29995.126953125, "learning_rate": 2.1976818037472214e-05, "loss": 0.7332, "step": 17650 }, { "epoch": 5.620832010161957, "grad_norm": 29895.734375, "learning_rate": 2.1897427754842808e-05, "loss": 0.7323, "step": 17700 }, { "epoch": 5.636710066687837, "grad_norm": 28998.80078125, "learning_rate": 2.1818037472213402e-05, "loss": 0.7308, "step": 17750 }, { "epoch": 5.652588123213719, "grad_norm": 29375.140625, "learning_rate": 2.1738647189583996e-05, "loss": 0.7307, "step": 17800 }, { "epoch": 5.6684661797396, "grad_norm": 30084.34375, "learning_rate": 2.165925690695459e-05, "loss": 0.7292, "step": 17850 }, { "epoch": 5.684344236265481, "grad_norm": 29009.86328125, "learning_rate": 2.1579866624325183e-05, "loss": 0.7311, "step": 17900 }, { "epoch": 5.700222292791363, "grad_norm": 28996.576171875, "learning_rate": 2.1500476341695777e-05, "loss": 0.731, "step": 17950 }, { "epoch": 5.7161003493172435, "grad_norm": 28874.802734375, "learning_rate": 2.142108605906637e-05, "loss": 0.7301, "step": 18000 }, { "epoch": 5.731978405843125, "grad_norm": 29940.734375, "learning_rate": 2.1341695776436965e-05, "loss": 0.73, "step": 18050 }, { "epoch": 5.747856462369006, "grad_norm": 29607.59765625, "learning_rate": 2.126230549380756e-05, "loss": 0.7311, "step": 18100 }, { "epoch": 5.763734518894887, "grad_norm": 29006.7890625, "learning_rate": 2.1182915211178153e-05, "loss": 0.7301, "step": 18150 }, { "epoch": 5.779612575420768, "grad_norm": 30157.11328125, "learning_rate": 2.1103524928548746e-05, "loss": 0.7281, "step": 18200 }, { "epoch": 5.79549063194665, "grad_norm": 29396.783203125, "learning_rate": 2.102413464591934e-05, "loss": 0.7315, "step": 18250 }, { "epoch": 5.811368688472531, "grad_norm": 29335.1015625, "learning_rate": 2.0944744363289934e-05, "loss": 0.7279, "step": 18300 }, { "epoch": 5.827246744998412, "grad_norm": 28650.416015625, "learning_rate": 2.0865354080660528e-05, "loss": 0.7347, "step": 18350 }, { "epoch": 5.843124801524294, "grad_norm": 29561.095703125, "learning_rate": 2.078596379803112e-05, "loss": 0.7263, "step": 18400 }, { "epoch": 5.859002858050174, "grad_norm": 28661.658203125, "learning_rate": 2.070657351540172e-05, "loss": 0.7314, "step": 18450 }, { "epoch": 5.874880914576056, "grad_norm": 28372.755859375, "learning_rate": 2.062718323277231e-05, "loss": 0.7265, "step": 18500 }, { "epoch": 5.8907589711019375, "grad_norm": 28641.00390625, "learning_rate": 2.0547792950142903e-05, "loss": 0.7296, "step": 18550 }, { "epoch": 5.906637027627818, "grad_norm": 29402.455078125, "learning_rate": 2.0468402667513497e-05, "loss": 0.7257, "step": 18600 }, { "epoch": 5.9225150841537, "grad_norm": 29028.78515625, "learning_rate": 2.038901238488409e-05, "loss": 0.7262, "step": 18650 }, { "epoch": 5.9383931406795805, "grad_norm": 29547.03125, "learning_rate": 2.0309622102254684e-05, "loss": 0.728, "step": 18700 }, { "epoch": 5.954271197205462, "grad_norm": 29396.15234375, "learning_rate": 2.0230231819625278e-05, "loss": 0.725, "step": 18750 }, { "epoch": 5.970149253731344, "grad_norm": 29648.634765625, "learning_rate": 2.0150841536995872e-05, "loss": 0.7264, "step": 18800 }, { "epoch": 5.986027310257224, "grad_norm": 29708.830078125, "learning_rate": 2.0071451254366466e-05, "loss": 0.7263, "step": 18850 }, { "epoch": 6.001905366783106, "grad_norm": 29569.521484375, "learning_rate": 1.9992060971737063e-05, "loss": 0.7249, "step": 18900 }, { "epoch": 6.017783423308987, "grad_norm": 29247.328125, "learning_rate": 1.9912670689107653e-05, "loss": 0.7248, "step": 18950 }, { "epoch": 6.033661479834868, "grad_norm": 29513.408203125, "learning_rate": 1.9833280406478247e-05, "loss": 0.7243, "step": 19000 }, { "epoch": 6.04953953636075, "grad_norm": 29599.560546875, "learning_rate": 1.975389012384884e-05, "loss": 0.7228, "step": 19050 }, { "epoch": 6.0654175928866305, "grad_norm": 28947.3515625, "learning_rate": 1.9674499841219435e-05, "loss": 0.7261, "step": 19100 }, { "epoch": 6.081295649412512, "grad_norm": 28986.390625, "learning_rate": 1.959510955859003e-05, "loss": 0.721, "step": 19150 }, { "epoch": 6.097173705938393, "grad_norm": 29116.353515625, "learning_rate": 1.9515719275960623e-05, "loss": 0.7193, "step": 19200 }, { "epoch": 6.113051762464274, "grad_norm": 29632.84375, "learning_rate": 1.943632899333122e-05, "loss": 0.7228, "step": 19250 }, { "epoch": 6.128929818990156, "grad_norm": 29931.40234375, "learning_rate": 1.935693871070181e-05, "loss": 0.7233, "step": 19300 }, { "epoch": 6.144807875516037, "grad_norm": 29751.517578125, "learning_rate": 1.9277548428072407e-05, "loss": 0.7249, "step": 19350 }, { "epoch": 6.160685932041918, "grad_norm": 30318.1796875, "learning_rate": 1.9198158145442998e-05, "loss": 0.7232, "step": 19400 }, { "epoch": 6.176563988567799, "grad_norm": 28459.033203125, "learning_rate": 1.911876786281359e-05, "loss": 0.7219, "step": 19450 }, { "epoch": 6.192442045093681, "grad_norm": 29604.791015625, "learning_rate": 1.9039377580184185e-05, "loss": 0.7232, "step": 19500 }, { "epoch": 6.208320101619562, "grad_norm": 29528.1484375, "learning_rate": 1.895998729755478e-05, "loss": 0.7192, "step": 19550 }, { "epoch": 6.224198158145443, "grad_norm": 28977.87109375, "learning_rate": 1.8880597014925376e-05, "loss": 0.7198, "step": 19600 }, { "epoch": 6.2400762146713245, "grad_norm": 29568.908203125, "learning_rate": 1.8801206732295967e-05, "loss": 0.7231, "step": 19650 }, { "epoch": 6.255954271197205, "grad_norm": 28843.431640625, "learning_rate": 1.8721816449666564e-05, "loss": 0.7188, "step": 19700 }, { "epoch": 6.271832327723087, "grad_norm": 29127.72265625, "learning_rate": 1.8642426167037154e-05, "loss": 0.7218, "step": 19750 }, { "epoch": 6.287710384248968, "grad_norm": 29642.048828125, "learning_rate": 1.856303588440775e-05, "loss": 0.7182, "step": 19800 }, { "epoch": 6.303588440774849, "grad_norm": 29874.93359375, "learning_rate": 1.8483645601778342e-05, "loss": 0.719, "step": 19850 }, { "epoch": 6.319466497300731, "grad_norm": 29941.87890625, "learning_rate": 1.8404255319148936e-05, "loss": 0.7198, "step": 19900 }, { "epoch": 6.335344553826611, "grad_norm": 29865.37890625, "learning_rate": 1.832486503651953e-05, "loss": 0.7249, "step": 19950 }, { "epoch": 6.351222610352493, "grad_norm": 30395.484375, "learning_rate": 1.8245474753890123e-05, "loss": 0.7195, "step": 20000 }, { "epoch": 6.367100666878374, "grad_norm": 30510.9765625, "learning_rate": 1.816608447126072e-05, "loss": 0.7206, "step": 20050 }, { "epoch": 6.382978723404255, "grad_norm": 30179.208984375, "learning_rate": 1.808669418863131e-05, "loss": 0.7208, "step": 20100 }, { "epoch": 6.398856779930137, "grad_norm": 30193.3125, "learning_rate": 1.8007303906001908e-05, "loss": 0.7227, "step": 20150 }, { "epoch": 6.4147348364560175, "grad_norm": 30033.546875, "learning_rate": 1.79279136233725e-05, "loss": 0.7172, "step": 20200 }, { "epoch": 6.430612892981899, "grad_norm": 30468.65625, "learning_rate": 1.7848523340743096e-05, "loss": 0.7215, "step": 20250 }, { "epoch": 6.44649094950778, "grad_norm": 29339.623046875, "learning_rate": 1.7769133058113686e-05, "loss": 0.7174, "step": 20300 }, { "epoch": 6.462369006033661, "grad_norm": 29816.017578125, "learning_rate": 1.768974277548428e-05, "loss": 0.7194, "step": 20350 }, { "epoch": 6.478247062559543, "grad_norm": 29318.12109375, "learning_rate": 1.7610352492854877e-05, "loss": 0.7196, "step": 20400 }, { "epoch": 6.494125119085424, "grad_norm": 29063.029296875, "learning_rate": 1.7530962210225468e-05, "loss": 0.7185, "step": 20450 }, { "epoch": 6.510003175611305, "grad_norm": 30219.3359375, "learning_rate": 1.7451571927596065e-05, "loss": 0.7168, "step": 20500 }, { "epoch": 6.525881232137186, "grad_norm": 29047.94140625, "learning_rate": 1.7372181644966655e-05, "loss": 0.721, "step": 20550 }, { "epoch": 6.5417592886630676, "grad_norm": 28919.94140625, "learning_rate": 1.7292791362337253e-05, "loss": 0.7172, "step": 20600 }, { "epoch": 6.557637345188949, "grad_norm": 30273.14453125, "learning_rate": 1.7213401079707843e-05, "loss": 0.7188, "step": 20650 }, { "epoch": 6.57351540171483, "grad_norm": 30020.40625, "learning_rate": 1.713401079707844e-05, "loss": 0.7184, "step": 20700 }, { "epoch": 6.589393458240711, "grad_norm": 29766.990234375, "learning_rate": 1.705462051444903e-05, "loss": 0.7162, "step": 20750 }, { "epoch": 6.605271514766592, "grad_norm": 29873.83203125, "learning_rate": 1.6975230231819624e-05, "loss": 0.7176, "step": 20800 }, { "epoch": 6.621149571292474, "grad_norm": 29591.2578125, "learning_rate": 1.689583994919022e-05, "loss": 0.715, "step": 20850 }, { "epoch": 6.637027627818355, "grad_norm": 30105.244140625, "learning_rate": 1.6816449666560812e-05, "loss": 0.716, "step": 20900 }, { "epoch": 6.652905684344236, "grad_norm": 29515.470703125, "learning_rate": 1.673705938393141e-05, "loss": 0.7165, "step": 20950 }, { "epoch": 6.668783740870118, "grad_norm": 29495.97265625, "learning_rate": 1.6657669101302e-05, "loss": 0.7157, "step": 21000 }, { "epoch": 6.684661797395998, "grad_norm": 29483.634765625, "learning_rate": 1.6578278818672597e-05, "loss": 0.7207, "step": 21050 }, { "epoch": 6.70053985392188, "grad_norm": 29576.375, "learning_rate": 1.6498888536043187e-05, "loss": 0.7158, "step": 21100 }, { "epoch": 6.7164179104477615, "grad_norm": 29991.36328125, "learning_rate": 1.6419498253413784e-05, "loss": 0.7142, "step": 21150 }, { "epoch": 6.732295966973642, "grad_norm": 30472.951171875, "learning_rate": 1.6340107970784378e-05, "loss": 0.7157, "step": 21200 }, { "epoch": 6.748174023499524, "grad_norm": 29432.845703125, "learning_rate": 1.626071768815497e-05, "loss": 0.7164, "step": 21250 }, { "epoch": 6.7640520800254045, "grad_norm": 30286.595703125, "learning_rate": 1.6181327405525566e-05, "loss": 0.7215, "step": 21300 }, { "epoch": 6.779930136551286, "grad_norm": 29090.42578125, "learning_rate": 1.6101937122896156e-05, "loss": 0.7189, "step": 21350 }, { "epoch": 6.795808193077168, "grad_norm": 29895.060546875, "learning_rate": 1.6022546840266754e-05, "loss": 0.7172, "step": 21400 }, { "epoch": 6.811686249603048, "grad_norm": 29853.85546875, "learning_rate": 1.5943156557637344e-05, "loss": 0.713, "step": 21450 }, { "epoch": 6.82756430612893, "grad_norm": 30071.662109375, "learning_rate": 1.586376627500794e-05, "loss": 0.712, "step": 21500 }, { "epoch": 6.843442362654811, "grad_norm": 28977.619140625, "learning_rate": 1.5784375992378535e-05, "loss": 0.7134, "step": 21550 }, { "epoch": 6.859320419180692, "grad_norm": 29966.154296875, "learning_rate": 1.570498570974913e-05, "loss": 0.7147, "step": 21600 }, { "epoch": 6.875198475706574, "grad_norm": 29842.240234375, "learning_rate": 1.5625595427119723e-05, "loss": 0.7201, "step": 21650 }, { "epoch": 6.8910765322324545, "grad_norm": 29326.939453125, "learning_rate": 1.5546205144490313e-05, "loss": 0.7159, "step": 21700 }, { "epoch": 6.906954588758336, "grad_norm": 30401.865234375, "learning_rate": 1.546681486186091e-05, "loss": 0.7154, "step": 21750 }, { "epoch": 6.922832645284217, "grad_norm": 30828.583984375, "learning_rate": 1.53874245792315e-05, "loss": 0.7167, "step": 21800 }, { "epoch": 6.938710701810098, "grad_norm": 30084.78125, "learning_rate": 1.5308034296602098e-05, "loss": 0.7147, "step": 21850 }, { "epoch": 6.95458875833598, "grad_norm": 29893.818359375, "learning_rate": 1.522864401397269e-05, "loss": 0.7174, "step": 21900 }, { "epoch": 6.970466814861861, "grad_norm": 29464.3125, "learning_rate": 1.5149253731343285e-05, "loss": 0.7158, "step": 21950 }, { "epoch": 6.986344871387742, "grad_norm": 28943.29296875, "learning_rate": 1.5069863448713878e-05, "loss": 0.7111, "step": 22000 }, { "epoch": 7.002222927913623, "grad_norm": 30447.416015625, "learning_rate": 1.4990473166084473e-05, "loss": 0.7167, "step": 22050 }, { "epoch": 7.018100984439505, "grad_norm": 30325.32421875, "learning_rate": 1.4911082883455065e-05, "loss": 0.7102, "step": 22100 }, { "epoch": 7.033979040965386, "grad_norm": 30525.169921875, "learning_rate": 1.4831692600825659e-05, "loss": 0.7102, "step": 22150 }, { "epoch": 7.049857097491267, "grad_norm": 30165.91015625, "learning_rate": 1.4752302318196254e-05, "loss": 0.7124, "step": 22200 }, { "epoch": 7.0657351540171485, "grad_norm": 29645.66796875, "learning_rate": 1.4672912035566847e-05, "loss": 0.709, "step": 22250 }, { "epoch": 7.081613210543029, "grad_norm": 29299.05859375, "learning_rate": 1.4593521752937442e-05, "loss": 0.7105, "step": 22300 }, { "epoch": 7.097491267068911, "grad_norm": 29655.083984375, "learning_rate": 1.4514131470308034e-05, "loss": 0.7106, "step": 22350 }, { "epoch": 7.113369323594792, "grad_norm": 30337.484375, "learning_rate": 1.443474118767863e-05, "loss": 0.7153, "step": 22400 }, { "epoch": 7.129247380120673, "grad_norm": 29374.36328125, "learning_rate": 1.4355350905049222e-05, "loss": 0.7101, "step": 22450 }, { "epoch": 7.145125436646555, "grad_norm": 29468.953125, "learning_rate": 1.4275960622419817e-05, "loss": 0.7074, "step": 22500 }, { "epoch": 7.161003493172435, "grad_norm": 30660.234375, "learning_rate": 1.4196570339790411e-05, "loss": 0.7107, "step": 22550 }, { "epoch": 7.176881549698317, "grad_norm": 29092.36328125, "learning_rate": 1.4117180057161003e-05, "loss": 0.7102, "step": 22600 }, { "epoch": 7.1927596062241985, "grad_norm": 29264.359375, "learning_rate": 1.4037789774531599e-05, "loss": 0.7075, "step": 22650 }, { "epoch": 7.208637662750079, "grad_norm": 30579.66015625, "learning_rate": 1.3958399491902191e-05, "loss": 0.7097, "step": 22700 }, { "epoch": 7.224515719275961, "grad_norm": 29726.57421875, "learning_rate": 1.3879009209272786e-05, "loss": 0.7095, "step": 22750 }, { "epoch": 7.2403937758018415, "grad_norm": 30103.4921875, "learning_rate": 1.3799618926643379e-05, "loss": 0.7108, "step": 22800 }, { "epoch": 7.256271832327723, "grad_norm": 29993.138671875, "learning_rate": 1.3720228644013974e-05, "loss": 0.709, "step": 22850 }, { "epoch": 7.272149888853605, "grad_norm": 30139.607421875, "learning_rate": 1.3640838361384566e-05, "loss": 0.7087, "step": 22900 }, { "epoch": 7.288027945379485, "grad_norm": 29735.54296875, "learning_rate": 1.3561448078755162e-05, "loss": 0.7094, "step": 22950 }, { "epoch": 7.303906001905367, "grad_norm": 29745.779296875, "learning_rate": 1.3482057796125755e-05, "loss": 0.7102, "step": 23000 }, { "epoch": 7.319784058431248, "grad_norm": 30882.021484375, "learning_rate": 1.3402667513496348e-05, "loss": 0.7108, "step": 23050 }, { "epoch": 7.335662114957129, "grad_norm": 29850.26171875, "learning_rate": 1.3323277230866943e-05, "loss": 0.7105, "step": 23100 }, { "epoch": 7.351540171483011, "grad_norm": 30374.646484375, "learning_rate": 1.3243886948237535e-05, "loss": 0.7119, "step": 23150 }, { "epoch": 7.367418228008892, "grad_norm": 29295.6640625, "learning_rate": 1.316449666560813e-05, "loss": 0.7082, "step": 23200 }, { "epoch": 7.383296284534773, "grad_norm": 30273.5703125, "learning_rate": 1.3085106382978723e-05, "loss": 0.7087, "step": 23250 }, { "epoch": 7.399174341060654, "grad_norm": 30317.859375, "learning_rate": 1.3005716100349318e-05, "loss": 0.7079, "step": 23300 }, { "epoch": 7.4150523975865354, "grad_norm": 29483.0, "learning_rate": 1.2926325817719912e-05, "loss": 0.7065, "step": 23350 }, { "epoch": 7.430930454112417, "grad_norm": 29725.1875, "learning_rate": 1.2846935535090508e-05, "loss": 0.7103, "step": 23400 }, { "epoch": 7.446808510638298, "grad_norm": 29491.5234375, "learning_rate": 1.27675452524611e-05, "loss": 0.7102, "step": 23450 }, { "epoch": 7.462686567164179, "grad_norm": 30156.08984375, "learning_rate": 1.2688154969831692e-05, "loss": 0.7093, "step": 23500 }, { "epoch": 7.47856462369006, "grad_norm": 29961.205078125, "learning_rate": 1.2608764687202287e-05, "loss": 0.7082, "step": 23550 }, { "epoch": 7.494442680215942, "grad_norm": 29865.1640625, "learning_rate": 1.252937440457288e-05, "loss": 0.7043, "step": 23600 }, { "epoch": 7.510320736741823, "grad_norm": 30199.93359375, "learning_rate": 1.2449984121943473e-05, "loss": 0.7076, "step": 23650 }, { "epoch": 7.526198793267704, "grad_norm": 30157.5, "learning_rate": 1.2370593839314069e-05, "loss": 0.7079, "step": 23700 }, { "epoch": 7.5420768497935855, "grad_norm": 29760.25390625, "learning_rate": 1.2291203556684663e-05, "loss": 0.7048, "step": 23750 }, { "epoch": 7.557954906319466, "grad_norm": 30412.80078125, "learning_rate": 1.2211813274055256e-05, "loss": 0.7067, "step": 23800 }, { "epoch": 7.573832962845348, "grad_norm": 29912.32421875, "learning_rate": 1.213242299142585e-05, "loss": 0.7092, "step": 23850 }, { "epoch": 7.589711019371229, "grad_norm": 30394.833984375, "learning_rate": 1.2053032708796444e-05, "loss": 0.7085, "step": 23900 }, { "epoch": 7.60558907589711, "grad_norm": 30330.65625, "learning_rate": 1.1973642426167038e-05, "loss": 0.7055, "step": 23950 }, { "epoch": 7.621467132422992, "grad_norm": 30122.162109375, "learning_rate": 1.1894252143537632e-05, "loss": 0.7078, "step": 24000 }, { "epoch": 7.637345188948872, "grad_norm": 29642.91796875, "learning_rate": 1.1814861860908225e-05, "loss": 0.7062, "step": 24050 }, { "epoch": 7.653223245474754, "grad_norm": 30002.765625, "learning_rate": 1.173547157827882e-05, "loss": 0.7054, "step": 24100 }, { "epoch": 7.669101302000636, "grad_norm": 29660.470703125, "learning_rate": 1.1656081295649413e-05, "loss": 0.7048, "step": 24150 }, { "epoch": 7.684979358526516, "grad_norm": 30191.8828125, "learning_rate": 1.1576691013020007e-05, "loss": 0.7085, "step": 24200 }, { "epoch": 7.700857415052398, "grad_norm": 29762.623046875, "learning_rate": 1.14973007303906e-05, "loss": 0.7102, "step": 24250 }, { "epoch": 7.7167354715782785, "grad_norm": 29867.5625, "learning_rate": 1.1417910447761194e-05, "loss": 0.704, "step": 24300 }, { "epoch": 7.73261352810416, "grad_norm": 30145.470703125, "learning_rate": 1.1338520165131788e-05, "loss": 0.7043, "step": 24350 }, { "epoch": 7.748491584630042, "grad_norm": 29339.95703125, "learning_rate": 1.1259129882502382e-05, "loss": 0.7092, "step": 24400 }, { "epoch": 7.764369641155922, "grad_norm": 30186.40234375, "learning_rate": 1.1179739599872976e-05, "loss": 0.7063, "step": 24450 }, { "epoch": 7.780247697681804, "grad_norm": 30043.974609375, "learning_rate": 1.110034931724357e-05, "loss": 0.7053, "step": 24500 }, { "epoch": 7.796125754207685, "grad_norm": 30453.666015625, "learning_rate": 1.1020959034614164e-05, "loss": 0.7066, "step": 24550 }, { "epoch": 7.812003810733566, "grad_norm": 30049.93359375, "learning_rate": 1.0941568751984757e-05, "loss": 0.71, "step": 24600 }, { "epoch": 7.827881867259448, "grad_norm": 30561.3203125, "learning_rate": 1.0862178469355351e-05, "loss": 0.7065, "step": 24650 }, { "epoch": 7.843759923785329, "grad_norm": 30853.423828125, "learning_rate": 1.0782788186725945e-05, "loss": 0.706, "step": 24700 }, { "epoch": 7.85963798031121, "grad_norm": 29456.958984375, "learning_rate": 1.0703397904096539e-05, "loss": 0.7093, "step": 24750 }, { "epoch": 7.875516036837091, "grad_norm": 31488.560546875, "learning_rate": 1.0624007621467133e-05, "loss": 0.7063, "step": 24800 }, { "epoch": 7.8913940933629725, "grad_norm": 30248.27734375, "learning_rate": 1.0544617338837726e-05, "loss": 0.7062, "step": 24850 }, { "epoch": 7.907272149888854, "grad_norm": 29547.48828125, "learning_rate": 1.046522705620832e-05, "loss": 0.7069, "step": 24900 }, { "epoch": 7.923150206414735, "grad_norm": 29567.212890625, "learning_rate": 1.0385836773578916e-05, "loss": 0.7059, "step": 24950 }, { "epoch": 7.939028262940616, "grad_norm": 30756.359375, "learning_rate": 1.0306446490949508e-05, "loss": 0.7056, "step": 25000 }, { "epoch": 7.954906319466497, "grad_norm": 29843.1953125, "learning_rate": 1.0227056208320102e-05, "loss": 0.7058, "step": 25050 }, { "epoch": 7.970784375992379, "grad_norm": 30317.7734375, "learning_rate": 1.0147665925690695e-05, "loss": 0.7062, "step": 25100 }, { "epoch": 7.98666243251826, "grad_norm": 29659.53125, "learning_rate": 1.006827564306129e-05, "loss": 0.703, "step": 25150 }, { "epoch": 8.002540489044142, "grad_norm": 30429.38671875, "learning_rate": 9.988885360431883e-06, "loss": 0.7054, "step": 25200 }, { "epoch": 8.018418545570022, "grad_norm": 31551.69140625, "learning_rate": 9.909495077802477e-06, "loss": 0.7015, "step": 25250 }, { "epoch": 8.034296602095903, "grad_norm": 30067.037109375, "learning_rate": 9.83010479517307e-06, "loss": 0.7018, "step": 25300 }, { "epoch": 8.050174658621785, "grad_norm": 30266.275390625, "learning_rate": 9.750714512543666e-06, "loss": 0.704, "step": 25350 }, { "epoch": 8.066052715147666, "grad_norm": 30958.685546875, "learning_rate": 9.67132422991426e-06, "loss": 0.7018, "step": 25400 }, { "epoch": 8.081930771673548, "grad_norm": 30467.9921875, "learning_rate": 9.591933947284852e-06, "loss": 0.7016, "step": 25450 }, { "epoch": 8.097808828199428, "grad_norm": 30163.07421875, "learning_rate": 9.512543664655446e-06, "loss": 0.7042, "step": 25500 }, { "epoch": 8.11368688472531, "grad_norm": 31141.931640625, "learning_rate": 9.43315338202604e-06, "loss": 0.6998, "step": 25550 }, { "epoch": 8.129564941251191, "grad_norm": 30357.98828125, "learning_rate": 9.353763099396634e-06, "loss": 0.7008, "step": 25600 }, { "epoch": 8.145442997777073, "grad_norm": 30259.6171875, "learning_rate": 9.274372816767227e-06, "loss": 0.7012, "step": 25650 }, { "epoch": 8.161321054302954, "grad_norm": 29641.35546875, "learning_rate": 9.194982534137823e-06, "loss": 0.6998, "step": 25700 }, { "epoch": 8.177199110828834, "grad_norm": 30559.46875, "learning_rate": 9.115592251508417e-06, "loss": 0.706, "step": 25750 }, { "epoch": 8.193077167354716, "grad_norm": 30127.62890625, "learning_rate": 9.03620196887901e-06, "loss": 0.7063, "step": 25800 }, { "epoch": 8.208955223880597, "grad_norm": 30027.09375, "learning_rate": 8.956811686249604e-06, "loss": 0.7025, "step": 25850 }, { "epoch": 8.224833280406479, "grad_norm": 30069.939453125, "learning_rate": 8.877421403620196e-06, "loss": 0.7002, "step": 25900 }, { "epoch": 8.24071133693236, "grad_norm": 30299.6171875, "learning_rate": 8.79803112099079e-06, "loss": 0.6996, "step": 25950 }, { "epoch": 8.25658939345824, "grad_norm": 30868.3671875, "learning_rate": 8.718640838361384e-06, "loss": 0.7012, "step": 26000 }, { "epoch": 8.272467449984122, "grad_norm": 30317.115234375, "learning_rate": 8.639250555731978e-06, "loss": 0.7003, "step": 26050 }, { "epoch": 8.288345506510003, "grad_norm": 30630.19921875, "learning_rate": 8.559860273102573e-06, "loss": 0.7023, "step": 26100 }, { "epoch": 8.304223563035885, "grad_norm": 31124.72265625, "learning_rate": 8.480469990473167e-06, "loss": 0.6989, "step": 26150 }, { "epoch": 8.320101619561765, "grad_norm": 30239.1875, "learning_rate": 8.401079707843761e-06, "loss": 0.7013, "step": 26200 }, { "epoch": 8.335979676087646, "grad_norm": 30741.263671875, "learning_rate": 8.321689425214355e-06, "loss": 0.7029, "step": 26250 }, { "epoch": 8.351857732613528, "grad_norm": 29795.580078125, "learning_rate": 8.242299142584949e-06, "loss": 0.6995, "step": 26300 }, { "epoch": 8.36773578913941, "grad_norm": 30709.451171875, "learning_rate": 8.16290885995554e-06, "loss": 0.7009, "step": 26350 }, { "epoch": 8.383613845665291, "grad_norm": 30076.46875, "learning_rate": 8.083518577326135e-06, "loss": 0.6988, "step": 26400 }, { "epoch": 8.399491902191173, "grad_norm": 29763.3828125, "learning_rate": 8.004128294696728e-06, "loss": 0.7028, "step": 26450 }, { "epoch": 8.415369958717053, "grad_norm": 30590.60546875, "learning_rate": 7.924738012067324e-06, "loss": 0.7067, "step": 26500 }, { "epoch": 8.431248015242934, "grad_norm": 30087.3671875, "learning_rate": 7.845347729437918e-06, "loss": 0.702, "step": 26550 }, { "epoch": 8.447126071768816, "grad_norm": 30778.0859375, "learning_rate": 7.765957446808511e-06, "loss": 0.7028, "step": 26600 }, { "epoch": 8.463004128294697, "grad_norm": 30542.35546875, "learning_rate": 7.686567164179105e-06, "loss": 0.6996, "step": 26650 }, { "epoch": 8.478882184820577, "grad_norm": 30533.51171875, "learning_rate": 7.607176881549699e-06, "loss": 0.7023, "step": 26700 }, { "epoch": 8.494760241346459, "grad_norm": 30710.169921875, "learning_rate": 7.527786598920293e-06, "loss": 0.6998, "step": 26750 }, { "epoch": 8.51063829787234, "grad_norm": 30740.416015625, "learning_rate": 7.448396316290886e-06, "loss": 0.7036, "step": 26800 }, { "epoch": 8.526516354398222, "grad_norm": 29898.5390625, "learning_rate": 7.36900603366148e-06, "loss": 0.7007, "step": 26850 }, { "epoch": 8.542394410924103, "grad_norm": 30001.548828125, "learning_rate": 7.2896157510320734e-06, "loss": 0.6958, "step": 26900 }, { "epoch": 8.558272467449985, "grad_norm": 30819.171875, "learning_rate": 7.210225468402667e-06, "loss": 0.6976, "step": 26950 }, { "epoch": 8.574150523975865, "grad_norm": 30720.3125, "learning_rate": 7.130835185773262e-06, "loss": 0.6985, "step": 27000 }, { "epoch": 8.590028580501746, "grad_norm": 31220.265625, "learning_rate": 7.051444903143856e-06, "loss": 0.7008, "step": 27050 }, { "epoch": 8.605906637027628, "grad_norm": 31095.111328125, "learning_rate": 6.9720546205144495e-06, "loss": 0.7011, "step": 27100 }, { "epoch": 8.62178469355351, "grad_norm": 30474.576171875, "learning_rate": 6.892664337885043e-06, "loss": 0.6994, "step": 27150 }, { "epoch": 8.63766275007939, "grad_norm": 30038.474609375, "learning_rate": 6.813274055255638e-06, "loss": 0.7002, "step": 27200 }, { "epoch": 8.653540806605271, "grad_norm": 30336.1953125, "learning_rate": 6.73388377262623e-06, "loss": 0.6994, "step": 27250 }, { "epoch": 8.669418863131153, "grad_norm": 30122.53125, "learning_rate": 6.654493489996824e-06, "loss": 0.7004, "step": 27300 }, { "epoch": 8.685296919657034, "grad_norm": 30615.431640625, "learning_rate": 6.575103207367419e-06, "loss": 0.7021, "step": 27350 }, { "epoch": 8.701174976182916, "grad_norm": 30339.583984375, "learning_rate": 6.495712924738012e-06, "loss": 0.7007, "step": 27400 }, { "epoch": 8.717053032708796, "grad_norm": 30189.884765625, "learning_rate": 6.416322642108606e-06, "loss": 0.6986, "step": 27450 }, { "epoch": 8.732931089234677, "grad_norm": 30549.412109375, "learning_rate": 6.3369323594792e-06, "loss": 0.6996, "step": 27500 }, { "epoch": 8.748809145760559, "grad_norm": 30457.353515625, "learning_rate": 6.257542076849794e-06, "loss": 0.6991, "step": 27550 }, { "epoch": 8.76468720228644, "grad_norm": 31163.732421875, "learning_rate": 6.178151794220388e-06, "loss": 0.702, "step": 27600 }, { "epoch": 8.780565258812322, "grad_norm": 29878.865234375, "learning_rate": 6.0987615115909814e-06, "loss": 0.6999, "step": 27650 }, { "epoch": 8.796443315338202, "grad_norm": 30267.07421875, "learning_rate": 6.019371228961575e-06, "loss": 0.7014, "step": 27700 }, { "epoch": 8.812321371864083, "grad_norm": 30354.353515625, "learning_rate": 5.939980946332169e-06, "loss": 0.6982, "step": 27750 }, { "epoch": 8.828199428389965, "grad_norm": 30582.365234375, "learning_rate": 5.860590663702763e-06, "loss": 0.7002, "step": 27800 }, { "epoch": 8.844077484915847, "grad_norm": 30702.970703125, "learning_rate": 5.781200381073357e-06, "loss": 0.6975, "step": 27850 }, { "epoch": 8.859955541441728, "grad_norm": 31331.51953125, "learning_rate": 5.7018100984439505e-06, "loss": 0.6966, "step": 27900 }, { "epoch": 8.875833597967608, "grad_norm": 31188.05859375, "learning_rate": 5.622419815814544e-06, "loss": 0.6976, "step": 27950 }, { "epoch": 8.89171165449349, "grad_norm": 30835.2578125, "learning_rate": 5.543029533185139e-06, "loss": 0.6985, "step": 28000 }, { "epoch": 8.907589711019371, "grad_norm": 30251.89453125, "learning_rate": 5.463639250555732e-06, "loss": 0.7002, "step": 28050 }, { "epoch": 8.923467767545253, "grad_norm": 30508.73828125, "learning_rate": 5.384248967926326e-06, "loss": 0.6986, "step": 28100 }, { "epoch": 8.939345824071134, "grad_norm": 30640.4375, "learning_rate": 5.3048586852969195e-06, "loss": 0.7019, "step": 28150 }, { "epoch": 8.955223880597014, "grad_norm": 31159.490234375, "learning_rate": 5.225468402667514e-06, "loss": 0.6987, "step": 28200 }, { "epoch": 8.971101937122896, "grad_norm": 30732.70703125, "learning_rate": 5.146078120038107e-06, "loss": 0.6996, "step": 28250 }, { "epoch": 8.986979993648777, "grad_norm": 30562.67578125, "learning_rate": 5.066687837408701e-06, "loss": 0.7028, "step": 28300 }, { "epoch": 9.002858050174659, "grad_norm": 31258.55078125, "learning_rate": 4.987297554779296e-06, "loss": 0.6952, "step": 28350 }, { "epoch": 9.01873610670054, "grad_norm": 30664.671875, "learning_rate": 4.9079072721498894e-06, "loss": 0.6945, "step": 28400 }, { "epoch": 9.03461416322642, "grad_norm": 31010.13671875, "learning_rate": 4.828516989520483e-06, "loss": 0.6963, "step": 28450 }, { "epoch": 9.050492219752302, "grad_norm": 31770.1484375, "learning_rate": 4.749126706891076e-06, "loss": 0.6951, "step": 28500 }, { "epoch": 9.066370276278183, "grad_norm": 30270.96484375, "learning_rate": 4.669736424261671e-06, "loss": 0.6962, "step": 28550 }, { "epoch": 9.082248332804065, "grad_norm": 31095.724609375, "learning_rate": 4.590346141632265e-06, "loss": 0.6951, "step": 28600 }, { "epoch": 9.098126389329947, "grad_norm": 30564.689453125, "learning_rate": 4.5109558590028585e-06, "loss": 0.6958, "step": 28650 }, { "epoch": 9.114004445855826, "grad_norm": 30729.578125, "learning_rate": 4.4315655763734514e-06, "loss": 0.6965, "step": 28700 }, { "epoch": 9.129882502381708, "grad_norm": 30804.77734375, "learning_rate": 4.352175293744046e-06, "loss": 0.6987, "step": 28750 }, { "epoch": 9.14576055890759, "grad_norm": 30098.283203125, "learning_rate": 4.27278501111464e-06, "loss": 0.693, "step": 28800 }, { "epoch": 9.161638615433471, "grad_norm": 31217.767578125, "learning_rate": 4.193394728485234e-06, "loss": 0.6952, "step": 28850 }, { "epoch": 9.177516671959353, "grad_norm": 30664.6953125, "learning_rate": 4.1140044458558275e-06, "loss": 0.6961, "step": 28900 }, { "epoch": 9.193394728485233, "grad_norm": 30483.142578125, "learning_rate": 4.034614163226421e-06, "loss": 0.6929, "step": 28950 }, { "epoch": 9.209272785011114, "grad_norm": 30538.34765625, "learning_rate": 3.955223880597015e-06, "loss": 0.6964, "step": 29000 }, { "epoch": 9.225150841536996, "grad_norm": 31039.91796875, "learning_rate": 3.875833597967609e-06, "loss": 0.6946, "step": 29050 }, { "epoch": 9.241028898062877, "grad_norm": 30256.009765625, "learning_rate": 3.796443315338203e-06, "loss": 0.6946, "step": 29100 }, { "epoch": 9.256906954588759, "grad_norm": 31257.78125, "learning_rate": 3.717053032708796e-06, "loss": 0.6942, "step": 29150 }, { "epoch": 9.272785011114639, "grad_norm": 30951.634765625, "learning_rate": 3.6376627500793904e-06, "loss": 0.6944, "step": 29200 }, { "epoch": 9.28866306764052, "grad_norm": 30461.291015625, "learning_rate": 3.558272467449984e-06, "loss": 0.6942, "step": 29250 }, { "epoch": 9.304541124166402, "grad_norm": 30459.7109375, "learning_rate": 3.4788821848205784e-06, "loss": 0.699, "step": 29300 }, { "epoch": 9.320419180692284, "grad_norm": 30297.685546875, "learning_rate": 3.3994919021911722e-06, "loss": 0.6909, "step": 29350 }, { "epoch": 9.336297237218165, "grad_norm": 30345.24609375, "learning_rate": 3.3201016195617656e-06, "loss": 0.6962, "step": 29400 }, { "epoch": 9.352175293744045, "grad_norm": 30085.6484375, "learning_rate": 3.2407113369323594e-06, "loss": 0.6957, "step": 29450 }, { "epoch": 9.368053350269927, "grad_norm": 31095.689453125, "learning_rate": 3.1613210543029537e-06, "loss": 0.698, "step": 29500 }, { "epoch": 9.383931406795808, "grad_norm": 30798.33203125, "learning_rate": 3.081930771673547e-06, "loss": 0.6938, "step": 29550 }, { "epoch": 9.39980946332169, "grad_norm": 30953.390625, "learning_rate": 3.0025404890441413e-06, "loss": 0.6934, "step": 29600 }, { "epoch": 9.415687519847571, "grad_norm": 30614.728515625, "learning_rate": 2.9231502064147347e-06, "loss": 0.6936, "step": 29650 }, { "epoch": 9.431565576373451, "grad_norm": 30555.818359375, "learning_rate": 2.843759923785329e-06, "loss": 0.6925, "step": 29700 }, { "epoch": 9.447443632899333, "grad_norm": 30385.14453125, "learning_rate": 2.7643696411559227e-06, "loss": 0.6989, "step": 29750 }, { "epoch": 9.463321689425214, "grad_norm": 30924.46875, "learning_rate": 2.6849793585265165e-06, "loss": 0.697, "step": 29800 }, { "epoch": 9.479199745951096, "grad_norm": 31371.728515625, "learning_rate": 2.6055890758971103e-06, "loss": 0.6955, "step": 29850 }, { "epoch": 9.495077802476978, "grad_norm": 30738.732421875, "learning_rate": 2.526198793267704e-06, "loss": 0.6945, "step": 29900 }, { "epoch": 9.510955859002857, "grad_norm": 30800.96484375, "learning_rate": 2.446808510638298e-06, "loss": 0.6951, "step": 29950 }, { "epoch": 9.526833915528739, "grad_norm": 30367.607421875, "learning_rate": 2.3674182280088918e-06, "loss": 0.6975, "step": 30000 }, { "epoch": 9.54271197205462, "grad_norm": 30162.599609375, "learning_rate": 2.2880279453794856e-06, "loss": 0.6949, "step": 30050 }, { "epoch": 9.558590028580502, "grad_norm": 30217.197265625, "learning_rate": 2.2086376627500794e-06, "loss": 0.6921, "step": 30100 }, { "epoch": 9.574468085106384, "grad_norm": 30000.412109375, "learning_rate": 2.129247380120673e-06, "loss": 0.6967, "step": 30150 }, { "epoch": 9.590346141632264, "grad_norm": 30676.177734375, "learning_rate": 2.0498570974912674e-06, "loss": 0.6957, "step": 30200 }, { "epoch": 9.606224198158145, "grad_norm": 30816.064453125, "learning_rate": 1.970466814861861e-06, "loss": 0.6944, "step": 30250 }, { "epoch": 9.622102254684027, "grad_norm": 30859.091796875, "learning_rate": 1.891076532232455e-06, "loss": 0.695, "step": 30300 }, { "epoch": 9.637980311209908, "grad_norm": 30824.66015625, "learning_rate": 1.8116862496030486e-06, "loss": 0.6955, "step": 30350 }, { "epoch": 9.65385836773579, "grad_norm": 31179.345703125, "learning_rate": 1.7322959669736427e-06, "loss": 0.692, "step": 30400 }, { "epoch": 9.66973642426167, "grad_norm": 29985.033203125, "learning_rate": 1.6529056843442363e-06, "loss": 0.6961, "step": 30450 }, { "epoch": 9.685614480787551, "grad_norm": 29948.798828125, "learning_rate": 1.5735154017148303e-06, "loss": 0.6948, "step": 30500 }, { "epoch": 9.701492537313433, "grad_norm": 30504.94140625, "learning_rate": 1.494125119085424e-06, "loss": 0.6955, "step": 30550 }, { "epoch": 9.717370593839314, "grad_norm": 30339.3515625, "learning_rate": 1.414734836456018e-06, "loss": 0.6962, "step": 30600 }, { "epoch": 9.733248650365196, "grad_norm": 30103.78125, "learning_rate": 1.3353445538266117e-06, "loss": 0.6953, "step": 30650 }, { "epoch": 9.749126706891076, "grad_norm": 30390.306640625, "learning_rate": 1.2559542711972055e-06, "loss": 0.6962, "step": 30700 }, { "epoch": 9.765004763416957, "grad_norm": 30574.888671875, "learning_rate": 1.1765639885677993e-06, "loss": 0.6963, "step": 30750 }, { "epoch": 9.780882819942839, "grad_norm": 30706.4765625, "learning_rate": 1.0971737059383931e-06, "loss": 0.6928, "step": 30800 }, { "epoch": 9.79676087646872, "grad_norm": 29788.431640625, "learning_rate": 1.0177834233089872e-06, "loss": 0.6918, "step": 30850 }, { "epoch": 9.812638932994602, "grad_norm": 30614.6875, "learning_rate": 9.383931406795809e-07, "loss": 0.6931, "step": 30900 }, { "epoch": 9.828516989520482, "grad_norm": 30468.765625, "learning_rate": 8.590028580501748e-07, "loss": 0.6976, "step": 30950 }, { "epoch": 9.844395046046364, "grad_norm": 29652.828125, "learning_rate": 7.796125754207686e-07, "loss": 0.6938, "step": 31000 }, { "epoch": 9.860273102572245, "grad_norm": 31199.251953125, "learning_rate": 7.002222927913624e-07, "loss": 0.6965, "step": 31050 }, { "epoch": 9.876151159098127, "grad_norm": 30488.6796875, "learning_rate": 6.208320101619562e-07, "loss": 0.6938, "step": 31100 }, { "epoch": 9.892029215624008, "grad_norm": 30915.556640625, "learning_rate": 5.4144172753255e-07, "loss": 0.6951, "step": 31150 }, { "epoch": 9.907907272149888, "grad_norm": 30898.82421875, "learning_rate": 4.620514449031439e-07, "loss": 0.6929, "step": 31200 }, { "epoch": 9.92378532867577, "grad_norm": 31000.388671875, "learning_rate": 3.826611622737377e-07, "loss": 0.6931, "step": 31250 }, { "epoch": 9.939663385201651, "grad_norm": 30619.94140625, "learning_rate": 3.0327087964433156e-07, "loss": 0.6963, "step": 31300 }, { "epoch": 9.955541441727533, "grad_norm": 30345.3515625, "learning_rate": 2.238805970149254e-07, "loss": 0.6932, "step": 31350 }, { "epoch": 9.971419498253415, "grad_norm": 31431.78515625, "learning_rate": 1.444903143855192e-07, "loss": 0.6936, "step": 31400 }, { "epoch": 9.987297554779294, "grad_norm": 30420.556640625, "learning_rate": 6.510003175611305e-08, "loss": 0.6958, "step": 31450 }, { "epoch": 10.0, "step": 31490, "total_flos": 1.05298598559744e+18, "train_loss": 0.8101965235240424, "train_runtime": 86746.076, "train_samples_per_second": 23.228, "train_steps_per_second": 0.363 }, { "epoch": 10.003175611305176, "grad_norm": 34598.17578125, "learning_rate": 4.9992854874563356e-05, "loss": 0.7254, "step": 31500 }, { "epoch": 10.019053667831058, "grad_norm": 32862.16015625, "learning_rate": 4.995315973324865e-05, "loss": 0.7148, "step": 31550 }, { "epoch": 10.03493172435694, "grad_norm": 34942.2890625, "learning_rate": 4.991346459193395e-05, "loss": 0.7129, "step": 31600 }, { "epoch": 10.05080978088282, "grad_norm": 32553.541015625, "learning_rate": 4.987376945061924e-05, "loss": 0.7192, "step": 31650 }, { "epoch": 10.0666878374087, "grad_norm": 32205.275390625, "learning_rate": 4.983407430930454e-05, "loss": 0.719, "step": 31700 }, { "epoch": 10.082565893934582, "grad_norm": 32618.0078125, "learning_rate": 4.9794379167989844e-05, "loss": 0.7201, "step": 31750 }, { "epoch": 10.098443950460464, "grad_norm": 31251.701171875, "learning_rate": 4.975468402667514e-05, "loss": 0.7209, "step": 31800 }, { "epoch": 10.114322006986345, "grad_norm": 30804.701171875, "learning_rate": 4.971498888536044e-05, "loss": 0.7199, "step": 31850 }, { "epoch": 10.130200063512227, "grad_norm": 31691.65234375, "learning_rate": 4.967529374404573e-05, "loss": 0.7242, "step": 31900 }, { "epoch": 10.146078120038107, "grad_norm": 31379.8046875, "learning_rate": 4.9635598602731024e-05, "loss": 0.7244, "step": 31950 }, { "epoch": 10.161956176563988, "grad_norm": 31711.45703125, "learning_rate": 4.9595903461416325e-05, "loss": 0.7231, "step": 32000 }, { "epoch": 10.17783423308987, "grad_norm": 30201.376953125, "learning_rate": 4.955620832010162e-05, "loss": 0.7212, "step": 32050 }, { "epoch": 10.193712289615751, "grad_norm": 30604.0234375, "learning_rate": 4.951651317878692e-05, "loss": 0.7217, "step": 32100 }, { "epoch": 10.209590346141633, "grad_norm": 30721.6171875, "learning_rate": 4.947681803747222e-05, "loss": 0.7186, "step": 32150 }, { "epoch": 10.225468402667513, "grad_norm": 30430.392578125, "learning_rate": 4.943712289615751e-05, "loss": 0.7247, "step": 32200 }, { "epoch": 10.241346459193394, "grad_norm": 29437.306640625, "learning_rate": 4.939742775484281e-05, "loss": 0.7216, "step": 32250 }, { "epoch": 10.257224515719276, "grad_norm": 31316.037109375, "learning_rate": 4.9357732613528106e-05, "loss": 0.7232, "step": 32300 }, { "epoch": 10.273102572245158, "grad_norm": 30285.005859375, "learning_rate": 4.93180374722134e-05, "loss": 0.7232, "step": 32350 }, { "epoch": 10.28898062877104, "grad_norm": 30771.517578125, "learning_rate": 4.92783423308987e-05, "loss": 0.7211, "step": 32400 }, { "epoch": 10.304858685296919, "grad_norm": 31089.01953125, "learning_rate": 4.9238647189584e-05, "loss": 0.719, "step": 32450 }, { "epoch": 10.3207367418228, "grad_norm": 29312.451171875, "learning_rate": 4.9198952048269294e-05, "loss": 0.7217, "step": 32500 }, { "epoch": 10.336614798348682, "grad_norm": 29717.74609375, "learning_rate": 4.9159256906954594e-05, "loss": 0.7204, "step": 32550 }, { "epoch": 10.352492854874564, "grad_norm": 29353.939453125, "learning_rate": 4.911956176563989e-05, "loss": 0.7209, "step": 32600 }, { "epoch": 10.368370911400445, "grad_norm": 30583.625, "learning_rate": 4.907986662432518e-05, "loss": 0.719, "step": 32650 }, { "epoch": 10.384248967926325, "grad_norm": 30315.1484375, "learning_rate": 4.904017148301048e-05, "loss": 0.7224, "step": 32700 }, { "epoch": 10.400127024452207, "grad_norm": 28785.408203125, "learning_rate": 4.9000476341695775e-05, "loss": 0.7227, "step": 32750 }, { "epoch": 10.416005080978088, "grad_norm": 29400.603515625, "learning_rate": 4.8960781200381075e-05, "loss": 0.7191, "step": 32800 }, { "epoch": 10.43188313750397, "grad_norm": 28836.03125, "learning_rate": 4.8921086059066375e-05, "loss": 0.7191, "step": 32850 }, { "epoch": 10.447761194029852, "grad_norm": 29112.181640625, "learning_rate": 4.888139091775167e-05, "loss": 0.7196, "step": 32900 }, { "epoch": 10.463639250555731, "grad_norm": 29085.232421875, "learning_rate": 4.884169577643697e-05, "loss": 0.7202, "step": 32950 }, { "epoch": 10.479517307081613, "grad_norm": 29157.4609375, "learning_rate": 4.880200063512226e-05, "loss": 0.7185, "step": 33000 }, { "epoch": 10.495395363607495, "grad_norm": 29475.744140625, "learning_rate": 4.8762305493807556e-05, "loss": 0.7221, "step": 33050 }, { "epoch": 10.511273420133376, "grad_norm": 29044.5703125, "learning_rate": 4.8722610352492857e-05, "loss": 0.7194, "step": 33100 }, { "epoch": 10.527151476659256, "grad_norm": 28844.75390625, "learning_rate": 4.868291521117816e-05, "loss": 0.7178, "step": 33150 }, { "epoch": 10.543029533185138, "grad_norm": 28828.00390625, "learning_rate": 4.864322006986345e-05, "loss": 0.7173, "step": 33200 }, { "epoch": 10.55890758971102, "grad_norm": 28988.736328125, "learning_rate": 4.860352492854875e-05, "loss": 0.7163, "step": 33250 }, { "epoch": 10.5747856462369, "grad_norm": 27665.376953125, "learning_rate": 4.8563829787234044e-05, "loss": 0.7177, "step": 33300 }, { "epoch": 10.590663702762782, "grad_norm": 28359.263671875, "learning_rate": 4.852413464591934e-05, "loss": 0.7215, "step": 33350 }, { "epoch": 10.606541759288664, "grad_norm": 28809.84375, "learning_rate": 4.848443950460464e-05, "loss": 0.7186, "step": 33400 }, { "epoch": 10.622419815814544, "grad_norm": 27395.1171875, "learning_rate": 4.844474436328993e-05, "loss": 0.7172, "step": 33450 }, { "epoch": 10.638297872340425, "grad_norm": 27581.38671875, "learning_rate": 4.840504922197523e-05, "loss": 0.721, "step": 33500 }, { "epoch": 10.654175928866307, "grad_norm": 27007.13671875, "learning_rate": 4.836535408066053e-05, "loss": 0.7198, "step": 33550 }, { "epoch": 10.670053985392189, "grad_norm": 28538.033203125, "learning_rate": 4.8325658939345826e-05, "loss": 0.717, "step": 33600 }, { "epoch": 10.685932041918068, "grad_norm": 28231.92578125, "learning_rate": 4.8285963798031126e-05, "loss": 0.7173, "step": 33650 }, { "epoch": 10.70181009844395, "grad_norm": 27481.81640625, "learning_rate": 4.824626865671642e-05, "loss": 0.7179, "step": 33700 }, { "epoch": 10.717688154969832, "grad_norm": 29380.140625, "learning_rate": 4.820657351540171e-05, "loss": 0.7185, "step": 33750 }, { "epoch": 10.733566211495713, "grad_norm": 27961.125, "learning_rate": 4.816687837408701e-05, "loss": 0.7158, "step": 33800 }, { "epoch": 10.749444268021595, "grad_norm": 28789.4453125, "learning_rate": 4.8127183232772314e-05, "loss": 0.7184, "step": 33850 }, { "epoch": 10.765322324547476, "grad_norm": 28526.916015625, "learning_rate": 4.808748809145761e-05, "loss": 0.719, "step": 33900 }, { "epoch": 10.781200381073356, "grad_norm": 28261.802734375, "learning_rate": 4.804779295014291e-05, "loss": 0.7155, "step": 33950 }, { "epoch": 10.797078437599238, "grad_norm": 28534.755859375, "learning_rate": 4.80080978088282e-05, "loss": 0.7169, "step": 34000 }, { "epoch": 10.81295649412512, "grad_norm": 28207.771484375, "learning_rate": 4.79684026675135e-05, "loss": 0.713, "step": 34050 }, { "epoch": 10.828834550651, "grad_norm": 27474.365234375, "learning_rate": 4.7928707526198795e-05, "loss": 0.7183, "step": 34100 }, { "epoch": 10.84471260717688, "grad_norm": 28205.083984375, "learning_rate": 4.788901238488409e-05, "loss": 0.7189, "step": 34150 }, { "epoch": 10.860590663702762, "grad_norm": 27502.6015625, "learning_rate": 4.784931724356939e-05, "loss": 0.7161, "step": 34200 }, { "epoch": 10.876468720228644, "grad_norm": 27558.001953125, "learning_rate": 4.780962210225469e-05, "loss": 0.7159, "step": 34250 }, { "epoch": 10.892346776754525, "grad_norm": 28145.595703125, "learning_rate": 4.776992696093998e-05, "loss": 0.7111, "step": 34300 }, { "epoch": 10.908224833280407, "grad_norm": 27374.37109375, "learning_rate": 4.773023181962528e-05, "loss": 0.718, "step": 34350 }, { "epoch": 10.924102889806289, "grad_norm": 27972.486328125, "learning_rate": 4.7690536678310576e-05, "loss": 0.7195, "step": 34400 }, { "epoch": 10.939980946332168, "grad_norm": 27874.939453125, "learning_rate": 4.765084153699587e-05, "loss": 0.717, "step": 34450 }, { "epoch": 10.95585900285805, "grad_norm": 27341.05859375, "learning_rate": 4.761114639568117e-05, "loss": 0.716, "step": 34500 }, { "epoch": 10.971737059383932, "grad_norm": 28336.388671875, "learning_rate": 4.757145125436647e-05, "loss": 0.7177, "step": 34550 }, { "epoch": 10.987615115909813, "grad_norm": 26653.798828125, "learning_rate": 4.753175611305177e-05, "loss": 0.7111, "step": 34600 }, { "epoch": 11.003493172435693, "grad_norm": 27900.7265625, "learning_rate": 4.7492060971737064e-05, "loss": 0.7124, "step": 34650 }, { "epoch": 11.019371228961575, "grad_norm": 27794.94921875, "learning_rate": 4.745236583042236e-05, "loss": 0.7103, "step": 34700 }, { "epoch": 11.035249285487456, "grad_norm": 27183.654296875, "learning_rate": 4.741267068910766e-05, "loss": 0.7052, "step": 34750 }, { "epoch": 11.051127342013338, "grad_norm": 27468.392578125, "learning_rate": 4.737297554779295e-05, "loss": 0.7084, "step": 34800 }, { "epoch": 11.06700539853922, "grad_norm": 26358.43359375, "learning_rate": 4.7333280406478245e-05, "loss": 0.7038, "step": 34850 }, { "epoch": 11.0828834550651, "grad_norm": 26982.095703125, "learning_rate": 4.7293585265163545e-05, "loss": 0.7023, "step": 34900 }, { "epoch": 11.09876151159098, "grad_norm": 27181.15625, "learning_rate": 4.7253890123848845e-05, "loss": 0.7062, "step": 34950 }, { "epoch": 11.114639568116862, "grad_norm": 27455.166015625, "learning_rate": 4.721419498253414e-05, "loss": 0.7088, "step": 35000 }, { "epoch": 11.130517624642744, "grad_norm": 27479.68359375, "learning_rate": 4.717449984121944e-05, "loss": 0.707, "step": 35050 }, { "epoch": 11.146395681168626, "grad_norm": 28365.30859375, "learning_rate": 4.713480469990473e-05, "loss": 0.7052, "step": 35100 }, { "epoch": 11.162273737694505, "grad_norm": 27870.494140625, "learning_rate": 4.7095109558590026e-05, "loss": 0.7089, "step": 35150 }, { "epoch": 11.178151794220387, "grad_norm": 27870.310546875, "learning_rate": 4.7055414417275327e-05, "loss": 0.7066, "step": 35200 }, { "epoch": 11.194029850746269, "grad_norm": 28179.03125, "learning_rate": 4.701571927596063e-05, "loss": 0.7069, "step": 35250 }, { "epoch": 11.20990790727215, "grad_norm": 27911.45703125, "learning_rate": 4.697602413464593e-05, "loss": 0.707, "step": 35300 }, { "epoch": 11.225785963798032, "grad_norm": 26871.20703125, "learning_rate": 4.693632899333122e-05, "loss": 0.7073, "step": 35350 }, { "epoch": 11.241664020323912, "grad_norm": 27323.912109375, "learning_rate": 4.6896633852016514e-05, "loss": 0.7095, "step": 35400 }, { "epoch": 11.257542076849793, "grad_norm": 27162.650390625, "learning_rate": 4.6856938710701815e-05, "loss": 0.7047, "step": 35450 }, { "epoch": 11.273420133375675, "grad_norm": 26953.41796875, "learning_rate": 4.681724356938711e-05, "loss": 0.7061, "step": 35500 }, { "epoch": 11.289298189901556, "grad_norm": 26672.6953125, "learning_rate": 4.67775484280724e-05, "loss": 0.7059, "step": 35550 }, { "epoch": 11.305176246427438, "grad_norm": 27072.08203125, "learning_rate": 4.67378532867577e-05, "loss": 0.7047, "step": 35600 }, { "epoch": 11.321054302953318, "grad_norm": 26242.37109375, "learning_rate": 4.6698158145443e-05, "loss": 0.7042, "step": 35650 }, { "epoch": 11.3369323594792, "grad_norm": 26593.7890625, "learning_rate": 4.6658463004128296e-05, "loss": 0.7022, "step": 35700 }, { "epoch": 11.352810416005081, "grad_norm": 26419.05078125, "learning_rate": 4.6618767862813596e-05, "loss": 0.7077, "step": 35750 }, { "epoch": 11.368688472530962, "grad_norm": 26334.150390625, "learning_rate": 4.657907272149889e-05, "loss": 0.7084, "step": 35800 }, { "epoch": 11.384566529056844, "grad_norm": 26394.89453125, "learning_rate": 4.653937758018419e-05, "loss": 0.7064, "step": 35850 }, { "epoch": 11.400444585582724, "grad_norm": 26715.767578125, "learning_rate": 4.649968243886948e-05, "loss": 0.7055, "step": 35900 }, { "epoch": 11.416322642108605, "grad_norm": 27197.98828125, "learning_rate": 4.645998729755478e-05, "loss": 0.708, "step": 35950 }, { "epoch": 11.432200698634487, "grad_norm": 27616.12109375, "learning_rate": 4.6420292156240084e-05, "loss": 0.708, "step": 36000 }, { "epoch": 11.448078755160369, "grad_norm": 26575.294921875, "learning_rate": 4.638059701492538e-05, "loss": 0.7039, "step": 36050 }, { "epoch": 11.46395681168625, "grad_norm": 26070.689453125, "learning_rate": 4.634090187361067e-05, "loss": 0.7075, "step": 36100 }, { "epoch": 11.47983486821213, "grad_norm": 26675.865234375, "learning_rate": 4.630120673229597e-05, "loss": 0.7058, "step": 36150 }, { "epoch": 11.495712924738012, "grad_norm": 26788.8671875, "learning_rate": 4.6261511590981265e-05, "loss": 0.7056, "step": 36200 }, { "epoch": 11.511590981263893, "grad_norm": 26736.013671875, "learning_rate": 4.622181644966656e-05, "loss": 0.7049, "step": 36250 }, { "epoch": 11.527469037789775, "grad_norm": 27259.62109375, "learning_rate": 4.618212130835186e-05, "loss": 0.7033, "step": 36300 }, { "epoch": 11.543347094315656, "grad_norm": 26500.08984375, "learning_rate": 4.614242616703716e-05, "loss": 0.7055, "step": 36350 }, { "epoch": 11.559225150841536, "grad_norm": 26374.705078125, "learning_rate": 4.610273102572246e-05, "loss": 0.7017, "step": 36400 }, { "epoch": 11.575103207367418, "grad_norm": 27614.83984375, "learning_rate": 4.606303588440775e-05, "loss": 0.7059, "step": 36450 }, { "epoch": 11.5909812638933, "grad_norm": 25694.1015625, "learning_rate": 4.6023340743093046e-05, "loss": 0.7048, "step": 36500 }, { "epoch": 11.606859320419181, "grad_norm": 26899.56640625, "learning_rate": 4.5983645601778346e-05, "loss": 0.7045, "step": 36550 }, { "epoch": 11.622737376945063, "grad_norm": 25786.490234375, "learning_rate": 4.594395046046364e-05, "loss": 0.7038, "step": 36600 }, { "epoch": 11.638615433470942, "grad_norm": 26447.548828125, "learning_rate": 4.5904255319148933e-05, "loss": 0.7031, "step": 36650 }, { "epoch": 11.654493489996824, "grad_norm": 25961.685546875, "learning_rate": 4.5864560177834234e-05, "loss": 0.7059, "step": 36700 }, { "epoch": 11.670371546522706, "grad_norm": 26539.162109375, "learning_rate": 4.5824865036519534e-05, "loss": 0.7033, "step": 36750 }, { "epoch": 11.686249603048587, "grad_norm": 26035.525390625, "learning_rate": 4.578516989520483e-05, "loss": 0.7057, "step": 36800 }, { "epoch": 11.702127659574469, "grad_norm": 26614.791015625, "learning_rate": 4.574547475389013e-05, "loss": 0.7048, "step": 36850 }, { "epoch": 11.718005716100349, "grad_norm": 26289.15234375, "learning_rate": 4.570577961257542e-05, "loss": 0.6975, "step": 36900 }, { "epoch": 11.73388377262623, "grad_norm": 26606.12109375, "learning_rate": 4.5666084471260715e-05, "loss": 0.705, "step": 36950 }, { "epoch": 11.749761829152112, "grad_norm": 25736.138671875, "learning_rate": 4.5626389329946015e-05, "loss": 0.7004, "step": 37000 }, { "epoch": 11.765639885677993, "grad_norm": 26059.619140625, "learning_rate": 4.5586694188631315e-05, "loss": 0.705, "step": 37050 }, { "epoch": 11.781517942203875, "grad_norm": 25117.2734375, "learning_rate": 4.5546999047316616e-05, "loss": 0.7088, "step": 37100 }, { "epoch": 11.797395998729755, "grad_norm": 27240.82421875, "learning_rate": 4.550730390600191e-05, "loss": 0.7003, "step": 37150 }, { "epoch": 11.813274055255636, "grad_norm": 26413.03125, "learning_rate": 4.54676087646872e-05, "loss": 0.7029, "step": 37200 }, { "epoch": 11.829152111781518, "grad_norm": 25459.921875, "learning_rate": 4.54279136233725e-05, "loss": 0.6975, "step": 37250 }, { "epoch": 11.8450301683074, "grad_norm": 26057.275390625, "learning_rate": 4.5388218482057797e-05, "loss": 0.6994, "step": 37300 }, { "epoch": 11.860908224833281, "grad_norm": 25720.384765625, "learning_rate": 4.534852334074309e-05, "loss": 0.7072, "step": 37350 }, { "epoch": 11.876786281359161, "grad_norm": 26672.6015625, "learning_rate": 4.530882819942839e-05, "loss": 0.7017, "step": 37400 }, { "epoch": 11.892664337885043, "grad_norm": 25892.302734375, "learning_rate": 4.526913305811369e-05, "loss": 0.7031, "step": 37450 }, { "epoch": 11.908542394410924, "grad_norm": 26435.544921875, "learning_rate": 4.5229437916798984e-05, "loss": 0.7011, "step": 37500 }, { "epoch": 11.924420450936806, "grad_norm": 25747.921875, "learning_rate": 4.5189742775484285e-05, "loss": 0.6979, "step": 37550 }, { "epoch": 11.940298507462687, "grad_norm": 25117.19140625, "learning_rate": 4.515004763416958e-05, "loss": 0.6996, "step": 37600 }, { "epoch": 11.956176563988567, "grad_norm": 26094.021484375, "learning_rate": 4.511035249285488e-05, "loss": 0.703, "step": 37650 }, { "epoch": 11.972054620514449, "grad_norm": 25621.94140625, "learning_rate": 4.507065735154017e-05, "loss": 0.7034, "step": 37700 }, { "epoch": 11.98793267704033, "grad_norm": 25541.908203125, "learning_rate": 4.503096221022547e-05, "loss": 0.7004, "step": 37750 }, { "epoch": 12.003810733566212, "grad_norm": 25723.466796875, "learning_rate": 4.499126706891077e-05, "loss": 0.6991, "step": 37800 }, { "epoch": 12.019688790092093, "grad_norm": 26813.052734375, "learning_rate": 4.4951571927596066e-05, "loss": 0.696, "step": 37850 }, { "epoch": 12.035566846617973, "grad_norm": 25913.0390625, "learning_rate": 4.491187678628136e-05, "loss": 0.6931, "step": 37900 }, { "epoch": 12.051444903143855, "grad_norm": 25332.591796875, "learning_rate": 4.487218164496666e-05, "loss": 0.6939, "step": 37950 }, { "epoch": 12.067322959669736, "grad_norm": 25602.212890625, "learning_rate": 4.483248650365195e-05, "loss": 0.6935, "step": 38000 }, { "epoch": 12.083201016195618, "grad_norm": 26077.806640625, "learning_rate": 4.479279136233725e-05, "loss": 0.6946, "step": 38050 }, { "epoch": 12.0990790727215, "grad_norm": 26038.025390625, "learning_rate": 4.475309622102255e-05, "loss": 0.6927, "step": 38100 }, { "epoch": 12.11495712924738, "grad_norm": 25922.17578125, "learning_rate": 4.471340107970785e-05, "loss": 0.691, "step": 38150 }, { "epoch": 12.130835185773261, "grad_norm": 25839.078125, "learning_rate": 4.467370593839315e-05, "loss": 0.6924, "step": 38200 }, { "epoch": 12.146713242299143, "grad_norm": 25702.123046875, "learning_rate": 4.463401079707844e-05, "loss": 0.6918, "step": 38250 }, { "epoch": 12.162591298825024, "grad_norm": 26183.7890625, "learning_rate": 4.4594315655763735e-05, "loss": 0.6922, "step": 38300 }, { "epoch": 12.178469355350906, "grad_norm": 26349.01953125, "learning_rate": 4.4554620514449035e-05, "loss": 0.6919, "step": 38350 }, { "epoch": 12.194347411876786, "grad_norm": 25578.62890625, "learning_rate": 4.451492537313433e-05, "loss": 0.6911, "step": 38400 }, { "epoch": 12.210225468402667, "grad_norm": 25342.06640625, "learning_rate": 4.447523023181963e-05, "loss": 0.6948, "step": 38450 }, { "epoch": 12.226103524928549, "grad_norm": 25582.328125, "learning_rate": 4.443553509050493e-05, "loss": 0.6965, "step": 38500 }, { "epoch": 12.24198158145443, "grad_norm": 26281.44921875, "learning_rate": 4.439583994919022e-05, "loss": 0.6947, "step": 38550 }, { "epoch": 12.257859637980312, "grad_norm": 25423.970703125, "learning_rate": 4.4356144807875516e-05, "loss": 0.6949, "step": 38600 }, { "epoch": 12.273737694506192, "grad_norm": 25645.67578125, "learning_rate": 4.4316449666560816e-05, "loss": 0.692, "step": 38650 }, { "epoch": 12.289615751032073, "grad_norm": 25957.416015625, "learning_rate": 4.427675452524611e-05, "loss": 0.6914, "step": 38700 }, { "epoch": 12.305493807557955, "grad_norm": 25020.30859375, "learning_rate": 4.4237059383931403e-05, "loss": 0.6956, "step": 38750 }, { "epoch": 12.321371864083837, "grad_norm": 25936.24609375, "learning_rate": 4.4197364242616704e-05, "loss": 0.6909, "step": 38800 }, { "epoch": 12.337249920609718, "grad_norm": 25477.5, "learning_rate": 4.4157669101302004e-05, "loss": 0.6915, "step": 38850 }, { "epoch": 12.353127977135598, "grad_norm": 24679.041015625, "learning_rate": 4.4117973959987304e-05, "loss": 0.6959, "step": 38900 }, { "epoch": 12.36900603366148, "grad_norm": 25305.9140625, "learning_rate": 4.40782788186726e-05, "loss": 0.6958, "step": 38950 }, { "epoch": 12.384884090187361, "grad_norm": 25186.720703125, "learning_rate": 4.403858367735789e-05, "loss": 0.6899, "step": 39000 }, { "epoch": 12.400762146713243, "grad_norm": 25848.09375, "learning_rate": 4.399888853604319e-05, "loss": 0.6954, "step": 39050 }, { "epoch": 12.416640203239124, "grad_norm": 25214.7265625, "learning_rate": 4.3959193394728485e-05, "loss": 0.6918, "step": 39100 }, { "epoch": 12.432518259765004, "grad_norm": 25850.787109375, "learning_rate": 4.3919498253413785e-05, "loss": 0.6889, "step": 39150 }, { "epoch": 12.448396316290886, "grad_norm": 25535.595703125, "learning_rate": 4.3879803112099086e-05, "loss": 0.6913, "step": 39200 }, { "epoch": 12.464274372816767, "grad_norm": 25293.98828125, "learning_rate": 4.384010797078438e-05, "loss": 0.6909, "step": 39250 }, { "epoch": 12.480152429342649, "grad_norm": 25519.966796875, "learning_rate": 4.380041282946967e-05, "loss": 0.693, "step": 39300 }, { "epoch": 12.49603048586853, "grad_norm": 25590.5390625, "learning_rate": 4.376071768815497e-05, "loss": 0.6901, "step": 39350 }, { "epoch": 12.51190854239441, "grad_norm": 25909.888671875, "learning_rate": 4.3721022546840267e-05, "loss": 0.6908, "step": 39400 }, { "epoch": 12.527786598920292, "grad_norm": 25248.21484375, "learning_rate": 4.368132740552557e-05, "loss": 0.6932, "step": 39450 }, { "epoch": 12.543664655446173, "grad_norm": 24925.970703125, "learning_rate": 4.364163226421086e-05, "loss": 0.6914, "step": 39500 }, { "epoch": 12.559542711972055, "grad_norm": 25199.6171875, "learning_rate": 4.360193712289616e-05, "loss": 0.6937, "step": 39550 }, { "epoch": 12.575420768497937, "grad_norm": 25989.927734375, "learning_rate": 4.356224198158146e-05, "loss": 0.6912, "step": 39600 }, { "epoch": 12.591298825023816, "grad_norm": 24462.072265625, "learning_rate": 4.3522546840266755e-05, "loss": 0.6919, "step": 39650 }, { "epoch": 12.607176881549698, "grad_norm": 25439.693359375, "learning_rate": 4.348285169895205e-05, "loss": 0.6903, "step": 39700 }, { "epoch": 12.62305493807558, "grad_norm": 25596.04296875, "learning_rate": 4.344315655763735e-05, "loss": 0.6912, "step": 39750 }, { "epoch": 12.638932994601461, "grad_norm": 24989.56640625, "learning_rate": 4.340346141632264e-05, "loss": 0.6935, "step": 39800 }, { "epoch": 12.654811051127343, "grad_norm": 24844.279296875, "learning_rate": 4.336376627500794e-05, "loss": 0.6895, "step": 39850 }, { "epoch": 12.670689107653223, "grad_norm": 25176.060546875, "learning_rate": 4.332407113369324e-05, "loss": 0.6924, "step": 39900 }, { "epoch": 12.686567164179104, "grad_norm": 25582.01953125, "learning_rate": 4.3284375992378536e-05, "loss": 0.6927, "step": 39950 }, { "epoch": 12.702445220704986, "grad_norm": 25024.33203125, "learning_rate": 4.3244680851063836e-05, "loss": 0.6904, "step": 40000 }, { "epoch": 12.718323277230867, "grad_norm": 25088.216796875, "learning_rate": 4.320498570974913e-05, "loss": 0.6904, "step": 40050 }, { "epoch": 12.734201333756747, "grad_norm": 24689.61328125, "learning_rate": 4.316529056843442e-05, "loss": 0.687, "step": 40100 }, { "epoch": 12.750079390282629, "grad_norm": 25595.205078125, "learning_rate": 4.3125595427119724e-05, "loss": 0.6911, "step": 40150 }, { "epoch": 12.76595744680851, "grad_norm": 25500.544921875, "learning_rate": 4.308590028580502e-05, "loss": 0.6894, "step": 40200 }, { "epoch": 12.781835503334392, "grad_norm": 24525.78125, "learning_rate": 4.304620514449032e-05, "loss": 0.6904, "step": 40250 }, { "epoch": 12.797713559860274, "grad_norm": 24815.94921875, "learning_rate": 4.300651000317562e-05, "loss": 0.6879, "step": 40300 }, { "epoch": 12.813591616386155, "grad_norm": 24612.96875, "learning_rate": 4.296681486186091e-05, "loss": 0.693, "step": 40350 }, { "epoch": 12.829469672912035, "grad_norm": 25878.453125, "learning_rate": 4.2927119720546205e-05, "loss": 0.6877, "step": 40400 }, { "epoch": 12.845347729437917, "grad_norm": 25819.634765625, "learning_rate": 4.2887424579231505e-05, "loss": 0.6889, "step": 40450 }, { "epoch": 12.861225785963798, "grad_norm": 24711.671875, "learning_rate": 4.28477294379168e-05, "loss": 0.6897, "step": 40500 }, { "epoch": 12.87710384248968, "grad_norm": 25272.193359375, "learning_rate": 4.280803429660209e-05, "loss": 0.6916, "step": 40550 }, { "epoch": 12.89298189901556, "grad_norm": 24950.6953125, "learning_rate": 4.27683391552874e-05, "loss": 0.6874, "step": 40600 }, { "epoch": 12.908859955541441, "grad_norm": 25394.751953125, "learning_rate": 4.272864401397269e-05, "loss": 0.6909, "step": 40650 }, { "epoch": 12.924738012067323, "grad_norm": 25527.599609375, "learning_rate": 4.268894887265799e-05, "loss": 0.6888, "step": 40700 }, { "epoch": 12.940616068593204, "grad_norm": 25220.8125, "learning_rate": 4.2649253731343286e-05, "loss": 0.6902, "step": 40750 }, { "epoch": 12.956494125119086, "grad_norm": 24796.541015625, "learning_rate": 4.260955859002858e-05, "loss": 0.6894, "step": 40800 }, { "epoch": 12.972372181644968, "grad_norm": 25115.955078125, "learning_rate": 4.256986344871388e-05, "loss": 0.6908, "step": 40850 }, { "epoch": 12.988250238170847, "grad_norm": 24197.728515625, "learning_rate": 4.2530168307399174e-05, "loss": 0.6864, "step": 40900 }, { "epoch": 13.004128294696729, "grad_norm": 24724.552734375, "learning_rate": 4.2490473166084474e-05, "loss": 0.6864, "step": 40950 }, { "epoch": 13.02000635122261, "grad_norm": 25012.1640625, "learning_rate": 4.2450778024769774e-05, "loss": 0.6799, "step": 41000 }, { "epoch": 13.035884407748492, "grad_norm": 24815.64453125, "learning_rate": 4.241108288345507e-05, "loss": 0.6837, "step": 41050 }, { "epoch": 13.051762464274374, "grad_norm": 25473.265625, "learning_rate": 4.237138774214036e-05, "loss": 0.6821, "step": 41100 }, { "epoch": 13.067640520800254, "grad_norm": 24979.189453125, "learning_rate": 4.233169260082566e-05, "loss": 0.6825, "step": 41150 }, { "epoch": 13.083518577326135, "grad_norm": 24369.861328125, "learning_rate": 4.2291997459510955e-05, "loss": 0.6802, "step": 41200 }, { "epoch": 13.099396633852017, "grad_norm": 24936.068359375, "learning_rate": 4.2252302318196255e-05, "loss": 0.6817, "step": 41250 }, { "epoch": 13.115274690377898, "grad_norm": 25067.654296875, "learning_rate": 4.221260717688155e-05, "loss": 0.68, "step": 41300 }, { "epoch": 13.13115274690378, "grad_norm": 24868.97265625, "learning_rate": 4.217291203556685e-05, "loss": 0.6811, "step": 41350 }, { "epoch": 13.14703080342966, "grad_norm": 24265.5546875, "learning_rate": 4.213321689425215e-05, "loss": 0.6813, "step": 41400 }, { "epoch": 13.162908859955541, "grad_norm": 24921.30078125, "learning_rate": 4.209352175293744e-05, "loss": 0.6796, "step": 41450 }, { "epoch": 13.178786916481423, "grad_norm": 24588.80078125, "learning_rate": 4.2053826611622737e-05, "loss": 0.6828, "step": 41500 }, { "epoch": 13.194664973007304, "grad_norm": 24551.5, "learning_rate": 4.201413147030804e-05, "loss": 0.6805, "step": 41550 }, { "epoch": 13.210543029533184, "grad_norm": 25206.583984375, "learning_rate": 4.197443632899333e-05, "loss": 0.6852, "step": 41600 }, { "epoch": 13.226421086059066, "grad_norm": 24448.646484375, "learning_rate": 4.193474118767863e-05, "loss": 0.6832, "step": 41650 }, { "epoch": 13.242299142584947, "grad_norm": 24609.46484375, "learning_rate": 4.189504604636393e-05, "loss": 0.6842, "step": 41700 }, { "epoch": 13.258177199110829, "grad_norm": 24898.201171875, "learning_rate": 4.1855350905049225e-05, "loss": 0.6812, "step": 41750 }, { "epoch": 13.27405525563671, "grad_norm": 24523.78515625, "learning_rate": 4.1815655763734525e-05, "loss": 0.6842, "step": 41800 }, { "epoch": 13.28993331216259, "grad_norm": 24928.837890625, "learning_rate": 4.177596062241982e-05, "loss": 0.6813, "step": 41850 }, { "epoch": 13.305811368688472, "grad_norm": 24684.125, "learning_rate": 4.173626548110511e-05, "loss": 0.6772, "step": 41900 }, { "epoch": 13.321689425214354, "grad_norm": 25091.099609375, "learning_rate": 4.169657033979041e-05, "loss": 0.6804, "step": 41950 }, { "epoch": 13.337567481740235, "grad_norm": 24772.115234375, "learning_rate": 4.1656875198475706e-05, "loss": 0.6788, "step": 42000 }, { "epoch": 13.353445538266117, "grad_norm": 25392.5, "learning_rate": 4.1617180057161006e-05, "loss": 0.6814, "step": 42050 }, { "epoch": 13.369323594791997, "grad_norm": 24344.09375, "learning_rate": 4.1577484915846306e-05, "loss": 0.683, "step": 42100 }, { "epoch": 13.385201651317878, "grad_norm": 24251.580078125, "learning_rate": 4.15377897745316e-05, "loss": 0.6826, "step": 42150 }, { "epoch": 13.40107970784376, "grad_norm": 25104.28125, "learning_rate": 4.149809463321689e-05, "loss": 0.6812, "step": 42200 }, { "epoch": 13.416957764369641, "grad_norm": 24021.865234375, "learning_rate": 4.1458399491902194e-05, "loss": 0.6802, "step": 42250 }, { "epoch": 13.432835820895523, "grad_norm": 24694.013671875, "learning_rate": 4.141870435058749e-05, "loss": 0.683, "step": 42300 }, { "epoch": 13.448713877421403, "grad_norm": 24235.306640625, "learning_rate": 4.137900920927279e-05, "loss": 0.6818, "step": 42350 }, { "epoch": 13.464591933947284, "grad_norm": 24905.02734375, "learning_rate": 4.133931406795809e-05, "loss": 0.68, "step": 42400 }, { "epoch": 13.480469990473166, "grad_norm": 24339.927734375, "learning_rate": 4.129961892664338e-05, "loss": 0.6799, "step": 42450 }, { "epoch": 13.496348046999048, "grad_norm": 24592.8828125, "learning_rate": 4.125992378532868e-05, "loss": 0.6776, "step": 42500 }, { "epoch": 13.51222610352493, "grad_norm": 24951.8125, "learning_rate": 4.1220228644013975e-05, "loss": 0.6841, "step": 42550 }, { "epoch": 13.528104160050809, "grad_norm": 24222.96875, "learning_rate": 4.118053350269927e-05, "loss": 0.6817, "step": 42600 }, { "epoch": 13.54398221657669, "grad_norm": 24539.65234375, "learning_rate": 4.114083836138457e-05, "loss": 0.6812, "step": 42650 }, { "epoch": 13.559860273102572, "grad_norm": 24310.98828125, "learning_rate": 4.110114322006986e-05, "loss": 0.6847, "step": 42700 }, { "epoch": 13.575738329628454, "grad_norm": 24087.740234375, "learning_rate": 4.106144807875516e-05, "loss": 0.6795, "step": 42750 }, { "epoch": 13.591616386154335, "grad_norm": 24772.966796875, "learning_rate": 4.102175293744046e-05, "loss": 0.68, "step": 42800 }, { "epoch": 13.607494442680215, "grad_norm": 24550.08203125, "learning_rate": 4.0982057796125756e-05, "loss": 0.6789, "step": 42850 }, { "epoch": 13.623372499206097, "grad_norm": 24223.595703125, "learning_rate": 4.094236265481105e-05, "loss": 0.6837, "step": 42900 }, { "epoch": 13.639250555731978, "grad_norm": 24113.8984375, "learning_rate": 4.090266751349635e-05, "loss": 0.681, "step": 42950 }, { "epoch": 13.65512861225786, "grad_norm": 24669.611328125, "learning_rate": 4.0862972372181644e-05, "loss": 0.6832, "step": 43000 }, { "epoch": 13.671006668783741, "grad_norm": 23798.759765625, "learning_rate": 4.0823277230866944e-05, "loss": 0.677, "step": 43050 }, { "epoch": 13.686884725309621, "grad_norm": 24341.833984375, "learning_rate": 4.0783582089552244e-05, "loss": 0.6814, "step": 43100 }, { "epoch": 13.702762781835503, "grad_norm": 24048.4609375, "learning_rate": 4.074388694823754e-05, "loss": 0.6784, "step": 43150 }, { "epoch": 13.718640838361384, "grad_norm": 24171.9921875, "learning_rate": 4.070419180692284e-05, "loss": 0.6803, "step": 43200 }, { "epoch": 13.734518894887266, "grad_norm": 24474.33203125, "learning_rate": 4.066449666560813e-05, "loss": 0.6785, "step": 43250 }, { "epoch": 13.750396951413148, "grad_norm": 23632.65625, "learning_rate": 4.0624801524293425e-05, "loss": 0.6811, "step": 43300 }, { "epoch": 13.766275007939027, "grad_norm": 23942.4140625, "learning_rate": 4.0585106382978725e-05, "loss": 0.6802, "step": 43350 }, { "epoch": 13.782153064464909, "grad_norm": 24818.43359375, "learning_rate": 4.054541124166402e-05, "loss": 0.6817, "step": 43400 }, { "epoch": 13.79803112099079, "grad_norm": 23369.2265625, "learning_rate": 4.050571610034932e-05, "loss": 0.6811, "step": 43450 }, { "epoch": 13.813909177516672, "grad_norm": 24389.009765625, "learning_rate": 4.046602095903462e-05, "loss": 0.6805, "step": 43500 }, { "epoch": 13.829787234042554, "grad_norm": 24440.2421875, "learning_rate": 4.042632581771991e-05, "loss": 0.681, "step": 43550 }, { "epoch": 13.845665290568434, "grad_norm": 24627.8203125, "learning_rate": 4.038663067640521e-05, "loss": 0.6775, "step": 43600 }, { "epoch": 13.861543347094315, "grad_norm": 23863.171875, "learning_rate": 4.034693553509051e-05, "loss": 0.6803, "step": 43650 }, { "epoch": 13.877421403620197, "grad_norm": 24950.375, "learning_rate": 4.03072403937758e-05, "loss": 0.6825, "step": 43700 }, { "epoch": 13.893299460146078, "grad_norm": 23713.75390625, "learning_rate": 4.02675452524611e-05, "loss": 0.6774, "step": 43750 }, { "epoch": 13.90917751667196, "grad_norm": 24111.923828125, "learning_rate": 4.02278501111464e-05, "loss": 0.6769, "step": 43800 }, { "epoch": 13.92505557319784, "grad_norm": 24068.44140625, "learning_rate": 4.0188154969831695e-05, "loss": 0.6792, "step": 43850 }, { "epoch": 13.940933629723721, "grad_norm": 24496.923828125, "learning_rate": 4.0148459828516995e-05, "loss": 0.6786, "step": 43900 }, { "epoch": 13.956811686249603, "grad_norm": 24324.5234375, "learning_rate": 4.010876468720229e-05, "loss": 0.6818, "step": 43950 }, { "epoch": 13.972689742775485, "grad_norm": 24043.82421875, "learning_rate": 4.006906954588758e-05, "loss": 0.6756, "step": 44000 }, { "epoch": 13.988567799301366, "grad_norm": 23718.05859375, "learning_rate": 4.002937440457288e-05, "loss": 0.6742, "step": 44050 }, { "epoch": 14.004445855827246, "grad_norm": 24011.693359375, "learning_rate": 3.9989679263258176e-05, "loss": 0.6766, "step": 44100 }, { "epoch": 14.020323912353128, "grad_norm": 23874.05859375, "learning_rate": 3.9949984121943476e-05, "loss": 0.6721, "step": 44150 }, { "epoch": 14.03620196887901, "grad_norm": 24170.900390625, "learning_rate": 3.9910288980628776e-05, "loss": 0.6746, "step": 44200 }, { "epoch": 14.05208002540489, "grad_norm": 23806.9921875, "learning_rate": 3.987059383931407e-05, "loss": 0.6702, "step": 44250 }, { "epoch": 14.067958081930772, "grad_norm": 24097.408203125, "learning_rate": 3.983089869799937e-05, "loss": 0.6726, "step": 44300 }, { "epoch": 14.083836138456652, "grad_norm": 24183.365234375, "learning_rate": 3.9791203556684664e-05, "loss": 0.6688, "step": 44350 }, { "epoch": 14.099714194982534, "grad_norm": 25200.125, "learning_rate": 3.975150841536996e-05, "loss": 0.6743, "step": 44400 }, { "epoch": 14.115592251508415, "grad_norm": 24078.8359375, "learning_rate": 3.971181327405526e-05, "loss": 0.6707, "step": 44450 }, { "epoch": 14.131470308034297, "grad_norm": 24092.71875, "learning_rate": 3.967211813274056e-05, "loss": 0.6727, "step": 44500 }, { "epoch": 14.147348364560179, "grad_norm": 24398.52734375, "learning_rate": 3.963242299142585e-05, "loss": 0.6726, "step": 44550 }, { "epoch": 14.163226421086058, "grad_norm": 25464.837890625, "learning_rate": 3.959272785011115e-05, "loss": 0.6692, "step": 44600 }, { "epoch": 14.17910447761194, "grad_norm": 24340.91796875, "learning_rate": 3.9553032708796445e-05, "loss": 0.6726, "step": 44650 }, { "epoch": 14.194982534137822, "grad_norm": 24626.837890625, "learning_rate": 3.951333756748174e-05, "loss": 0.6719, "step": 44700 }, { "epoch": 14.210860590663703, "grad_norm": 24861.15234375, "learning_rate": 3.947364242616704e-05, "loss": 0.6724, "step": 44750 }, { "epoch": 14.226738647189585, "grad_norm": 23964.291015625, "learning_rate": 3.943394728485233e-05, "loss": 0.674, "step": 44800 }, { "epoch": 14.242616703715465, "grad_norm": 23936.125, "learning_rate": 3.939425214353763e-05, "loss": 0.6743, "step": 44850 }, { "epoch": 14.258494760241346, "grad_norm": 24653.296875, "learning_rate": 3.935455700222293e-05, "loss": 0.6723, "step": 44900 }, { "epoch": 14.274372816767228, "grad_norm": 24398.095703125, "learning_rate": 3.9314861860908226e-05, "loss": 0.6774, "step": 44950 }, { "epoch": 14.29025087329311, "grad_norm": 24063.818359375, "learning_rate": 3.927516671959353e-05, "loss": 0.6701, "step": 45000 }, { "epoch": 14.30612892981899, "grad_norm": 24251.955078125, "learning_rate": 3.923547157827882e-05, "loss": 0.6749, "step": 45050 }, { "epoch": 14.32200698634487, "grad_norm": 24572.671875, "learning_rate": 3.9195776436964114e-05, "loss": 0.6754, "step": 45100 }, { "epoch": 14.337885042870752, "grad_norm": 24464.720703125, "learning_rate": 3.9156081295649414e-05, "loss": 0.6718, "step": 45150 }, { "epoch": 14.353763099396634, "grad_norm": 24511.29296875, "learning_rate": 3.9116386154334714e-05, "loss": 0.6752, "step": 45200 }, { "epoch": 14.369641155922515, "grad_norm": 24030.92578125, "learning_rate": 3.907669101302001e-05, "loss": 0.6737, "step": 45250 }, { "epoch": 14.385519212448397, "grad_norm": 23963.9453125, "learning_rate": 3.903699587170531e-05, "loss": 0.6726, "step": 45300 }, { "epoch": 14.401397268974277, "grad_norm": 24408.703125, "learning_rate": 3.89973007303906e-05, "loss": 0.6755, "step": 45350 }, { "epoch": 14.417275325500158, "grad_norm": 24175.90234375, "learning_rate": 3.89576055890759e-05, "loss": 0.6733, "step": 45400 }, { "epoch": 14.43315338202604, "grad_norm": 24181.03515625, "learning_rate": 3.8917910447761195e-05, "loss": 0.6715, "step": 45450 }, { "epoch": 14.449031438551922, "grad_norm": 24088.134765625, "learning_rate": 3.887821530644649e-05, "loss": 0.6734, "step": 45500 }, { "epoch": 14.464909495077803, "grad_norm": 24348.998046875, "learning_rate": 3.883852016513179e-05, "loss": 0.673, "step": 45550 }, { "epoch": 14.480787551603683, "grad_norm": 23610.765625, "learning_rate": 3.879882502381709e-05, "loss": 0.6733, "step": 45600 }, { "epoch": 14.496665608129565, "grad_norm": 23721.083984375, "learning_rate": 3.875912988250238e-05, "loss": 0.6731, "step": 45650 }, { "epoch": 14.512543664655446, "grad_norm": 24457.9375, "learning_rate": 3.871943474118768e-05, "loss": 0.6733, "step": 45700 }, { "epoch": 14.528421721181328, "grad_norm": 24163.458984375, "learning_rate": 3.867973959987298e-05, "loss": 0.6721, "step": 45750 }, { "epoch": 14.54429977770721, "grad_norm": 23760.05859375, "learning_rate": 3.864004445855827e-05, "loss": 0.6715, "step": 45800 }, { "epoch": 14.56017783423309, "grad_norm": 23812.14453125, "learning_rate": 3.860034931724357e-05, "loss": 0.6727, "step": 45850 }, { "epoch": 14.57605589075897, "grad_norm": 23580.501953125, "learning_rate": 3.8560654175928864e-05, "loss": 0.6704, "step": 45900 }, { "epoch": 14.591933947284852, "grad_norm": 24045.09765625, "learning_rate": 3.8520959034614165e-05, "loss": 0.6718, "step": 45950 }, { "epoch": 14.607812003810734, "grad_norm": 24375.74609375, "learning_rate": 3.8481263893299465e-05, "loss": 0.6709, "step": 46000 }, { "epoch": 14.623690060336616, "grad_norm": 24000.484375, "learning_rate": 3.844156875198476e-05, "loss": 0.674, "step": 46050 }, { "epoch": 14.639568116862495, "grad_norm": 23421.57421875, "learning_rate": 3.840187361067006e-05, "loss": 0.6735, "step": 46100 }, { "epoch": 14.655446173388377, "grad_norm": 24476.041015625, "learning_rate": 3.836217846935535e-05, "loss": 0.6737, "step": 46150 }, { "epoch": 14.671324229914259, "grad_norm": 23659.337890625, "learning_rate": 3.8322483328040646e-05, "loss": 0.6752, "step": 46200 }, { "epoch": 14.68720228644014, "grad_norm": 24077.185546875, "learning_rate": 3.8282788186725946e-05, "loss": 0.6684, "step": 46250 }, { "epoch": 14.703080342966022, "grad_norm": 24052.515625, "learning_rate": 3.8243093045411246e-05, "loss": 0.673, "step": 46300 }, { "epoch": 14.718958399491902, "grad_norm": 23948.08203125, "learning_rate": 3.820339790409654e-05, "loss": 0.6745, "step": 46350 }, { "epoch": 14.734836456017783, "grad_norm": 23770.400390625, "learning_rate": 3.816370276278184e-05, "loss": 0.6699, "step": 46400 }, { "epoch": 14.750714512543665, "grad_norm": 23651.755859375, "learning_rate": 3.8124007621467134e-05, "loss": 0.6722, "step": 46450 }, { "epoch": 14.766592569069546, "grad_norm": 24653.626953125, "learning_rate": 3.808431248015243e-05, "loss": 0.6698, "step": 46500 }, { "epoch": 14.782470625595428, "grad_norm": 24360.767578125, "learning_rate": 3.804461733883773e-05, "loss": 0.6701, "step": 46550 }, { "epoch": 14.798348682121308, "grad_norm": 23147.37109375, "learning_rate": 3.800492219752302e-05, "loss": 0.6687, "step": 46600 }, { "epoch": 14.81422673864719, "grad_norm": 24035.583984375, "learning_rate": 3.796522705620832e-05, "loss": 0.6703, "step": 46650 }, { "epoch": 14.830104795173071, "grad_norm": 23670.650390625, "learning_rate": 3.792553191489362e-05, "loss": 0.67, "step": 46700 }, { "epoch": 14.845982851698952, "grad_norm": 23496.2265625, "learning_rate": 3.7885836773578915e-05, "loss": 0.6741, "step": 46750 }, { "epoch": 14.861860908224834, "grad_norm": 24511.904296875, "learning_rate": 3.7846141632264215e-05, "loss": 0.673, "step": 46800 }, { "epoch": 14.877738964750714, "grad_norm": 23906.30859375, "learning_rate": 3.780644649094951e-05, "loss": 0.6745, "step": 46850 }, { "epoch": 14.893617021276595, "grad_norm": 23655.87109375, "learning_rate": 3.77667513496348e-05, "loss": 0.668, "step": 46900 }, { "epoch": 14.909495077802477, "grad_norm": 23849.982421875, "learning_rate": 3.77270562083201e-05, "loss": 0.6671, "step": 46950 }, { "epoch": 14.925373134328359, "grad_norm": 24548.85546875, "learning_rate": 3.76873610670054e-05, "loss": 0.6698, "step": 47000 }, { "epoch": 14.94125119085424, "grad_norm": 24144.28515625, "learning_rate": 3.7647665925690696e-05, "loss": 0.668, "step": 47050 }, { "epoch": 14.95712924738012, "grad_norm": 23972.701171875, "learning_rate": 3.7607970784376e-05, "loss": 0.6677, "step": 47100 }, { "epoch": 14.973007303906002, "grad_norm": 23551.353515625, "learning_rate": 3.756827564306129e-05, "loss": 0.6699, "step": 47150 }, { "epoch": 14.988885360431883, "grad_norm": 23527.978515625, "learning_rate": 3.752858050174659e-05, "loss": 0.6702, "step": 47200 }, { "epoch": 15.004763416957765, "grad_norm": 23273.08984375, "learning_rate": 3.7488885360431884e-05, "loss": 0.6702, "step": 47250 }, { "epoch": 15.020641473483646, "grad_norm": 23773.09765625, "learning_rate": 3.744919021911718e-05, "loss": 0.6671, "step": 47300 }, { "epoch": 15.036519530009526, "grad_norm": 24133.150390625, "learning_rate": 3.740949507780248e-05, "loss": 0.6607, "step": 47350 }, { "epoch": 15.052397586535408, "grad_norm": 23611.69140625, "learning_rate": 3.736979993648778e-05, "loss": 0.6608, "step": 47400 }, { "epoch": 15.06827564306129, "grad_norm": 24616.044921875, "learning_rate": 3.733010479517307e-05, "loss": 0.6678, "step": 47450 }, { "epoch": 15.084153699587171, "grad_norm": 24132.029296875, "learning_rate": 3.729040965385837e-05, "loss": 0.6634, "step": 47500 }, { "epoch": 15.100031756113053, "grad_norm": 24181.455078125, "learning_rate": 3.7250714512543665e-05, "loss": 0.6655, "step": 47550 }, { "epoch": 15.115909812638932, "grad_norm": 23974.443359375, "learning_rate": 3.721101937122896e-05, "loss": 0.662, "step": 47600 }, { "epoch": 15.131787869164814, "grad_norm": 24213.7890625, "learning_rate": 3.717132422991426e-05, "loss": 0.6608, "step": 47650 }, { "epoch": 15.147665925690696, "grad_norm": 24070.62890625, "learning_rate": 3.713162908859956e-05, "loss": 0.6658, "step": 47700 }, { "epoch": 15.163543982216577, "grad_norm": 23968.4296875, "learning_rate": 3.709193394728485e-05, "loss": 0.6609, "step": 47750 }, { "epoch": 15.179422038742459, "grad_norm": 23617.162109375, "learning_rate": 3.7052238805970153e-05, "loss": 0.6613, "step": 47800 }, { "epoch": 15.195300095268339, "grad_norm": 24029.837890625, "learning_rate": 3.701254366465545e-05, "loss": 0.664, "step": 47850 }, { "epoch": 15.21117815179422, "grad_norm": 23731.349609375, "learning_rate": 3.697284852334075e-05, "loss": 0.6636, "step": 47900 }, { "epoch": 15.227056208320102, "grad_norm": 24603.748046875, "learning_rate": 3.693315338202604e-05, "loss": 0.6629, "step": 47950 }, { "epoch": 15.242934264845983, "grad_norm": 23958.181640625, "learning_rate": 3.6893458240711334e-05, "loss": 0.6648, "step": 48000 }, { "epoch": 15.258812321371865, "grad_norm": 23479.33984375, "learning_rate": 3.6853763099396635e-05, "loss": 0.6653, "step": 48050 }, { "epoch": 15.274690377897745, "grad_norm": 24116.53515625, "learning_rate": 3.6814067958081935e-05, "loss": 0.6622, "step": 48100 }, { "epoch": 15.290568434423626, "grad_norm": 23710.822265625, "learning_rate": 3.677437281676723e-05, "loss": 0.6632, "step": 48150 }, { "epoch": 15.306446490949508, "grad_norm": 24976.740234375, "learning_rate": 3.673467767545253e-05, "loss": 0.6648, "step": 48200 }, { "epoch": 15.32232454747539, "grad_norm": 24450.333984375, "learning_rate": 3.669498253413782e-05, "loss": 0.6665, "step": 48250 }, { "epoch": 15.338202604001271, "grad_norm": 24068.060546875, "learning_rate": 3.6655287392823116e-05, "loss": 0.6653, "step": 48300 }, { "epoch": 15.354080660527151, "grad_norm": 23732.517578125, "learning_rate": 3.6615592251508416e-05, "loss": 0.6649, "step": 48350 }, { "epoch": 15.369958717053033, "grad_norm": 23714.439453125, "learning_rate": 3.6575897110193716e-05, "loss": 0.6644, "step": 48400 }, { "epoch": 15.385836773578914, "grad_norm": 24033.36328125, "learning_rate": 3.6536201968879017e-05, "loss": 0.6648, "step": 48450 }, { "epoch": 15.401714830104796, "grad_norm": 23467.201171875, "learning_rate": 3.649650682756431e-05, "loss": 0.6632, "step": 48500 }, { "epoch": 15.417592886630676, "grad_norm": 23929.56640625, "learning_rate": 3.6456811686249604e-05, "loss": 0.6621, "step": 48550 }, { "epoch": 15.433470943156557, "grad_norm": 23940.3125, "learning_rate": 3.6417116544934904e-05, "loss": 0.6666, "step": 48600 }, { "epoch": 15.449348999682439, "grad_norm": 23760.220703125, "learning_rate": 3.63774214036202e-05, "loss": 0.6657, "step": 48650 }, { "epoch": 15.46522705620832, "grad_norm": 24150.181640625, "learning_rate": 3.633772626230549e-05, "loss": 0.6635, "step": 48700 }, { "epoch": 15.481105112734202, "grad_norm": 23952.111328125, "learning_rate": 3.629803112099079e-05, "loss": 0.6659, "step": 48750 }, { "epoch": 15.496983169260083, "grad_norm": 23915.330078125, "learning_rate": 3.625833597967609e-05, "loss": 0.6649, "step": 48800 }, { "epoch": 15.512861225785963, "grad_norm": 24483.07421875, "learning_rate": 3.6218640838361385e-05, "loss": 0.6633, "step": 48850 }, { "epoch": 15.528739282311845, "grad_norm": 24314.33203125, "learning_rate": 3.6178945697046685e-05, "loss": 0.6649, "step": 48900 }, { "epoch": 15.544617338837726, "grad_norm": 24762.5625, "learning_rate": 3.613925055573198e-05, "loss": 0.6652, "step": 48950 }, { "epoch": 15.560495395363608, "grad_norm": 24496.2109375, "learning_rate": 3.609955541441727e-05, "loss": 0.6636, "step": 49000 }, { "epoch": 15.576373451889488, "grad_norm": 24142.921875, "learning_rate": 3.605986027310257e-05, "loss": 0.665, "step": 49050 }, { "epoch": 15.59225150841537, "grad_norm": 23896.888671875, "learning_rate": 3.602016513178787e-05, "loss": 0.6636, "step": 49100 }, { "epoch": 15.608129564941251, "grad_norm": 23628.111328125, "learning_rate": 3.598046999047317e-05, "loss": 0.6615, "step": 49150 }, { "epoch": 15.624007621467133, "grad_norm": 23748.744140625, "learning_rate": 3.594077484915847e-05, "loss": 0.6648, "step": 49200 }, { "epoch": 15.639885677993014, "grad_norm": 23825.630859375, "learning_rate": 3.590107970784376e-05, "loss": 0.6652, "step": 49250 }, { "epoch": 15.655763734518896, "grad_norm": 23878.716796875, "learning_rate": 3.586138456652906e-05, "loss": 0.6615, "step": 49300 }, { "epoch": 15.671641791044776, "grad_norm": 23718.876953125, "learning_rate": 3.5821689425214354e-05, "loss": 0.6622, "step": 49350 }, { "epoch": 15.687519847570657, "grad_norm": 23546.572265625, "learning_rate": 3.578199428389965e-05, "loss": 0.6641, "step": 49400 }, { "epoch": 15.703397904096539, "grad_norm": 23565.736328125, "learning_rate": 3.574229914258495e-05, "loss": 0.6656, "step": 49450 }, { "epoch": 15.71927596062242, "grad_norm": 23384.71484375, "learning_rate": 3.570260400127025e-05, "loss": 0.6618, "step": 49500 }, { "epoch": 15.7351540171483, "grad_norm": 23340.80078125, "learning_rate": 3.566290885995554e-05, "loss": 0.6653, "step": 49550 }, { "epoch": 15.751032073674182, "grad_norm": 24208.947265625, "learning_rate": 3.562321371864084e-05, "loss": 0.6661, "step": 49600 }, { "epoch": 15.766910130200063, "grad_norm": 23574.521484375, "learning_rate": 3.5583518577326135e-05, "loss": 0.6666, "step": 49650 }, { "epoch": 15.782788186725945, "grad_norm": 23982.78125, "learning_rate": 3.5543823436011436e-05, "loss": 0.663, "step": 49700 }, { "epoch": 15.798666243251827, "grad_norm": 23564.3359375, "learning_rate": 3.550412829469673e-05, "loss": 0.6606, "step": 49750 }, { "epoch": 15.814544299777708, "grad_norm": 23933.189453125, "learning_rate": 3.546443315338203e-05, "loss": 0.6647, "step": 49800 }, { "epoch": 15.830422356303588, "grad_norm": 23510.328125, "learning_rate": 3.542473801206733e-05, "loss": 0.6635, "step": 49850 }, { "epoch": 15.84630041282947, "grad_norm": 23416.091796875, "learning_rate": 3.5385042870752623e-05, "loss": 0.6646, "step": 49900 }, { "epoch": 15.862178469355351, "grad_norm": 23560.341796875, "learning_rate": 3.534534772943792e-05, "loss": 0.663, "step": 49950 }, { "epoch": 15.878056525881233, "grad_norm": 23726.447265625, "learning_rate": 3.530565258812322e-05, "loss": 0.6649, "step": 50000 }, { "epoch": 15.893934582407113, "grad_norm": 23844.755859375, "learning_rate": 3.526595744680851e-05, "loss": 0.6622, "step": 50050 }, { "epoch": 15.909812638932994, "grad_norm": 23132.53125, "learning_rate": 3.5226262305493804e-05, "loss": 0.6612, "step": 50100 }, { "epoch": 15.925690695458876, "grad_norm": 24354.9453125, "learning_rate": 3.5186567164179105e-05, "loss": 0.6633, "step": 50150 }, { "epoch": 15.941568751984757, "grad_norm": 23439.099609375, "learning_rate": 3.5146872022864405e-05, "loss": 0.6619, "step": 50200 }, { "epoch": 15.957446808510639, "grad_norm": 24292.439453125, "learning_rate": 3.5107176881549705e-05, "loss": 0.6629, "step": 50250 }, { "epoch": 15.973324865036519, "grad_norm": 23685.8515625, "learning_rate": 3.5067481740235e-05, "loss": 0.6622, "step": 50300 }, { "epoch": 15.9892029215624, "grad_norm": 23704.876953125, "learning_rate": 3.502778659892029e-05, "loss": 0.663, "step": 50350 }, { "epoch": 16.005080978088284, "grad_norm": 23387.416015625, "learning_rate": 3.498809145760559e-05, "loss": 0.659, "step": 50400 }, { "epoch": 16.02095903461416, "grad_norm": 23603.845703125, "learning_rate": 3.4948396316290886e-05, "loss": 0.6574, "step": 50450 }, { "epoch": 16.036837091140043, "grad_norm": 24016.529296875, "learning_rate": 3.490870117497618e-05, "loss": 0.657, "step": 50500 }, { "epoch": 16.052715147665925, "grad_norm": 24235.45703125, "learning_rate": 3.4869006033661487e-05, "loss": 0.6557, "step": 50550 }, { "epoch": 16.068593204191806, "grad_norm": 23704.05859375, "learning_rate": 3.482931089234678e-05, "loss": 0.6547, "step": 50600 }, { "epoch": 16.084471260717688, "grad_norm": 23739.4921875, "learning_rate": 3.4789615751032074e-05, "loss": 0.6541, "step": 50650 }, { "epoch": 16.10034931724357, "grad_norm": 24145.8046875, "learning_rate": 3.4749920609717374e-05, "loss": 0.6594, "step": 50700 }, { "epoch": 16.11622737376945, "grad_norm": 24285.50390625, "learning_rate": 3.471022546840267e-05, "loss": 0.6569, "step": 50750 }, { "epoch": 16.132105430295333, "grad_norm": 23721.884765625, "learning_rate": 3.467053032708796e-05, "loss": 0.6585, "step": 50800 }, { "epoch": 16.147983486821214, "grad_norm": 23684.302734375, "learning_rate": 3.463083518577326e-05, "loss": 0.6563, "step": 50850 }, { "epoch": 16.163861543347096, "grad_norm": 24330.8046875, "learning_rate": 3.459114004445856e-05, "loss": 0.6547, "step": 50900 }, { "epoch": 16.179739599872974, "grad_norm": 23832.5703125, "learning_rate": 3.455144490314386e-05, "loss": 0.6578, "step": 50950 }, { "epoch": 16.195617656398856, "grad_norm": 23988.150390625, "learning_rate": 3.4511749761829155e-05, "loss": 0.6558, "step": 51000 }, { "epoch": 16.211495712924737, "grad_norm": 23709.25, "learning_rate": 3.447205462051445e-05, "loss": 0.655, "step": 51050 }, { "epoch": 16.22737376945062, "grad_norm": 23120.591796875, "learning_rate": 3.443235947919975e-05, "loss": 0.6565, "step": 51100 }, { "epoch": 16.2432518259765, "grad_norm": 23713.447265625, "learning_rate": 3.439266433788504e-05, "loss": 0.6583, "step": 51150 }, { "epoch": 16.259129882502382, "grad_norm": 23596.0625, "learning_rate": 3.4352969196570336e-05, "loss": 0.6585, "step": 51200 }, { "epoch": 16.275007939028264, "grad_norm": 24150.869140625, "learning_rate": 3.4313274055255636e-05, "loss": 0.6575, "step": 51250 }, { "epoch": 16.290885995554145, "grad_norm": 24331.630859375, "learning_rate": 3.427357891394094e-05, "loss": 0.6594, "step": 51300 }, { "epoch": 16.306764052080027, "grad_norm": 23451.88671875, "learning_rate": 3.423388377262623e-05, "loss": 0.6596, "step": 51350 }, { "epoch": 16.32264210860591, "grad_norm": 23804.421875, "learning_rate": 3.419418863131153e-05, "loss": 0.6565, "step": 51400 }, { "epoch": 16.338520165131786, "grad_norm": 24613.421875, "learning_rate": 3.4154493489996824e-05, "loss": 0.6608, "step": 51450 }, { "epoch": 16.354398221657668, "grad_norm": 23770.623046875, "learning_rate": 3.4114798348682124e-05, "loss": 0.6558, "step": 51500 }, { "epoch": 16.37027627818355, "grad_norm": 23497.1171875, "learning_rate": 3.407510320736742e-05, "loss": 0.6559, "step": 51550 }, { "epoch": 16.38615433470943, "grad_norm": 23999.451171875, "learning_rate": 3.403540806605272e-05, "loss": 0.66, "step": 51600 }, { "epoch": 16.402032391235313, "grad_norm": 23710.712890625, "learning_rate": 3.399571292473802e-05, "loss": 0.6605, "step": 51650 }, { "epoch": 16.417910447761194, "grad_norm": 23660.166015625, "learning_rate": 3.395601778342331e-05, "loss": 0.6549, "step": 51700 }, { "epoch": 16.433788504287076, "grad_norm": 23169.587890625, "learning_rate": 3.3916322642108605e-05, "loss": 0.6604, "step": 51750 }, { "epoch": 16.449666560812958, "grad_norm": 24434.888671875, "learning_rate": 3.3876627500793906e-05, "loss": 0.6586, "step": 51800 }, { "epoch": 16.46554461733884, "grad_norm": 23537.533203125, "learning_rate": 3.38369323594792e-05, "loss": 0.6588, "step": 51850 }, { "epoch": 16.48142267386472, "grad_norm": 23304.681640625, "learning_rate": 3.379723721816449e-05, "loss": 0.6571, "step": 51900 }, { "epoch": 16.4973007303906, "grad_norm": 23815.26171875, "learning_rate": 3.375754207684979e-05, "loss": 0.6593, "step": 51950 }, { "epoch": 16.51317878691648, "grad_norm": 24492.578125, "learning_rate": 3.3717846935535093e-05, "loss": 0.6583, "step": 52000 }, { "epoch": 16.529056843442362, "grad_norm": 23740.220703125, "learning_rate": 3.3678151794220394e-05, "loss": 0.6548, "step": 52050 }, { "epoch": 16.544934899968244, "grad_norm": 23588.91015625, "learning_rate": 3.363845665290569e-05, "loss": 0.6556, "step": 52100 }, { "epoch": 16.560812956494125, "grad_norm": 23579.50390625, "learning_rate": 3.359876151159098e-05, "loss": 0.6572, "step": 52150 }, { "epoch": 16.576691013020007, "grad_norm": 23243.826171875, "learning_rate": 3.355906637027628e-05, "loss": 0.658, "step": 52200 }, { "epoch": 16.59256906954589, "grad_norm": 23816.73828125, "learning_rate": 3.3519371228961575e-05, "loss": 0.6573, "step": 52250 }, { "epoch": 16.60844712607177, "grad_norm": 23419.2265625, "learning_rate": 3.3479676087646875e-05, "loss": 0.6569, "step": 52300 }, { "epoch": 16.62432518259765, "grad_norm": 24114.97265625, "learning_rate": 3.3439980946332175e-05, "loss": 0.6556, "step": 52350 }, { "epoch": 16.64020323912353, "grad_norm": 23384.103515625, "learning_rate": 3.340028580501747e-05, "loss": 0.6595, "step": 52400 }, { "epoch": 16.65608129564941, "grad_norm": 23055.017578125, "learning_rate": 3.336059066370276e-05, "loss": 0.6539, "step": 52450 }, { "epoch": 16.671959352175293, "grad_norm": 23221.4609375, "learning_rate": 3.332089552238806e-05, "loss": 0.6587, "step": 52500 }, { "epoch": 16.687837408701174, "grad_norm": 23152.85546875, "learning_rate": 3.3281200381073356e-05, "loss": 0.6583, "step": 52550 }, { "epoch": 16.703715465227056, "grad_norm": 23718.134765625, "learning_rate": 3.324150523975865e-05, "loss": 0.6575, "step": 52600 }, { "epoch": 16.719593521752937, "grad_norm": 23835.32421875, "learning_rate": 3.320181009844395e-05, "loss": 0.652, "step": 52650 }, { "epoch": 16.73547157827882, "grad_norm": 24064.998046875, "learning_rate": 3.316211495712925e-05, "loss": 0.6544, "step": 52700 }, { "epoch": 16.7513496348047, "grad_norm": 23508.853515625, "learning_rate": 3.312241981581455e-05, "loss": 0.6558, "step": 52750 }, { "epoch": 16.767227691330582, "grad_norm": 23061.650390625, "learning_rate": 3.3082724674499844e-05, "loss": 0.6591, "step": 52800 }, { "epoch": 16.783105747856464, "grad_norm": 23995.44140625, "learning_rate": 3.304302953318514e-05, "loss": 0.6559, "step": 52850 }, { "epoch": 16.798983804382345, "grad_norm": 23488.259765625, "learning_rate": 3.300333439187044e-05, "loss": 0.6571, "step": 52900 }, { "epoch": 16.814861860908223, "grad_norm": 24552.5625, "learning_rate": 3.296363925055573e-05, "loss": 0.6547, "step": 52950 }, { "epoch": 16.830739917434105, "grad_norm": 23736.818359375, "learning_rate": 3.292394410924103e-05, "loss": 0.6578, "step": 53000 }, { "epoch": 16.846617973959987, "grad_norm": 24375.546875, "learning_rate": 3.288424896792633e-05, "loss": 0.655, "step": 53050 }, { "epoch": 16.862496030485868, "grad_norm": 23373.5, "learning_rate": 3.2844553826611625e-05, "loss": 0.6555, "step": 53100 }, { "epoch": 16.87837408701175, "grad_norm": 24301.333984375, "learning_rate": 3.280485868529692e-05, "loss": 0.6574, "step": 53150 }, { "epoch": 16.89425214353763, "grad_norm": 23277.462890625, "learning_rate": 3.276516354398222e-05, "loss": 0.6555, "step": 53200 }, { "epoch": 16.910130200063513, "grad_norm": 23834.724609375, "learning_rate": 3.272546840266751e-05, "loss": 0.6582, "step": 53250 }, { "epoch": 16.926008256589395, "grad_norm": 23567.853515625, "learning_rate": 3.268577326135281e-05, "loss": 0.6547, "step": 53300 }, { "epoch": 16.941886313115276, "grad_norm": 23326.3671875, "learning_rate": 3.2646078120038106e-05, "loss": 0.6567, "step": 53350 }, { "epoch": 16.957764369641154, "grad_norm": 23291.1953125, "learning_rate": 3.260638297872341e-05, "loss": 0.6552, "step": 53400 }, { "epoch": 16.973642426167036, "grad_norm": 23493.248046875, "learning_rate": 3.256668783740871e-05, "loss": 0.6562, "step": 53450 }, { "epoch": 16.989520482692917, "grad_norm": 23239.197265625, "learning_rate": 3.2526992696094e-05, "loss": 0.6562, "step": 53500 }, { "epoch": 17.0053985392188, "grad_norm": 23646.5, "learning_rate": 3.2487297554779294e-05, "loss": 0.6553, "step": 53550 }, { "epoch": 17.02127659574468, "grad_norm": 23330.103515625, "learning_rate": 3.2447602413464594e-05, "loss": 0.6497, "step": 53600 }, { "epoch": 17.037154652270562, "grad_norm": 24091.298828125, "learning_rate": 3.240790727214989e-05, "loss": 0.6515, "step": 53650 }, { "epoch": 17.053032708796444, "grad_norm": 24229.26171875, "learning_rate": 3.236821213083519e-05, "loss": 0.6518, "step": 53700 }, { "epoch": 17.068910765322325, "grad_norm": 23416.974609375, "learning_rate": 3.232851698952049e-05, "loss": 0.6496, "step": 53750 }, { "epoch": 17.084788821848207, "grad_norm": 23611.005859375, "learning_rate": 3.228882184820578e-05, "loss": 0.6491, "step": 53800 }, { "epoch": 17.10066687837409, "grad_norm": 23050.642578125, "learning_rate": 3.224912670689108e-05, "loss": 0.6507, "step": 53850 }, { "epoch": 17.116544934899967, "grad_norm": 23595.267578125, "learning_rate": 3.2209431565576376e-05, "loss": 0.6475, "step": 53900 }, { "epoch": 17.132422991425848, "grad_norm": 23561.833984375, "learning_rate": 3.216973642426167e-05, "loss": 0.6487, "step": 53950 }, { "epoch": 17.14830104795173, "grad_norm": 23513.59375, "learning_rate": 3.213004128294697e-05, "loss": 0.6474, "step": 54000 }, { "epoch": 17.16417910447761, "grad_norm": 23998.837890625, "learning_rate": 3.209034614163226e-05, "loss": 0.6507, "step": 54050 }, { "epoch": 17.180057161003493, "grad_norm": 23122.234375, "learning_rate": 3.2050651000317563e-05, "loss": 0.649, "step": 54100 }, { "epoch": 17.195935217529374, "grad_norm": 23222.94921875, "learning_rate": 3.2010955859002864e-05, "loss": 0.6503, "step": 54150 }, { "epoch": 17.211813274055256, "grad_norm": 23577.119140625, "learning_rate": 3.197126071768816e-05, "loss": 0.6511, "step": 54200 }, { "epoch": 17.227691330581138, "grad_norm": 23782.28125, "learning_rate": 3.193156557637345e-05, "loss": 0.6523, "step": 54250 }, { "epoch": 17.24356938710702, "grad_norm": 24278.03125, "learning_rate": 3.189187043505875e-05, "loss": 0.654, "step": 54300 }, { "epoch": 17.2594474436329, "grad_norm": 23616.5390625, "learning_rate": 3.1852175293744045e-05, "loss": 0.651, "step": 54350 }, { "epoch": 17.27532550015878, "grad_norm": 23676.12890625, "learning_rate": 3.181248015242934e-05, "loss": 0.6503, "step": 54400 }, { "epoch": 17.29120355668466, "grad_norm": 23172.5859375, "learning_rate": 3.1772785011114645e-05, "loss": 0.6541, "step": 54450 }, { "epoch": 17.307081613210542, "grad_norm": 23444.791015625, "learning_rate": 3.173308986979994e-05, "loss": 0.6491, "step": 54500 }, { "epoch": 17.322959669736424, "grad_norm": 23441.435546875, "learning_rate": 3.169339472848524e-05, "loss": 0.6495, "step": 54550 }, { "epoch": 17.338837726262305, "grad_norm": 23380.716796875, "learning_rate": 3.165369958717053e-05, "loss": 0.6504, "step": 54600 }, { "epoch": 17.354715782788187, "grad_norm": 23601.03125, "learning_rate": 3.1614004445855826e-05, "loss": 0.6543, "step": 54650 }, { "epoch": 17.37059383931407, "grad_norm": 23646.830078125, "learning_rate": 3.1574309304541126e-05, "loss": 0.6514, "step": 54700 }, { "epoch": 17.38647189583995, "grad_norm": 23490.85546875, "learning_rate": 3.153461416322642e-05, "loss": 0.6513, "step": 54750 }, { "epoch": 17.40234995236583, "grad_norm": 23827.6796875, "learning_rate": 3.149491902191172e-05, "loss": 0.6484, "step": 54800 }, { "epoch": 17.418228008891713, "grad_norm": 23096.40625, "learning_rate": 3.145522388059702e-05, "loss": 0.6483, "step": 54850 }, { "epoch": 17.43410606541759, "grad_norm": 23743.8359375, "learning_rate": 3.1415528739282314e-05, "loss": 0.652, "step": 54900 }, { "epoch": 17.449984121943473, "grad_norm": 23075.265625, "learning_rate": 3.137583359796761e-05, "loss": 0.6501, "step": 54950 }, { "epoch": 17.465862178469354, "grad_norm": 24035.71875, "learning_rate": 3.133613845665291e-05, "loss": 0.652, "step": 55000 }, { "epoch": 17.481740234995236, "grad_norm": 23361.478515625, "learning_rate": 3.12964433153382e-05, "loss": 0.6536, "step": 55050 }, { "epoch": 17.497618291521118, "grad_norm": 23745.171875, "learning_rate": 3.12567481740235e-05, "loss": 0.6504, "step": 55100 }, { "epoch": 17.513496348047, "grad_norm": 23789.912109375, "learning_rate": 3.12170530327088e-05, "loss": 0.6491, "step": 55150 }, { "epoch": 17.52937440457288, "grad_norm": 23335.9609375, "learning_rate": 3.1177357891394095e-05, "loss": 0.6503, "step": 55200 }, { "epoch": 17.545252461098762, "grad_norm": 23689.12890625, "learning_rate": 3.1137662750079396e-05, "loss": 0.6527, "step": 55250 }, { "epoch": 17.561130517624644, "grad_norm": 23400.572265625, "learning_rate": 3.109796760876469e-05, "loss": 0.6519, "step": 55300 }, { "epoch": 17.577008574150526, "grad_norm": 23527.71875, "learning_rate": 3.105827246744998e-05, "loss": 0.6494, "step": 55350 }, { "epoch": 17.592886630676404, "grad_norm": 23462.595703125, "learning_rate": 3.101857732613528e-05, "loss": 0.6507, "step": 55400 }, { "epoch": 17.608764687202285, "grad_norm": 23832.5625, "learning_rate": 3.0978882184820576e-05, "loss": 0.648, "step": 55450 }, { "epoch": 17.624642743728167, "grad_norm": 23171.41796875, "learning_rate": 3.093918704350588e-05, "loss": 0.6515, "step": 55500 }, { "epoch": 17.64052080025405, "grad_norm": 22859.158203125, "learning_rate": 3.089949190219118e-05, "loss": 0.65, "step": 55550 }, { "epoch": 17.65639885677993, "grad_norm": 23794.48828125, "learning_rate": 3.085979676087647e-05, "loss": 0.6494, "step": 55600 }, { "epoch": 17.67227691330581, "grad_norm": 23437.626953125, "learning_rate": 3.082010161956177e-05, "loss": 0.6507, "step": 55650 }, { "epoch": 17.688154969831693, "grad_norm": 23454.169921875, "learning_rate": 3.0780406478247064e-05, "loss": 0.6528, "step": 55700 }, { "epoch": 17.704033026357575, "grad_norm": 23537.875, "learning_rate": 3.074071133693236e-05, "loss": 0.6508, "step": 55750 }, { "epoch": 17.719911082883456, "grad_norm": 23073.7421875, "learning_rate": 3.070101619561766e-05, "loss": 0.6514, "step": 55800 }, { "epoch": 17.735789139409338, "grad_norm": 24199.0234375, "learning_rate": 3.066132105430295e-05, "loss": 0.652, "step": 55850 }, { "epoch": 17.751667195935216, "grad_norm": 24246.693359375, "learning_rate": 3.062162591298825e-05, "loss": 0.6506, "step": 55900 }, { "epoch": 17.767545252461098, "grad_norm": 23131.373046875, "learning_rate": 3.058193077167355e-05, "loss": 0.6473, "step": 55950 }, { "epoch": 17.78342330898698, "grad_norm": 23967.60546875, "learning_rate": 3.0542235630358846e-05, "loss": 0.649, "step": 56000 }, { "epoch": 17.79930136551286, "grad_norm": 23857.62890625, "learning_rate": 3.050254048904414e-05, "loss": 0.6498, "step": 56050 }, { "epoch": 17.815179422038742, "grad_norm": 23931.333984375, "learning_rate": 3.046284534772944e-05, "loss": 0.651, "step": 56100 }, { "epoch": 17.831057478564624, "grad_norm": 23383.25390625, "learning_rate": 3.0423150206414736e-05, "loss": 0.6526, "step": 56150 }, { "epoch": 17.846935535090505, "grad_norm": 23175.05078125, "learning_rate": 3.038345506510003e-05, "loss": 0.6494, "step": 56200 }, { "epoch": 17.862813591616387, "grad_norm": 23574.662109375, "learning_rate": 3.034375992378533e-05, "loss": 0.6479, "step": 56250 }, { "epoch": 17.87869164814227, "grad_norm": 23581.66015625, "learning_rate": 3.0304064782470627e-05, "loss": 0.6514, "step": 56300 }, { "epoch": 17.89456970466815, "grad_norm": 23191.83203125, "learning_rate": 3.0264369641155928e-05, "loss": 0.649, "step": 56350 }, { "epoch": 17.91044776119403, "grad_norm": 23138.642578125, "learning_rate": 3.022467449984122e-05, "loss": 0.6523, "step": 56400 }, { "epoch": 17.92632581771991, "grad_norm": 24247.412109375, "learning_rate": 3.0184979358526515e-05, "loss": 0.6508, "step": 56450 }, { "epoch": 17.94220387424579, "grad_norm": 23328.904296875, "learning_rate": 3.0145284217211818e-05, "loss": 0.6521, "step": 56500 }, { "epoch": 17.958081930771673, "grad_norm": 23195.806640625, "learning_rate": 3.0105589075897112e-05, "loss": 0.6519, "step": 56550 }, { "epoch": 17.973959987297555, "grad_norm": 24111.8359375, "learning_rate": 3.0065893934582405e-05, "loss": 0.6504, "step": 56600 }, { "epoch": 17.989838043823436, "grad_norm": 23394.666015625, "learning_rate": 3.0026198793267706e-05, "loss": 0.6499, "step": 56650 }, { "epoch": 18.005716100349318, "grad_norm": 24451.033203125, "learning_rate": 2.9986503651953002e-05, "loss": 0.6493, "step": 56700 }, { "epoch": 18.0215941568752, "grad_norm": 24162.71875, "learning_rate": 2.9946808510638296e-05, "loss": 0.6437, "step": 56750 }, { "epoch": 18.03747221340108, "grad_norm": 23387.994140625, "learning_rate": 2.9907113369323596e-05, "loss": 0.6487, "step": 56800 }, { "epoch": 18.053350269926963, "grad_norm": 23269.083984375, "learning_rate": 2.9867418228008893e-05, "loss": 0.6469, "step": 56850 }, { "epoch": 18.06922832645284, "grad_norm": 23347.12890625, "learning_rate": 2.9827723086694193e-05, "loss": 0.6428, "step": 56900 }, { "epoch": 18.085106382978722, "grad_norm": 23654.724609375, "learning_rate": 2.9788027945379487e-05, "loss": 0.6485, "step": 56950 }, { "epoch": 18.100984439504604, "grad_norm": 23451.236328125, "learning_rate": 2.9748332804064784e-05, "loss": 0.6458, "step": 57000 }, { "epoch": 18.116862496030485, "grad_norm": 24515.484375, "learning_rate": 2.9708637662750084e-05, "loss": 0.6451, "step": 57050 }, { "epoch": 18.132740552556367, "grad_norm": 23294.755859375, "learning_rate": 2.9668942521435378e-05, "loss": 0.6473, "step": 57100 }, { "epoch": 18.14861860908225, "grad_norm": 24213.658203125, "learning_rate": 2.962924738012067e-05, "loss": 0.6435, "step": 57150 }, { "epoch": 18.16449666560813, "grad_norm": 23097.380859375, "learning_rate": 2.958955223880597e-05, "loss": 0.6448, "step": 57200 }, { "epoch": 18.18037472213401, "grad_norm": 23379.626953125, "learning_rate": 2.954985709749127e-05, "loss": 0.6455, "step": 57250 }, { "epoch": 18.196252778659893, "grad_norm": 23119.544921875, "learning_rate": 2.9510161956176562e-05, "loss": 0.6435, "step": 57300 }, { "epoch": 18.212130835185775, "grad_norm": 23745.896484375, "learning_rate": 2.9470466814861862e-05, "loss": 0.6469, "step": 57350 }, { "epoch": 18.228008891711653, "grad_norm": 23996.830078125, "learning_rate": 2.943077167354716e-05, "loss": 0.6471, "step": 57400 }, { "epoch": 18.243886948237535, "grad_norm": 23478.310546875, "learning_rate": 2.939107653223246e-05, "loss": 0.6435, "step": 57450 }, { "epoch": 18.259765004763416, "grad_norm": 23451.203125, "learning_rate": 2.9351381390917753e-05, "loss": 0.6466, "step": 57500 }, { "epoch": 18.275643061289298, "grad_norm": 23465.240234375, "learning_rate": 2.931168624960305e-05, "loss": 0.6473, "step": 57550 }, { "epoch": 18.29152111781518, "grad_norm": 23575.08203125, "learning_rate": 2.927199110828835e-05, "loss": 0.6465, "step": 57600 }, { "epoch": 18.30739917434106, "grad_norm": 23251.138671875, "learning_rate": 2.9232295966973644e-05, "loss": 0.6434, "step": 57650 }, { "epoch": 18.323277230866942, "grad_norm": 23791.912109375, "learning_rate": 2.919260082565894e-05, "loss": 0.6462, "step": 57700 }, { "epoch": 18.339155287392824, "grad_norm": 23962.904296875, "learning_rate": 2.915290568434424e-05, "loss": 0.6464, "step": 57750 }, { "epoch": 18.355033343918706, "grad_norm": 23311.55078125, "learning_rate": 2.9113210543029534e-05, "loss": 0.6432, "step": 57800 }, { "epoch": 18.370911400444587, "grad_norm": 23348.361328125, "learning_rate": 2.9073515401714828e-05, "loss": 0.6439, "step": 57850 }, { "epoch": 18.386789456970465, "grad_norm": 23554.818359375, "learning_rate": 2.9033820260400128e-05, "loss": 0.6468, "step": 57900 }, { "epoch": 18.402667513496347, "grad_norm": 23781.22265625, "learning_rate": 2.8994125119085425e-05, "loss": 0.6444, "step": 57950 }, { "epoch": 18.41854557002223, "grad_norm": 22692.29296875, "learning_rate": 2.895442997777072e-05, "loss": 0.6442, "step": 58000 }, { "epoch": 18.43442362654811, "grad_norm": 23058.107421875, "learning_rate": 2.891473483645602e-05, "loss": 0.6439, "step": 58050 }, { "epoch": 18.45030168307399, "grad_norm": 22859.33203125, "learning_rate": 2.8875039695141316e-05, "loss": 0.645, "step": 58100 }, { "epoch": 18.466179739599873, "grad_norm": 23191.587890625, "learning_rate": 2.8835344553826616e-05, "loss": 0.6449, "step": 58150 }, { "epoch": 18.482057796125755, "grad_norm": 23647.953125, "learning_rate": 2.879564941251191e-05, "loss": 0.6466, "step": 58200 }, { "epoch": 18.497935852651636, "grad_norm": 23992.3828125, "learning_rate": 2.8755954271197207e-05, "loss": 0.6473, "step": 58250 }, { "epoch": 18.513813909177518, "grad_norm": 23166.861328125, "learning_rate": 2.8716259129882507e-05, "loss": 0.6451, "step": 58300 }, { "epoch": 18.5296919657034, "grad_norm": 23637.072265625, "learning_rate": 2.86765639885678e-05, "loss": 0.6481, "step": 58350 }, { "epoch": 18.545570022229278, "grad_norm": 23371.7265625, "learning_rate": 2.8636868847253094e-05, "loss": 0.6406, "step": 58400 }, { "epoch": 18.56144807875516, "grad_norm": 23572.712890625, "learning_rate": 2.8597173705938398e-05, "loss": 0.6437, "step": 58450 }, { "epoch": 18.57732613528104, "grad_norm": 23404.9765625, "learning_rate": 2.855747856462369e-05, "loss": 0.6467, "step": 58500 }, { "epoch": 18.593204191806922, "grad_norm": 23365.392578125, "learning_rate": 2.8517783423308985e-05, "loss": 0.6464, "step": 58550 }, { "epoch": 18.609082248332804, "grad_norm": 22837.333984375, "learning_rate": 2.8478088281994285e-05, "loss": 0.6476, "step": 58600 }, { "epoch": 18.624960304858686, "grad_norm": 23265.06640625, "learning_rate": 2.8438393140679582e-05, "loss": 0.6445, "step": 58650 }, { "epoch": 18.640838361384567, "grad_norm": 23657.7265625, "learning_rate": 2.8398697999364882e-05, "loss": 0.6487, "step": 58700 }, { "epoch": 18.65671641791045, "grad_norm": 23775.322265625, "learning_rate": 2.8359002858050176e-05, "loss": 0.6461, "step": 58750 }, { "epoch": 18.67259447443633, "grad_norm": 23216.8359375, "learning_rate": 2.8319307716735472e-05, "loss": 0.6445, "step": 58800 }, { "epoch": 18.68847253096221, "grad_norm": 23683.677734375, "learning_rate": 2.8279612575420773e-05, "loss": 0.6464, "step": 58850 }, { "epoch": 18.70435058748809, "grad_norm": 24212.357421875, "learning_rate": 2.8239917434106066e-05, "loss": 0.6458, "step": 58900 }, { "epoch": 18.72022864401397, "grad_norm": 23845.314453125, "learning_rate": 2.8200222292791363e-05, "loss": 0.6434, "step": 58950 }, { "epoch": 18.736106700539853, "grad_norm": 23429.07421875, "learning_rate": 2.8160527151476663e-05, "loss": 0.6454, "step": 59000 }, { "epoch": 18.751984757065735, "grad_norm": 23693.30078125, "learning_rate": 2.8120832010161957e-05, "loss": 0.6449, "step": 59050 }, { "epoch": 18.767862813591616, "grad_norm": 24399.693359375, "learning_rate": 2.808113686884725e-05, "loss": 0.6442, "step": 59100 }, { "epoch": 18.783740870117498, "grad_norm": 23742.171875, "learning_rate": 2.8041441727532554e-05, "loss": 0.645, "step": 59150 }, { "epoch": 18.79961892664338, "grad_norm": 23861.95703125, "learning_rate": 2.8001746586217848e-05, "loss": 0.6476, "step": 59200 }, { "epoch": 18.81549698316926, "grad_norm": 23694.40625, "learning_rate": 2.7962051444903148e-05, "loss": 0.6453, "step": 59250 }, { "epoch": 18.831375039695143, "grad_norm": 22883.59375, "learning_rate": 2.792235630358844e-05, "loss": 0.6441, "step": 59300 }, { "epoch": 18.847253096221024, "grad_norm": 24172.53125, "learning_rate": 2.788266116227374e-05, "loss": 0.643, "step": 59350 }, { "epoch": 18.863131152746902, "grad_norm": 23304.0859375, "learning_rate": 2.784296602095904e-05, "loss": 0.6468, "step": 59400 }, { "epoch": 18.879009209272784, "grad_norm": 23574.00390625, "learning_rate": 2.7803270879644332e-05, "loss": 0.6473, "step": 59450 }, { "epoch": 18.894887265798666, "grad_norm": 23528.5859375, "learning_rate": 2.776357573832963e-05, "loss": 0.6455, "step": 59500 }, { "epoch": 18.910765322324547, "grad_norm": 23860.66015625, "learning_rate": 2.772388059701493e-05, "loss": 0.647, "step": 59550 }, { "epoch": 18.92664337885043, "grad_norm": 23734.267578125, "learning_rate": 2.7684185455700223e-05, "loss": 0.6422, "step": 59600 }, { "epoch": 18.94252143537631, "grad_norm": 23822.1875, "learning_rate": 2.764449031438552e-05, "loss": 0.643, "step": 59650 }, { "epoch": 18.958399491902192, "grad_norm": 22949.701171875, "learning_rate": 2.760479517307082e-05, "loss": 0.6441, "step": 59700 }, { "epoch": 18.974277548428073, "grad_norm": 23238.943359375, "learning_rate": 2.7565100031756114e-05, "loss": 0.6449, "step": 59750 }, { "epoch": 18.990155604953955, "grad_norm": 23398.4140625, "learning_rate": 2.7525404890441407e-05, "loss": 0.6429, "step": 59800 }, { "epoch": 19.006033661479837, "grad_norm": 23431.22265625, "learning_rate": 2.7485709749126707e-05, "loss": 0.6427, "step": 59850 }, { "epoch": 19.021911718005715, "grad_norm": 23309.41796875, "learning_rate": 2.7446014607812004e-05, "loss": 0.6407, "step": 59900 }, { "epoch": 19.037789774531596, "grad_norm": 23369.134765625, "learning_rate": 2.7406319466497305e-05, "loss": 0.6337, "step": 59950 }, { "epoch": 19.053667831057478, "grad_norm": 23830.84375, "learning_rate": 2.7366624325182598e-05, "loss": 0.6405, "step": 60000 }, { "epoch": 19.06954588758336, "grad_norm": 23755.240234375, "learning_rate": 2.7326929183867895e-05, "loss": 0.6404, "step": 60050 }, { "epoch": 19.08542394410924, "grad_norm": 23902.658203125, "learning_rate": 2.7287234042553195e-05, "loss": 0.639, "step": 60100 }, { "epoch": 19.101302000635123, "grad_norm": 23243.98828125, "learning_rate": 2.724753890123849e-05, "loss": 0.6352, "step": 60150 }, { "epoch": 19.117180057161004, "grad_norm": 23601.951171875, "learning_rate": 2.7207843759923786e-05, "loss": 0.6374, "step": 60200 }, { "epoch": 19.133058113686886, "grad_norm": 23469.58203125, "learning_rate": 2.7168148618609086e-05, "loss": 0.6391, "step": 60250 }, { "epoch": 19.148936170212767, "grad_norm": 24173.52734375, "learning_rate": 2.712845347729438e-05, "loss": 0.6387, "step": 60300 }, { "epoch": 19.164814226738645, "grad_norm": 23875.79296875, "learning_rate": 2.7088758335979677e-05, "loss": 0.6399, "step": 60350 }, { "epoch": 19.180692283264527, "grad_norm": 23351.52734375, "learning_rate": 2.7049063194664977e-05, "loss": 0.6418, "step": 60400 }, { "epoch": 19.19657033979041, "grad_norm": 24030.84765625, "learning_rate": 2.700936805335027e-05, "loss": 0.6417, "step": 60450 }, { "epoch": 19.21244839631629, "grad_norm": 23502.681640625, "learning_rate": 2.696967291203557e-05, "loss": 0.6419, "step": 60500 }, { "epoch": 19.228326452842172, "grad_norm": 23972.9609375, "learning_rate": 2.6929977770720864e-05, "loss": 0.6385, "step": 60550 }, { "epoch": 19.244204509368053, "grad_norm": 23450.828125, "learning_rate": 2.689028262940616e-05, "loss": 0.6383, "step": 60600 }, { "epoch": 19.260082565893935, "grad_norm": 23348.533203125, "learning_rate": 2.685058748809146e-05, "loss": 0.6438, "step": 60650 }, { "epoch": 19.275960622419817, "grad_norm": 23530.572265625, "learning_rate": 2.6810892346776755e-05, "loss": 0.642, "step": 60700 }, { "epoch": 19.291838678945698, "grad_norm": 23539.4375, "learning_rate": 2.6771197205462052e-05, "loss": 0.6403, "step": 60750 }, { "epoch": 19.30771673547158, "grad_norm": 23497.1015625, "learning_rate": 2.6731502064147352e-05, "loss": 0.6384, "step": 60800 }, { "epoch": 19.323594791997458, "grad_norm": 23695.337890625, "learning_rate": 2.6691806922832646e-05, "loss": 0.6428, "step": 60850 }, { "epoch": 19.33947284852334, "grad_norm": 23628.791015625, "learning_rate": 2.6652111781517942e-05, "loss": 0.6371, "step": 60900 }, { "epoch": 19.35535090504922, "grad_norm": 23643.2421875, "learning_rate": 2.6612416640203243e-05, "loss": 0.643, "step": 60950 }, { "epoch": 19.371228961575103, "grad_norm": 23056.48828125, "learning_rate": 2.6572721498888536e-05, "loss": 0.6387, "step": 61000 }, { "epoch": 19.387107018100984, "grad_norm": 23687.0859375, "learning_rate": 2.6533026357573837e-05, "loss": 0.64, "step": 61050 }, { "epoch": 19.402985074626866, "grad_norm": 23476.853515625, "learning_rate": 2.6493331216259133e-05, "loss": 0.6406, "step": 61100 }, { "epoch": 19.418863131152747, "grad_norm": 24086.623046875, "learning_rate": 2.6453636074944427e-05, "loss": 0.6428, "step": 61150 }, { "epoch": 19.43474118767863, "grad_norm": 23353.08984375, "learning_rate": 2.6413940933629727e-05, "loss": 0.6435, "step": 61200 }, { "epoch": 19.45061924420451, "grad_norm": 23448.341796875, "learning_rate": 2.637424579231502e-05, "loss": 0.6423, "step": 61250 }, { "epoch": 19.466497300730392, "grad_norm": 23803.828125, "learning_rate": 2.6334550651000318e-05, "loss": 0.6419, "step": 61300 }, { "epoch": 19.48237535725627, "grad_norm": 24309.66796875, "learning_rate": 2.6294855509685618e-05, "loss": 0.6434, "step": 61350 }, { "epoch": 19.49825341378215, "grad_norm": 23195.96875, "learning_rate": 2.625516036837091e-05, "loss": 0.6378, "step": 61400 }, { "epoch": 19.514131470308033, "grad_norm": 23841.748046875, "learning_rate": 2.621546522705621e-05, "loss": 0.6397, "step": 61450 }, { "epoch": 19.530009526833915, "grad_norm": 23373.90625, "learning_rate": 2.617577008574151e-05, "loss": 0.6405, "step": 61500 }, { "epoch": 19.545887583359796, "grad_norm": 23817.623046875, "learning_rate": 2.6136074944426802e-05, "loss": 0.6402, "step": 61550 }, { "epoch": 19.561765639885678, "grad_norm": 23955.10546875, "learning_rate": 2.60963798031121e-05, "loss": 0.6394, "step": 61600 }, { "epoch": 19.57764369641156, "grad_norm": 23063.484375, "learning_rate": 2.60566846617974e-05, "loss": 0.6426, "step": 61650 }, { "epoch": 19.59352175293744, "grad_norm": 23891.98046875, "learning_rate": 2.6016989520482693e-05, "loss": 0.6411, "step": 61700 }, { "epoch": 19.609399809463323, "grad_norm": 23992.01171875, "learning_rate": 2.5977294379167993e-05, "loss": 0.6414, "step": 61750 }, { "epoch": 19.625277865989204, "grad_norm": 23503.697265625, "learning_rate": 2.5937599237853287e-05, "loss": 0.6375, "step": 61800 }, { "epoch": 19.641155922515082, "grad_norm": 24320.890625, "learning_rate": 2.5897904096538584e-05, "loss": 0.6395, "step": 61850 }, { "epoch": 19.657033979040964, "grad_norm": 23124.4375, "learning_rate": 2.5858208955223884e-05, "loss": 0.6425, "step": 61900 }, { "epoch": 19.672912035566846, "grad_norm": 24162.05078125, "learning_rate": 2.5818513813909177e-05, "loss": 0.6399, "step": 61950 }, { "epoch": 19.688790092092727, "grad_norm": 23305.732421875, "learning_rate": 2.5778818672594474e-05, "loss": 0.6375, "step": 62000 }, { "epoch": 19.70466814861861, "grad_norm": 23611.771484375, "learning_rate": 2.5739123531279775e-05, "loss": 0.6402, "step": 62050 }, { "epoch": 19.72054620514449, "grad_norm": 23453.798828125, "learning_rate": 2.5699428389965068e-05, "loss": 0.6365, "step": 62100 }, { "epoch": 19.736424261670372, "grad_norm": 23632.29296875, "learning_rate": 2.5659733248650365e-05, "loss": 0.6387, "step": 62150 }, { "epoch": 19.752302318196254, "grad_norm": 23645.380859375, "learning_rate": 2.5620038107335665e-05, "loss": 0.642, "step": 62200 }, { "epoch": 19.768180374722135, "grad_norm": 23833.201171875, "learning_rate": 2.558034296602096e-05, "loss": 0.6401, "step": 62250 }, { "epoch": 19.784058431248017, "grad_norm": 23208.59765625, "learning_rate": 2.554064782470626e-05, "loss": 0.6398, "step": 62300 }, { "epoch": 19.799936487773895, "grad_norm": 23972.072265625, "learning_rate": 2.5500952683391556e-05, "loss": 0.6371, "step": 62350 }, { "epoch": 19.815814544299776, "grad_norm": 23224.896484375, "learning_rate": 2.546125754207685e-05, "loss": 0.6391, "step": 62400 }, { "epoch": 19.831692600825658, "grad_norm": 24514.93359375, "learning_rate": 2.542156240076215e-05, "loss": 0.6403, "step": 62450 }, { "epoch": 19.84757065735154, "grad_norm": 23860.24609375, "learning_rate": 2.5381867259447443e-05, "loss": 0.6409, "step": 62500 }, { "epoch": 19.86344871387742, "grad_norm": 23272.37890625, "learning_rate": 2.534217211813274e-05, "loss": 0.6391, "step": 62550 }, { "epoch": 19.879326770403303, "grad_norm": 23465.796875, "learning_rate": 2.530247697681804e-05, "loss": 0.6403, "step": 62600 }, { "epoch": 19.895204826929184, "grad_norm": 23239.646484375, "learning_rate": 2.5262781835503334e-05, "loss": 0.641, "step": 62650 }, { "epoch": 19.911082883455066, "grad_norm": 23455.052734375, "learning_rate": 2.522308669418863e-05, "loss": 0.641, "step": 62700 }, { "epoch": 19.926960939980948, "grad_norm": 23780.283203125, "learning_rate": 2.518339155287393e-05, "loss": 0.6403, "step": 62750 }, { "epoch": 19.94283899650683, "grad_norm": 24348.8515625, "learning_rate": 2.5143696411559225e-05, "loss": 0.638, "step": 62800 }, { "epoch": 19.958717053032707, "grad_norm": 23591.212890625, "learning_rate": 2.5104001270244525e-05, "loss": 0.6367, "step": 62850 }, { "epoch": 19.97459510955859, "grad_norm": 23728.669921875, "learning_rate": 2.5064306128929822e-05, "loss": 0.6397, "step": 62900 }, { "epoch": 19.99047316608447, "grad_norm": 23320.609375, "learning_rate": 2.5024610987615116e-05, "loss": 0.6411, "step": 62950 }, { "epoch": 20.0, "step": 62980, "total_flos": 2.10597197119488e+18, "train_loss": 0.33622724912249125, "train_runtime": 72693.7372, "train_samples_per_second": 55.437, "train_steps_per_second": 0.866 } ], "logging_steps": 50, "max_steps": 62980, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.10597197119488e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }