diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8865 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 500, + "global_step": 62980, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.015878056525881232, + "grad_norm": 69456.203125, + "learning_rate": 4.992219752302319e-05, + "loss": 3.3489, + "step": 50 + }, + { + "epoch": 0.031756113051762465, + "grad_norm": 28315.216796875, + "learning_rate": 4.984280724039378e-05, + "loss": 2.1825, + "step": 100 + }, + { + "epoch": 0.047634169577643694, + "grad_norm": 22939.28515625, + "learning_rate": 4.976341695776437e-05, + "loss": 1.8984, + "step": 150 + }, + { + "epoch": 0.06351222610352493, + "grad_norm": 23446.751953125, + "learning_rate": 4.968402667513497e-05, + "loss": 1.8356, + "step": 200 + }, + { + "epoch": 0.07939028262940616, + "grad_norm": 15620.2763671875, + "learning_rate": 4.960463639250556e-05, + "loss": 1.8063, + "step": 250 + }, + { + "epoch": 0.09526833915528739, + "grad_norm": 17849.13671875, + "learning_rate": 4.9525246109876156e-05, + "loss": 1.7904, + "step": 300 + }, + { + "epoch": 0.11114639568116863, + "grad_norm": 16811.81640625, + "learning_rate": 4.944585582724674e-05, + "loss": 1.7763, + "step": 350 + }, + { + "epoch": 0.12702445220704986, + "grad_norm": 15963.05859375, + "learning_rate": 4.936646554461734e-05, + "loss": 1.7694, + "step": 400 + }, + { + "epoch": 0.1429025087329311, + "grad_norm": 13125.708984375, + "learning_rate": 4.928707526198794e-05, + "loss": 1.7613, + "step": 450 + }, + { + "epoch": 0.15878056525881232, + "grad_norm": 13808.8876953125, + "learning_rate": 4.920768497935853e-05, + "loss": 1.7565, + "step": 500 + }, + { + "epoch": 0.17465862178469355, + "grad_norm": 15005.7353515625, + "learning_rate": 4.912829469672912e-05, + "loss": 1.7512, + "step": 550 + }, + { + "epoch": 0.19053667831057478, + "grad_norm": 15644.3916015625, + "learning_rate": 4.904890441409972e-05, + "loss": 1.7457, + "step": 600 + }, + { + "epoch": 0.206414734836456, + "grad_norm": 13973.876953125, + "learning_rate": 4.896951413147031e-05, + "loss": 1.7395, + "step": 650 + }, + { + "epoch": 0.22229279136233726, + "grad_norm": 16523.75390625, + "learning_rate": 4.88901238488409e-05, + "loss": 1.7285, + "step": 700 + }, + { + "epoch": 0.2381708478882185, + "grad_norm": 25229.689453125, + "learning_rate": 4.88107335662115e-05, + "loss": 1.7046, + "step": 750 + }, + { + "epoch": 0.2540489044140997, + "grad_norm": 48292.5078125, + "learning_rate": 4.8731343283582094e-05, + "loss": 1.664, + "step": 800 + }, + { + "epoch": 0.2699269609399809, + "grad_norm": 38503.4296875, + "learning_rate": 4.865195300095269e-05, + "loss": 1.6121, + "step": 850 + }, + { + "epoch": 0.2858050174658622, + "grad_norm": 48101.1484375, + "learning_rate": 4.8572562718323275e-05, + "loss": 1.5622, + "step": 900 + }, + { + "epoch": 0.30168307399174343, + "grad_norm": 63038.48046875, + "learning_rate": 4.8493172435693875e-05, + "loss": 1.5095, + "step": 950 + }, + { + "epoch": 0.31756113051762463, + "grad_norm": 80867.75, + "learning_rate": 4.841378215306447e-05, + "loss": 1.4587, + "step": 1000 + }, + { + "epoch": 0.3334391870435059, + "grad_norm": 42233.703125, + "learning_rate": 4.8334391870435056e-05, + "loss": 1.4149, + "step": 1050 + }, + { + "epoch": 0.3493172435693871, + "grad_norm": 53614.875, + "learning_rate": 4.825500158780566e-05, + "loss": 1.375, + "step": 1100 + }, + { + "epoch": 0.36519530009526835, + "grad_norm": 43879.546875, + "learning_rate": 4.817561130517625e-05, + "loss": 1.3372, + "step": 1150 + }, + { + "epoch": 0.38107335662114955, + "grad_norm": 43690.40625, + "learning_rate": 4.8096221022546844e-05, + "loss": 1.3026, + "step": 1200 + }, + { + "epoch": 0.3969514131470308, + "grad_norm": 46655.68359375, + "learning_rate": 4.801683073991743e-05, + "loss": 1.2798, + "step": 1250 + }, + { + "epoch": 0.412829469672912, + "grad_norm": 39245.97265625, + "learning_rate": 4.793744045728803e-05, + "loss": 1.2521, + "step": 1300 + }, + { + "epoch": 0.42870752619879327, + "grad_norm": 52106.55859375, + "learning_rate": 4.7858050174658626e-05, + "loss": 1.2299, + "step": 1350 + }, + { + "epoch": 0.4445855827246745, + "grad_norm": 41391.16796875, + "learning_rate": 4.777865989202922e-05, + "loss": 1.2141, + "step": 1400 + }, + { + "epoch": 0.4604636392505557, + "grad_norm": 41273.65625, + "learning_rate": 4.769926960939981e-05, + "loss": 1.1954, + "step": 1450 + }, + { + "epoch": 0.476341695776437, + "grad_norm": 48099.3203125, + "learning_rate": 4.761987932677041e-05, + "loss": 1.1785, + "step": 1500 + }, + { + "epoch": 0.4922197523023182, + "grad_norm": 41167.43359375, + "learning_rate": 4.7540489044141e-05, + "loss": 1.1642, + "step": 1550 + }, + { + "epoch": 0.5080978088281994, + "grad_norm": 40066.16796875, + "learning_rate": 4.746109876151159e-05, + "loss": 1.15, + "step": 1600 + }, + { + "epoch": 0.5239758653540807, + "grad_norm": 57272.24609375, + "learning_rate": 4.738170847888219e-05, + "loss": 1.1426, + "step": 1650 + }, + { + "epoch": 0.5398539218799618, + "grad_norm": 38700.19921875, + "learning_rate": 4.730231819625278e-05, + "loss": 1.1286, + "step": 1700 + }, + { + "epoch": 0.5557319784058431, + "grad_norm": 39080.23828125, + "learning_rate": 4.7222927913623376e-05, + "loss": 1.116, + "step": 1750 + }, + { + "epoch": 0.5716100349317244, + "grad_norm": 40904.78515625, + "learning_rate": 4.714353763099397e-05, + "loss": 1.1037, + "step": 1800 + }, + { + "epoch": 0.5874880914576056, + "grad_norm": 46008.6328125, + "learning_rate": 4.7064147348364564e-05, + "loss": 1.1006, + "step": 1850 + }, + { + "epoch": 0.6033661479834869, + "grad_norm": 51325.14453125, + "learning_rate": 4.698475706573516e-05, + "loss": 1.0899, + "step": 1900 + }, + { + "epoch": 0.619244204509368, + "grad_norm": 38173.07421875, + "learning_rate": 4.6905366783105745e-05, + "loss": 1.0767, + "step": 1950 + }, + { + "epoch": 0.6351222610352493, + "grad_norm": 36432.33203125, + "learning_rate": 4.6825976500476345e-05, + "loss": 1.0737, + "step": 2000 + }, + { + "epoch": 0.6510003175611305, + "grad_norm": 38897.46484375, + "learning_rate": 4.674658621784694e-05, + "loss": 1.0631, + "step": 2050 + }, + { + "epoch": 0.6668783740870118, + "grad_norm": 36565.90234375, + "learning_rate": 4.666719593521753e-05, + "loss": 1.0611, + "step": 2100 + }, + { + "epoch": 0.6827564306128929, + "grad_norm": 38040.09765625, + "learning_rate": 4.658780565258813e-05, + "loss": 1.049, + "step": 2150 + }, + { + "epoch": 0.6986344871387742, + "grad_norm": 40119.8984375, + "learning_rate": 4.650841536995872e-05, + "loss": 1.0401, + "step": 2200 + }, + { + "epoch": 0.7145125436646554, + "grad_norm": 38403.37890625, + "learning_rate": 4.6429025087329314e-05, + "loss": 1.0373, + "step": 2250 + }, + { + "epoch": 0.7303906001905367, + "grad_norm": 35558.62109375, + "learning_rate": 4.634963480469991e-05, + "loss": 1.0327, + "step": 2300 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 41901.265625, + "learning_rate": 4.62702445220705e-05, + "loss": 1.0258, + "step": 2350 + }, + { + "epoch": 0.7621467132422991, + "grad_norm": 35863.7109375, + "learning_rate": 4.6190854239441096e-05, + "loss": 1.0181, + "step": 2400 + }, + { + "epoch": 0.7780247697681804, + "grad_norm": 34477.07421875, + "learning_rate": 4.611146395681169e-05, + "loss": 1.0156, + "step": 2450 + }, + { + "epoch": 0.7939028262940616, + "grad_norm": 36488.30078125, + "learning_rate": 4.603207367418228e-05, + "loss": 1.0109, + "step": 2500 + }, + { + "epoch": 0.8097808828199429, + "grad_norm": 37530.74609375, + "learning_rate": 4.595268339155288e-05, + "loss": 1.0063, + "step": 2550 + }, + { + "epoch": 0.825658939345824, + "grad_norm": 38817.25390625, + "learning_rate": 4.587329310892347e-05, + "loss": 1.0007, + "step": 2600 + }, + { + "epoch": 0.8415369958717053, + "grad_norm": 36987.03515625, + "learning_rate": 4.5793902826294065e-05, + "loss": 0.9925, + "step": 2650 + }, + { + "epoch": 0.8574150523975865, + "grad_norm": 36874.86328125, + "learning_rate": 4.571451254366466e-05, + "loss": 0.9931, + "step": 2700 + }, + { + "epoch": 0.8732931089234678, + "grad_norm": 38325.61328125, + "learning_rate": 4.563512226103525e-05, + "loss": 0.984, + "step": 2750 + }, + { + "epoch": 0.889171165449349, + "grad_norm": 35897.0859375, + "learning_rate": 4.5555731978405846e-05, + "loss": 0.982, + "step": 2800 + }, + { + "epoch": 0.9050492219752302, + "grad_norm": 38165.0, + "learning_rate": 4.547634169577643e-05, + "loss": 0.9786, + "step": 2850 + }, + { + "epoch": 0.9209272785011114, + "grad_norm": 36260.8671875, + "learning_rate": 4.5396951413147034e-05, + "loss": 0.9736, + "step": 2900 + }, + { + "epoch": 0.9368053350269927, + "grad_norm": 35681.60546875, + "learning_rate": 4.531756113051763e-05, + "loss": 0.9657, + "step": 2950 + }, + { + "epoch": 0.952683391552874, + "grad_norm": 33949.65234375, + "learning_rate": 4.523817084788822e-05, + "loss": 0.9691, + "step": 3000 + }, + { + "epoch": 0.9685614480787551, + "grad_norm": 34695.6796875, + "learning_rate": 4.5158780565258815e-05, + "loss": 0.9621, + "step": 3050 + }, + { + "epoch": 0.9844395046046364, + "grad_norm": 42460.8359375, + "learning_rate": 4.507939028262941e-05, + "loss": 0.9627, + "step": 3100 + }, + { + "epoch": 1.0003175611305177, + "grad_norm": 37083.6796875, + "learning_rate": 4.5e-05, + "loss": 0.9538, + "step": 3150 + }, + { + "epoch": 1.0161956176563989, + "grad_norm": 35333.8359375, + "learning_rate": 4.49206097173706e-05, + "loss": 0.951, + "step": 3200 + }, + { + "epoch": 1.03207367418228, + "grad_norm": 34177.54296875, + "learning_rate": 4.484121943474119e-05, + "loss": 0.9496, + "step": 3250 + }, + { + "epoch": 1.0479517307081614, + "grad_norm": 33729.51953125, + "learning_rate": 4.4761829152111784e-05, + "loss": 0.9431, + "step": 3300 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 35274.5625, + "learning_rate": 4.468243886948238e-05, + "loss": 0.9405, + "step": 3350 + }, + { + "epoch": 1.0797078437599237, + "grad_norm": 35014.6796875, + "learning_rate": 4.460304858685297e-05, + "loss": 0.9404, + "step": 3400 + }, + { + "epoch": 1.095585900285805, + "grad_norm": 34381.96484375, + "learning_rate": 4.4523658304223566e-05, + "loss": 0.9367, + "step": 3450 + }, + { + "epoch": 1.1114639568116862, + "grad_norm": 36695.0625, + "learning_rate": 4.444426802159416e-05, + "loss": 0.9309, + "step": 3500 + }, + { + "epoch": 1.1273420133375676, + "grad_norm": 37935.7734375, + "learning_rate": 4.436487773896475e-05, + "loss": 0.9331, + "step": 3550 + }, + { + "epoch": 1.1432200698634487, + "grad_norm": 34156.03125, + "learning_rate": 4.428548745633535e-05, + "loss": 0.9251, + "step": 3600 + }, + { + "epoch": 1.1590981263893299, + "grad_norm": 34130.3046875, + "learning_rate": 4.420609717370594e-05, + "loss": 0.9263, + "step": 3650 + }, + { + "epoch": 1.1749761829152112, + "grad_norm": 35052.70703125, + "learning_rate": 4.4126706891076535e-05, + "loss": 0.9217, + "step": 3700 + }, + { + "epoch": 1.1908542394410924, + "grad_norm": 36061.64453125, + "learning_rate": 4.404731660844713e-05, + "loss": 0.9197, + "step": 3750 + }, + { + "epoch": 1.2067322959669737, + "grad_norm": 32517.8984375, + "learning_rate": 4.396792632581772e-05, + "loss": 0.9208, + "step": 3800 + }, + { + "epoch": 1.2226103524928549, + "grad_norm": 35002.6328125, + "learning_rate": 4.3888536043188316e-05, + "loss": 0.9197, + "step": 3850 + }, + { + "epoch": 1.238488409018736, + "grad_norm": 32863.40234375, + "learning_rate": 4.380914576055891e-05, + "loss": 0.9089, + "step": 3900 + }, + { + "epoch": 1.2543664655446174, + "grad_norm": 34469.52734375, + "learning_rate": 4.3729755477929504e-05, + "loss": 0.9127, + "step": 3950 + }, + { + "epoch": 1.2702445220704985, + "grad_norm": 32288.01171875, + "learning_rate": 4.36503651953001e-05, + "loss": 0.9062, + "step": 4000 + }, + { + "epoch": 1.28612257859638, + "grad_norm": 34935.56640625, + "learning_rate": 4.357097491267069e-05, + "loss": 0.9076, + "step": 4050 + }, + { + "epoch": 1.302000635122261, + "grad_norm": 32266.908203125, + "learning_rate": 4.3491584630041285e-05, + "loss": 0.9079, + "step": 4100 + }, + { + "epoch": 1.3178786916481422, + "grad_norm": 33226.34375, + "learning_rate": 4.341219434741188e-05, + "loss": 0.9044, + "step": 4150 + }, + { + "epoch": 1.3337567481740236, + "grad_norm": 34449.453125, + "learning_rate": 4.333280406478247e-05, + "loss": 0.9038, + "step": 4200 + }, + { + "epoch": 1.3496348046999047, + "grad_norm": 32593.271484375, + "learning_rate": 4.325341378215307e-05, + "loss": 0.8969, + "step": 4250 + }, + { + "epoch": 1.365512861225786, + "grad_norm": 33997.08984375, + "learning_rate": 4.317402349952366e-05, + "loss": 0.8955, + "step": 4300 + }, + { + "epoch": 1.3813909177516672, + "grad_norm": 34988.44921875, + "learning_rate": 4.3094633216894254e-05, + "loss": 0.8925, + "step": 4350 + }, + { + "epoch": 1.3972689742775484, + "grad_norm": 31535.099609375, + "learning_rate": 4.301524293426485e-05, + "loss": 0.8931, + "step": 4400 + }, + { + "epoch": 1.4131470308034297, + "grad_norm": 32298.515625, + "learning_rate": 4.293585265163544e-05, + "loss": 0.8914, + "step": 4450 + }, + { + "epoch": 1.4290250873293109, + "grad_norm": 33798.9140625, + "learning_rate": 4.2856462369006036e-05, + "loss": 0.889, + "step": 4500 + }, + { + "epoch": 1.4449031438551923, + "grad_norm": 33727.1484375, + "learning_rate": 4.277707208637663e-05, + "loss": 0.8891, + "step": 4550 + }, + { + "epoch": 1.4607812003810734, + "grad_norm": 39095.109375, + "learning_rate": 4.269768180374722e-05, + "loss": 0.8932, + "step": 4600 + }, + { + "epoch": 1.4766592569069545, + "grad_norm": 33266.66796875, + "learning_rate": 4.261829152111782e-05, + "loss": 0.8843, + "step": 4650 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 33800.22265625, + "learning_rate": 4.253890123848841e-05, + "loss": 0.882, + "step": 4700 + }, + { + "epoch": 1.508415369958717, + "grad_norm": 31127.240234375, + "learning_rate": 4.2459510955859005e-05, + "loss": 0.8828, + "step": 4750 + }, + { + "epoch": 1.5242934264845984, + "grad_norm": 33765.43359375, + "learning_rate": 4.23801206732296e-05, + "loss": 0.8802, + "step": 4800 + }, + { + "epoch": 1.5401714830104796, + "grad_norm": 32892.484375, + "learning_rate": 4.230073039060019e-05, + "loss": 0.882, + "step": 4850 + }, + { + "epoch": 1.5560495395363607, + "grad_norm": 32505.76953125, + "learning_rate": 4.2221340107970786e-05, + "loss": 0.8772, + "step": 4900 + }, + { + "epoch": 1.5719275960622419, + "grad_norm": 32439.31640625, + "learning_rate": 4.214194982534138e-05, + "loss": 0.8742, + "step": 4950 + }, + { + "epoch": 1.5878056525881232, + "grad_norm": 32518.017578125, + "learning_rate": 4.2062559542711974e-05, + "loss": 0.873, + "step": 5000 + }, + { + "epoch": 1.6036837091140046, + "grad_norm": 34613.1640625, + "learning_rate": 4.198316926008257e-05, + "loss": 0.873, + "step": 5050 + }, + { + "epoch": 1.6195617656398857, + "grad_norm": 32058.451171875, + "learning_rate": 4.190377897745316e-05, + "loss": 0.873, + "step": 5100 + }, + { + "epoch": 1.6354398221657669, + "grad_norm": 31566.736328125, + "learning_rate": 4.1824388694823755e-05, + "loss": 0.8675, + "step": 5150 + }, + { + "epoch": 1.651317878691648, + "grad_norm": 33988.2734375, + "learning_rate": 4.174499841219435e-05, + "loss": 0.8667, + "step": 5200 + }, + { + "epoch": 1.6671959352175294, + "grad_norm": 33580.97265625, + "learning_rate": 4.166560812956494e-05, + "loss": 0.8638, + "step": 5250 + }, + { + "epoch": 1.6830739917434108, + "grad_norm": 33622.66015625, + "learning_rate": 4.158621784693554e-05, + "loss": 0.8668, + "step": 5300 + }, + { + "epoch": 1.698952048269292, + "grad_norm": 32790.08203125, + "learning_rate": 4.150682756430613e-05, + "loss": 0.8616, + "step": 5350 + }, + { + "epoch": 1.714830104795173, + "grad_norm": 31265.09765625, + "learning_rate": 4.1427437281676724e-05, + "loss": 0.8578, + "step": 5400 + }, + { + "epoch": 1.7307081613210542, + "grad_norm": 31559.154296875, + "learning_rate": 4.134804699904732e-05, + "loss": 0.86, + "step": 5450 + }, + { + "epoch": 1.7465862178469356, + "grad_norm": 31868.90234375, + "learning_rate": 4.126865671641791e-05, + "loss": 0.8612, + "step": 5500 + }, + { + "epoch": 1.7624642743728167, + "grad_norm": 32017.955078125, + "learning_rate": 4.1189266433788506e-05, + "loss": 0.8599, + "step": 5550 + }, + { + "epoch": 1.778342330898698, + "grad_norm": 32002.77734375, + "learning_rate": 4.11098761511591e-05, + "loss": 0.8592, + "step": 5600 + }, + { + "epoch": 1.7942203874245792, + "grad_norm": 30244.373046875, + "learning_rate": 4.103048586852969e-05, + "loss": 0.855, + "step": 5650 + }, + { + "epoch": 1.8100984439504604, + "grad_norm": 32472.248046875, + "learning_rate": 4.095109558590029e-05, + "loss": 0.857, + "step": 5700 + }, + { + "epoch": 1.8259765004763417, + "grad_norm": 32474.484375, + "learning_rate": 4.087170530327088e-05, + "loss": 0.8526, + "step": 5750 + }, + { + "epoch": 1.841854557002223, + "grad_norm": 31490.275390625, + "learning_rate": 4.0792315020641475e-05, + "loss": 0.8559, + "step": 5800 + }, + { + "epoch": 1.8577326135281043, + "grad_norm": 31051.58203125, + "learning_rate": 4.071292473801207e-05, + "loss": 0.8522, + "step": 5850 + }, + { + "epoch": 1.8736106700539854, + "grad_norm": 31409.7578125, + "learning_rate": 4.063353445538267e-05, + "loss": 0.8486, + "step": 5900 + }, + { + "epoch": 1.8894887265798666, + "grad_norm": 32382.998046875, + "learning_rate": 4.0554144172753256e-05, + "loss": 0.8515, + "step": 5950 + }, + { + "epoch": 1.9053667831057477, + "grad_norm": 31483.380859375, + "learning_rate": 4.047475389012385e-05, + "loss": 0.8477, + "step": 6000 + }, + { + "epoch": 1.921244839631629, + "grad_norm": 32504.404296875, + "learning_rate": 4.0395363607494444e-05, + "loss": 0.8506, + "step": 6050 + }, + { + "epoch": 1.9371228961575104, + "grad_norm": 31748.40234375, + "learning_rate": 4.031597332486504e-05, + "loss": 0.8461, + "step": 6100 + }, + { + "epoch": 1.9530009526833916, + "grad_norm": 31397.05078125, + "learning_rate": 4.023658304223563e-05, + "loss": 0.8459, + "step": 6150 + }, + { + "epoch": 1.9688790092092727, + "grad_norm": 32932.44140625, + "learning_rate": 4.0157192759606225e-05, + "loss": 0.8431, + "step": 6200 + }, + { + "epoch": 1.9847570657351539, + "grad_norm": 32290.40234375, + "learning_rate": 4.007780247697682e-05, + "loss": 0.8435, + "step": 6250 + }, + { + "epoch": 2.0006351222610355, + "grad_norm": 31642.935546875, + "learning_rate": 3.999841219434741e-05, + "loss": 0.8403, + "step": 6300 + }, + { + "epoch": 2.0165131787869166, + "grad_norm": 31722.009765625, + "learning_rate": 3.991902191171801e-05, + "loss": 0.8393, + "step": 6350 + }, + { + "epoch": 2.0323912353127978, + "grad_norm": 30855.62890625, + "learning_rate": 3.98396316290886e-05, + "loss": 0.8356, + "step": 6400 + }, + { + "epoch": 2.048269291838679, + "grad_norm": 31825.283203125, + "learning_rate": 3.9760241346459194e-05, + "loss": 0.8342, + "step": 6450 + }, + { + "epoch": 2.06414734836456, + "grad_norm": 32278.908203125, + "learning_rate": 3.968085106382979e-05, + "loss": 0.8337, + "step": 6500 + }, + { + "epoch": 2.080025404890441, + "grad_norm": 31714.283203125, + "learning_rate": 3.960146078120038e-05, + "loss": 0.8348, + "step": 6550 + }, + { + "epoch": 2.0959034614163228, + "grad_norm": 31567.390625, + "learning_rate": 3.9522070498570976e-05, + "loss": 0.8281, + "step": 6600 + }, + { + "epoch": 2.111781517942204, + "grad_norm": 32440.802734375, + "learning_rate": 3.944268021594157e-05, + "loss": 0.831, + "step": 6650 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 32060.837890625, + "learning_rate": 3.936328993331216e-05, + "loss": 0.831, + "step": 6700 + }, + { + "epoch": 2.143537630993966, + "grad_norm": 31656.82421875, + "learning_rate": 3.928389965068276e-05, + "loss": 0.829, + "step": 6750 + }, + { + "epoch": 2.1594156875198474, + "grad_norm": 31442.107421875, + "learning_rate": 3.920450936805336e-05, + "loss": 0.8266, + "step": 6800 + }, + { + "epoch": 2.175293744045729, + "grad_norm": 31876.103515625, + "learning_rate": 3.9125119085423945e-05, + "loss": 0.8256, + "step": 6850 + }, + { + "epoch": 2.19117180057161, + "grad_norm": 31034.79296875, + "learning_rate": 3.904572880279454e-05, + "loss": 0.8217, + "step": 6900 + }, + { + "epoch": 2.2070498570974912, + "grad_norm": 32682.744140625, + "learning_rate": 3.896633852016513e-05, + "loss": 0.8301, + "step": 6950 + }, + { + "epoch": 2.2229279136233724, + "grad_norm": 32618.359375, + "learning_rate": 3.8886948237535726e-05, + "loss": 0.8211, + "step": 7000 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 30968.298828125, + "learning_rate": 3.880755795490632e-05, + "loss": 0.825, + "step": 7050 + }, + { + "epoch": 2.254684026675135, + "grad_norm": 30861.9921875, + "learning_rate": 3.8728167672276914e-05, + "loss": 0.8201, + "step": 7100 + }, + { + "epoch": 2.2705620832010163, + "grad_norm": 30823.109375, + "learning_rate": 3.8648777389647514e-05, + "loss": 0.8197, + "step": 7150 + }, + { + "epoch": 2.2864401397268974, + "grad_norm": 30141.990234375, + "learning_rate": 3.85693871070181e-05, + "loss": 0.8224, + "step": 7200 + }, + { + "epoch": 2.3023181962527786, + "grad_norm": 34504.34375, + "learning_rate": 3.8489996824388695e-05, + "loss": 0.821, + "step": 7250 + }, + { + "epoch": 2.3181962527786597, + "grad_norm": 30797.966796875, + "learning_rate": 3.841060654175929e-05, + "loss": 0.8203, + "step": 7300 + }, + { + "epoch": 2.3340743093045413, + "grad_norm": 30129.06640625, + "learning_rate": 3.833121625912988e-05, + "loss": 0.8178, + "step": 7350 + }, + { + "epoch": 2.3499523658304224, + "grad_norm": 30087.42578125, + "learning_rate": 3.825182597650048e-05, + "loss": 0.8219, + "step": 7400 + }, + { + "epoch": 2.3658304223563036, + "grad_norm": 30917.18359375, + "learning_rate": 3.817243569387107e-05, + "loss": 0.8163, + "step": 7450 + }, + { + "epoch": 2.3817084788821847, + "grad_norm": 33235.78515625, + "learning_rate": 3.809304541124167e-05, + "loss": 0.8149, + "step": 7500 + }, + { + "epoch": 2.397586535408066, + "grad_norm": 30538.533203125, + "learning_rate": 3.801365512861226e-05, + "loss": 0.818, + "step": 7550 + }, + { + "epoch": 2.4134645919339475, + "grad_norm": 31500.251953125, + "learning_rate": 3.793426484598285e-05, + "loss": 0.8171, + "step": 7600 + }, + { + "epoch": 2.4293426484598286, + "grad_norm": 30612.111328125, + "learning_rate": 3.7854874563353446e-05, + "loss": 0.8132, + "step": 7650 + }, + { + "epoch": 2.4452207049857098, + "grad_norm": 30479.9140625, + "learning_rate": 3.7775484280724046e-05, + "loss": 0.8147, + "step": 7700 + }, + { + "epoch": 2.461098761511591, + "grad_norm": 30102.48046875, + "learning_rate": 3.769609399809463e-05, + "loss": 0.8112, + "step": 7750 + }, + { + "epoch": 2.476976818037472, + "grad_norm": 30969.11328125, + "learning_rate": 3.761670371546523e-05, + "loss": 0.8161, + "step": 7800 + }, + { + "epoch": 2.4928548745633536, + "grad_norm": 30215.08203125, + "learning_rate": 3.753731343283583e-05, + "loss": 0.8118, + "step": 7850 + }, + { + "epoch": 2.508732931089235, + "grad_norm": 30152.841796875, + "learning_rate": 3.7457923150206415e-05, + "loss": 0.813, + "step": 7900 + }, + { + "epoch": 2.524610987615116, + "grad_norm": 30816.564453125, + "learning_rate": 3.737853286757701e-05, + "loss": 0.8124, + "step": 7950 + }, + { + "epoch": 2.540489044140997, + "grad_norm": 31137.095703125, + "learning_rate": 3.72991425849476e-05, + "loss": 0.8105, + "step": 8000 + }, + { + "epoch": 2.5563671006668782, + "grad_norm": 30815.490234375, + "learning_rate": 3.72197523023182e-05, + "loss": 0.8117, + "step": 8050 + }, + { + "epoch": 2.57224515719276, + "grad_norm": 31679.986328125, + "learning_rate": 3.714036201968879e-05, + "loss": 0.808, + "step": 8100 + }, + { + "epoch": 2.588123213718641, + "grad_norm": 30918.755859375, + "learning_rate": 3.7060971737059384e-05, + "loss": 0.807, + "step": 8150 + }, + { + "epoch": 2.604001270244522, + "grad_norm": 30721.857421875, + "learning_rate": 3.6981581454429984e-05, + "loss": 0.8077, + "step": 8200 + }, + { + "epoch": 2.6198793267704032, + "grad_norm": 30369.298828125, + "learning_rate": 3.690219117180057e-05, + "loss": 0.8067, + "step": 8250 + }, + { + "epoch": 2.6357573832962844, + "grad_norm": 30493.7421875, + "learning_rate": 3.6822800889171165e-05, + "loss": 0.8063, + "step": 8300 + }, + { + "epoch": 2.651635439822166, + "grad_norm": 30101.5859375, + "learning_rate": 3.674341060654176e-05, + "loss": 0.8065, + "step": 8350 + }, + { + "epoch": 2.667513496348047, + "grad_norm": 30403.134765625, + "learning_rate": 3.666402032391236e-05, + "loss": 0.8023, + "step": 8400 + }, + { + "epoch": 2.6833915528739283, + "grad_norm": 30230.22265625, + "learning_rate": 3.658463004128295e-05, + "loss": 0.8025, + "step": 8450 + }, + { + "epoch": 2.6992696093998094, + "grad_norm": 30146.58203125, + "learning_rate": 3.650523975865354e-05, + "loss": 0.8035, + "step": 8500 + }, + { + "epoch": 2.7151476659256906, + "grad_norm": 30205.19921875, + "learning_rate": 3.6425849476024134e-05, + "loss": 0.8023, + "step": 8550 + }, + { + "epoch": 2.731025722451572, + "grad_norm": 29393.84375, + "learning_rate": 3.6346459193394735e-05, + "loss": 0.7985, + "step": 8600 + }, + { + "epoch": 2.7469037789774533, + "grad_norm": 30105.896484375, + "learning_rate": 3.626706891076532e-05, + "loss": 0.798, + "step": 8650 + }, + { + "epoch": 2.7627818355033344, + "grad_norm": 31707.62109375, + "learning_rate": 3.6187678628135916e-05, + "loss": 0.8019, + "step": 8700 + }, + { + "epoch": 2.7786598920292156, + "grad_norm": 30057.185546875, + "learning_rate": 3.6108288345506516e-05, + "loss": 0.8019, + "step": 8750 + }, + { + "epoch": 2.7945379485550967, + "grad_norm": 31893.216796875, + "learning_rate": 3.60288980628771e-05, + "loss": 0.8004, + "step": 8800 + }, + { + "epoch": 2.8104160050809783, + "grad_norm": 29928.91015625, + "learning_rate": 3.59495077802477e-05, + "loss": 0.7989, + "step": 8850 + }, + { + "epoch": 2.8262940616068595, + "grad_norm": 29859.1015625, + "learning_rate": 3.587011749761829e-05, + "loss": 0.7987, + "step": 8900 + }, + { + "epoch": 2.8421721181327406, + "grad_norm": 29658.259765625, + "learning_rate": 3.579072721498889e-05, + "loss": 0.8001, + "step": 8950 + }, + { + "epoch": 2.8580501746586218, + "grad_norm": 29646.7421875, + "learning_rate": 3.571133693235948e-05, + "loss": 0.7946, + "step": 9000 + }, + { + "epoch": 2.873928231184503, + "grad_norm": 29755.7890625, + "learning_rate": 3.563194664973007e-05, + "loss": 0.7983, + "step": 9050 + }, + { + "epoch": 2.8898062877103845, + "grad_norm": 29201.306640625, + "learning_rate": 3.555255636710067e-05, + "loss": 0.7966, + "step": 9100 + }, + { + "epoch": 2.905684344236265, + "grad_norm": 30059.353515625, + "learning_rate": 3.547316608447126e-05, + "loss": 0.7968, + "step": 9150 + }, + { + "epoch": 2.921562400762147, + "grad_norm": 29836.736328125, + "learning_rate": 3.5393775801841854e-05, + "loss": 0.7915, + "step": 9200 + }, + { + "epoch": 2.937440457288028, + "grad_norm": 29474.76171875, + "learning_rate": 3.531438551921245e-05, + "loss": 0.7951, + "step": 9250 + }, + { + "epoch": 2.953318513813909, + "grad_norm": 29720.64453125, + "learning_rate": 3.523499523658305e-05, + "loss": 0.7973, + "step": 9300 + }, + { + "epoch": 2.9691965703397907, + "grad_norm": 30905.662109375, + "learning_rate": 3.5155604953953635e-05, + "loss": 0.796, + "step": 9350 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 29326.87109375, + "learning_rate": 3.507621467132423e-05, + "loss": 0.7943, + "step": 9400 + }, + { + "epoch": 3.000952683391553, + "grad_norm": 29983.439453125, + "learning_rate": 3.499682438869483e-05, + "loss": 0.7913, + "step": 9450 + }, + { + "epoch": 3.016830739917434, + "grad_norm": 31416.537109375, + "learning_rate": 3.4917434106065423e-05, + "loss": 0.7875, + "step": 9500 + }, + { + "epoch": 3.0327087964433153, + "grad_norm": 29326.58984375, + "learning_rate": 3.483804382343601e-05, + "loss": 0.788, + "step": 9550 + }, + { + "epoch": 3.0485868529691964, + "grad_norm": 30804.06640625, + "learning_rate": 3.4758653540806604e-05, + "loss": 0.7874, + "step": 9600 + }, + { + "epoch": 3.064464909495078, + "grad_norm": 30785.3125, + "learning_rate": 3.4679263258177205e-05, + "loss": 0.7889, + "step": 9650 + }, + { + "epoch": 3.080342966020959, + "grad_norm": 29545.892578125, + "learning_rate": 3.459987297554779e-05, + "loss": 0.786, + "step": 9700 + }, + { + "epoch": 3.0962210225468403, + "grad_norm": 29333.875, + "learning_rate": 3.4520482692918386e-05, + "loss": 0.7869, + "step": 9750 + }, + { + "epoch": 3.1120990790727214, + "grad_norm": 29140.724609375, + "learning_rate": 3.4441092410288986e-05, + "loss": 0.7848, + "step": 9800 + }, + { + "epoch": 3.1279771355986026, + "grad_norm": 29844.708984375, + "learning_rate": 3.436170212765958e-05, + "loss": 0.7833, + "step": 9850 + }, + { + "epoch": 3.143855192124484, + "grad_norm": 29902.115234375, + "learning_rate": 3.428231184503017e-05, + "loss": 0.7868, + "step": 9900 + }, + { + "epoch": 3.1597332486503653, + "grad_norm": 29875.98828125, + "learning_rate": 3.420292156240076e-05, + "loss": 0.7824, + "step": 9950 + }, + { + "epoch": 3.1756113051762465, + "grad_norm": 29536.19921875, + "learning_rate": 3.412353127977136e-05, + "loss": 0.7819, + "step": 10000 + }, + { + "epoch": 3.1914893617021276, + "grad_norm": 29816.875, + "learning_rate": 3.404414099714195e-05, + "loss": 0.7804, + "step": 10050 + }, + { + "epoch": 3.2073674182280087, + "grad_norm": 30143.560546875, + "learning_rate": 3.396475071451254e-05, + "loss": 0.7819, + "step": 10100 + }, + { + "epoch": 3.22324547475389, + "grad_norm": 29888.271484375, + "learning_rate": 3.388536043188314e-05, + "loss": 0.7817, + "step": 10150 + }, + { + "epoch": 3.2391235312797715, + "grad_norm": 29966.201171875, + "learning_rate": 3.380597014925374e-05, + "loss": 0.7809, + "step": 10200 + }, + { + "epoch": 3.2550015878056526, + "grad_norm": 30786.470703125, + "learning_rate": 3.3726579866624324e-05, + "loss": 0.7765, + "step": 10250 + }, + { + "epoch": 3.2708796443315338, + "grad_norm": 30272.826171875, + "learning_rate": 3.364718958399492e-05, + "loss": 0.7806, + "step": 10300 + }, + { + "epoch": 3.286757700857415, + "grad_norm": 29083.400390625, + "learning_rate": 3.356779930136552e-05, + "loss": 0.7799, + "step": 10350 + }, + { + "epoch": 3.302635757383296, + "grad_norm": 29236.140625, + "learning_rate": 3.348840901873611e-05, + "loss": 0.7801, + "step": 10400 + }, + { + "epoch": 3.3185138139091777, + "grad_norm": 29460.9296875, + "learning_rate": 3.34090187361067e-05, + "loss": 0.7779, + "step": 10450 + }, + { + "epoch": 3.334391870435059, + "grad_norm": 29783.908203125, + "learning_rate": 3.332962845347729e-05, + "loss": 0.7782, + "step": 10500 + }, + { + "epoch": 3.35026992696094, + "grad_norm": 30124.951171875, + "learning_rate": 3.3250238170847893e-05, + "loss": 0.7778, + "step": 10550 + }, + { + "epoch": 3.366147983486821, + "grad_norm": 29516.72265625, + "learning_rate": 3.317084788821848e-05, + "loss": 0.7771, + "step": 10600 + }, + { + "epoch": 3.3820260400127022, + "grad_norm": 29329.431640625, + "learning_rate": 3.3091457605589074e-05, + "loss": 0.7777, + "step": 10650 + }, + { + "epoch": 3.397904096538584, + "grad_norm": 29930.46875, + "learning_rate": 3.3012067322959675e-05, + "loss": 0.7776, + "step": 10700 + }, + { + "epoch": 3.413782153064465, + "grad_norm": 30200.29296875, + "learning_rate": 3.293267704033027e-05, + "loss": 0.7774, + "step": 10750 + }, + { + "epoch": 3.429660209590346, + "grad_norm": 29434.279296875, + "learning_rate": 3.2853286757700856e-05, + "loss": 0.7794, + "step": 10800 + }, + { + "epoch": 3.4455382661162273, + "grad_norm": 29533.7265625, + "learning_rate": 3.277389647507145e-05, + "loss": 0.78, + "step": 10850 + }, + { + "epoch": 3.4614163226421084, + "grad_norm": 30062.75390625, + "learning_rate": 3.269450619244205e-05, + "loss": 0.7715, + "step": 10900 + }, + { + "epoch": 3.47729437916799, + "grad_norm": 30203.49609375, + "learning_rate": 3.261511590981264e-05, + "loss": 0.7754, + "step": 10950 + }, + { + "epoch": 3.493172435693871, + "grad_norm": 29971.337890625, + "learning_rate": 3.253572562718323e-05, + "loss": 0.7743, + "step": 11000 + }, + { + "epoch": 3.5090504922197523, + "grad_norm": 30626.96875, + "learning_rate": 3.245633534455383e-05, + "loss": 0.7722, + "step": 11050 + }, + { + "epoch": 3.5249285487456334, + "grad_norm": 29813.41796875, + "learning_rate": 3.2376945061924425e-05, + "loss": 0.7762, + "step": 11100 + }, + { + "epoch": 3.5408066052715146, + "grad_norm": 30018.158203125, + "learning_rate": 3.229755477929501e-05, + "loss": 0.7708, + "step": 11150 + }, + { + "epoch": 3.556684661797396, + "grad_norm": 28745.65625, + "learning_rate": 3.2218164496665606e-05, + "loss": 0.7703, + "step": 11200 + }, + { + "epoch": 3.5725627183232773, + "grad_norm": 29827.396484375, + "learning_rate": 3.213877421403621e-05, + "loss": 0.7731, + "step": 11250 + }, + { + "epoch": 3.5884407748491585, + "grad_norm": 29104.228515625, + "learning_rate": 3.20593839314068e-05, + "loss": 0.7697, + "step": 11300 + }, + { + "epoch": 3.6043188313750396, + "grad_norm": 29951.318359375, + "learning_rate": 3.197999364877739e-05, + "loss": 0.7719, + "step": 11350 + }, + { + "epoch": 3.6201968879009208, + "grad_norm": 29854.1171875, + "learning_rate": 3.190060336614799e-05, + "loss": 0.7707, + "step": 11400 + }, + { + "epoch": 3.6360749444268023, + "grad_norm": 29064.115234375, + "learning_rate": 3.182121308351858e-05, + "loss": 0.7679, + "step": 11450 + }, + { + "epoch": 3.6519530009526835, + "grad_norm": 29561.064453125, + "learning_rate": 3.174182280088917e-05, + "loss": 0.773, + "step": 11500 + }, + { + "epoch": 3.6678310574785646, + "grad_norm": 30519.935546875, + "learning_rate": 3.166243251825976e-05, + "loss": 0.7715, + "step": 11550 + }, + { + "epoch": 3.683709114004446, + "grad_norm": 29498.1796875, + "learning_rate": 3.1583042235630363e-05, + "loss": 0.7696, + "step": 11600 + }, + { + "epoch": 3.699587170530327, + "grad_norm": 29892.5, + "learning_rate": 3.150365195300096e-05, + "loss": 0.7722, + "step": 11650 + }, + { + "epoch": 3.7154652270562085, + "grad_norm": 29588.62890625, + "learning_rate": 3.1424261670371544e-05, + "loss": 0.7695, + "step": 11700 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 28297.962890625, + "learning_rate": 3.1344871387742145e-05, + "loss": 0.767, + "step": 11750 + }, + { + "epoch": 3.747221340107971, + "grad_norm": 29926.146484375, + "learning_rate": 3.126548110511274e-05, + "loss": 0.7686, + "step": 11800 + }, + { + "epoch": 3.763099396633852, + "grad_norm": 29238.9375, + "learning_rate": 3.1186090822483326e-05, + "loss": 0.7657, + "step": 11850 + }, + { + "epoch": 3.778977453159733, + "grad_norm": 29130.10546875, + "learning_rate": 3.110670053985392e-05, + "loss": 0.7664, + "step": 11900 + }, + { + "epoch": 3.7948555096856147, + "grad_norm": 28701.7890625, + "learning_rate": 3.102731025722452e-05, + "loss": 0.7658, + "step": 11950 + }, + { + "epoch": 3.810733566211496, + "grad_norm": 29051.44140625, + "learning_rate": 3.0947919974595114e-05, + "loss": 0.7693, + "step": 12000 + }, + { + "epoch": 3.826611622737377, + "grad_norm": 29298.595703125, + "learning_rate": 3.08685296919657e-05, + "loss": 0.767, + "step": 12050 + }, + { + "epoch": 3.842489679263258, + "grad_norm": 28832.44921875, + "learning_rate": 3.07891394093363e-05, + "loss": 0.7655, + "step": 12100 + }, + { + "epoch": 3.8583677357891393, + "grad_norm": 28990.67578125, + "learning_rate": 3.0709749126706895e-05, + "loss": 0.7618, + "step": 12150 + }, + { + "epoch": 3.874245792315021, + "grad_norm": 28622.34765625, + "learning_rate": 3.063035884407749e-05, + "loss": 0.7664, + "step": 12200 + }, + { + "epoch": 3.890123848840902, + "grad_norm": 29283.064453125, + "learning_rate": 3.0550968561448076e-05, + "loss": 0.7678, + "step": 12250 + }, + { + "epoch": 3.906001905366783, + "grad_norm": 29989.224609375, + "learning_rate": 3.0471578278818673e-05, + "loss": 0.7631, + "step": 12300 + }, + { + "epoch": 3.9218799618926643, + "grad_norm": 30515.478515625, + "learning_rate": 3.039218799618927e-05, + "loss": 0.7634, + "step": 12350 + }, + { + "epoch": 3.9377580184185454, + "grad_norm": 29057.341796875, + "learning_rate": 3.031279771355986e-05, + "loss": 0.7649, + "step": 12400 + }, + { + "epoch": 3.953636074944427, + "grad_norm": 29256.357421875, + "learning_rate": 3.0233407430930455e-05, + "loss": 0.7617, + "step": 12450 + }, + { + "epoch": 3.969514131470308, + "grad_norm": 29832.517578125, + "learning_rate": 3.015401714830105e-05, + "loss": 0.762, + "step": 12500 + }, + { + "epoch": 3.9853921879961893, + "grad_norm": 30056.580078125, + "learning_rate": 3.0074626865671646e-05, + "loss": 0.7635, + "step": 12550 + }, + { + "epoch": 4.001270244522071, + "grad_norm": 30290.673828125, + "learning_rate": 2.9995236583042236e-05, + "loss": 0.761, + "step": 12600 + }, + { + "epoch": 4.017148301047952, + "grad_norm": 30393.2421875, + "learning_rate": 2.991584630041283e-05, + "loss": 0.7576, + "step": 12650 + }, + { + "epoch": 4.033026357573833, + "grad_norm": 29898.7734375, + "learning_rate": 2.9836456017783427e-05, + "loss": 0.7592, + "step": 12700 + }, + { + "epoch": 4.048904414099714, + "grad_norm": 29665.693359375, + "learning_rate": 2.9757065735154018e-05, + "loss": 0.7564, + "step": 12750 + }, + { + "epoch": 4.0647824706255955, + "grad_norm": 29677.63671875, + "learning_rate": 2.967767545252461e-05, + "loss": 0.7564, + "step": 12800 + }, + { + "epoch": 4.080660527151477, + "grad_norm": 29343.1328125, + "learning_rate": 2.9598285169895205e-05, + "loss": 0.7589, + "step": 12850 + }, + { + "epoch": 4.096538583677358, + "grad_norm": 28720.55078125, + "learning_rate": 2.9518894887265802e-05, + "loss": 0.7543, + "step": 12900 + }, + { + "epoch": 4.112416640203239, + "grad_norm": 28604.630859375, + "learning_rate": 2.9439504604636393e-05, + "loss": 0.7554, + "step": 12950 + }, + { + "epoch": 4.12829469672912, + "grad_norm": 29508.154296875, + "learning_rate": 2.9360114322006987e-05, + "loss": 0.7555, + "step": 13000 + }, + { + "epoch": 4.144172753255002, + "grad_norm": 28996.201171875, + "learning_rate": 2.9280724039377584e-05, + "loss": 0.7552, + "step": 13050 + }, + { + "epoch": 4.160050809780882, + "grad_norm": 31208.623046875, + "learning_rate": 2.9201333756748178e-05, + "loss": 0.7584, + "step": 13100 + }, + { + "epoch": 4.175928866306764, + "grad_norm": 29778.853515625, + "learning_rate": 2.9121943474118768e-05, + "loss": 0.7549, + "step": 13150 + }, + { + "epoch": 4.1918069228326456, + "grad_norm": 29337.587890625, + "learning_rate": 2.9042553191489362e-05, + "loss": 0.7546, + "step": 13200 + }, + { + "epoch": 4.207684979358526, + "grad_norm": 28682.619140625, + "learning_rate": 2.896316290885996e-05, + "loss": 0.749, + "step": 13250 + }, + { + "epoch": 4.223563035884408, + "grad_norm": 29200.912109375, + "learning_rate": 2.888377262623055e-05, + "loss": 0.7545, + "step": 13300 + }, + { + "epoch": 4.2394410924102885, + "grad_norm": 30967.3515625, + "learning_rate": 2.8804382343601143e-05, + "loss": 0.7558, + "step": 13350 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 30665.638671875, + "learning_rate": 2.872499206097174e-05, + "loss": 0.7532, + "step": 13400 + }, + { + "epoch": 4.271197205462052, + "grad_norm": 28959.0625, + "learning_rate": 2.8645601778342334e-05, + "loss": 0.7553, + "step": 13450 + }, + { + "epoch": 4.287075261987932, + "grad_norm": 29186.98046875, + "learning_rate": 2.8566211495712925e-05, + "loss": 0.7535, + "step": 13500 + }, + { + "epoch": 4.302953318513814, + "grad_norm": 29713.626953125, + "learning_rate": 2.848682121308352e-05, + "loss": 0.7526, + "step": 13550 + }, + { + "epoch": 4.318831375039695, + "grad_norm": 30507.126953125, + "learning_rate": 2.8407430930454116e-05, + "loss": 0.7557, + "step": 13600 + }, + { + "epoch": 4.334709431565576, + "grad_norm": 28860.220703125, + "learning_rate": 2.8328040647824706e-05, + "loss": 0.7497, + "step": 13650 + }, + { + "epoch": 4.350587488091458, + "grad_norm": 28531.17578125, + "learning_rate": 2.82486503651953e-05, + "loss": 0.7536, + "step": 13700 + }, + { + "epoch": 4.366465544617339, + "grad_norm": 29637.4296875, + "learning_rate": 2.8169260082565897e-05, + "loss": 0.7536, + "step": 13750 + }, + { + "epoch": 4.38234360114322, + "grad_norm": 29468.94140625, + "learning_rate": 2.808986979993649e-05, + "loss": 0.7517, + "step": 13800 + }, + { + "epoch": 4.398221657669101, + "grad_norm": 29931.30859375, + "learning_rate": 2.801047951730708e-05, + "loss": 0.7514, + "step": 13850 + }, + { + "epoch": 4.4140997141949825, + "grad_norm": 29154.953125, + "learning_rate": 2.7931089234677675e-05, + "loss": 0.7545, + "step": 13900 + }, + { + "epoch": 4.429977770720864, + "grad_norm": 29288.595703125, + "learning_rate": 2.7851698952048273e-05, + "loss": 0.7535, + "step": 13950 + }, + { + "epoch": 4.445855827246745, + "grad_norm": 29386.255859375, + "learning_rate": 2.7772308669418866e-05, + "loss": 0.7517, + "step": 14000 + }, + { + "epoch": 4.461733883772626, + "grad_norm": 28722.88671875, + "learning_rate": 2.7692918386789457e-05, + "loss": 0.7482, + "step": 14050 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 29279.75, + "learning_rate": 2.7613528104160054e-05, + "loss": 0.7481, + "step": 14100 + }, + { + "epoch": 4.493489996824389, + "grad_norm": 29140.587890625, + "learning_rate": 2.7534137821530648e-05, + "loss": 0.7462, + "step": 14150 + }, + { + "epoch": 4.50936805335027, + "grad_norm": 28993.6484375, + "learning_rate": 2.7454747538901238e-05, + "loss": 0.7473, + "step": 14200 + }, + { + "epoch": 4.525246109876151, + "grad_norm": 30390.90234375, + "learning_rate": 2.7375357256271832e-05, + "loss": 0.7515, + "step": 14250 + }, + { + "epoch": 4.5411241664020325, + "grad_norm": 29218.51953125, + "learning_rate": 2.729596697364243e-05, + "loss": 0.7471, + "step": 14300 + }, + { + "epoch": 4.557002222927913, + "grad_norm": 30701.79296875, + "learning_rate": 2.7216576691013023e-05, + "loss": 0.7487, + "step": 14350 + }, + { + "epoch": 4.572880279453795, + "grad_norm": 29310.14453125, + "learning_rate": 2.7137186408383613e-05, + "loss": 0.7492, + "step": 14400 + }, + { + "epoch": 4.588758335979676, + "grad_norm": 29325.59375, + "learning_rate": 2.705779612575421e-05, + "loss": 0.7479, + "step": 14450 + }, + { + "epoch": 4.604636392505557, + "grad_norm": 29217.953125, + "learning_rate": 2.6978405843124804e-05, + "loss": 0.7512, + "step": 14500 + }, + { + "epoch": 4.620514449031439, + "grad_norm": 29466.189453125, + "learning_rate": 2.6899015560495395e-05, + "loss": 0.7477, + "step": 14550 + }, + { + "epoch": 4.636392505557319, + "grad_norm": 28055.72265625, + "learning_rate": 2.681962527786599e-05, + "loss": 0.749, + "step": 14600 + }, + { + "epoch": 4.652270562083201, + "grad_norm": 28245.34375, + "learning_rate": 2.6740234995236586e-05, + "loss": 0.7479, + "step": 14650 + }, + { + "epoch": 4.668148618609083, + "grad_norm": 29280.736328125, + "learning_rate": 2.666084471260718e-05, + "loss": 0.7484, + "step": 14700 + }, + { + "epoch": 4.684026675134963, + "grad_norm": 29603.513671875, + "learning_rate": 2.658145442997777e-05, + "loss": 0.7499, + "step": 14750 + }, + { + "epoch": 4.699904731660845, + "grad_norm": 28774.55078125, + "learning_rate": 2.6502064147348364e-05, + "loss": 0.7463, + "step": 14800 + }, + { + "epoch": 4.715782788186726, + "grad_norm": 29580.22265625, + "learning_rate": 2.642267386471896e-05, + "loss": 0.7424, + "step": 14850 + }, + { + "epoch": 4.731660844712607, + "grad_norm": 28998.916015625, + "learning_rate": 2.6343283582089555e-05, + "loss": 0.7457, + "step": 14900 + }, + { + "epoch": 4.747538901238489, + "grad_norm": 28983.86328125, + "learning_rate": 2.6263893299460145e-05, + "loss": 0.7412, + "step": 14950 + }, + { + "epoch": 4.7634169577643695, + "grad_norm": 30523.78125, + "learning_rate": 2.6184503016830743e-05, + "loss": 0.7468, + "step": 15000 + }, + { + "epoch": 4.779295014290251, + "grad_norm": 29167.177734375, + "learning_rate": 2.6105112734201336e-05, + "loss": 0.7435, + "step": 15050 + }, + { + "epoch": 4.795173070816132, + "grad_norm": 29224.529296875, + "learning_rate": 2.6025722451571927e-05, + "loss": 0.7409, + "step": 15100 + }, + { + "epoch": 4.811051127342013, + "grad_norm": 28679.3046875, + "learning_rate": 2.594633216894252e-05, + "loss": 0.7394, + "step": 15150 + }, + { + "epoch": 4.826929183867895, + "grad_norm": 29085.837890625, + "learning_rate": 2.5866941886313118e-05, + "loss": 0.745, + "step": 15200 + }, + { + "epoch": 4.842807240393776, + "grad_norm": 28241.09765625, + "learning_rate": 2.578755160368371e-05, + "loss": 0.7456, + "step": 15250 + }, + { + "epoch": 4.858685296919657, + "grad_norm": 29162.150390625, + "learning_rate": 2.5708161321054302e-05, + "loss": 0.7428, + "step": 15300 + }, + { + "epoch": 4.874563353445538, + "grad_norm": 29517.7578125, + "learning_rate": 2.56287710384249e-05, + "loss": 0.7418, + "step": 15350 + }, + { + "epoch": 4.8904414099714195, + "grad_norm": 28757.634765625, + "learning_rate": 2.5549380755795493e-05, + "loss": 0.7475, + "step": 15400 + }, + { + "epoch": 4.906319466497301, + "grad_norm": 29112.775390625, + "learning_rate": 2.5469990473166083e-05, + "loss": 0.742, + "step": 15450 + }, + { + "epoch": 4.922197523023182, + "grad_norm": 29536.763671875, + "learning_rate": 2.5390600190536677e-05, + "loss": 0.7423, + "step": 15500 + }, + { + "epoch": 4.938075579549063, + "grad_norm": 29418.62890625, + "learning_rate": 2.5311209907907274e-05, + "loss": 0.7445, + "step": 15550 + }, + { + "epoch": 4.953953636074944, + "grad_norm": 29107.703125, + "learning_rate": 2.5231819625277868e-05, + "loss": 0.7421, + "step": 15600 + }, + { + "epoch": 4.969831692600826, + "grad_norm": 28584.513671875, + "learning_rate": 2.515242934264846e-05, + "loss": 0.7416, + "step": 15650 + }, + { + "epoch": 4.985709749126707, + "grad_norm": 29202.330078125, + "learning_rate": 2.5073039060019056e-05, + "loss": 0.7404, + "step": 15700 + }, + { + "epoch": 5.001587805652588, + "grad_norm": 28508.923828125, + "learning_rate": 2.499364877738965e-05, + "loss": 0.7401, + "step": 15750 + }, + { + "epoch": 5.01746586217847, + "grad_norm": 29855.150390625, + "learning_rate": 2.4914258494760243e-05, + "loss": 0.7377, + "step": 15800 + }, + { + "epoch": 5.03334391870435, + "grad_norm": 28641.822265625, + "learning_rate": 2.4834868212130834e-05, + "loss": 0.7373, + "step": 15850 + }, + { + "epoch": 5.049221975230232, + "grad_norm": 29080.0234375, + "learning_rate": 2.475547792950143e-05, + "loss": 0.7387, + "step": 15900 + }, + { + "epoch": 5.0651000317561135, + "grad_norm": 29752.501953125, + "learning_rate": 2.467608764687202e-05, + "loss": 0.7364, + "step": 15950 + }, + { + "epoch": 5.080978088281994, + "grad_norm": 29244.943359375, + "learning_rate": 2.459669736424262e-05, + "loss": 0.7386, + "step": 16000 + }, + { + "epoch": 5.096856144807876, + "grad_norm": 30756.474609375, + "learning_rate": 2.4517307081613213e-05, + "loss": 0.7365, + "step": 16050 + }, + { + "epoch": 5.1127342013337564, + "grad_norm": 29783.80078125, + "learning_rate": 2.4437916798983806e-05, + "loss": 0.7345, + "step": 16100 + }, + { + "epoch": 5.128612257859638, + "grad_norm": 28871.20703125, + "learning_rate": 2.43585265163544e-05, + "loss": 0.7385, + "step": 16150 + }, + { + "epoch": 5.14449031438552, + "grad_norm": 29666.92578125, + "learning_rate": 2.4279136233724994e-05, + "loss": 0.7364, + "step": 16200 + }, + { + "epoch": 5.1603683709114, + "grad_norm": 27965.49609375, + "learning_rate": 2.4199745951095588e-05, + "loss": 0.7349, + "step": 16250 + }, + { + "epoch": 5.176246427437282, + "grad_norm": 29663.21484375, + "learning_rate": 2.4120355668466178e-05, + "loss": 0.7336, + "step": 16300 + }, + { + "epoch": 5.192124483963163, + "grad_norm": 29286.236328125, + "learning_rate": 2.4040965385836775e-05, + "loss": 0.7353, + "step": 16350 + }, + { + "epoch": 5.208002540489044, + "grad_norm": 28999.544921875, + "learning_rate": 2.396157510320737e-05, + "loss": 0.7364, + "step": 16400 + }, + { + "epoch": 5.223880597014926, + "grad_norm": 29259.37890625, + "learning_rate": 2.3882184820577963e-05, + "loss": 0.7386, + "step": 16450 + }, + { + "epoch": 5.2397586535408065, + "grad_norm": 29506.087890625, + "learning_rate": 2.3802794537948557e-05, + "loss": 0.734, + "step": 16500 + }, + { + "epoch": 5.255636710066688, + "grad_norm": 29443.67578125, + "learning_rate": 2.372340425531915e-05, + "loss": 0.7408, + "step": 16550 + }, + { + "epoch": 5.271514766592569, + "grad_norm": 29926.109375, + "learning_rate": 2.3644013972689744e-05, + "loss": 0.7368, + "step": 16600 + }, + { + "epoch": 5.28739282311845, + "grad_norm": 30687.26953125, + "learning_rate": 2.3564623690060338e-05, + "loss": 0.7345, + "step": 16650 + }, + { + "epoch": 5.303270879644332, + "grad_norm": 28573.580078125, + "learning_rate": 2.3485233407430932e-05, + "loss": 0.7348, + "step": 16700 + }, + { + "epoch": 5.319148936170213, + "grad_norm": 30217.728515625, + "learning_rate": 2.3405843124801526e-05, + "loss": 0.7348, + "step": 16750 + }, + { + "epoch": 5.335026992696094, + "grad_norm": 28830.787109375, + "learning_rate": 2.332645284217212e-05, + "loss": 0.7389, + "step": 16800 + }, + { + "epoch": 5.350905049221975, + "grad_norm": 29589.51953125, + "learning_rate": 2.3247062559542713e-05, + "loss": 0.7364, + "step": 16850 + }, + { + "epoch": 5.3667831057478566, + "grad_norm": 28568.32421875, + "learning_rate": 2.3167672276913307e-05, + "loss": 0.7348, + "step": 16900 + }, + { + "epoch": 5.382661162273738, + "grad_norm": 30656.90234375, + "learning_rate": 2.30882819942839e-05, + "loss": 0.7346, + "step": 16950 + }, + { + "epoch": 5.398539218799619, + "grad_norm": 29044.07421875, + "learning_rate": 2.3008891711654495e-05, + "loss": 0.7363, + "step": 17000 + }, + { + "epoch": 5.4144172753255, + "grad_norm": 29246.9765625, + "learning_rate": 2.292950142902509e-05, + "loss": 0.7376, + "step": 17050 + }, + { + "epoch": 5.430295331851381, + "grad_norm": 29861.26171875, + "learning_rate": 2.2850111146395683e-05, + "loss": 0.7322, + "step": 17100 + }, + { + "epoch": 5.446173388377263, + "grad_norm": 29695.34765625, + "learning_rate": 2.2770720863766276e-05, + "loss": 0.7324, + "step": 17150 + }, + { + "epoch": 5.462051444903144, + "grad_norm": 29459.607421875, + "learning_rate": 2.269133058113687e-05, + "loss": 0.733, + "step": 17200 + }, + { + "epoch": 5.477929501429025, + "grad_norm": 28512.21484375, + "learning_rate": 2.2611940298507464e-05, + "loss": 0.7274, + "step": 17250 + }, + { + "epoch": 5.493807557954907, + "grad_norm": 29732.572265625, + "learning_rate": 2.2532550015878058e-05, + "loss": 0.7333, + "step": 17300 + }, + { + "epoch": 5.509685614480787, + "grad_norm": 29110.30859375, + "learning_rate": 2.245315973324865e-05, + "loss": 0.7344, + "step": 17350 + }, + { + "epoch": 5.525563671006669, + "grad_norm": 29294.5859375, + "learning_rate": 2.2373769450619245e-05, + "loss": 0.7292, + "step": 17400 + }, + { + "epoch": 5.5414417275325505, + "grad_norm": 28999.29296875, + "learning_rate": 2.229437916798984e-05, + "loss": 0.7322, + "step": 17450 + }, + { + "epoch": 5.557319784058431, + "grad_norm": 28789.3359375, + "learning_rate": 2.2214988885360433e-05, + "loss": 0.7334, + "step": 17500 + }, + { + "epoch": 5.573197840584313, + "grad_norm": 29767.94140625, + "learning_rate": 2.2135598602731027e-05, + "loss": 0.7324, + "step": 17550 + }, + { + "epoch": 5.5890758971101935, + "grad_norm": 29029.384765625, + "learning_rate": 2.205620832010162e-05, + "loss": 0.7318, + "step": 17600 + }, + { + "epoch": 5.604953953636075, + "grad_norm": 29995.126953125, + "learning_rate": 2.1976818037472214e-05, + "loss": 0.7332, + "step": 17650 + }, + { + "epoch": 5.620832010161957, + "grad_norm": 29895.734375, + "learning_rate": 2.1897427754842808e-05, + "loss": 0.7323, + "step": 17700 + }, + { + "epoch": 5.636710066687837, + "grad_norm": 28998.80078125, + "learning_rate": 2.1818037472213402e-05, + "loss": 0.7308, + "step": 17750 + }, + { + "epoch": 5.652588123213719, + "grad_norm": 29375.140625, + "learning_rate": 2.1738647189583996e-05, + "loss": 0.7307, + "step": 17800 + }, + { + "epoch": 5.6684661797396, + "grad_norm": 30084.34375, + "learning_rate": 2.165925690695459e-05, + "loss": 0.7292, + "step": 17850 + }, + { + "epoch": 5.684344236265481, + "grad_norm": 29009.86328125, + "learning_rate": 2.1579866624325183e-05, + "loss": 0.7311, + "step": 17900 + }, + { + "epoch": 5.700222292791363, + "grad_norm": 28996.576171875, + "learning_rate": 2.1500476341695777e-05, + "loss": 0.731, + "step": 17950 + }, + { + "epoch": 5.7161003493172435, + "grad_norm": 28874.802734375, + "learning_rate": 2.142108605906637e-05, + "loss": 0.7301, + "step": 18000 + }, + { + "epoch": 5.731978405843125, + "grad_norm": 29940.734375, + "learning_rate": 2.1341695776436965e-05, + "loss": 0.73, + "step": 18050 + }, + { + "epoch": 5.747856462369006, + "grad_norm": 29607.59765625, + "learning_rate": 2.126230549380756e-05, + "loss": 0.7311, + "step": 18100 + }, + { + "epoch": 5.763734518894887, + "grad_norm": 29006.7890625, + "learning_rate": 2.1182915211178153e-05, + "loss": 0.7301, + "step": 18150 + }, + { + "epoch": 5.779612575420768, + "grad_norm": 30157.11328125, + "learning_rate": 2.1103524928548746e-05, + "loss": 0.7281, + "step": 18200 + }, + { + "epoch": 5.79549063194665, + "grad_norm": 29396.783203125, + "learning_rate": 2.102413464591934e-05, + "loss": 0.7315, + "step": 18250 + }, + { + "epoch": 5.811368688472531, + "grad_norm": 29335.1015625, + "learning_rate": 2.0944744363289934e-05, + "loss": 0.7279, + "step": 18300 + }, + { + "epoch": 5.827246744998412, + "grad_norm": 28650.416015625, + "learning_rate": 2.0865354080660528e-05, + "loss": 0.7347, + "step": 18350 + }, + { + "epoch": 5.843124801524294, + "grad_norm": 29561.095703125, + "learning_rate": 2.078596379803112e-05, + "loss": 0.7263, + "step": 18400 + }, + { + "epoch": 5.859002858050174, + "grad_norm": 28661.658203125, + "learning_rate": 2.070657351540172e-05, + "loss": 0.7314, + "step": 18450 + }, + { + "epoch": 5.874880914576056, + "grad_norm": 28372.755859375, + "learning_rate": 2.062718323277231e-05, + "loss": 0.7265, + "step": 18500 + }, + { + "epoch": 5.8907589711019375, + "grad_norm": 28641.00390625, + "learning_rate": 2.0547792950142903e-05, + "loss": 0.7296, + "step": 18550 + }, + { + "epoch": 5.906637027627818, + "grad_norm": 29402.455078125, + "learning_rate": 2.0468402667513497e-05, + "loss": 0.7257, + "step": 18600 + }, + { + "epoch": 5.9225150841537, + "grad_norm": 29028.78515625, + "learning_rate": 2.038901238488409e-05, + "loss": 0.7262, + "step": 18650 + }, + { + "epoch": 5.9383931406795805, + "grad_norm": 29547.03125, + "learning_rate": 2.0309622102254684e-05, + "loss": 0.728, + "step": 18700 + }, + { + "epoch": 5.954271197205462, + "grad_norm": 29396.15234375, + "learning_rate": 2.0230231819625278e-05, + "loss": 0.725, + "step": 18750 + }, + { + "epoch": 5.970149253731344, + "grad_norm": 29648.634765625, + "learning_rate": 2.0150841536995872e-05, + "loss": 0.7264, + "step": 18800 + }, + { + "epoch": 5.986027310257224, + "grad_norm": 29708.830078125, + "learning_rate": 2.0071451254366466e-05, + "loss": 0.7263, + "step": 18850 + }, + { + "epoch": 6.001905366783106, + "grad_norm": 29569.521484375, + "learning_rate": 1.9992060971737063e-05, + "loss": 0.7249, + "step": 18900 + }, + { + "epoch": 6.017783423308987, + "grad_norm": 29247.328125, + "learning_rate": 1.9912670689107653e-05, + "loss": 0.7248, + "step": 18950 + }, + { + "epoch": 6.033661479834868, + "grad_norm": 29513.408203125, + "learning_rate": 1.9833280406478247e-05, + "loss": 0.7243, + "step": 19000 + }, + { + "epoch": 6.04953953636075, + "grad_norm": 29599.560546875, + "learning_rate": 1.975389012384884e-05, + "loss": 0.7228, + "step": 19050 + }, + { + "epoch": 6.0654175928866305, + "grad_norm": 28947.3515625, + "learning_rate": 1.9674499841219435e-05, + "loss": 0.7261, + "step": 19100 + }, + { + "epoch": 6.081295649412512, + "grad_norm": 28986.390625, + "learning_rate": 1.959510955859003e-05, + "loss": 0.721, + "step": 19150 + }, + { + "epoch": 6.097173705938393, + "grad_norm": 29116.353515625, + "learning_rate": 1.9515719275960623e-05, + "loss": 0.7193, + "step": 19200 + }, + { + "epoch": 6.113051762464274, + "grad_norm": 29632.84375, + "learning_rate": 1.943632899333122e-05, + "loss": 0.7228, + "step": 19250 + }, + { + "epoch": 6.128929818990156, + "grad_norm": 29931.40234375, + "learning_rate": 1.935693871070181e-05, + "loss": 0.7233, + "step": 19300 + }, + { + "epoch": 6.144807875516037, + "grad_norm": 29751.517578125, + "learning_rate": 1.9277548428072407e-05, + "loss": 0.7249, + "step": 19350 + }, + { + "epoch": 6.160685932041918, + "grad_norm": 30318.1796875, + "learning_rate": 1.9198158145442998e-05, + "loss": 0.7232, + "step": 19400 + }, + { + "epoch": 6.176563988567799, + "grad_norm": 28459.033203125, + "learning_rate": 1.911876786281359e-05, + "loss": 0.7219, + "step": 19450 + }, + { + "epoch": 6.192442045093681, + "grad_norm": 29604.791015625, + "learning_rate": 1.9039377580184185e-05, + "loss": 0.7232, + "step": 19500 + }, + { + "epoch": 6.208320101619562, + "grad_norm": 29528.1484375, + "learning_rate": 1.895998729755478e-05, + "loss": 0.7192, + "step": 19550 + }, + { + "epoch": 6.224198158145443, + "grad_norm": 28977.87109375, + "learning_rate": 1.8880597014925376e-05, + "loss": 0.7198, + "step": 19600 + }, + { + "epoch": 6.2400762146713245, + "grad_norm": 29568.908203125, + "learning_rate": 1.8801206732295967e-05, + "loss": 0.7231, + "step": 19650 + }, + { + "epoch": 6.255954271197205, + "grad_norm": 28843.431640625, + "learning_rate": 1.8721816449666564e-05, + "loss": 0.7188, + "step": 19700 + }, + { + "epoch": 6.271832327723087, + "grad_norm": 29127.72265625, + "learning_rate": 1.8642426167037154e-05, + "loss": 0.7218, + "step": 19750 + }, + { + "epoch": 6.287710384248968, + "grad_norm": 29642.048828125, + "learning_rate": 1.856303588440775e-05, + "loss": 0.7182, + "step": 19800 + }, + { + "epoch": 6.303588440774849, + "grad_norm": 29874.93359375, + "learning_rate": 1.8483645601778342e-05, + "loss": 0.719, + "step": 19850 + }, + { + "epoch": 6.319466497300731, + "grad_norm": 29941.87890625, + "learning_rate": 1.8404255319148936e-05, + "loss": 0.7198, + "step": 19900 + }, + { + "epoch": 6.335344553826611, + "grad_norm": 29865.37890625, + "learning_rate": 1.832486503651953e-05, + "loss": 0.7249, + "step": 19950 + }, + { + "epoch": 6.351222610352493, + "grad_norm": 30395.484375, + "learning_rate": 1.8245474753890123e-05, + "loss": 0.7195, + "step": 20000 + }, + { + "epoch": 6.367100666878374, + "grad_norm": 30510.9765625, + "learning_rate": 1.816608447126072e-05, + "loss": 0.7206, + "step": 20050 + }, + { + "epoch": 6.382978723404255, + "grad_norm": 30179.208984375, + "learning_rate": 1.808669418863131e-05, + "loss": 0.7208, + "step": 20100 + }, + { + "epoch": 6.398856779930137, + "grad_norm": 30193.3125, + "learning_rate": 1.8007303906001908e-05, + "loss": 0.7227, + "step": 20150 + }, + { + "epoch": 6.4147348364560175, + "grad_norm": 30033.546875, + "learning_rate": 1.79279136233725e-05, + "loss": 0.7172, + "step": 20200 + }, + { + "epoch": 6.430612892981899, + "grad_norm": 30468.65625, + "learning_rate": 1.7848523340743096e-05, + "loss": 0.7215, + "step": 20250 + }, + { + "epoch": 6.44649094950778, + "grad_norm": 29339.623046875, + "learning_rate": 1.7769133058113686e-05, + "loss": 0.7174, + "step": 20300 + }, + { + "epoch": 6.462369006033661, + "grad_norm": 29816.017578125, + "learning_rate": 1.768974277548428e-05, + "loss": 0.7194, + "step": 20350 + }, + { + "epoch": 6.478247062559543, + "grad_norm": 29318.12109375, + "learning_rate": 1.7610352492854877e-05, + "loss": 0.7196, + "step": 20400 + }, + { + "epoch": 6.494125119085424, + "grad_norm": 29063.029296875, + "learning_rate": 1.7530962210225468e-05, + "loss": 0.7185, + "step": 20450 + }, + { + "epoch": 6.510003175611305, + "grad_norm": 30219.3359375, + "learning_rate": 1.7451571927596065e-05, + "loss": 0.7168, + "step": 20500 + }, + { + "epoch": 6.525881232137186, + "grad_norm": 29047.94140625, + "learning_rate": 1.7372181644966655e-05, + "loss": 0.721, + "step": 20550 + }, + { + "epoch": 6.5417592886630676, + "grad_norm": 28919.94140625, + "learning_rate": 1.7292791362337253e-05, + "loss": 0.7172, + "step": 20600 + }, + { + "epoch": 6.557637345188949, + "grad_norm": 30273.14453125, + "learning_rate": 1.7213401079707843e-05, + "loss": 0.7188, + "step": 20650 + }, + { + "epoch": 6.57351540171483, + "grad_norm": 30020.40625, + "learning_rate": 1.713401079707844e-05, + "loss": 0.7184, + "step": 20700 + }, + { + "epoch": 6.589393458240711, + "grad_norm": 29766.990234375, + "learning_rate": 1.705462051444903e-05, + "loss": 0.7162, + "step": 20750 + }, + { + "epoch": 6.605271514766592, + "grad_norm": 29873.83203125, + "learning_rate": 1.6975230231819624e-05, + "loss": 0.7176, + "step": 20800 + }, + { + "epoch": 6.621149571292474, + "grad_norm": 29591.2578125, + "learning_rate": 1.689583994919022e-05, + "loss": 0.715, + "step": 20850 + }, + { + "epoch": 6.637027627818355, + "grad_norm": 30105.244140625, + "learning_rate": 1.6816449666560812e-05, + "loss": 0.716, + "step": 20900 + }, + { + "epoch": 6.652905684344236, + "grad_norm": 29515.470703125, + "learning_rate": 1.673705938393141e-05, + "loss": 0.7165, + "step": 20950 + }, + { + "epoch": 6.668783740870118, + "grad_norm": 29495.97265625, + "learning_rate": 1.6657669101302e-05, + "loss": 0.7157, + "step": 21000 + }, + { + "epoch": 6.684661797395998, + "grad_norm": 29483.634765625, + "learning_rate": 1.6578278818672597e-05, + "loss": 0.7207, + "step": 21050 + }, + { + "epoch": 6.70053985392188, + "grad_norm": 29576.375, + "learning_rate": 1.6498888536043187e-05, + "loss": 0.7158, + "step": 21100 + }, + { + "epoch": 6.7164179104477615, + "grad_norm": 29991.36328125, + "learning_rate": 1.6419498253413784e-05, + "loss": 0.7142, + "step": 21150 + }, + { + "epoch": 6.732295966973642, + "grad_norm": 30472.951171875, + "learning_rate": 1.6340107970784378e-05, + "loss": 0.7157, + "step": 21200 + }, + { + "epoch": 6.748174023499524, + "grad_norm": 29432.845703125, + "learning_rate": 1.626071768815497e-05, + "loss": 0.7164, + "step": 21250 + }, + { + "epoch": 6.7640520800254045, + "grad_norm": 30286.595703125, + "learning_rate": 1.6181327405525566e-05, + "loss": 0.7215, + "step": 21300 + }, + { + "epoch": 6.779930136551286, + "grad_norm": 29090.42578125, + "learning_rate": 1.6101937122896156e-05, + "loss": 0.7189, + "step": 21350 + }, + { + "epoch": 6.795808193077168, + "grad_norm": 29895.060546875, + "learning_rate": 1.6022546840266754e-05, + "loss": 0.7172, + "step": 21400 + }, + { + "epoch": 6.811686249603048, + "grad_norm": 29853.85546875, + "learning_rate": 1.5943156557637344e-05, + "loss": 0.713, + "step": 21450 + }, + { + "epoch": 6.82756430612893, + "grad_norm": 30071.662109375, + "learning_rate": 1.586376627500794e-05, + "loss": 0.712, + "step": 21500 + }, + { + "epoch": 6.843442362654811, + "grad_norm": 28977.619140625, + "learning_rate": 1.5784375992378535e-05, + "loss": 0.7134, + "step": 21550 + }, + { + "epoch": 6.859320419180692, + "grad_norm": 29966.154296875, + "learning_rate": 1.570498570974913e-05, + "loss": 0.7147, + "step": 21600 + }, + { + "epoch": 6.875198475706574, + "grad_norm": 29842.240234375, + "learning_rate": 1.5625595427119723e-05, + "loss": 0.7201, + "step": 21650 + }, + { + "epoch": 6.8910765322324545, + "grad_norm": 29326.939453125, + "learning_rate": 1.5546205144490313e-05, + "loss": 0.7159, + "step": 21700 + }, + { + "epoch": 6.906954588758336, + "grad_norm": 30401.865234375, + "learning_rate": 1.546681486186091e-05, + "loss": 0.7154, + "step": 21750 + }, + { + "epoch": 6.922832645284217, + "grad_norm": 30828.583984375, + "learning_rate": 1.53874245792315e-05, + "loss": 0.7167, + "step": 21800 + }, + { + "epoch": 6.938710701810098, + "grad_norm": 30084.78125, + "learning_rate": 1.5308034296602098e-05, + "loss": 0.7147, + "step": 21850 + }, + { + "epoch": 6.95458875833598, + "grad_norm": 29893.818359375, + "learning_rate": 1.522864401397269e-05, + "loss": 0.7174, + "step": 21900 + }, + { + "epoch": 6.970466814861861, + "grad_norm": 29464.3125, + "learning_rate": 1.5149253731343285e-05, + "loss": 0.7158, + "step": 21950 + }, + { + "epoch": 6.986344871387742, + "grad_norm": 28943.29296875, + "learning_rate": 1.5069863448713878e-05, + "loss": 0.7111, + "step": 22000 + }, + { + "epoch": 7.002222927913623, + "grad_norm": 30447.416015625, + "learning_rate": 1.4990473166084473e-05, + "loss": 0.7167, + "step": 22050 + }, + { + "epoch": 7.018100984439505, + "grad_norm": 30325.32421875, + "learning_rate": 1.4911082883455065e-05, + "loss": 0.7102, + "step": 22100 + }, + { + "epoch": 7.033979040965386, + "grad_norm": 30525.169921875, + "learning_rate": 1.4831692600825659e-05, + "loss": 0.7102, + "step": 22150 + }, + { + "epoch": 7.049857097491267, + "grad_norm": 30165.91015625, + "learning_rate": 1.4752302318196254e-05, + "loss": 0.7124, + "step": 22200 + }, + { + "epoch": 7.0657351540171485, + "grad_norm": 29645.66796875, + "learning_rate": 1.4672912035566847e-05, + "loss": 0.709, + "step": 22250 + }, + { + "epoch": 7.081613210543029, + "grad_norm": 29299.05859375, + "learning_rate": 1.4593521752937442e-05, + "loss": 0.7105, + "step": 22300 + }, + { + "epoch": 7.097491267068911, + "grad_norm": 29655.083984375, + "learning_rate": 1.4514131470308034e-05, + "loss": 0.7106, + "step": 22350 + }, + { + "epoch": 7.113369323594792, + "grad_norm": 30337.484375, + "learning_rate": 1.443474118767863e-05, + "loss": 0.7153, + "step": 22400 + }, + { + "epoch": 7.129247380120673, + "grad_norm": 29374.36328125, + "learning_rate": 1.4355350905049222e-05, + "loss": 0.7101, + "step": 22450 + }, + { + "epoch": 7.145125436646555, + "grad_norm": 29468.953125, + "learning_rate": 1.4275960622419817e-05, + "loss": 0.7074, + "step": 22500 + }, + { + "epoch": 7.161003493172435, + "grad_norm": 30660.234375, + "learning_rate": 1.4196570339790411e-05, + "loss": 0.7107, + "step": 22550 + }, + { + "epoch": 7.176881549698317, + "grad_norm": 29092.36328125, + "learning_rate": 1.4117180057161003e-05, + "loss": 0.7102, + "step": 22600 + }, + { + "epoch": 7.1927596062241985, + "grad_norm": 29264.359375, + "learning_rate": 1.4037789774531599e-05, + "loss": 0.7075, + "step": 22650 + }, + { + "epoch": 7.208637662750079, + "grad_norm": 30579.66015625, + "learning_rate": 1.3958399491902191e-05, + "loss": 0.7097, + "step": 22700 + }, + { + "epoch": 7.224515719275961, + "grad_norm": 29726.57421875, + "learning_rate": 1.3879009209272786e-05, + "loss": 0.7095, + "step": 22750 + }, + { + "epoch": 7.2403937758018415, + "grad_norm": 30103.4921875, + "learning_rate": 1.3799618926643379e-05, + "loss": 0.7108, + "step": 22800 + }, + { + "epoch": 7.256271832327723, + "grad_norm": 29993.138671875, + "learning_rate": 1.3720228644013974e-05, + "loss": 0.709, + "step": 22850 + }, + { + "epoch": 7.272149888853605, + "grad_norm": 30139.607421875, + "learning_rate": 1.3640838361384566e-05, + "loss": 0.7087, + "step": 22900 + }, + { + "epoch": 7.288027945379485, + "grad_norm": 29735.54296875, + "learning_rate": 1.3561448078755162e-05, + "loss": 0.7094, + "step": 22950 + }, + { + "epoch": 7.303906001905367, + "grad_norm": 29745.779296875, + "learning_rate": 1.3482057796125755e-05, + "loss": 0.7102, + "step": 23000 + }, + { + "epoch": 7.319784058431248, + "grad_norm": 30882.021484375, + "learning_rate": 1.3402667513496348e-05, + "loss": 0.7108, + "step": 23050 + }, + { + "epoch": 7.335662114957129, + "grad_norm": 29850.26171875, + "learning_rate": 1.3323277230866943e-05, + "loss": 0.7105, + "step": 23100 + }, + { + "epoch": 7.351540171483011, + "grad_norm": 30374.646484375, + "learning_rate": 1.3243886948237535e-05, + "loss": 0.7119, + "step": 23150 + }, + { + "epoch": 7.367418228008892, + "grad_norm": 29295.6640625, + "learning_rate": 1.316449666560813e-05, + "loss": 0.7082, + "step": 23200 + }, + { + "epoch": 7.383296284534773, + "grad_norm": 30273.5703125, + "learning_rate": 1.3085106382978723e-05, + "loss": 0.7087, + "step": 23250 + }, + { + "epoch": 7.399174341060654, + "grad_norm": 30317.859375, + "learning_rate": 1.3005716100349318e-05, + "loss": 0.7079, + "step": 23300 + }, + { + "epoch": 7.4150523975865354, + "grad_norm": 29483.0, + "learning_rate": 1.2926325817719912e-05, + "loss": 0.7065, + "step": 23350 + }, + { + "epoch": 7.430930454112417, + "grad_norm": 29725.1875, + "learning_rate": 1.2846935535090508e-05, + "loss": 0.7103, + "step": 23400 + }, + { + "epoch": 7.446808510638298, + "grad_norm": 29491.5234375, + "learning_rate": 1.27675452524611e-05, + "loss": 0.7102, + "step": 23450 + }, + { + "epoch": 7.462686567164179, + "grad_norm": 30156.08984375, + "learning_rate": 1.2688154969831692e-05, + "loss": 0.7093, + "step": 23500 + }, + { + "epoch": 7.47856462369006, + "grad_norm": 29961.205078125, + "learning_rate": 1.2608764687202287e-05, + "loss": 0.7082, + "step": 23550 + }, + { + "epoch": 7.494442680215942, + "grad_norm": 29865.1640625, + "learning_rate": 1.252937440457288e-05, + "loss": 0.7043, + "step": 23600 + }, + { + "epoch": 7.510320736741823, + "grad_norm": 30199.93359375, + "learning_rate": 1.2449984121943473e-05, + "loss": 0.7076, + "step": 23650 + }, + { + "epoch": 7.526198793267704, + "grad_norm": 30157.5, + "learning_rate": 1.2370593839314069e-05, + "loss": 0.7079, + "step": 23700 + }, + { + "epoch": 7.5420768497935855, + "grad_norm": 29760.25390625, + "learning_rate": 1.2291203556684663e-05, + "loss": 0.7048, + "step": 23750 + }, + { + "epoch": 7.557954906319466, + "grad_norm": 30412.80078125, + "learning_rate": 1.2211813274055256e-05, + "loss": 0.7067, + "step": 23800 + }, + { + "epoch": 7.573832962845348, + "grad_norm": 29912.32421875, + "learning_rate": 1.213242299142585e-05, + "loss": 0.7092, + "step": 23850 + }, + { + "epoch": 7.589711019371229, + "grad_norm": 30394.833984375, + "learning_rate": 1.2053032708796444e-05, + "loss": 0.7085, + "step": 23900 + }, + { + "epoch": 7.60558907589711, + "grad_norm": 30330.65625, + "learning_rate": 1.1973642426167038e-05, + "loss": 0.7055, + "step": 23950 + }, + { + "epoch": 7.621467132422992, + "grad_norm": 30122.162109375, + "learning_rate": 1.1894252143537632e-05, + "loss": 0.7078, + "step": 24000 + }, + { + "epoch": 7.637345188948872, + "grad_norm": 29642.91796875, + "learning_rate": 1.1814861860908225e-05, + "loss": 0.7062, + "step": 24050 + }, + { + "epoch": 7.653223245474754, + "grad_norm": 30002.765625, + "learning_rate": 1.173547157827882e-05, + "loss": 0.7054, + "step": 24100 + }, + { + "epoch": 7.669101302000636, + "grad_norm": 29660.470703125, + "learning_rate": 1.1656081295649413e-05, + "loss": 0.7048, + "step": 24150 + }, + { + "epoch": 7.684979358526516, + "grad_norm": 30191.8828125, + "learning_rate": 1.1576691013020007e-05, + "loss": 0.7085, + "step": 24200 + }, + { + "epoch": 7.700857415052398, + "grad_norm": 29762.623046875, + "learning_rate": 1.14973007303906e-05, + "loss": 0.7102, + "step": 24250 + }, + { + "epoch": 7.7167354715782785, + "grad_norm": 29867.5625, + "learning_rate": 1.1417910447761194e-05, + "loss": 0.704, + "step": 24300 + }, + { + "epoch": 7.73261352810416, + "grad_norm": 30145.470703125, + "learning_rate": 1.1338520165131788e-05, + "loss": 0.7043, + "step": 24350 + }, + { + "epoch": 7.748491584630042, + "grad_norm": 29339.95703125, + "learning_rate": 1.1259129882502382e-05, + "loss": 0.7092, + "step": 24400 + }, + { + "epoch": 7.764369641155922, + "grad_norm": 30186.40234375, + "learning_rate": 1.1179739599872976e-05, + "loss": 0.7063, + "step": 24450 + }, + { + "epoch": 7.780247697681804, + "grad_norm": 30043.974609375, + "learning_rate": 1.110034931724357e-05, + "loss": 0.7053, + "step": 24500 + }, + { + "epoch": 7.796125754207685, + "grad_norm": 30453.666015625, + "learning_rate": 1.1020959034614164e-05, + "loss": 0.7066, + "step": 24550 + }, + { + "epoch": 7.812003810733566, + "grad_norm": 30049.93359375, + "learning_rate": 1.0941568751984757e-05, + "loss": 0.71, + "step": 24600 + }, + { + "epoch": 7.827881867259448, + "grad_norm": 30561.3203125, + "learning_rate": 1.0862178469355351e-05, + "loss": 0.7065, + "step": 24650 + }, + { + "epoch": 7.843759923785329, + "grad_norm": 30853.423828125, + "learning_rate": 1.0782788186725945e-05, + "loss": 0.706, + "step": 24700 + }, + { + "epoch": 7.85963798031121, + "grad_norm": 29456.958984375, + "learning_rate": 1.0703397904096539e-05, + "loss": 0.7093, + "step": 24750 + }, + { + "epoch": 7.875516036837091, + "grad_norm": 31488.560546875, + "learning_rate": 1.0624007621467133e-05, + "loss": 0.7063, + "step": 24800 + }, + { + "epoch": 7.8913940933629725, + "grad_norm": 30248.27734375, + "learning_rate": 1.0544617338837726e-05, + "loss": 0.7062, + "step": 24850 + }, + { + "epoch": 7.907272149888854, + "grad_norm": 29547.48828125, + "learning_rate": 1.046522705620832e-05, + "loss": 0.7069, + "step": 24900 + }, + { + "epoch": 7.923150206414735, + "grad_norm": 29567.212890625, + "learning_rate": 1.0385836773578916e-05, + "loss": 0.7059, + "step": 24950 + }, + { + "epoch": 7.939028262940616, + "grad_norm": 30756.359375, + "learning_rate": 1.0306446490949508e-05, + "loss": 0.7056, + "step": 25000 + }, + { + "epoch": 7.954906319466497, + "grad_norm": 29843.1953125, + "learning_rate": 1.0227056208320102e-05, + "loss": 0.7058, + "step": 25050 + }, + { + "epoch": 7.970784375992379, + "grad_norm": 30317.7734375, + "learning_rate": 1.0147665925690695e-05, + "loss": 0.7062, + "step": 25100 + }, + { + "epoch": 7.98666243251826, + "grad_norm": 29659.53125, + "learning_rate": 1.006827564306129e-05, + "loss": 0.703, + "step": 25150 + }, + { + "epoch": 8.002540489044142, + "grad_norm": 30429.38671875, + "learning_rate": 9.988885360431883e-06, + "loss": 0.7054, + "step": 25200 + }, + { + "epoch": 8.018418545570022, + "grad_norm": 31551.69140625, + "learning_rate": 9.909495077802477e-06, + "loss": 0.7015, + "step": 25250 + }, + { + "epoch": 8.034296602095903, + "grad_norm": 30067.037109375, + "learning_rate": 9.83010479517307e-06, + "loss": 0.7018, + "step": 25300 + }, + { + "epoch": 8.050174658621785, + "grad_norm": 30266.275390625, + "learning_rate": 9.750714512543666e-06, + "loss": 0.704, + "step": 25350 + }, + { + "epoch": 8.066052715147666, + "grad_norm": 30958.685546875, + "learning_rate": 9.67132422991426e-06, + "loss": 0.7018, + "step": 25400 + }, + { + "epoch": 8.081930771673548, + "grad_norm": 30467.9921875, + "learning_rate": 9.591933947284852e-06, + "loss": 0.7016, + "step": 25450 + }, + { + "epoch": 8.097808828199428, + "grad_norm": 30163.07421875, + "learning_rate": 9.512543664655446e-06, + "loss": 0.7042, + "step": 25500 + }, + { + "epoch": 8.11368688472531, + "grad_norm": 31141.931640625, + "learning_rate": 9.43315338202604e-06, + "loss": 0.6998, + "step": 25550 + }, + { + "epoch": 8.129564941251191, + "grad_norm": 30357.98828125, + "learning_rate": 9.353763099396634e-06, + "loss": 0.7008, + "step": 25600 + }, + { + "epoch": 8.145442997777073, + "grad_norm": 30259.6171875, + "learning_rate": 9.274372816767227e-06, + "loss": 0.7012, + "step": 25650 + }, + { + "epoch": 8.161321054302954, + "grad_norm": 29641.35546875, + "learning_rate": 9.194982534137823e-06, + "loss": 0.6998, + "step": 25700 + }, + { + "epoch": 8.177199110828834, + "grad_norm": 30559.46875, + "learning_rate": 9.115592251508417e-06, + "loss": 0.706, + "step": 25750 + }, + { + "epoch": 8.193077167354716, + "grad_norm": 30127.62890625, + "learning_rate": 9.03620196887901e-06, + "loss": 0.7063, + "step": 25800 + }, + { + "epoch": 8.208955223880597, + "grad_norm": 30027.09375, + "learning_rate": 8.956811686249604e-06, + "loss": 0.7025, + "step": 25850 + }, + { + "epoch": 8.224833280406479, + "grad_norm": 30069.939453125, + "learning_rate": 8.877421403620196e-06, + "loss": 0.7002, + "step": 25900 + }, + { + "epoch": 8.24071133693236, + "grad_norm": 30299.6171875, + "learning_rate": 8.79803112099079e-06, + "loss": 0.6996, + "step": 25950 + }, + { + "epoch": 8.25658939345824, + "grad_norm": 30868.3671875, + "learning_rate": 8.718640838361384e-06, + "loss": 0.7012, + "step": 26000 + }, + { + "epoch": 8.272467449984122, + "grad_norm": 30317.115234375, + "learning_rate": 8.639250555731978e-06, + "loss": 0.7003, + "step": 26050 + }, + { + "epoch": 8.288345506510003, + "grad_norm": 30630.19921875, + "learning_rate": 8.559860273102573e-06, + "loss": 0.7023, + "step": 26100 + }, + { + "epoch": 8.304223563035885, + "grad_norm": 31124.72265625, + "learning_rate": 8.480469990473167e-06, + "loss": 0.6989, + "step": 26150 + }, + { + "epoch": 8.320101619561765, + "grad_norm": 30239.1875, + "learning_rate": 8.401079707843761e-06, + "loss": 0.7013, + "step": 26200 + }, + { + "epoch": 8.335979676087646, + "grad_norm": 30741.263671875, + "learning_rate": 8.321689425214355e-06, + "loss": 0.7029, + "step": 26250 + }, + { + "epoch": 8.351857732613528, + "grad_norm": 29795.580078125, + "learning_rate": 8.242299142584949e-06, + "loss": 0.6995, + "step": 26300 + }, + { + "epoch": 8.36773578913941, + "grad_norm": 30709.451171875, + "learning_rate": 8.16290885995554e-06, + "loss": 0.7009, + "step": 26350 + }, + { + "epoch": 8.383613845665291, + "grad_norm": 30076.46875, + "learning_rate": 8.083518577326135e-06, + "loss": 0.6988, + "step": 26400 + }, + { + "epoch": 8.399491902191173, + "grad_norm": 29763.3828125, + "learning_rate": 8.004128294696728e-06, + "loss": 0.7028, + "step": 26450 + }, + { + "epoch": 8.415369958717053, + "grad_norm": 30590.60546875, + "learning_rate": 7.924738012067324e-06, + "loss": 0.7067, + "step": 26500 + }, + { + "epoch": 8.431248015242934, + "grad_norm": 30087.3671875, + "learning_rate": 7.845347729437918e-06, + "loss": 0.702, + "step": 26550 + }, + { + "epoch": 8.447126071768816, + "grad_norm": 30778.0859375, + "learning_rate": 7.765957446808511e-06, + "loss": 0.7028, + "step": 26600 + }, + { + "epoch": 8.463004128294697, + "grad_norm": 30542.35546875, + "learning_rate": 7.686567164179105e-06, + "loss": 0.6996, + "step": 26650 + }, + { + "epoch": 8.478882184820577, + "grad_norm": 30533.51171875, + "learning_rate": 7.607176881549699e-06, + "loss": 0.7023, + "step": 26700 + }, + { + "epoch": 8.494760241346459, + "grad_norm": 30710.169921875, + "learning_rate": 7.527786598920293e-06, + "loss": 0.6998, + "step": 26750 + }, + { + "epoch": 8.51063829787234, + "grad_norm": 30740.416015625, + "learning_rate": 7.448396316290886e-06, + "loss": 0.7036, + "step": 26800 + }, + { + "epoch": 8.526516354398222, + "grad_norm": 29898.5390625, + "learning_rate": 7.36900603366148e-06, + "loss": 0.7007, + "step": 26850 + }, + { + "epoch": 8.542394410924103, + "grad_norm": 30001.548828125, + "learning_rate": 7.2896157510320734e-06, + "loss": 0.6958, + "step": 26900 + }, + { + "epoch": 8.558272467449985, + "grad_norm": 30819.171875, + "learning_rate": 7.210225468402667e-06, + "loss": 0.6976, + "step": 26950 + }, + { + "epoch": 8.574150523975865, + "grad_norm": 30720.3125, + "learning_rate": 7.130835185773262e-06, + "loss": 0.6985, + "step": 27000 + }, + { + "epoch": 8.590028580501746, + "grad_norm": 31220.265625, + "learning_rate": 7.051444903143856e-06, + "loss": 0.7008, + "step": 27050 + }, + { + "epoch": 8.605906637027628, + "grad_norm": 31095.111328125, + "learning_rate": 6.9720546205144495e-06, + "loss": 0.7011, + "step": 27100 + }, + { + "epoch": 8.62178469355351, + "grad_norm": 30474.576171875, + "learning_rate": 6.892664337885043e-06, + "loss": 0.6994, + "step": 27150 + }, + { + "epoch": 8.63766275007939, + "grad_norm": 30038.474609375, + "learning_rate": 6.813274055255638e-06, + "loss": 0.7002, + "step": 27200 + }, + { + "epoch": 8.653540806605271, + "grad_norm": 30336.1953125, + "learning_rate": 6.73388377262623e-06, + "loss": 0.6994, + "step": 27250 + }, + { + "epoch": 8.669418863131153, + "grad_norm": 30122.53125, + "learning_rate": 6.654493489996824e-06, + "loss": 0.7004, + "step": 27300 + }, + { + "epoch": 8.685296919657034, + "grad_norm": 30615.431640625, + "learning_rate": 6.575103207367419e-06, + "loss": 0.7021, + "step": 27350 + }, + { + "epoch": 8.701174976182916, + "grad_norm": 30339.583984375, + "learning_rate": 6.495712924738012e-06, + "loss": 0.7007, + "step": 27400 + }, + { + "epoch": 8.717053032708796, + "grad_norm": 30189.884765625, + "learning_rate": 6.416322642108606e-06, + "loss": 0.6986, + "step": 27450 + }, + { + "epoch": 8.732931089234677, + "grad_norm": 30549.412109375, + "learning_rate": 6.3369323594792e-06, + "loss": 0.6996, + "step": 27500 + }, + { + "epoch": 8.748809145760559, + "grad_norm": 30457.353515625, + "learning_rate": 6.257542076849794e-06, + "loss": 0.6991, + "step": 27550 + }, + { + "epoch": 8.76468720228644, + "grad_norm": 31163.732421875, + "learning_rate": 6.178151794220388e-06, + "loss": 0.702, + "step": 27600 + }, + { + "epoch": 8.780565258812322, + "grad_norm": 29878.865234375, + "learning_rate": 6.0987615115909814e-06, + "loss": 0.6999, + "step": 27650 + }, + { + "epoch": 8.796443315338202, + "grad_norm": 30267.07421875, + "learning_rate": 6.019371228961575e-06, + "loss": 0.7014, + "step": 27700 + }, + { + "epoch": 8.812321371864083, + "grad_norm": 30354.353515625, + "learning_rate": 5.939980946332169e-06, + "loss": 0.6982, + "step": 27750 + }, + { + "epoch": 8.828199428389965, + "grad_norm": 30582.365234375, + "learning_rate": 5.860590663702763e-06, + "loss": 0.7002, + "step": 27800 + }, + { + "epoch": 8.844077484915847, + "grad_norm": 30702.970703125, + "learning_rate": 5.781200381073357e-06, + "loss": 0.6975, + "step": 27850 + }, + { + "epoch": 8.859955541441728, + "grad_norm": 31331.51953125, + "learning_rate": 5.7018100984439505e-06, + "loss": 0.6966, + "step": 27900 + }, + { + "epoch": 8.875833597967608, + "grad_norm": 31188.05859375, + "learning_rate": 5.622419815814544e-06, + "loss": 0.6976, + "step": 27950 + }, + { + "epoch": 8.89171165449349, + "grad_norm": 30835.2578125, + "learning_rate": 5.543029533185139e-06, + "loss": 0.6985, + "step": 28000 + }, + { + "epoch": 8.907589711019371, + "grad_norm": 30251.89453125, + "learning_rate": 5.463639250555732e-06, + "loss": 0.7002, + "step": 28050 + }, + { + "epoch": 8.923467767545253, + "grad_norm": 30508.73828125, + "learning_rate": 5.384248967926326e-06, + "loss": 0.6986, + "step": 28100 + }, + { + "epoch": 8.939345824071134, + "grad_norm": 30640.4375, + "learning_rate": 5.3048586852969195e-06, + "loss": 0.7019, + "step": 28150 + }, + { + "epoch": 8.955223880597014, + "grad_norm": 31159.490234375, + "learning_rate": 5.225468402667514e-06, + "loss": 0.6987, + "step": 28200 + }, + { + "epoch": 8.971101937122896, + "grad_norm": 30732.70703125, + "learning_rate": 5.146078120038107e-06, + "loss": 0.6996, + "step": 28250 + }, + { + "epoch": 8.986979993648777, + "grad_norm": 30562.67578125, + "learning_rate": 5.066687837408701e-06, + "loss": 0.7028, + "step": 28300 + }, + { + "epoch": 9.002858050174659, + "grad_norm": 31258.55078125, + "learning_rate": 4.987297554779296e-06, + "loss": 0.6952, + "step": 28350 + }, + { + "epoch": 9.01873610670054, + "grad_norm": 30664.671875, + "learning_rate": 4.9079072721498894e-06, + "loss": 0.6945, + "step": 28400 + }, + { + "epoch": 9.03461416322642, + "grad_norm": 31010.13671875, + "learning_rate": 4.828516989520483e-06, + "loss": 0.6963, + "step": 28450 + }, + { + "epoch": 9.050492219752302, + "grad_norm": 31770.1484375, + "learning_rate": 4.749126706891076e-06, + "loss": 0.6951, + "step": 28500 + }, + { + "epoch": 9.066370276278183, + "grad_norm": 30270.96484375, + "learning_rate": 4.669736424261671e-06, + "loss": 0.6962, + "step": 28550 + }, + { + "epoch": 9.082248332804065, + "grad_norm": 31095.724609375, + "learning_rate": 4.590346141632265e-06, + "loss": 0.6951, + "step": 28600 + }, + { + "epoch": 9.098126389329947, + "grad_norm": 30564.689453125, + "learning_rate": 4.5109558590028585e-06, + "loss": 0.6958, + "step": 28650 + }, + { + "epoch": 9.114004445855826, + "grad_norm": 30729.578125, + "learning_rate": 4.4315655763734514e-06, + "loss": 0.6965, + "step": 28700 + }, + { + "epoch": 9.129882502381708, + "grad_norm": 30804.77734375, + "learning_rate": 4.352175293744046e-06, + "loss": 0.6987, + "step": 28750 + }, + { + "epoch": 9.14576055890759, + "grad_norm": 30098.283203125, + "learning_rate": 4.27278501111464e-06, + "loss": 0.693, + "step": 28800 + }, + { + "epoch": 9.161638615433471, + "grad_norm": 31217.767578125, + "learning_rate": 4.193394728485234e-06, + "loss": 0.6952, + "step": 28850 + }, + { + "epoch": 9.177516671959353, + "grad_norm": 30664.6953125, + "learning_rate": 4.1140044458558275e-06, + "loss": 0.6961, + "step": 28900 + }, + { + "epoch": 9.193394728485233, + "grad_norm": 30483.142578125, + "learning_rate": 4.034614163226421e-06, + "loss": 0.6929, + "step": 28950 + }, + { + "epoch": 9.209272785011114, + "grad_norm": 30538.34765625, + "learning_rate": 3.955223880597015e-06, + "loss": 0.6964, + "step": 29000 + }, + { + "epoch": 9.225150841536996, + "grad_norm": 31039.91796875, + "learning_rate": 3.875833597967609e-06, + "loss": 0.6946, + "step": 29050 + }, + { + "epoch": 9.241028898062877, + "grad_norm": 30256.009765625, + "learning_rate": 3.796443315338203e-06, + "loss": 0.6946, + "step": 29100 + }, + { + "epoch": 9.256906954588759, + "grad_norm": 31257.78125, + "learning_rate": 3.717053032708796e-06, + "loss": 0.6942, + "step": 29150 + }, + { + "epoch": 9.272785011114639, + "grad_norm": 30951.634765625, + "learning_rate": 3.6376627500793904e-06, + "loss": 0.6944, + "step": 29200 + }, + { + "epoch": 9.28866306764052, + "grad_norm": 30461.291015625, + "learning_rate": 3.558272467449984e-06, + "loss": 0.6942, + "step": 29250 + }, + { + "epoch": 9.304541124166402, + "grad_norm": 30459.7109375, + "learning_rate": 3.4788821848205784e-06, + "loss": 0.699, + "step": 29300 + }, + { + "epoch": 9.320419180692284, + "grad_norm": 30297.685546875, + "learning_rate": 3.3994919021911722e-06, + "loss": 0.6909, + "step": 29350 + }, + { + "epoch": 9.336297237218165, + "grad_norm": 30345.24609375, + "learning_rate": 3.3201016195617656e-06, + "loss": 0.6962, + "step": 29400 + }, + { + "epoch": 9.352175293744045, + "grad_norm": 30085.6484375, + "learning_rate": 3.2407113369323594e-06, + "loss": 0.6957, + "step": 29450 + }, + { + "epoch": 9.368053350269927, + "grad_norm": 31095.689453125, + "learning_rate": 3.1613210543029537e-06, + "loss": 0.698, + "step": 29500 + }, + { + "epoch": 9.383931406795808, + "grad_norm": 30798.33203125, + "learning_rate": 3.081930771673547e-06, + "loss": 0.6938, + "step": 29550 + }, + { + "epoch": 9.39980946332169, + "grad_norm": 30953.390625, + "learning_rate": 3.0025404890441413e-06, + "loss": 0.6934, + "step": 29600 + }, + { + "epoch": 9.415687519847571, + "grad_norm": 30614.728515625, + "learning_rate": 2.9231502064147347e-06, + "loss": 0.6936, + "step": 29650 + }, + { + "epoch": 9.431565576373451, + "grad_norm": 30555.818359375, + "learning_rate": 2.843759923785329e-06, + "loss": 0.6925, + "step": 29700 + }, + { + "epoch": 9.447443632899333, + "grad_norm": 30385.14453125, + "learning_rate": 2.7643696411559227e-06, + "loss": 0.6989, + "step": 29750 + }, + { + "epoch": 9.463321689425214, + "grad_norm": 30924.46875, + "learning_rate": 2.6849793585265165e-06, + "loss": 0.697, + "step": 29800 + }, + { + "epoch": 9.479199745951096, + "grad_norm": 31371.728515625, + "learning_rate": 2.6055890758971103e-06, + "loss": 0.6955, + "step": 29850 + }, + { + "epoch": 9.495077802476978, + "grad_norm": 30738.732421875, + "learning_rate": 2.526198793267704e-06, + "loss": 0.6945, + "step": 29900 + }, + { + "epoch": 9.510955859002857, + "grad_norm": 30800.96484375, + "learning_rate": 2.446808510638298e-06, + "loss": 0.6951, + "step": 29950 + }, + { + "epoch": 9.526833915528739, + "grad_norm": 30367.607421875, + "learning_rate": 2.3674182280088918e-06, + "loss": 0.6975, + "step": 30000 + }, + { + "epoch": 9.54271197205462, + "grad_norm": 30162.599609375, + "learning_rate": 2.2880279453794856e-06, + "loss": 0.6949, + "step": 30050 + }, + { + "epoch": 9.558590028580502, + "grad_norm": 30217.197265625, + "learning_rate": 2.2086376627500794e-06, + "loss": 0.6921, + "step": 30100 + }, + { + "epoch": 9.574468085106384, + "grad_norm": 30000.412109375, + "learning_rate": 2.129247380120673e-06, + "loss": 0.6967, + "step": 30150 + }, + { + "epoch": 9.590346141632264, + "grad_norm": 30676.177734375, + "learning_rate": 2.0498570974912674e-06, + "loss": 0.6957, + "step": 30200 + }, + { + "epoch": 9.606224198158145, + "grad_norm": 30816.064453125, + "learning_rate": 1.970466814861861e-06, + "loss": 0.6944, + "step": 30250 + }, + { + "epoch": 9.622102254684027, + "grad_norm": 30859.091796875, + "learning_rate": 1.891076532232455e-06, + "loss": 0.695, + "step": 30300 + }, + { + "epoch": 9.637980311209908, + "grad_norm": 30824.66015625, + "learning_rate": 1.8116862496030486e-06, + "loss": 0.6955, + "step": 30350 + }, + { + "epoch": 9.65385836773579, + "grad_norm": 31179.345703125, + "learning_rate": 1.7322959669736427e-06, + "loss": 0.692, + "step": 30400 + }, + { + "epoch": 9.66973642426167, + "grad_norm": 29985.033203125, + "learning_rate": 1.6529056843442363e-06, + "loss": 0.6961, + "step": 30450 + }, + { + "epoch": 9.685614480787551, + "grad_norm": 29948.798828125, + "learning_rate": 1.5735154017148303e-06, + "loss": 0.6948, + "step": 30500 + }, + { + "epoch": 9.701492537313433, + "grad_norm": 30504.94140625, + "learning_rate": 1.494125119085424e-06, + "loss": 0.6955, + "step": 30550 + }, + { + "epoch": 9.717370593839314, + "grad_norm": 30339.3515625, + "learning_rate": 1.414734836456018e-06, + "loss": 0.6962, + "step": 30600 + }, + { + "epoch": 9.733248650365196, + "grad_norm": 30103.78125, + "learning_rate": 1.3353445538266117e-06, + "loss": 0.6953, + "step": 30650 + }, + { + "epoch": 9.749126706891076, + "grad_norm": 30390.306640625, + "learning_rate": 1.2559542711972055e-06, + "loss": 0.6962, + "step": 30700 + }, + { + "epoch": 9.765004763416957, + "grad_norm": 30574.888671875, + "learning_rate": 1.1765639885677993e-06, + "loss": 0.6963, + "step": 30750 + }, + { + "epoch": 9.780882819942839, + "grad_norm": 30706.4765625, + "learning_rate": 1.0971737059383931e-06, + "loss": 0.6928, + "step": 30800 + }, + { + "epoch": 9.79676087646872, + "grad_norm": 29788.431640625, + "learning_rate": 1.0177834233089872e-06, + "loss": 0.6918, + "step": 30850 + }, + { + "epoch": 9.812638932994602, + "grad_norm": 30614.6875, + "learning_rate": 9.383931406795809e-07, + "loss": 0.6931, + "step": 30900 + }, + { + "epoch": 9.828516989520482, + "grad_norm": 30468.765625, + "learning_rate": 8.590028580501748e-07, + "loss": 0.6976, + "step": 30950 + }, + { + "epoch": 9.844395046046364, + "grad_norm": 29652.828125, + "learning_rate": 7.796125754207686e-07, + "loss": 0.6938, + "step": 31000 + }, + { + "epoch": 9.860273102572245, + "grad_norm": 31199.251953125, + "learning_rate": 7.002222927913624e-07, + "loss": 0.6965, + "step": 31050 + }, + { + "epoch": 9.876151159098127, + "grad_norm": 30488.6796875, + "learning_rate": 6.208320101619562e-07, + "loss": 0.6938, + "step": 31100 + }, + { + "epoch": 9.892029215624008, + "grad_norm": 30915.556640625, + "learning_rate": 5.4144172753255e-07, + "loss": 0.6951, + "step": 31150 + }, + { + "epoch": 9.907907272149888, + "grad_norm": 30898.82421875, + "learning_rate": 4.620514449031439e-07, + "loss": 0.6929, + "step": 31200 + }, + { + "epoch": 9.92378532867577, + "grad_norm": 31000.388671875, + "learning_rate": 3.826611622737377e-07, + "loss": 0.6931, + "step": 31250 + }, + { + "epoch": 9.939663385201651, + "grad_norm": 30619.94140625, + "learning_rate": 3.0327087964433156e-07, + "loss": 0.6963, + "step": 31300 + }, + { + "epoch": 9.955541441727533, + "grad_norm": 30345.3515625, + "learning_rate": 2.238805970149254e-07, + "loss": 0.6932, + "step": 31350 + }, + { + "epoch": 9.971419498253415, + "grad_norm": 31431.78515625, + "learning_rate": 1.444903143855192e-07, + "loss": 0.6936, + "step": 31400 + }, + { + "epoch": 9.987297554779294, + "grad_norm": 30420.556640625, + "learning_rate": 6.510003175611305e-08, + "loss": 0.6958, + "step": 31450 + }, + { + "epoch": 10.0, + "step": 31490, + "total_flos": 1.05298598559744e+18, + "train_loss": 0.8101965235240424, + "train_runtime": 86746.076, + "train_samples_per_second": 23.228, + "train_steps_per_second": 0.363 + }, + { + "epoch": 10.003175611305176, + "grad_norm": 34598.17578125, + "learning_rate": 4.9992854874563356e-05, + "loss": 0.7254, + "step": 31500 + }, + { + "epoch": 10.019053667831058, + "grad_norm": 32862.16015625, + "learning_rate": 4.995315973324865e-05, + "loss": 0.7148, + "step": 31550 + }, + { + "epoch": 10.03493172435694, + "grad_norm": 34942.2890625, + "learning_rate": 4.991346459193395e-05, + "loss": 0.7129, + "step": 31600 + }, + { + "epoch": 10.05080978088282, + "grad_norm": 32553.541015625, + "learning_rate": 4.987376945061924e-05, + "loss": 0.7192, + "step": 31650 + }, + { + "epoch": 10.0666878374087, + "grad_norm": 32205.275390625, + "learning_rate": 4.983407430930454e-05, + "loss": 0.719, + "step": 31700 + }, + { + "epoch": 10.082565893934582, + "grad_norm": 32618.0078125, + "learning_rate": 4.9794379167989844e-05, + "loss": 0.7201, + "step": 31750 + }, + { + "epoch": 10.098443950460464, + "grad_norm": 31251.701171875, + "learning_rate": 4.975468402667514e-05, + "loss": 0.7209, + "step": 31800 + }, + { + "epoch": 10.114322006986345, + "grad_norm": 30804.701171875, + "learning_rate": 4.971498888536044e-05, + "loss": 0.7199, + "step": 31850 + }, + { + "epoch": 10.130200063512227, + "grad_norm": 31691.65234375, + "learning_rate": 4.967529374404573e-05, + "loss": 0.7242, + "step": 31900 + }, + { + "epoch": 10.146078120038107, + "grad_norm": 31379.8046875, + "learning_rate": 4.9635598602731024e-05, + "loss": 0.7244, + "step": 31950 + }, + { + "epoch": 10.161956176563988, + "grad_norm": 31711.45703125, + "learning_rate": 4.9595903461416325e-05, + "loss": 0.7231, + "step": 32000 + }, + { + "epoch": 10.17783423308987, + "grad_norm": 30201.376953125, + "learning_rate": 4.955620832010162e-05, + "loss": 0.7212, + "step": 32050 + }, + { + "epoch": 10.193712289615751, + "grad_norm": 30604.0234375, + "learning_rate": 4.951651317878692e-05, + "loss": 0.7217, + "step": 32100 + }, + { + "epoch": 10.209590346141633, + "grad_norm": 30721.6171875, + "learning_rate": 4.947681803747222e-05, + "loss": 0.7186, + "step": 32150 + }, + { + "epoch": 10.225468402667513, + "grad_norm": 30430.392578125, + "learning_rate": 4.943712289615751e-05, + "loss": 0.7247, + "step": 32200 + }, + { + "epoch": 10.241346459193394, + "grad_norm": 29437.306640625, + "learning_rate": 4.939742775484281e-05, + "loss": 0.7216, + "step": 32250 + }, + { + "epoch": 10.257224515719276, + "grad_norm": 31316.037109375, + "learning_rate": 4.9357732613528106e-05, + "loss": 0.7232, + "step": 32300 + }, + { + "epoch": 10.273102572245158, + "grad_norm": 30285.005859375, + "learning_rate": 4.93180374722134e-05, + "loss": 0.7232, + "step": 32350 + }, + { + "epoch": 10.28898062877104, + "grad_norm": 30771.517578125, + "learning_rate": 4.92783423308987e-05, + "loss": 0.7211, + "step": 32400 + }, + { + "epoch": 10.304858685296919, + "grad_norm": 31089.01953125, + "learning_rate": 4.9238647189584e-05, + "loss": 0.719, + "step": 32450 + }, + { + "epoch": 10.3207367418228, + "grad_norm": 29312.451171875, + "learning_rate": 4.9198952048269294e-05, + "loss": 0.7217, + "step": 32500 + }, + { + "epoch": 10.336614798348682, + "grad_norm": 29717.74609375, + "learning_rate": 4.9159256906954594e-05, + "loss": 0.7204, + "step": 32550 + }, + { + "epoch": 10.352492854874564, + "grad_norm": 29353.939453125, + "learning_rate": 4.911956176563989e-05, + "loss": 0.7209, + "step": 32600 + }, + { + "epoch": 10.368370911400445, + "grad_norm": 30583.625, + "learning_rate": 4.907986662432518e-05, + "loss": 0.719, + "step": 32650 + }, + { + "epoch": 10.384248967926325, + "grad_norm": 30315.1484375, + "learning_rate": 4.904017148301048e-05, + "loss": 0.7224, + "step": 32700 + }, + { + "epoch": 10.400127024452207, + "grad_norm": 28785.408203125, + "learning_rate": 4.9000476341695775e-05, + "loss": 0.7227, + "step": 32750 + }, + { + "epoch": 10.416005080978088, + "grad_norm": 29400.603515625, + "learning_rate": 4.8960781200381075e-05, + "loss": 0.7191, + "step": 32800 + }, + { + "epoch": 10.43188313750397, + "grad_norm": 28836.03125, + "learning_rate": 4.8921086059066375e-05, + "loss": 0.7191, + "step": 32850 + }, + { + "epoch": 10.447761194029852, + "grad_norm": 29112.181640625, + "learning_rate": 4.888139091775167e-05, + "loss": 0.7196, + "step": 32900 + }, + { + "epoch": 10.463639250555731, + "grad_norm": 29085.232421875, + "learning_rate": 4.884169577643697e-05, + "loss": 0.7202, + "step": 32950 + }, + { + "epoch": 10.479517307081613, + "grad_norm": 29157.4609375, + "learning_rate": 4.880200063512226e-05, + "loss": 0.7185, + "step": 33000 + }, + { + "epoch": 10.495395363607495, + "grad_norm": 29475.744140625, + "learning_rate": 4.8762305493807556e-05, + "loss": 0.7221, + "step": 33050 + }, + { + "epoch": 10.511273420133376, + "grad_norm": 29044.5703125, + "learning_rate": 4.8722610352492857e-05, + "loss": 0.7194, + "step": 33100 + }, + { + "epoch": 10.527151476659256, + "grad_norm": 28844.75390625, + "learning_rate": 4.868291521117816e-05, + "loss": 0.7178, + "step": 33150 + }, + { + "epoch": 10.543029533185138, + "grad_norm": 28828.00390625, + "learning_rate": 4.864322006986345e-05, + "loss": 0.7173, + "step": 33200 + }, + { + "epoch": 10.55890758971102, + "grad_norm": 28988.736328125, + "learning_rate": 4.860352492854875e-05, + "loss": 0.7163, + "step": 33250 + }, + { + "epoch": 10.5747856462369, + "grad_norm": 27665.376953125, + "learning_rate": 4.8563829787234044e-05, + "loss": 0.7177, + "step": 33300 + }, + { + "epoch": 10.590663702762782, + "grad_norm": 28359.263671875, + "learning_rate": 4.852413464591934e-05, + "loss": 0.7215, + "step": 33350 + }, + { + "epoch": 10.606541759288664, + "grad_norm": 28809.84375, + "learning_rate": 4.848443950460464e-05, + "loss": 0.7186, + "step": 33400 + }, + { + "epoch": 10.622419815814544, + "grad_norm": 27395.1171875, + "learning_rate": 4.844474436328993e-05, + "loss": 0.7172, + "step": 33450 + }, + { + "epoch": 10.638297872340425, + "grad_norm": 27581.38671875, + "learning_rate": 4.840504922197523e-05, + "loss": 0.721, + "step": 33500 + }, + { + "epoch": 10.654175928866307, + "grad_norm": 27007.13671875, + "learning_rate": 4.836535408066053e-05, + "loss": 0.7198, + "step": 33550 + }, + { + "epoch": 10.670053985392189, + "grad_norm": 28538.033203125, + "learning_rate": 4.8325658939345826e-05, + "loss": 0.717, + "step": 33600 + }, + { + "epoch": 10.685932041918068, + "grad_norm": 28231.92578125, + "learning_rate": 4.8285963798031126e-05, + "loss": 0.7173, + "step": 33650 + }, + { + "epoch": 10.70181009844395, + "grad_norm": 27481.81640625, + "learning_rate": 4.824626865671642e-05, + "loss": 0.7179, + "step": 33700 + }, + { + "epoch": 10.717688154969832, + "grad_norm": 29380.140625, + "learning_rate": 4.820657351540171e-05, + "loss": 0.7185, + "step": 33750 + }, + { + "epoch": 10.733566211495713, + "grad_norm": 27961.125, + "learning_rate": 4.816687837408701e-05, + "loss": 0.7158, + "step": 33800 + }, + { + "epoch": 10.749444268021595, + "grad_norm": 28789.4453125, + "learning_rate": 4.8127183232772314e-05, + "loss": 0.7184, + "step": 33850 + }, + { + "epoch": 10.765322324547476, + "grad_norm": 28526.916015625, + "learning_rate": 4.808748809145761e-05, + "loss": 0.719, + "step": 33900 + }, + { + "epoch": 10.781200381073356, + "grad_norm": 28261.802734375, + "learning_rate": 4.804779295014291e-05, + "loss": 0.7155, + "step": 33950 + }, + { + "epoch": 10.797078437599238, + "grad_norm": 28534.755859375, + "learning_rate": 4.80080978088282e-05, + "loss": 0.7169, + "step": 34000 + }, + { + "epoch": 10.81295649412512, + "grad_norm": 28207.771484375, + "learning_rate": 4.79684026675135e-05, + "loss": 0.713, + "step": 34050 + }, + { + "epoch": 10.828834550651, + "grad_norm": 27474.365234375, + "learning_rate": 4.7928707526198795e-05, + "loss": 0.7183, + "step": 34100 + }, + { + "epoch": 10.84471260717688, + "grad_norm": 28205.083984375, + "learning_rate": 4.788901238488409e-05, + "loss": 0.7189, + "step": 34150 + }, + { + "epoch": 10.860590663702762, + "grad_norm": 27502.6015625, + "learning_rate": 4.784931724356939e-05, + "loss": 0.7161, + "step": 34200 + }, + { + "epoch": 10.876468720228644, + "grad_norm": 27558.001953125, + "learning_rate": 4.780962210225469e-05, + "loss": 0.7159, + "step": 34250 + }, + { + "epoch": 10.892346776754525, + "grad_norm": 28145.595703125, + "learning_rate": 4.776992696093998e-05, + "loss": 0.7111, + "step": 34300 + }, + { + "epoch": 10.908224833280407, + "grad_norm": 27374.37109375, + "learning_rate": 4.773023181962528e-05, + "loss": 0.718, + "step": 34350 + }, + { + "epoch": 10.924102889806289, + "grad_norm": 27972.486328125, + "learning_rate": 4.7690536678310576e-05, + "loss": 0.7195, + "step": 34400 + }, + { + "epoch": 10.939980946332168, + "grad_norm": 27874.939453125, + "learning_rate": 4.765084153699587e-05, + "loss": 0.717, + "step": 34450 + }, + { + "epoch": 10.95585900285805, + "grad_norm": 27341.05859375, + "learning_rate": 4.761114639568117e-05, + "loss": 0.716, + "step": 34500 + }, + { + "epoch": 10.971737059383932, + "grad_norm": 28336.388671875, + "learning_rate": 4.757145125436647e-05, + "loss": 0.7177, + "step": 34550 + }, + { + "epoch": 10.987615115909813, + "grad_norm": 26653.798828125, + "learning_rate": 4.753175611305177e-05, + "loss": 0.7111, + "step": 34600 + }, + { + "epoch": 11.003493172435693, + "grad_norm": 27900.7265625, + "learning_rate": 4.7492060971737064e-05, + "loss": 0.7124, + "step": 34650 + }, + { + "epoch": 11.019371228961575, + "grad_norm": 27794.94921875, + "learning_rate": 4.745236583042236e-05, + "loss": 0.7103, + "step": 34700 + }, + { + "epoch": 11.035249285487456, + "grad_norm": 27183.654296875, + "learning_rate": 4.741267068910766e-05, + "loss": 0.7052, + "step": 34750 + }, + { + "epoch": 11.051127342013338, + "grad_norm": 27468.392578125, + "learning_rate": 4.737297554779295e-05, + "loss": 0.7084, + "step": 34800 + }, + { + "epoch": 11.06700539853922, + "grad_norm": 26358.43359375, + "learning_rate": 4.7333280406478245e-05, + "loss": 0.7038, + "step": 34850 + }, + { + "epoch": 11.0828834550651, + "grad_norm": 26982.095703125, + "learning_rate": 4.7293585265163545e-05, + "loss": 0.7023, + "step": 34900 + }, + { + "epoch": 11.09876151159098, + "grad_norm": 27181.15625, + "learning_rate": 4.7253890123848845e-05, + "loss": 0.7062, + "step": 34950 + }, + { + "epoch": 11.114639568116862, + "grad_norm": 27455.166015625, + "learning_rate": 4.721419498253414e-05, + "loss": 0.7088, + "step": 35000 + }, + { + "epoch": 11.130517624642744, + "grad_norm": 27479.68359375, + "learning_rate": 4.717449984121944e-05, + "loss": 0.707, + "step": 35050 + }, + { + "epoch": 11.146395681168626, + "grad_norm": 28365.30859375, + "learning_rate": 4.713480469990473e-05, + "loss": 0.7052, + "step": 35100 + }, + { + "epoch": 11.162273737694505, + "grad_norm": 27870.494140625, + "learning_rate": 4.7095109558590026e-05, + "loss": 0.7089, + "step": 35150 + }, + { + "epoch": 11.178151794220387, + "grad_norm": 27870.310546875, + "learning_rate": 4.7055414417275327e-05, + "loss": 0.7066, + "step": 35200 + }, + { + "epoch": 11.194029850746269, + "grad_norm": 28179.03125, + "learning_rate": 4.701571927596063e-05, + "loss": 0.7069, + "step": 35250 + }, + { + "epoch": 11.20990790727215, + "grad_norm": 27911.45703125, + "learning_rate": 4.697602413464593e-05, + "loss": 0.707, + "step": 35300 + }, + { + "epoch": 11.225785963798032, + "grad_norm": 26871.20703125, + "learning_rate": 4.693632899333122e-05, + "loss": 0.7073, + "step": 35350 + }, + { + "epoch": 11.241664020323912, + "grad_norm": 27323.912109375, + "learning_rate": 4.6896633852016514e-05, + "loss": 0.7095, + "step": 35400 + }, + { + "epoch": 11.257542076849793, + "grad_norm": 27162.650390625, + "learning_rate": 4.6856938710701815e-05, + "loss": 0.7047, + "step": 35450 + }, + { + "epoch": 11.273420133375675, + "grad_norm": 26953.41796875, + "learning_rate": 4.681724356938711e-05, + "loss": 0.7061, + "step": 35500 + }, + { + "epoch": 11.289298189901556, + "grad_norm": 26672.6953125, + "learning_rate": 4.67775484280724e-05, + "loss": 0.7059, + "step": 35550 + }, + { + "epoch": 11.305176246427438, + "grad_norm": 27072.08203125, + "learning_rate": 4.67378532867577e-05, + "loss": 0.7047, + "step": 35600 + }, + { + "epoch": 11.321054302953318, + "grad_norm": 26242.37109375, + "learning_rate": 4.6698158145443e-05, + "loss": 0.7042, + "step": 35650 + }, + { + "epoch": 11.3369323594792, + "grad_norm": 26593.7890625, + "learning_rate": 4.6658463004128296e-05, + "loss": 0.7022, + "step": 35700 + }, + { + "epoch": 11.352810416005081, + "grad_norm": 26419.05078125, + "learning_rate": 4.6618767862813596e-05, + "loss": 0.7077, + "step": 35750 + }, + { + "epoch": 11.368688472530962, + "grad_norm": 26334.150390625, + "learning_rate": 4.657907272149889e-05, + "loss": 0.7084, + "step": 35800 + }, + { + "epoch": 11.384566529056844, + "grad_norm": 26394.89453125, + "learning_rate": 4.653937758018419e-05, + "loss": 0.7064, + "step": 35850 + }, + { + "epoch": 11.400444585582724, + "grad_norm": 26715.767578125, + "learning_rate": 4.649968243886948e-05, + "loss": 0.7055, + "step": 35900 + }, + { + "epoch": 11.416322642108605, + "grad_norm": 27197.98828125, + "learning_rate": 4.645998729755478e-05, + "loss": 0.708, + "step": 35950 + }, + { + "epoch": 11.432200698634487, + "grad_norm": 27616.12109375, + "learning_rate": 4.6420292156240084e-05, + "loss": 0.708, + "step": 36000 + }, + { + "epoch": 11.448078755160369, + "grad_norm": 26575.294921875, + "learning_rate": 4.638059701492538e-05, + "loss": 0.7039, + "step": 36050 + }, + { + "epoch": 11.46395681168625, + "grad_norm": 26070.689453125, + "learning_rate": 4.634090187361067e-05, + "loss": 0.7075, + "step": 36100 + }, + { + "epoch": 11.47983486821213, + "grad_norm": 26675.865234375, + "learning_rate": 4.630120673229597e-05, + "loss": 0.7058, + "step": 36150 + }, + { + "epoch": 11.495712924738012, + "grad_norm": 26788.8671875, + "learning_rate": 4.6261511590981265e-05, + "loss": 0.7056, + "step": 36200 + }, + { + "epoch": 11.511590981263893, + "grad_norm": 26736.013671875, + "learning_rate": 4.622181644966656e-05, + "loss": 0.7049, + "step": 36250 + }, + { + "epoch": 11.527469037789775, + "grad_norm": 27259.62109375, + "learning_rate": 4.618212130835186e-05, + "loss": 0.7033, + "step": 36300 + }, + { + "epoch": 11.543347094315656, + "grad_norm": 26500.08984375, + "learning_rate": 4.614242616703716e-05, + "loss": 0.7055, + "step": 36350 + }, + { + "epoch": 11.559225150841536, + "grad_norm": 26374.705078125, + "learning_rate": 4.610273102572246e-05, + "loss": 0.7017, + "step": 36400 + }, + { + "epoch": 11.575103207367418, + "grad_norm": 27614.83984375, + "learning_rate": 4.606303588440775e-05, + "loss": 0.7059, + "step": 36450 + }, + { + "epoch": 11.5909812638933, + "grad_norm": 25694.1015625, + "learning_rate": 4.6023340743093046e-05, + "loss": 0.7048, + "step": 36500 + }, + { + "epoch": 11.606859320419181, + "grad_norm": 26899.56640625, + "learning_rate": 4.5983645601778346e-05, + "loss": 0.7045, + "step": 36550 + }, + { + "epoch": 11.622737376945063, + "grad_norm": 25786.490234375, + "learning_rate": 4.594395046046364e-05, + "loss": 0.7038, + "step": 36600 + }, + { + "epoch": 11.638615433470942, + "grad_norm": 26447.548828125, + "learning_rate": 4.5904255319148933e-05, + "loss": 0.7031, + "step": 36650 + }, + { + "epoch": 11.654493489996824, + "grad_norm": 25961.685546875, + "learning_rate": 4.5864560177834234e-05, + "loss": 0.7059, + "step": 36700 + }, + { + "epoch": 11.670371546522706, + "grad_norm": 26539.162109375, + "learning_rate": 4.5824865036519534e-05, + "loss": 0.7033, + "step": 36750 + }, + { + "epoch": 11.686249603048587, + "grad_norm": 26035.525390625, + "learning_rate": 4.578516989520483e-05, + "loss": 0.7057, + "step": 36800 + }, + { + "epoch": 11.702127659574469, + "grad_norm": 26614.791015625, + "learning_rate": 4.574547475389013e-05, + "loss": 0.7048, + "step": 36850 + }, + { + "epoch": 11.718005716100349, + "grad_norm": 26289.15234375, + "learning_rate": 4.570577961257542e-05, + "loss": 0.6975, + "step": 36900 + }, + { + "epoch": 11.73388377262623, + "grad_norm": 26606.12109375, + "learning_rate": 4.5666084471260715e-05, + "loss": 0.705, + "step": 36950 + }, + { + "epoch": 11.749761829152112, + "grad_norm": 25736.138671875, + "learning_rate": 4.5626389329946015e-05, + "loss": 0.7004, + "step": 37000 + }, + { + "epoch": 11.765639885677993, + "grad_norm": 26059.619140625, + "learning_rate": 4.5586694188631315e-05, + "loss": 0.705, + "step": 37050 + }, + { + "epoch": 11.781517942203875, + "grad_norm": 25117.2734375, + "learning_rate": 4.5546999047316616e-05, + "loss": 0.7088, + "step": 37100 + }, + { + "epoch": 11.797395998729755, + "grad_norm": 27240.82421875, + "learning_rate": 4.550730390600191e-05, + "loss": 0.7003, + "step": 37150 + }, + { + "epoch": 11.813274055255636, + "grad_norm": 26413.03125, + "learning_rate": 4.54676087646872e-05, + "loss": 0.7029, + "step": 37200 + }, + { + "epoch": 11.829152111781518, + "grad_norm": 25459.921875, + "learning_rate": 4.54279136233725e-05, + "loss": 0.6975, + "step": 37250 + }, + { + "epoch": 11.8450301683074, + "grad_norm": 26057.275390625, + "learning_rate": 4.5388218482057797e-05, + "loss": 0.6994, + "step": 37300 + }, + { + "epoch": 11.860908224833281, + "grad_norm": 25720.384765625, + "learning_rate": 4.534852334074309e-05, + "loss": 0.7072, + "step": 37350 + }, + { + "epoch": 11.876786281359161, + "grad_norm": 26672.6015625, + "learning_rate": 4.530882819942839e-05, + "loss": 0.7017, + "step": 37400 + }, + { + "epoch": 11.892664337885043, + "grad_norm": 25892.302734375, + "learning_rate": 4.526913305811369e-05, + "loss": 0.7031, + "step": 37450 + }, + { + "epoch": 11.908542394410924, + "grad_norm": 26435.544921875, + "learning_rate": 4.5229437916798984e-05, + "loss": 0.7011, + "step": 37500 + }, + { + "epoch": 11.924420450936806, + "grad_norm": 25747.921875, + "learning_rate": 4.5189742775484285e-05, + "loss": 0.6979, + "step": 37550 + }, + { + "epoch": 11.940298507462687, + "grad_norm": 25117.19140625, + "learning_rate": 4.515004763416958e-05, + "loss": 0.6996, + "step": 37600 + }, + { + "epoch": 11.956176563988567, + "grad_norm": 26094.021484375, + "learning_rate": 4.511035249285488e-05, + "loss": 0.703, + "step": 37650 + }, + { + "epoch": 11.972054620514449, + "grad_norm": 25621.94140625, + "learning_rate": 4.507065735154017e-05, + "loss": 0.7034, + "step": 37700 + }, + { + "epoch": 11.98793267704033, + "grad_norm": 25541.908203125, + "learning_rate": 4.503096221022547e-05, + "loss": 0.7004, + "step": 37750 + }, + { + "epoch": 12.003810733566212, + "grad_norm": 25723.466796875, + "learning_rate": 4.499126706891077e-05, + "loss": 0.6991, + "step": 37800 + }, + { + "epoch": 12.019688790092093, + "grad_norm": 26813.052734375, + "learning_rate": 4.4951571927596066e-05, + "loss": 0.696, + "step": 37850 + }, + { + "epoch": 12.035566846617973, + "grad_norm": 25913.0390625, + "learning_rate": 4.491187678628136e-05, + "loss": 0.6931, + "step": 37900 + }, + { + "epoch": 12.051444903143855, + "grad_norm": 25332.591796875, + "learning_rate": 4.487218164496666e-05, + "loss": 0.6939, + "step": 37950 + }, + { + "epoch": 12.067322959669736, + "grad_norm": 25602.212890625, + "learning_rate": 4.483248650365195e-05, + "loss": 0.6935, + "step": 38000 + }, + { + "epoch": 12.083201016195618, + "grad_norm": 26077.806640625, + "learning_rate": 4.479279136233725e-05, + "loss": 0.6946, + "step": 38050 + }, + { + "epoch": 12.0990790727215, + "grad_norm": 26038.025390625, + "learning_rate": 4.475309622102255e-05, + "loss": 0.6927, + "step": 38100 + }, + { + "epoch": 12.11495712924738, + "grad_norm": 25922.17578125, + "learning_rate": 4.471340107970785e-05, + "loss": 0.691, + "step": 38150 + }, + { + "epoch": 12.130835185773261, + "grad_norm": 25839.078125, + "learning_rate": 4.467370593839315e-05, + "loss": 0.6924, + "step": 38200 + }, + { + "epoch": 12.146713242299143, + "grad_norm": 25702.123046875, + "learning_rate": 4.463401079707844e-05, + "loss": 0.6918, + "step": 38250 + }, + { + "epoch": 12.162591298825024, + "grad_norm": 26183.7890625, + "learning_rate": 4.4594315655763735e-05, + "loss": 0.6922, + "step": 38300 + }, + { + "epoch": 12.178469355350906, + "grad_norm": 26349.01953125, + "learning_rate": 4.4554620514449035e-05, + "loss": 0.6919, + "step": 38350 + }, + { + "epoch": 12.194347411876786, + "grad_norm": 25578.62890625, + "learning_rate": 4.451492537313433e-05, + "loss": 0.6911, + "step": 38400 + }, + { + "epoch": 12.210225468402667, + "grad_norm": 25342.06640625, + "learning_rate": 4.447523023181963e-05, + "loss": 0.6948, + "step": 38450 + }, + { + "epoch": 12.226103524928549, + "grad_norm": 25582.328125, + "learning_rate": 4.443553509050493e-05, + "loss": 0.6965, + "step": 38500 + }, + { + "epoch": 12.24198158145443, + "grad_norm": 26281.44921875, + "learning_rate": 4.439583994919022e-05, + "loss": 0.6947, + "step": 38550 + }, + { + "epoch": 12.257859637980312, + "grad_norm": 25423.970703125, + "learning_rate": 4.4356144807875516e-05, + "loss": 0.6949, + "step": 38600 + }, + { + "epoch": 12.273737694506192, + "grad_norm": 25645.67578125, + "learning_rate": 4.4316449666560816e-05, + "loss": 0.692, + "step": 38650 + }, + { + "epoch": 12.289615751032073, + "grad_norm": 25957.416015625, + "learning_rate": 4.427675452524611e-05, + "loss": 0.6914, + "step": 38700 + }, + { + "epoch": 12.305493807557955, + "grad_norm": 25020.30859375, + "learning_rate": 4.4237059383931403e-05, + "loss": 0.6956, + "step": 38750 + }, + { + "epoch": 12.321371864083837, + "grad_norm": 25936.24609375, + "learning_rate": 4.4197364242616704e-05, + "loss": 0.6909, + "step": 38800 + }, + { + "epoch": 12.337249920609718, + "grad_norm": 25477.5, + "learning_rate": 4.4157669101302004e-05, + "loss": 0.6915, + "step": 38850 + }, + { + "epoch": 12.353127977135598, + "grad_norm": 24679.041015625, + "learning_rate": 4.4117973959987304e-05, + "loss": 0.6959, + "step": 38900 + }, + { + "epoch": 12.36900603366148, + "grad_norm": 25305.9140625, + "learning_rate": 4.40782788186726e-05, + "loss": 0.6958, + "step": 38950 + }, + { + "epoch": 12.384884090187361, + "grad_norm": 25186.720703125, + "learning_rate": 4.403858367735789e-05, + "loss": 0.6899, + "step": 39000 + }, + { + "epoch": 12.400762146713243, + "grad_norm": 25848.09375, + "learning_rate": 4.399888853604319e-05, + "loss": 0.6954, + "step": 39050 + }, + { + "epoch": 12.416640203239124, + "grad_norm": 25214.7265625, + "learning_rate": 4.3959193394728485e-05, + "loss": 0.6918, + "step": 39100 + }, + { + "epoch": 12.432518259765004, + "grad_norm": 25850.787109375, + "learning_rate": 4.3919498253413785e-05, + "loss": 0.6889, + "step": 39150 + }, + { + "epoch": 12.448396316290886, + "grad_norm": 25535.595703125, + "learning_rate": 4.3879803112099086e-05, + "loss": 0.6913, + "step": 39200 + }, + { + "epoch": 12.464274372816767, + "grad_norm": 25293.98828125, + "learning_rate": 4.384010797078438e-05, + "loss": 0.6909, + "step": 39250 + }, + { + "epoch": 12.480152429342649, + "grad_norm": 25519.966796875, + "learning_rate": 4.380041282946967e-05, + "loss": 0.693, + "step": 39300 + }, + { + "epoch": 12.49603048586853, + "grad_norm": 25590.5390625, + "learning_rate": 4.376071768815497e-05, + "loss": 0.6901, + "step": 39350 + }, + { + "epoch": 12.51190854239441, + "grad_norm": 25909.888671875, + "learning_rate": 4.3721022546840267e-05, + "loss": 0.6908, + "step": 39400 + }, + { + "epoch": 12.527786598920292, + "grad_norm": 25248.21484375, + "learning_rate": 4.368132740552557e-05, + "loss": 0.6932, + "step": 39450 + }, + { + "epoch": 12.543664655446173, + "grad_norm": 24925.970703125, + "learning_rate": 4.364163226421086e-05, + "loss": 0.6914, + "step": 39500 + }, + { + "epoch": 12.559542711972055, + "grad_norm": 25199.6171875, + "learning_rate": 4.360193712289616e-05, + "loss": 0.6937, + "step": 39550 + }, + { + "epoch": 12.575420768497937, + "grad_norm": 25989.927734375, + "learning_rate": 4.356224198158146e-05, + "loss": 0.6912, + "step": 39600 + }, + { + "epoch": 12.591298825023816, + "grad_norm": 24462.072265625, + "learning_rate": 4.3522546840266755e-05, + "loss": 0.6919, + "step": 39650 + }, + { + "epoch": 12.607176881549698, + "grad_norm": 25439.693359375, + "learning_rate": 4.348285169895205e-05, + "loss": 0.6903, + "step": 39700 + }, + { + "epoch": 12.62305493807558, + "grad_norm": 25596.04296875, + "learning_rate": 4.344315655763735e-05, + "loss": 0.6912, + "step": 39750 + }, + { + "epoch": 12.638932994601461, + "grad_norm": 24989.56640625, + "learning_rate": 4.340346141632264e-05, + "loss": 0.6935, + "step": 39800 + }, + { + "epoch": 12.654811051127343, + "grad_norm": 24844.279296875, + "learning_rate": 4.336376627500794e-05, + "loss": 0.6895, + "step": 39850 + }, + { + "epoch": 12.670689107653223, + "grad_norm": 25176.060546875, + "learning_rate": 4.332407113369324e-05, + "loss": 0.6924, + "step": 39900 + }, + { + "epoch": 12.686567164179104, + "grad_norm": 25582.01953125, + "learning_rate": 4.3284375992378536e-05, + "loss": 0.6927, + "step": 39950 + }, + { + "epoch": 12.702445220704986, + "grad_norm": 25024.33203125, + "learning_rate": 4.3244680851063836e-05, + "loss": 0.6904, + "step": 40000 + }, + { + "epoch": 12.718323277230867, + "grad_norm": 25088.216796875, + "learning_rate": 4.320498570974913e-05, + "loss": 0.6904, + "step": 40050 + }, + { + "epoch": 12.734201333756747, + "grad_norm": 24689.61328125, + "learning_rate": 4.316529056843442e-05, + "loss": 0.687, + "step": 40100 + }, + { + "epoch": 12.750079390282629, + "grad_norm": 25595.205078125, + "learning_rate": 4.3125595427119724e-05, + "loss": 0.6911, + "step": 40150 + }, + { + "epoch": 12.76595744680851, + "grad_norm": 25500.544921875, + "learning_rate": 4.308590028580502e-05, + "loss": 0.6894, + "step": 40200 + }, + { + "epoch": 12.781835503334392, + "grad_norm": 24525.78125, + "learning_rate": 4.304620514449032e-05, + "loss": 0.6904, + "step": 40250 + }, + { + "epoch": 12.797713559860274, + "grad_norm": 24815.94921875, + "learning_rate": 4.300651000317562e-05, + "loss": 0.6879, + "step": 40300 + }, + { + "epoch": 12.813591616386155, + "grad_norm": 24612.96875, + "learning_rate": 4.296681486186091e-05, + "loss": 0.693, + "step": 40350 + }, + { + "epoch": 12.829469672912035, + "grad_norm": 25878.453125, + "learning_rate": 4.2927119720546205e-05, + "loss": 0.6877, + "step": 40400 + }, + { + "epoch": 12.845347729437917, + "grad_norm": 25819.634765625, + "learning_rate": 4.2887424579231505e-05, + "loss": 0.6889, + "step": 40450 + }, + { + "epoch": 12.861225785963798, + "grad_norm": 24711.671875, + "learning_rate": 4.28477294379168e-05, + "loss": 0.6897, + "step": 40500 + }, + { + "epoch": 12.87710384248968, + "grad_norm": 25272.193359375, + "learning_rate": 4.280803429660209e-05, + "loss": 0.6916, + "step": 40550 + }, + { + "epoch": 12.89298189901556, + "grad_norm": 24950.6953125, + "learning_rate": 4.27683391552874e-05, + "loss": 0.6874, + "step": 40600 + }, + { + "epoch": 12.908859955541441, + "grad_norm": 25394.751953125, + "learning_rate": 4.272864401397269e-05, + "loss": 0.6909, + "step": 40650 + }, + { + "epoch": 12.924738012067323, + "grad_norm": 25527.599609375, + "learning_rate": 4.268894887265799e-05, + "loss": 0.6888, + "step": 40700 + }, + { + "epoch": 12.940616068593204, + "grad_norm": 25220.8125, + "learning_rate": 4.2649253731343286e-05, + "loss": 0.6902, + "step": 40750 + }, + { + "epoch": 12.956494125119086, + "grad_norm": 24796.541015625, + "learning_rate": 4.260955859002858e-05, + "loss": 0.6894, + "step": 40800 + }, + { + "epoch": 12.972372181644968, + "grad_norm": 25115.955078125, + "learning_rate": 4.256986344871388e-05, + "loss": 0.6908, + "step": 40850 + }, + { + "epoch": 12.988250238170847, + "grad_norm": 24197.728515625, + "learning_rate": 4.2530168307399174e-05, + "loss": 0.6864, + "step": 40900 + }, + { + "epoch": 13.004128294696729, + "grad_norm": 24724.552734375, + "learning_rate": 4.2490473166084474e-05, + "loss": 0.6864, + "step": 40950 + }, + { + "epoch": 13.02000635122261, + "grad_norm": 25012.1640625, + "learning_rate": 4.2450778024769774e-05, + "loss": 0.6799, + "step": 41000 + }, + { + "epoch": 13.035884407748492, + "grad_norm": 24815.64453125, + "learning_rate": 4.241108288345507e-05, + "loss": 0.6837, + "step": 41050 + }, + { + "epoch": 13.051762464274374, + "grad_norm": 25473.265625, + "learning_rate": 4.237138774214036e-05, + "loss": 0.6821, + "step": 41100 + }, + { + "epoch": 13.067640520800254, + "grad_norm": 24979.189453125, + "learning_rate": 4.233169260082566e-05, + "loss": 0.6825, + "step": 41150 + }, + { + "epoch": 13.083518577326135, + "grad_norm": 24369.861328125, + "learning_rate": 4.2291997459510955e-05, + "loss": 0.6802, + "step": 41200 + }, + { + "epoch": 13.099396633852017, + "grad_norm": 24936.068359375, + "learning_rate": 4.2252302318196255e-05, + "loss": 0.6817, + "step": 41250 + }, + { + "epoch": 13.115274690377898, + "grad_norm": 25067.654296875, + "learning_rate": 4.221260717688155e-05, + "loss": 0.68, + "step": 41300 + }, + { + "epoch": 13.13115274690378, + "grad_norm": 24868.97265625, + "learning_rate": 4.217291203556685e-05, + "loss": 0.6811, + "step": 41350 + }, + { + "epoch": 13.14703080342966, + "grad_norm": 24265.5546875, + "learning_rate": 4.213321689425215e-05, + "loss": 0.6813, + "step": 41400 + }, + { + "epoch": 13.162908859955541, + "grad_norm": 24921.30078125, + "learning_rate": 4.209352175293744e-05, + "loss": 0.6796, + "step": 41450 + }, + { + "epoch": 13.178786916481423, + "grad_norm": 24588.80078125, + "learning_rate": 4.2053826611622737e-05, + "loss": 0.6828, + "step": 41500 + }, + { + "epoch": 13.194664973007304, + "grad_norm": 24551.5, + "learning_rate": 4.201413147030804e-05, + "loss": 0.6805, + "step": 41550 + }, + { + "epoch": 13.210543029533184, + "grad_norm": 25206.583984375, + "learning_rate": 4.197443632899333e-05, + "loss": 0.6852, + "step": 41600 + }, + { + "epoch": 13.226421086059066, + "grad_norm": 24448.646484375, + "learning_rate": 4.193474118767863e-05, + "loss": 0.6832, + "step": 41650 + }, + { + "epoch": 13.242299142584947, + "grad_norm": 24609.46484375, + "learning_rate": 4.189504604636393e-05, + "loss": 0.6842, + "step": 41700 + }, + { + "epoch": 13.258177199110829, + "grad_norm": 24898.201171875, + "learning_rate": 4.1855350905049225e-05, + "loss": 0.6812, + "step": 41750 + }, + { + "epoch": 13.27405525563671, + "grad_norm": 24523.78515625, + "learning_rate": 4.1815655763734525e-05, + "loss": 0.6842, + "step": 41800 + }, + { + "epoch": 13.28993331216259, + "grad_norm": 24928.837890625, + "learning_rate": 4.177596062241982e-05, + "loss": 0.6813, + "step": 41850 + }, + { + "epoch": 13.305811368688472, + "grad_norm": 24684.125, + "learning_rate": 4.173626548110511e-05, + "loss": 0.6772, + "step": 41900 + }, + { + "epoch": 13.321689425214354, + "grad_norm": 25091.099609375, + "learning_rate": 4.169657033979041e-05, + "loss": 0.6804, + "step": 41950 + }, + { + "epoch": 13.337567481740235, + "grad_norm": 24772.115234375, + "learning_rate": 4.1656875198475706e-05, + "loss": 0.6788, + "step": 42000 + }, + { + "epoch": 13.353445538266117, + "grad_norm": 25392.5, + "learning_rate": 4.1617180057161006e-05, + "loss": 0.6814, + "step": 42050 + }, + { + "epoch": 13.369323594791997, + "grad_norm": 24344.09375, + "learning_rate": 4.1577484915846306e-05, + "loss": 0.683, + "step": 42100 + }, + { + "epoch": 13.385201651317878, + "grad_norm": 24251.580078125, + "learning_rate": 4.15377897745316e-05, + "loss": 0.6826, + "step": 42150 + }, + { + "epoch": 13.40107970784376, + "grad_norm": 25104.28125, + "learning_rate": 4.149809463321689e-05, + "loss": 0.6812, + "step": 42200 + }, + { + "epoch": 13.416957764369641, + "grad_norm": 24021.865234375, + "learning_rate": 4.1458399491902194e-05, + "loss": 0.6802, + "step": 42250 + }, + { + "epoch": 13.432835820895523, + "grad_norm": 24694.013671875, + "learning_rate": 4.141870435058749e-05, + "loss": 0.683, + "step": 42300 + }, + { + "epoch": 13.448713877421403, + "grad_norm": 24235.306640625, + "learning_rate": 4.137900920927279e-05, + "loss": 0.6818, + "step": 42350 + }, + { + "epoch": 13.464591933947284, + "grad_norm": 24905.02734375, + "learning_rate": 4.133931406795809e-05, + "loss": 0.68, + "step": 42400 + }, + { + "epoch": 13.480469990473166, + "grad_norm": 24339.927734375, + "learning_rate": 4.129961892664338e-05, + "loss": 0.6799, + "step": 42450 + }, + { + "epoch": 13.496348046999048, + "grad_norm": 24592.8828125, + "learning_rate": 4.125992378532868e-05, + "loss": 0.6776, + "step": 42500 + }, + { + "epoch": 13.51222610352493, + "grad_norm": 24951.8125, + "learning_rate": 4.1220228644013975e-05, + "loss": 0.6841, + "step": 42550 + }, + { + "epoch": 13.528104160050809, + "grad_norm": 24222.96875, + "learning_rate": 4.118053350269927e-05, + "loss": 0.6817, + "step": 42600 + }, + { + "epoch": 13.54398221657669, + "grad_norm": 24539.65234375, + "learning_rate": 4.114083836138457e-05, + "loss": 0.6812, + "step": 42650 + }, + { + "epoch": 13.559860273102572, + "grad_norm": 24310.98828125, + "learning_rate": 4.110114322006986e-05, + "loss": 0.6847, + "step": 42700 + }, + { + "epoch": 13.575738329628454, + "grad_norm": 24087.740234375, + "learning_rate": 4.106144807875516e-05, + "loss": 0.6795, + "step": 42750 + }, + { + "epoch": 13.591616386154335, + "grad_norm": 24772.966796875, + "learning_rate": 4.102175293744046e-05, + "loss": 0.68, + "step": 42800 + }, + { + "epoch": 13.607494442680215, + "grad_norm": 24550.08203125, + "learning_rate": 4.0982057796125756e-05, + "loss": 0.6789, + "step": 42850 + }, + { + "epoch": 13.623372499206097, + "grad_norm": 24223.595703125, + "learning_rate": 4.094236265481105e-05, + "loss": 0.6837, + "step": 42900 + }, + { + "epoch": 13.639250555731978, + "grad_norm": 24113.8984375, + "learning_rate": 4.090266751349635e-05, + "loss": 0.681, + "step": 42950 + }, + { + "epoch": 13.65512861225786, + "grad_norm": 24669.611328125, + "learning_rate": 4.0862972372181644e-05, + "loss": 0.6832, + "step": 43000 + }, + { + "epoch": 13.671006668783741, + "grad_norm": 23798.759765625, + "learning_rate": 4.0823277230866944e-05, + "loss": 0.677, + "step": 43050 + }, + { + "epoch": 13.686884725309621, + "grad_norm": 24341.833984375, + "learning_rate": 4.0783582089552244e-05, + "loss": 0.6814, + "step": 43100 + }, + { + "epoch": 13.702762781835503, + "grad_norm": 24048.4609375, + "learning_rate": 4.074388694823754e-05, + "loss": 0.6784, + "step": 43150 + }, + { + "epoch": 13.718640838361384, + "grad_norm": 24171.9921875, + "learning_rate": 4.070419180692284e-05, + "loss": 0.6803, + "step": 43200 + }, + { + "epoch": 13.734518894887266, + "grad_norm": 24474.33203125, + "learning_rate": 4.066449666560813e-05, + "loss": 0.6785, + "step": 43250 + }, + { + "epoch": 13.750396951413148, + "grad_norm": 23632.65625, + "learning_rate": 4.0624801524293425e-05, + "loss": 0.6811, + "step": 43300 + }, + { + "epoch": 13.766275007939027, + "grad_norm": 23942.4140625, + "learning_rate": 4.0585106382978725e-05, + "loss": 0.6802, + "step": 43350 + }, + { + "epoch": 13.782153064464909, + "grad_norm": 24818.43359375, + "learning_rate": 4.054541124166402e-05, + "loss": 0.6817, + "step": 43400 + }, + { + "epoch": 13.79803112099079, + "grad_norm": 23369.2265625, + "learning_rate": 4.050571610034932e-05, + "loss": 0.6811, + "step": 43450 + }, + { + "epoch": 13.813909177516672, + "grad_norm": 24389.009765625, + "learning_rate": 4.046602095903462e-05, + "loss": 0.6805, + "step": 43500 + }, + { + "epoch": 13.829787234042554, + "grad_norm": 24440.2421875, + "learning_rate": 4.042632581771991e-05, + "loss": 0.681, + "step": 43550 + }, + { + "epoch": 13.845665290568434, + "grad_norm": 24627.8203125, + "learning_rate": 4.038663067640521e-05, + "loss": 0.6775, + "step": 43600 + }, + { + "epoch": 13.861543347094315, + "grad_norm": 23863.171875, + "learning_rate": 4.034693553509051e-05, + "loss": 0.6803, + "step": 43650 + }, + { + "epoch": 13.877421403620197, + "grad_norm": 24950.375, + "learning_rate": 4.03072403937758e-05, + "loss": 0.6825, + "step": 43700 + }, + { + "epoch": 13.893299460146078, + "grad_norm": 23713.75390625, + "learning_rate": 4.02675452524611e-05, + "loss": 0.6774, + "step": 43750 + }, + { + "epoch": 13.90917751667196, + "grad_norm": 24111.923828125, + "learning_rate": 4.02278501111464e-05, + "loss": 0.6769, + "step": 43800 + }, + { + "epoch": 13.92505557319784, + "grad_norm": 24068.44140625, + "learning_rate": 4.0188154969831695e-05, + "loss": 0.6792, + "step": 43850 + }, + { + "epoch": 13.940933629723721, + "grad_norm": 24496.923828125, + "learning_rate": 4.0148459828516995e-05, + "loss": 0.6786, + "step": 43900 + }, + { + "epoch": 13.956811686249603, + "grad_norm": 24324.5234375, + "learning_rate": 4.010876468720229e-05, + "loss": 0.6818, + "step": 43950 + }, + { + "epoch": 13.972689742775485, + "grad_norm": 24043.82421875, + "learning_rate": 4.006906954588758e-05, + "loss": 0.6756, + "step": 44000 + }, + { + "epoch": 13.988567799301366, + "grad_norm": 23718.05859375, + "learning_rate": 4.002937440457288e-05, + "loss": 0.6742, + "step": 44050 + }, + { + "epoch": 14.004445855827246, + "grad_norm": 24011.693359375, + "learning_rate": 3.9989679263258176e-05, + "loss": 0.6766, + "step": 44100 + }, + { + "epoch": 14.020323912353128, + "grad_norm": 23874.05859375, + "learning_rate": 3.9949984121943476e-05, + "loss": 0.6721, + "step": 44150 + }, + { + "epoch": 14.03620196887901, + "grad_norm": 24170.900390625, + "learning_rate": 3.9910288980628776e-05, + "loss": 0.6746, + "step": 44200 + }, + { + "epoch": 14.05208002540489, + "grad_norm": 23806.9921875, + "learning_rate": 3.987059383931407e-05, + "loss": 0.6702, + "step": 44250 + }, + { + "epoch": 14.067958081930772, + "grad_norm": 24097.408203125, + "learning_rate": 3.983089869799937e-05, + "loss": 0.6726, + "step": 44300 + }, + { + "epoch": 14.083836138456652, + "grad_norm": 24183.365234375, + "learning_rate": 3.9791203556684664e-05, + "loss": 0.6688, + "step": 44350 + }, + { + "epoch": 14.099714194982534, + "grad_norm": 25200.125, + "learning_rate": 3.975150841536996e-05, + "loss": 0.6743, + "step": 44400 + }, + { + "epoch": 14.115592251508415, + "grad_norm": 24078.8359375, + "learning_rate": 3.971181327405526e-05, + "loss": 0.6707, + "step": 44450 + }, + { + "epoch": 14.131470308034297, + "grad_norm": 24092.71875, + "learning_rate": 3.967211813274056e-05, + "loss": 0.6727, + "step": 44500 + }, + { + "epoch": 14.147348364560179, + "grad_norm": 24398.52734375, + "learning_rate": 3.963242299142585e-05, + "loss": 0.6726, + "step": 44550 + }, + { + "epoch": 14.163226421086058, + "grad_norm": 25464.837890625, + "learning_rate": 3.959272785011115e-05, + "loss": 0.6692, + "step": 44600 + }, + { + "epoch": 14.17910447761194, + "grad_norm": 24340.91796875, + "learning_rate": 3.9553032708796445e-05, + "loss": 0.6726, + "step": 44650 + }, + { + "epoch": 14.194982534137822, + "grad_norm": 24626.837890625, + "learning_rate": 3.951333756748174e-05, + "loss": 0.6719, + "step": 44700 + }, + { + "epoch": 14.210860590663703, + "grad_norm": 24861.15234375, + "learning_rate": 3.947364242616704e-05, + "loss": 0.6724, + "step": 44750 + }, + { + "epoch": 14.226738647189585, + "grad_norm": 23964.291015625, + "learning_rate": 3.943394728485233e-05, + "loss": 0.674, + "step": 44800 + }, + { + "epoch": 14.242616703715465, + "grad_norm": 23936.125, + "learning_rate": 3.939425214353763e-05, + "loss": 0.6743, + "step": 44850 + }, + { + "epoch": 14.258494760241346, + "grad_norm": 24653.296875, + "learning_rate": 3.935455700222293e-05, + "loss": 0.6723, + "step": 44900 + }, + { + "epoch": 14.274372816767228, + "grad_norm": 24398.095703125, + "learning_rate": 3.9314861860908226e-05, + "loss": 0.6774, + "step": 44950 + }, + { + "epoch": 14.29025087329311, + "grad_norm": 24063.818359375, + "learning_rate": 3.927516671959353e-05, + "loss": 0.6701, + "step": 45000 + }, + { + "epoch": 14.30612892981899, + "grad_norm": 24251.955078125, + "learning_rate": 3.923547157827882e-05, + "loss": 0.6749, + "step": 45050 + }, + { + "epoch": 14.32200698634487, + "grad_norm": 24572.671875, + "learning_rate": 3.9195776436964114e-05, + "loss": 0.6754, + "step": 45100 + }, + { + "epoch": 14.337885042870752, + "grad_norm": 24464.720703125, + "learning_rate": 3.9156081295649414e-05, + "loss": 0.6718, + "step": 45150 + }, + { + "epoch": 14.353763099396634, + "grad_norm": 24511.29296875, + "learning_rate": 3.9116386154334714e-05, + "loss": 0.6752, + "step": 45200 + }, + { + "epoch": 14.369641155922515, + "grad_norm": 24030.92578125, + "learning_rate": 3.907669101302001e-05, + "loss": 0.6737, + "step": 45250 + }, + { + "epoch": 14.385519212448397, + "grad_norm": 23963.9453125, + "learning_rate": 3.903699587170531e-05, + "loss": 0.6726, + "step": 45300 + }, + { + "epoch": 14.401397268974277, + "grad_norm": 24408.703125, + "learning_rate": 3.89973007303906e-05, + "loss": 0.6755, + "step": 45350 + }, + { + "epoch": 14.417275325500158, + "grad_norm": 24175.90234375, + "learning_rate": 3.89576055890759e-05, + "loss": 0.6733, + "step": 45400 + }, + { + "epoch": 14.43315338202604, + "grad_norm": 24181.03515625, + "learning_rate": 3.8917910447761195e-05, + "loss": 0.6715, + "step": 45450 + }, + { + "epoch": 14.449031438551922, + "grad_norm": 24088.134765625, + "learning_rate": 3.887821530644649e-05, + "loss": 0.6734, + "step": 45500 + }, + { + "epoch": 14.464909495077803, + "grad_norm": 24348.998046875, + "learning_rate": 3.883852016513179e-05, + "loss": 0.673, + "step": 45550 + }, + { + "epoch": 14.480787551603683, + "grad_norm": 23610.765625, + "learning_rate": 3.879882502381709e-05, + "loss": 0.6733, + "step": 45600 + }, + { + "epoch": 14.496665608129565, + "grad_norm": 23721.083984375, + "learning_rate": 3.875912988250238e-05, + "loss": 0.6731, + "step": 45650 + }, + { + "epoch": 14.512543664655446, + "grad_norm": 24457.9375, + "learning_rate": 3.871943474118768e-05, + "loss": 0.6733, + "step": 45700 + }, + { + "epoch": 14.528421721181328, + "grad_norm": 24163.458984375, + "learning_rate": 3.867973959987298e-05, + "loss": 0.6721, + "step": 45750 + }, + { + "epoch": 14.54429977770721, + "grad_norm": 23760.05859375, + "learning_rate": 3.864004445855827e-05, + "loss": 0.6715, + "step": 45800 + }, + { + "epoch": 14.56017783423309, + "grad_norm": 23812.14453125, + "learning_rate": 3.860034931724357e-05, + "loss": 0.6727, + "step": 45850 + }, + { + "epoch": 14.57605589075897, + "grad_norm": 23580.501953125, + "learning_rate": 3.8560654175928864e-05, + "loss": 0.6704, + "step": 45900 + }, + { + "epoch": 14.591933947284852, + "grad_norm": 24045.09765625, + "learning_rate": 3.8520959034614165e-05, + "loss": 0.6718, + "step": 45950 + }, + { + "epoch": 14.607812003810734, + "grad_norm": 24375.74609375, + "learning_rate": 3.8481263893299465e-05, + "loss": 0.6709, + "step": 46000 + }, + { + "epoch": 14.623690060336616, + "grad_norm": 24000.484375, + "learning_rate": 3.844156875198476e-05, + "loss": 0.674, + "step": 46050 + }, + { + "epoch": 14.639568116862495, + "grad_norm": 23421.57421875, + "learning_rate": 3.840187361067006e-05, + "loss": 0.6735, + "step": 46100 + }, + { + "epoch": 14.655446173388377, + "grad_norm": 24476.041015625, + "learning_rate": 3.836217846935535e-05, + "loss": 0.6737, + "step": 46150 + }, + { + "epoch": 14.671324229914259, + "grad_norm": 23659.337890625, + "learning_rate": 3.8322483328040646e-05, + "loss": 0.6752, + "step": 46200 + }, + { + "epoch": 14.68720228644014, + "grad_norm": 24077.185546875, + "learning_rate": 3.8282788186725946e-05, + "loss": 0.6684, + "step": 46250 + }, + { + "epoch": 14.703080342966022, + "grad_norm": 24052.515625, + "learning_rate": 3.8243093045411246e-05, + "loss": 0.673, + "step": 46300 + }, + { + "epoch": 14.718958399491902, + "grad_norm": 23948.08203125, + "learning_rate": 3.820339790409654e-05, + "loss": 0.6745, + "step": 46350 + }, + { + "epoch": 14.734836456017783, + "grad_norm": 23770.400390625, + "learning_rate": 3.816370276278184e-05, + "loss": 0.6699, + "step": 46400 + }, + { + "epoch": 14.750714512543665, + "grad_norm": 23651.755859375, + "learning_rate": 3.8124007621467134e-05, + "loss": 0.6722, + "step": 46450 + }, + { + "epoch": 14.766592569069546, + "grad_norm": 24653.626953125, + "learning_rate": 3.808431248015243e-05, + "loss": 0.6698, + "step": 46500 + }, + { + "epoch": 14.782470625595428, + "grad_norm": 24360.767578125, + "learning_rate": 3.804461733883773e-05, + "loss": 0.6701, + "step": 46550 + }, + { + "epoch": 14.798348682121308, + "grad_norm": 23147.37109375, + "learning_rate": 3.800492219752302e-05, + "loss": 0.6687, + "step": 46600 + }, + { + "epoch": 14.81422673864719, + "grad_norm": 24035.583984375, + "learning_rate": 3.796522705620832e-05, + "loss": 0.6703, + "step": 46650 + }, + { + "epoch": 14.830104795173071, + "grad_norm": 23670.650390625, + "learning_rate": 3.792553191489362e-05, + "loss": 0.67, + "step": 46700 + }, + { + "epoch": 14.845982851698952, + "grad_norm": 23496.2265625, + "learning_rate": 3.7885836773578915e-05, + "loss": 0.6741, + "step": 46750 + }, + { + "epoch": 14.861860908224834, + "grad_norm": 24511.904296875, + "learning_rate": 3.7846141632264215e-05, + "loss": 0.673, + "step": 46800 + }, + { + "epoch": 14.877738964750714, + "grad_norm": 23906.30859375, + "learning_rate": 3.780644649094951e-05, + "loss": 0.6745, + "step": 46850 + }, + { + "epoch": 14.893617021276595, + "grad_norm": 23655.87109375, + "learning_rate": 3.77667513496348e-05, + "loss": 0.668, + "step": 46900 + }, + { + "epoch": 14.909495077802477, + "grad_norm": 23849.982421875, + "learning_rate": 3.77270562083201e-05, + "loss": 0.6671, + "step": 46950 + }, + { + "epoch": 14.925373134328359, + "grad_norm": 24548.85546875, + "learning_rate": 3.76873610670054e-05, + "loss": 0.6698, + "step": 47000 + }, + { + "epoch": 14.94125119085424, + "grad_norm": 24144.28515625, + "learning_rate": 3.7647665925690696e-05, + "loss": 0.668, + "step": 47050 + }, + { + "epoch": 14.95712924738012, + "grad_norm": 23972.701171875, + "learning_rate": 3.7607970784376e-05, + "loss": 0.6677, + "step": 47100 + }, + { + "epoch": 14.973007303906002, + "grad_norm": 23551.353515625, + "learning_rate": 3.756827564306129e-05, + "loss": 0.6699, + "step": 47150 + }, + { + "epoch": 14.988885360431883, + "grad_norm": 23527.978515625, + "learning_rate": 3.752858050174659e-05, + "loss": 0.6702, + "step": 47200 + }, + { + "epoch": 15.004763416957765, + "grad_norm": 23273.08984375, + "learning_rate": 3.7488885360431884e-05, + "loss": 0.6702, + "step": 47250 + }, + { + "epoch": 15.020641473483646, + "grad_norm": 23773.09765625, + "learning_rate": 3.744919021911718e-05, + "loss": 0.6671, + "step": 47300 + }, + { + "epoch": 15.036519530009526, + "grad_norm": 24133.150390625, + "learning_rate": 3.740949507780248e-05, + "loss": 0.6607, + "step": 47350 + }, + { + "epoch": 15.052397586535408, + "grad_norm": 23611.69140625, + "learning_rate": 3.736979993648778e-05, + "loss": 0.6608, + "step": 47400 + }, + { + "epoch": 15.06827564306129, + "grad_norm": 24616.044921875, + "learning_rate": 3.733010479517307e-05, + "loss": 0.6678, + "step": 47450 + }, + { + "epoch": 15.084153699587171, + "grad_norm": 24132.029296875, + "learning_rate": 3.729040965385837e-05, + "loss": 0.6634, + "step": 47500 + }, + { + "epoch": 15.100031756113053, + "grad_norm": 24181.455078125, + "learning_rate": 3.7250714512543665e-05, + "loss": 0.6655, + "step": 47550 + }, + { + "epoch": 15.115909812638932, + "grad_norm": 23974.443359375, + "learning_rate": 3.721101937122896e-05, + "loss": 0.662, + "step": 47600 + }, + { + "epoch": 15.131787869164814, + "grad_norm": 24213.7890625, + "learning_rate": 3.717132422991426e-05, + "loss": 0.6608, + "step": 47650 + }, + { + "epoch": 15.147665925690696, + "grad_norm": 24070.62890625, + "learning_rate": 3.713162908859956e-05, + "loss": 0.6658, + "step": 47700 + }, + { + "epoch": 15.163543982216577, + "grad_norm": 23968.4296875, + "learning_rate": 3.709193394728485e-05, + "loss": 0.6609, + "step": 47750 + }, + { + "epoch": 15.179422038742459, + "grad_norm": 23617.162109375, + "learning_rate": 3.7052238805970153e-05, + "loss": 0.6613, + "step": 47800 + }, + { + "epoch": 15.195300095268339, + "grad_norm": 24029.837890625, + "learning_rate": 3.701254366465545e-05, + "loss": 0.664, + "step": 47850 + }, + { + "epoch": 15.21117815179422, + "grad_norm": 23731.349609375, + "learning_rate": 3.697284852334075e-05, + "loss": 0.6636, + "step": 47900 + }, + { + "epoch": 15.227056208320102, + "grad_norm": 24603.748046875, + "learning_rate": 3.693315338202604e-05, + "loss": 0.6629, + "step": 47950 + }, + { + "epoch": 15.242934264845983, + "grad_norm": 23958.181640625, + "learning_rate": 3.6893458240711334e-05, + "loss": 0.6648, + "step": 48000 + }, + { + "epoch": 15.258812321371865, + "grad_norm": 23479.33984375, + "learning_rate": 3.6853763099396635e-05, + "loss": 0.6653, + "step": 48050 + }, + { + "epoch": 15.274690377897745, + "grad_norm": 24116.53515625, + "learning_rate": 3.6814067958081935e-05, + "loss": 0.6622, + "step": 48100 + }, + { + "epoch": 15.290568434423626, + "grad_norm": 23710.822265625, + "learning_rate": 3.677437281676723e-05, + "loss": 0.6632, + "step": 48150 + }, + { + "epoch": 15.306446490949508, + "grad_norm": 24976.740234375, + "learning_rate": 3.673467767545253e-05, + "loss": 0.6648, + "step": 48200 + }, + { + "epoch": 15.32232454747539, + "grad_norm": 24450.333984375, + "learning_rate": 3.669498253413782e-05, + "loss": 0.6665, + "step": 48250 + }, + { + "epoch": 15.338202604001271, + "grad_norm": 24068.060546875, + "learning_rate": 3.6655287392823116e-05, + "loss": 0.6653, + "step": 48300 + }, + { + "epoch": 15.354080660527151, + "grad_norm": 23732.517578125, + "learning_rate": 3.6615592251508416e-05, + "loss": 0.6649, + "step": 48350 + }, + { + "epoch": 15.369958717053033, + "grad_norm": 23714.439453125, + "learning_rate": 3.6575897110193716e-05, + "loss": 0.6644, + "step": 48400 + }, + { + "epoch": 15.385836773578914, + "grad_norm": 24033.36328125, + "learning_rate": 3.6536201968879017e-05, + "loss": 0.6648, + "step": 48450 + }, + { + "epoch": 15.401714830104796, + "grad_norm": 23467.201171875, + "learning_rate": 3.649650682756431e-05, + "loss": 0.6632, + "step": 48500 + }, + { + "epoch": 15.417592886630676, + "grad_norm": 23929.56640625, + "learning_rate": 3.6456811686249604e-05, + "loss": 0.6621, + "step": 48550 + }, + { + "epoch": 15.433470943156557, + "grad_norm": 23940.3125, + "learning_rate": 3.6417116544934904e-05, + "loss": 0.6666, + "step": 48600 + }, + { + "epoch": 15.449348999682439, + "grad_norm": 23760.220703125, + "learning_rate": 3.63774214036202e-05, + "loss": 0.6657, + "step": 48650 + }, + { + "epoch": 15.46522705620832, + "grad_norm": 24150.181640625, + "learning_rate": 3.633772626230549e-05, + "loss": 0.6635, + "step": 48700 + }, + { + "epoch": 15.481105112734202, + "grad_norm": 23952.111328125, + "learning_rate": 3.629803112099079e-05, + "loss": 0.6659, + "step": 48750 + }, + { + "epoch": 15.496983169260083, + "grad_norm": 23915.330078125, + "learning_rate": 3.625833597967609e-05, + "loss": 0.6649, + "step": 48800 + }, + { + "epoch": 15.512861225785963, + "grad_norm": 24483.07421875, + "learning_rate": 3.6218640838361385e-05, + "loss": 0.6633, + "step": 48850 + }, + { + "epoch": 15.528739282311845, + "grad_norm": 24314.33203125, + "learning_rate": 3.6178945697046685e-05, + "loss": 0.6649, + "step": 48900 + }, + { + "epoch": 15.544617338837726, + "grad_norm": 24762.5625, + "learning_rate": 3.613925055573198e-05, + "loss": 0.6652, + "step": 48950 + }, + { + "epoch": 15.560495395363608, + "grad_norm": 24496.2109375, + "learning_rate": 3.609955541441727e-05, + "loss": 0.6636, + "step": 49000 + }, + { + "epoch": 15.576373451889488, + "grad_norm": 24142.921875, + "learning_rate": 3.605986027310257e-05, + "loss": 0.665, + "step": 49050 + }, + { + "epoch": 15.59225150841537, + "grad_norm": 23896.888671875, + "learning_rate": 3.602016513178787e-05, + "loss": 0.6636, + "step": 49100 + }, + { + "epoch": 15.608129564941251, + "grad_norm": 23628.111328125, + "learning_rate": 3.598046999047317e-05, + "loss": 0.6615, + "step": 49150 + }, + { + "epoch": 15.624007621467133, + "grad_norm": 23748.744140625, + "learning_rate": 3.594077484915847e-05, + "loss": 0.6648, + "step": 49200 + }, + { + "epoch": 15.639885677993014, + "grad_norm": 23825.630859375, + "learning_rate": 3.590107970784376e-05, + "loss": 0.6652, + "step": 49250 + }, + { + "epoch": 15.655763734518896, + "grad_norm": 23878.716796875, + "learning_rate": 3.586138456652906e-05, + "loss": 0.6615, + "step": 49300 + }, + { + "epoch": 15.671641791044776, + "grad_norm": 23718.876953125, + "learning_rate": 3.5821689425214354e-05, + "loss": 0.6622, + "step": 49350 + }, + { + "epoch": 15.687519847570657, + "grad_norm": 23546.572265625, + "learning_rate": 3.578199428389965e-05, + "loss": 0.6641, + "step": 49400 + }, + { + "epoch": 15.703397904096539, + "grad_norm": 23565.736328125, + "learning_rate": 3.574229914258495e-05, + "loss": 0.6656, + "step": 49450 + }, + { + "epoch": 15.71927596062242, + "grad_norm": 23384.71484375, + "learning_rate": 3.570260400127025e-05, + "loss": 0.6618, + "step": 49500 + }, + { + "epoch": 15.7351540171483, + "grad_norm": 23340.80078125, + "learning_rate": 3.566290885995554e-05, + "loss": 0.6653, + "step": 49550 + }, + { + "epoch": 15.751032073674182, + "grad_norm": 24208.947265625, + "learning_rate": 3.562321371864084e-05, + "loss": 0.6661, + "step": 49600 + }, + { + "epoch": 15.766910130200063, + "grad_norm": 23574.521484375, + "learning_rate": 3.5583518577326135e-05, + "loss": 0.6666, + "step": 49650 + }, + { + "epoch": 15.782788186725945, + "grad_norm": 23982.78125, + "learning_rate": 3.5543823436011436e-05, + "loss": 0.663, + "step": 49700 + }, + { + "epoch": 15.798666243251827, + "grad_norm": 23564.3359375, + "learning_rate": 3.550412829469673e-05, + "loss": 0.6606, + "step": 49750 + }, + { + "epoch": 15.814544299777708, + "grad_norm": 23933.189453125, + "learning_rate": 3.546443315338203e-05, + "loss": 0.6647, + "step": 49800 + }, + { + "epoch": 15.830422356303588, + "grad_norm": 23510.328125, + "learning_rate": 3.542473801206733e-05, + "loss": 0.6635, + "step": 49850 + }, + { + "epoch": 15.84630041282947, + "grad_norm": 23416.091796875, + "learning_rate": 3.5385042870752623e-05, + "loss": 0.6646, + "step": 49900 + }, + { + "epoch": 15.862178469355351, + "grad_norm": 23560.341796875, + "learning_rate": 3.534534772943792e-05, + "loss": 0.663, + "step": 49950 + }, + { + "epoch": 15.878056525881233, + "grad_norm": 23726.447265625, + "learning_rate": 3.530565258812322e-05, + "loss": 0.6649, + "step": 50000 + }, + { + "epoch": 15.893934582407113, + "grad_norm": 23844.755859375, + "learning_rate": 3.526595744680851e-05, + "loss": 0.6622, + "step": 50050 + }, + { + "epoch": 15.909812638932994, + "grad_norm": 23132.53125, + "learning_rate": 3.5226262305493804e-05, + "loss": 0.6612, + "step": 50100 + }, + { + "epoch": 15.925690695458876, + "grad_norm": 24354.9453125, + "learning_rate": 3.5186567164179105e-05, + "loss": 0.6633, + "step": 50150 + }, + { + "epoch": 15.941568751984757, + "grad_norm": 23439.099609375, + "learning_rate": 3.5146872022864405e-05, + "loss": 0.6619, + "step": 50200 + }, + { + "epoch": 15.957446808510639, + "grad_norm": 24292.439453125, + "learning_rate": 3.5107176881549705e-05, + "loss": 0.6629, + "step": 50250 + }, + { + "epoch": 15.973324865036519, + "grad_norm": 23685.8515625, + "learning_rate": 3.5067481740235e-05, + "loss": 0.6622, + "step": 50300 + }, + { + "epoch": 15.9892029215624, + "grad_norm": 23704.876953125, + "learning_rate": 3.502778659892029e-05, + "loss": 0.663, + "step": 50350 + }, + { + "epoch": 16.005080978088284, + "grad_norm": 23387.416015625, + "learning_rate": 3.498809145760559e-05, + "loss": 0.659, + "step": 50400 + }, + { + "epoch": 16.02095903461416, + "grad_norm": 23603.845703125, + "learning_rate": 3.4948396316290886e-05, + "loss": 0.6574, + "step": 50450 + }, + { + "epoch": 16.036837091140043, + "grad_norm": 24016.529296875, + "learning_rate": 3.490870117497618e-05, + "loss": 0.657, + "step": 50500 + }, + { + "epoch": 16.052715147665925, + "grad_norm": 24235.45703125, + "learning_rate": 3.4869006033661487e-05, + "loss": 0.6557, + "step": 50550 + }, + { + "epoch": 16.068593204191806, + "grad_norm": 23704.05859375, + "learning_rate": 3.482931089234678e-05, + "loss": 0.6547, + "step": 50600 + }, + { + "epoch": 16.084471260717688, + "grad_norm": 23739.4921875, + "learning_rate": 3.4789615751032074e-05, + "loss": 0.6541, + "step": 50650 + }, + { + "epoch": 16.10034931724357, + "grad_norm": 24145.8046875, + "learning_rate": 3.4749920609717374e-05, + "loss": 0.6594, + "step": 50700 + }, + { + "epoch": 16.11622737376945, + "grad_norm": 24285.50390625, + "learning_rate": 3.471022546840267e-05, + "loss": 0.6569, + "step": 50750 + }, + { + "epoch": 16.132105430295333, + "grad_norm": 23721.884765625, + "learning_rate": 3.467053032708796e-05, + "loss": 0.6585, + "step": 50800 + }, + { + "epoch": 16.147983486821214, + "grad_norm": 23684.302734375, + "learning_rate": 3.463083518577326e-05, + "loss": 0.6563, + "step": 50850 + }, + { + "epoch": 16.163861543347096, + "grad_norm": 24330.8046875, + "learning_rate": 3.459114004445856e-05, + "loss": 0.6547, + "step": 50900 + }, + { + "epoch": 16.179739599872974, + "grad_norm": 23832.5703125, + "learning_rate": 3.455144490314386e-05, + "loss": 0.6578, + "step": 50950 + }, + { + "epoch": 16.195617656398856, + "grad_norm": 23988.150390625, + "learning_rate": 3.4511749761829155e-05, + "loss": 0.6558, + "step": 51000 + }, + { + "epoch": 16.211495712924737, + "grad_norm": 23709.25, + "learning_rate": 3.447205462051445e-05, + "loss": 0.655, + "step": 51050 + }, + { + "epoch": 16.22737376945062, + "grad_norm": 23120.591796875, + "learning_rate": 3.443235947919975e-05, + "loss": 0.6565, + "step": 51100 + }, + { + "epoch": 16.2432518259765, + "grad_norm": 23713.447265625, + "learning_rate": 3.439266433788504e-05, + "loss": 0.6583, + "step": 51150 + }, + { + "epoch": 16.259129882502382, + "grad_norm": 23596.0625, + "learning_rate": 3.4352969196570336e-05, + "loss": 0.6585, + "step": 51200 + }, + { + "epoch": 16.275007939028264, + "grad_norm": 24150.869140625, + "learning_rate": 3.4313274055255636e-05, + "loss": 0.6575, + "step": 51250 + }, + { + "epoch": 16.290885995554145, + "grad_norm": 24331.630859375, + "learning_rate": 3.427357891394094e-05, + "loss": 0.6594, + "step": 51300 + }, + { + "epoch": 16.306764052080027, + "grad_norm": 23451.88671875, + "learning_rate": 3.423388377262623e-05, + "loss": 0.6596, + "step": 51350 + }, + { + "epoch": 16.32264210860591, + "grad_norm": 23804.421875, + "learning_rate": 3.419418863131153e-05, + "loss": 0.6565, + "step": 51400 + }, + { + "epoch": 16.338520165131786, + "grad_norm": 24613.421875, + "learning_rate": 3.4154493489996824e-05, + "loss": 0.6608, + "step": 51450 + }, + { + "epoch": 16.354398221657668, + "grad_norm": 23770.623046875, + "learning_rate": 3.4114798348682124e-05, + "loss": 0.6558, + "step": 51500 + }, + { + "epoch": 16.37027627818355, + "grad_norm": 23497.1171875, + "learning_rate": 3.407510320736742e-05, + "loss": 0.6559, + "step": 51550 + }, + { + "epoch": 16.38615433470943, + "grad_norm": 23999.451171875, + "learning_rate": 3.403540806605272e-05, + "loss": 0.66, + "step": 51600 + }, + { + "epoch": 16.402032391235313, + "grad_norm": 23710.712890625, + "learning_rate": 3.399571292473802e-05, + "loss": 0.6605, + "step": 51650 + }, + { + "epoch": 16.417910447761194, + "grad_norm": 23660.166015625, + "learning_rate": 3.395601778342331e-05, + "loss": 0.6549, + "step": 51700 + }, + { + "epoch": 16.433788504287076, + "grad_norm": 23169.587890625, + "learning_rate": 3.3916322642108605e-05, + "loss": 0.6604, + "step": 51750 + }, + { + "epoch": 16.449666560812958, + "grad_norm": 24434.888671875, + "learning_rate": 3.3876627500793906e-05, + "loss": 0.6586, + "step": 51800 + }, + { + "epoch": 16.46554461733884, + "grad_norm": 23537.533203125, + "learning_rate": 3.38369323594792e-05, + "loss": 0.6588, + "step": 51850 + }, + { + "epoch": 16.48142267386472, + "grad_norm": 23304.681640625, + "learning_rate": 3.379723721816449e-05, + "loss": 0.6571, + "step": 51900 + }, + { + "epoch": 16.4973007303906, + "grad_norm": 23815.26171875, + "learning_rate": 3.375754207684979e-05, + "loss": 0.6593, + "step": 51950 + }, + { + "epoch": 16.51317878691648, + "grad_norm": 24492.578125, + "learning_rate": 3.3717846935535093e-05, + "loss": 0.6583, + "step": 52000 + }, + { + "epoch": 16.529056843442362, + "grad_norm": 23740.220703125, + "learning_rate": 3.3678151794220394e-05, + "loss": 0.6548, + "step": 52050 + }, + { + "epoch": 16.544934899968244, + "grad_norm": 23588.91015625, + "learning_rate": 3.363845665290569e-05, + "loss": 0.6556, + "step": 52100 + }, + { + "epoch": 16.560812956494125, + "grad_norm": 23579.50390625, + "learning_rate": 3.359876151159098e-05, + "loss": 0.6572, + "step": 52150 + }, + { + "epoch": 16.576691013020007, + "grad_norm": 23243.826171875, + "learning_rate": 3.355906637027628e-05, + "loss": 0.658, + "step": 52200 + }, + { + "epoch": 16.59256906954589, + "grad_norm": 23816.73828125, + "learning_rate": 3.3519371228961575e-05, + "loss": 0.6573, + "step": 52250 + }, + { + "epoch": 16.60844712607177, + "grad_norm": 23419.2265625, + "learning_rate": 3.3479676087646875e-05, + "loss": 0.6569, + "step": 52300 + }, + { + "epoch": 16.62432518259765, + "grad_norm": 24114.97265625, + "learning_rate": 3.3439980946332175e-05, + "loss": 0.6556, + "step": 52350 + }, + { + "epoch": 16.64020323912353, + "grad_norm": 23384.103515625, + "learning_rate": 3.340028580501747e-05, + "loss": 0.6595, + "step": 52400 + }, + { + "epoch": 16.65608129564941, + "grad_norm": 23055.017578125, + "learning_rate": 3.336059066370276e-05, + "loss": 0.6539, + "step": 52450 + }, + { + "epoch": 16.671959352175293, + "grad_norm": 23221.4609375, + "learning_rate": 3.332089552238806e-05, + "loss": 0.6587, + "step": 52500 + }, + { + "epoch": 16.687837408701174, + "grad_norm": 23152.85546875, + "learning_rate": 3.3281200381073356e-05, + "loss": 0.6583, + "step": 52550 + }, + { + "epoch": 16.703715465227056, + "grad_norm": 23718.134765625, + "learning_rate": 3.324150523975865e-05, + "loss": 0.6575, + "step": 52600 + }, + { + "epoch": 16.719593521752937, + "grad_norm": 23835.32421875, + "learning_rate": 3.320181009844395e-05, + "loss": 0.652, + "step": 52650 + }, + { + "epoch": 16.73547157827882, + "grad_norm": 24064.998046875, + "learning_rate": 3.316211495712925e-05, + "loss": 0.6544, + "step": 52700 + }, + { + "epoch": 16.7513496348047, + "grad_norm": 23508.853515625, + "learning_rate": 3.312241981581455e-05, + "loss": 0.6558, + "step": 52750 + }, + { + "epoch": 16.767227691330582, + "grad_norm": 23061.650390625, + "learning_rate": 3.3082724674499844e-05, + "loss": 0.6591, + "step": 52800 + }, + { + "epoch": 16.783105747856464, + "grad_norm": 23995.44140625, + "learning_rate": 3.304302953318514e-05, + "loss": 0.6559, + "step": 52850 + }, + { + "epoch": 16.798983804382345, + "grad_norm": 23488.259765625, + "learning_rate": 3.300333439187044e-05, + "loss": 0.6571, + "step": 52900 + }, + { + "epoch": 16.814861860908223, + "grad_norm": 24552.5625, + "learning_rate": 3.296363925055573e-05, + "loss": 0.6547, + "step": 52950 + }, + { + "epoch": 16.830739917434105, + "grad_norm": 23736.818359375, + "learning_rate": 3.292394410924103e-05, + "loss": 0.6578, + "step": 53000 + }, + { + "epoch": 16.846617973959987, + "grad_norm": 24375.546875, + "learning_rate": 3.288424896792633e-05, + "loss": 0.655, + "step": 53050 + }, + { + "epoch": 16.862496030485868, + "grad_norm": 23373.5, + "learning_rate": 3.2844553826611625e-05, + "loss": 0.6555, + "step": 53100 + }, + { + "epoch": 16.87837408701175, + "grad_norm": 24301.333984375, + "learning_rate": 3.280485868529692e-05, + "loss": 0.6574, + "step": 53150 + }, + { + "epoch": 16.89425214353763, + "grad_norm": 23277.462890625, + "learning_rate": 3.276516354398222e-05, + "loss": 0.6555, + "step": 53200 + }, + { + "epoch": 16.910130200063513, + "grad_norm": 23834.724609375, + "learning_rate": 3.272546840266751e-05, + "loss": 0.6582, + "step": 53250 + }, + { + "epoch": 16.926008256589395, + "grad_norm": 23567.853515625, + "learning_rate": 3.268577326135281e-05, + "loss": 0.6547, + "step": 53300 + }, + { + "epoch": 16.941886313115276, + "grad_norm": 23326.3671875, + "learning_rate": 3.2646078120038106e-05, + "loss": 0.6567, + "step": 53350 + }, + { + "epoch": 16.957764369641154, + "grad_norm": 23291.1953125, + "learning_rate": 3.260638297872341e-05, + "loss": 0.6552, + "step": 53400 + }, + { + "epoch": 16.973642426167036, + "grad_norm": 23493.248046875, + "learning_rate": 3.256668783740871e-05, + "loss": 0.6562, + "step": 53450 + }, + { + "epoch": 16.989520482692917, + "grad_norm": 23239.197265625, + "learning_rate": 3.2526992696094e-05, + "loss": 0.6562, + "step": 53500 + }, + { + "epoch": 17.0053985392188, + "grad_norm": 23646.5, + "learning_rate": 3.2487297554779294e-05, + "loss": 0.6553, + "step": 53550 + }, + { + "epoch": 17.02127659574468, + "grad_norm": 23330.103515625, + "learning_rate": 3.2447602413464594e-05, + "loss": 0.6497, + "step": 53600 + }, + { + "epoch": 17.037154652270562, + "grad_norm": 24091.298828125, + "learning_rate": 3.240790727214989e-05, + "loss": 0.6515, + "step": 53650 + }, + { + "epoch": 17.053032708796444, + "grad_norm": 24229.26171875, + "learning_rate": 3.236821213083519e-05, + "loss": 0.6518, + "step": 53700 + }, + { + "epoch": 17.068910765322325, + "grad_norm": 23416.974609375, + "learning_rate": 3.232851698952049e-05, + "loss": 0.6496, + "step": 53750 + }, + { + "epoch": 17.084788821848207, + "grad_norm": 23611.005859375, + "learning_rate": 3.228882184820578e-05, + "loss": 0.6491, + "step": 53800 + }, + { + "epoch": 17.10066687837409, + "grad_norm": 23050.642578125, + "learning_rate": 3.224912670689108e-05, + "loss": 0.6507, + "step": 53850 + }, + { + "epoch": 17.116544934899967, + "grad_norm": 23595.267578125, + "learning_rate": 3.2209431565576376e-05, + "loss": 0.6475, + "step": 53900 + }, + { + "epoch": 17.132422991425848, + "grad_norm": 23561.833984375, + "learning_rate": 3.216973642426167e-05, + "loss": 0.6487, + "step": 53950 + }, + { + "epoch": 17.14830104795173, + "grad_norm": 23513.59375, + "learning_rate": 3.213004128294697e-05, + "loss": 0.6474, + "step": 54000 + }, + { + "epoch": 17.16417910447761, + "grad_norm": 23998.837890625, + "learning_rate": 3.209034614163226e-05, + "loss": 0.6507, + "step": 54050 + }, + { + "epoch": 17.180057161003493, + "grad_norm": 23122.234375, + "learning_rate": 3.2050651000317563e-05, + "loss": 0.649, + "step": 54100 + }, + { + "epoch": 17.195935217529374, + "grad_norm": 23222.94921875, + "learning_rate": 3.2010955859002864e-05, + "loss": 0.6503, + "step": 54150 + }, + { + "epoch": 17.211813274055256, + "grad_norm": 23577.119140625, + "learning_rate": 3.197126071768816e-05, + "loss": 0.6511, + "step": 54200 + }, + { + "epoch": 17.227691330581138, + "grad_norm": 23782.28125, + "learning_rate": 3.193156557637345e-05, + "loss": 0.6523, + "step": 54250 + }, + { + "epoch": 17.24356938710702, + "grad_norm": 24278.03125, + "learning_rate": 3.189187043505875e-05, + "loss": 0.654, + "step": 54300 + }, + { + "epoch": 17.2594474436329, + "grad_norm": 23616.5390625, + "learning_rate": 3.1852175293744045e-05, + "loss": 0.651, + "step": 54350 + }, + { + "epoch": 17.27532550015878, + "grad_norm": 23676.12890625, + "learning_rate": 3.181248015242934e-05, + "loss": 0.6503, + "step": 54400 + }, + { + "epoch": 17.29120355668466, + "grad_norm": 23172.5859375, + "learning_rate": 3.1772785011114645e-05, + "loss": 0.6541, + "step": 54450 + }, + { + "epoch": 17.307081613210542, + "grad_norm": 23444.791015625, + "learning_rate": 3.173308986979994e-05, + "loss": 0.6491, + "step": 54500 + }, + { + "epoch": 17.322959669736424, + "grad_norm": 23441.435546875, + "learning_rate": 3.169339472848524e-05, + "loss": 0.6495, + "step": 54550 + }, + { + "epoch": 17.338837726262305, + "grad_norm": 23380.716796875, + "learning_rate": 3.165369958717053e-05, + "loss": 0.6504, + "step": 54600 + }, + { + "epoch": 17.354715782788187, + "grad_norm": 23601.03125, + "learning_rate": 3.1614004445855826e-05, + "loss": 0.6543, + "step": 54650 + }, + { + "epoch": 17.37059383931407, + "grad_norm": 23646.830078125, + "learning_rate": 3.1574309304541126e-05, + "loss": 0.6514, + "step": 54700 + }, + { + "epoch": 17.38647189583995, + "grad_norm": 23490.85546875, + "learning_rate": 3.153461416322642e-05, + "loss": 0.6513, + "step": 54750 + }, + { + "epoch": 17.40234995236583, + "grad_norm": 23827.6796875, + "learning_rate": 3.149491902191172e-05, + "loss": 0.6484, + "step": 54800 + }, + { + "epoch": 17.418228008891713, + "grad_norm": 23096.40625, + "learning_rate": 3.145522388059702e-05, + "loss": 0.6483, + "step": 54850 + }, + { + "epoch": 17.43410606541759, + "grad_norm": 23743.8359375, + "learning_rate": 3.1415528739282314e-05, + "loss": 0.652, + "step": 54900 + }, + { + "epoch": 17.449984121943473, + "grad_norm": 23075.265625, + "learning_rate": 3.137583359796761e-05, + "loss": 0.6501, + "step": 54950 + }, + { + "epoch": 17.465862178469354, + "grad_norm": 24035.71875, + "learning_rate": 3.133613845665291e-05, + "loss": 0.652, + "step": 55000 + }, + { + "epoch": 17.481740234995236, + "grad_norm": 23361.478515625, + "learning_rate": 3.12964433153382e-05, + "loss": 0.6536, + "step": 55050 + }, + { + "epoch": 17.497618291521118, + "grad_norm": 23745.171875, + "learning_rate": 3.12567481740235e-05, + "loss": 0.6504, + "step": 55100 + }, + { + "epoch": 17.513496348047, + "grad_norm": 23789.912109375, + "learning_rate": 3.12170530327088e-05, + "loss": 0.6491, + "step": 55150 + }, + { + "epoch": 17.52937440457288, + "grad_norm": 23335.9609375, + "learning_rate": 3.1177357891394095e-05, + "loss": 0.6503, + "step": 55200 + }, + { + "epoch": 17.545252461098762, + "grad_norm": 23689.12890625, + "learning_rate": 3.1137662750079396e-05, + "loss": 0.6527, + "step": 55250 + }, + { + "epoch": 17.561130517624644, + "grad_norm": 23400.572265625, + "learning_rate": 3.109796760876469e-05, + "loss": 0.6519, + "step": 55300 + }, + { + "epoch": 17.577008574150526, + "grad_norm": 23527.71875, + "learning_rate": 3.105827246744998e-05, + "loss": 0.6494, + "step": 55350 + }, + { + "epoch": 17.592886630676404, + "grad_norm": 23462.595703125, + "learning_rate": 3.101857732613528e-05, + "loss": 0.6507, + "step": 55400 + }, + { + "epoch": 17.608764687202285, + "grad_norm": 23832.5625, + "learning_rate": 3.0978882184820576e-05, + "loss": 0.648, + "step": 55450 + }, + { + "epoch": 17.624642743728167, + "grad_norm": 23171.41796875, + "learning_rate": 3.093918704350588e-05, + "loss": 0.6515, + "step": 55500 + }, + { + "epoch": 17.64052080025405, + "grad_norm": 22859.158203125, + "learning_rate": 3.089949190219118e-05, + "loss": 0.65, + "step": 55550 + }, + { + "epoch": 17.65639885677993, + "grad_norm": 23794.48828125, + "learning_rate": 3.085979676087647e-05, + "loss": 0.6494, + "step": 55600 + }, + { + "epoch": 17.67227691330581, + "grad_norm": 23437.626953125, + "learning_rate": 3.082010161956177e-05, + "loss": 0.6507, + "step": 55650 + }, + { + "epoch": 17.688154969831693, + "grad_norm": 23454.169921875, + "learning_rate": 3.0780406478247064e-05, + "loss": 0.6528, + "step": 55700 + }, + { + "epoch": 17.704033026357575, + "grad_norm": 23537.875, + "learning_rate": 3.074071133693236e-05, + "loss": 0.6508, + "step": 55750 + }, + { + "epoch": 17.719911082883456, + "grad_norm": 23073.7421875, + "learning_rate": 3.070101619561766e-05, + "loss": 0.6514, + "step": 55800 + }, + { + "epoch": 17.735789139409338, + "grad_norm": 24199.0234375, + "learning_rate": 3.066132105430295e-05, + "loss": 0.652, + "step": 55850 + }, + { + "epoch": 17.751667195935216, + "grad_norm": 24246.693359375, + "learning_rate": 3.062162591298825e-05, + "loss": 0.6506, + "step": 55900 + }, + { + "epoch": 17.767545252461098, + "grad_norm": 23131.373046875, + "learning_rate": 3.058193077167355e-05, + "loss": 0.6473, + "step": 55950 + }, + { + "epoch": 17.78342330898698, + "grad_norm": 23967.60546875, + "learning_rate": 3.0542235630358846e-05, + "loss": 0.649, + "step": 56000 + }, + { + "epoch": 17.79930136551286, + "grad_norm": 23857.62890625, + "learning_rate": 3.050254048904414e-05, + "loss": 0.6498, + "step": 56050 + }, + { + "epoch": 17.815179422038742, + "grad_norm": 23931.333984375, + "learning_rate": 3.046284534772944e-05, + "loss": 0.651, + "step": 56100 + }, + { + "epoch": 17.831057478564624, + "grad_norm": 23383.25390625, + "learning_rate": 3.0423150206414736e-05, + "loss": 0.6526, + "step": 56150 + }, + { + "epoch": 17.846935535090505, + "grad_norm": 23175.05078125, + "learning_rate": 3.038345506510003e-05, + "loss": 0.6494, + "step": 56200 + }, + { + "epoch": 17.862813591616387, + "grad_norm": 23574.662109375, + "learning_rate": 3.034375992378533e-05, + "loss": 0.6479, + "step": 56250 + }, + { + "epoch": 17.87869164814227, + "grad_norm": 23581.66015625, + "learning_rate": 3.0304064782470627e-05, + "loss": 0.6514, + "step": 56300 + }, + { + "epoch": 17.89456970466815, + "grad_norm": 23191.83203125, + "learning_rate": 3.0264369641155928e-05, + "loss": 0.649, + "step": 56350 + }, + { + "epoch": 17.91044776119403, + "grad_norm": 23138.642578125, + "learning_rate": 3.022467449984122e-05, + "loss": 0.6523, + "step": 56400 + }, + { + "epoch": 17.92632581771991, + "grad_norm": 24247.412109375, + "learning_rate": 3.0184979358526515e-05, + "loss": 0.6508, + "step": 56450 + }, + { + "epoch": 17.94220387424579, + "grad_norm": 23328.904296875, + "learning_rate": 3.0145284217211818e-05, + "loss": 0.6521, + "step": 56500 + }, + { + "epoch": 17.958081930771673, + "grad_norm": 23195.806640625, + "learning_rate": 3.0105589075897112e-05, + "loss": 0.6519, + "step": 56550 + }, + { + "epoch": 17.973959987297555, + "grad_norm": 24111.8359375, + "learning_rate": 3.0065893934582405e-05, + "loss": 0.6504, + "step": 56600 + }, + { + "epoch": 17.989838043823436, + "grad_norm": 23394.666015625, + "learning_rate": 3.0026198793267706e-05, + "loss": 0.6499, + "step": 56650 + }, + { + "epoch": 18.005716100349318, + "grad_norm": 24451.033203125, + "learning_rate": 2.9986503651953002e-05, + "loss": 0.6493, + "step": 56700 + }, + { + "epoch": 18.0215941568752, + "grad_norm": 24162.71875, + "learning_rate": 2.9946808510638296e-05, + "loss": 0.6437, + "step": 56750 + }, + { + "epoch": 18.03747221340108, + "grad_norm": 23387.994140625, + "learning_rate": 2.9907113369323596e-05, + "loss": 0.6487, + "step": 56800 + }, + { + "epoch": 18.053350269926963, + "grad_norm": 23269.083984375, + "learning_rate": 2.9867418228008893e-05, + "loss": 0.6469, + "step": 56850 + }, + { + "epoch": 18.06922832645284, + "grad_norm": 23347.12890625, + "learning_rate": 2.9827723086694193e-05, + "loss": 0.6428, + "step": 56900 + }, + { + "epoch": 18.085106382978722, + "grad_norm": 23654.724609375, + "learning_rate": 2.9788027945379487e-05, + "loss": 0.6485, + "step": 56950 + }, + { + "epoch": 18.100984439504604, + "grad_norm": 23451.236328125, + "learning_rate": 2.9748332804064784e-05, + "loss": 0.6458, + "step": 57000 + }, + { + "epoch": 18.116862496030485, + "grad_norm": 24515.484375, + "learning_rate": 2.9708637662750084e-05, + "loss": 0.6451, + "step": 57050 + }, + { + "epoch": 18.132740552556367, + "grad_norm": 23294.755859375, + "learning_rate": 2.9668942521435378e-05, + "loss": 0.6473, + "step": 57100 + }, + { + "epoch": 18.14861860908225, + "grad_norm": 24213.658203125, + "learning_rate": 2.962924738012067e-05, + "loss": 0.6435, + "step": 57150 + }, + { + "epoch": 18.16449666560813, + "grad_norm": 23097.380859375, + "learning_rate": 2.958955223880597e-05, + "loss": 0.6448, + "step": 57200 + }, + { + "epoch": 18.18037472213401, + "grad_norm": 23379.626953125, + "learning_rate": 2.954985709749127e-05, + "loss": 0.6455, + "step": 57250 + }, + { + "epoch": 18.196252778659893, + "grad_norm": 23119.544921875, + "learning_rate": 2.9510161956176562e-05, + "loss": 0.6435, + "step": 57300 + }, + { + "epoch": 18.212130835185775, + "grad_norm": 23745.896484375, + "learning_rate": 2.9470466814861862e-05, + "loss": 0.6469, + "step": 57350 + }, + { + "epoch": 18.228008891711653, + "grad_norm": 23996.830078125, + "learning_rate": 2.943077167354716e-05, + "loss": 0.6471, + "step": 57400 + }, + { + "epoch": 18.243886948237535, + "grad_norm": 23478.310546875, + "learning_rate": 2.939107653223246e-05, + "loss": 0.6435, + "step": 57450 + }, + { + "epoch": 18.259765004763416, + "grad_norm": 23451.203125, + "learning_rate": 2.9351381390917753e-05, + "loss": 0.6466, + "step": 57500 + }, + { + "epoch": 18.275643061289298, + "grad_norm": 23465.240234375, + "learning_rate": 2.931168624960305e-05, + "loss": 0.6473, + "step": 57550 + }, + { + "epoch": 18.29152111781518, + "grad_norm": 23575.08203125, + "learning_rate": 2.927199110828835e-05, + "loss": 0.6465, + "step": 57600 + }, + { + "epoch": 18.30739917434106, + "grad_norm": 23251.138671875, + "learning_rate": 2.9232295966973644e-05, + "loss": 0.6434, + "step": 57650 + }, + { + "epoch": 18.323277230866942, + "grad_norm": 23791.912109375, + "learning_rate": 2.919260082565894e-05, + "loss": 0.6462, + "step": 57700 + }, + { + "epoch": 18.339155287392824, + "grad_norm": 23962.904296875, + "learning_rate": 2.915290568434424e-05, + "loss": 0.6464, + "step": 57750 + }, + { + "epoch": 18.355033343918706, + "grad_norm": 23311.55078125, + "learning_rate": 2.9113210543029534e-05, + "loss": 0.6432, + "step": 57800 + }, + { + "epoch": 18.370911400444587, + "grad_norm": 23348.361328125, + "learning_rate": 2.9073515401714828e-05, + "loss": 0.6439, + "step": 57850 + }, + { + "epoch": 18.386789456970465, + "grad_norm": 23554.818359375, + "learning_rate": 2.9033820260400128e-05, + "loss": 0.6468, + "step": 57900 + }, + { + "epoch": 18.402667513496347, + "grad_norm": 23781.22265625, + "learning_rate": 2.8994125119085425e-05, + "loss": 0.6444, + "step": 57950 + }, + { + "epoch": 18.41854557002223, + "grad_norm": 22692.29296875, + "learning_rate": 2.895442997777072e-05, + "loss": 0.6442, + "step": 58000 + }, + { + "epoch": 18.43442362654811, + "grad_norm": 23058.107421875, + "learning_rate": 2.891473483645602e-05, + "loss": 0.6439, + "step": 58050 + }, + { + "epoch": 18.45030168307399, + "grad_norm": 22859.33203125, + "learning_rate": 2.8875039695141316e-05, + "loss": 0.645, + "step": 58100 + }, + { + "epoch": 18.466179739599873, + "grad_norm": 23191.587890625, + "learning_rate": 2.8835344553826616e-05, + "loss": 0.6449, + "step": 58150 + }, + { + "epoch": 18.482057796125755, + "grad_norm": 23647.953125, + "learning_rate": 2.879564941251191e-05, + "loss": 0.6466, + "step": 58200 + }, + { + "epoch": 18.497935852651636, + "grad_norm": 23992.3828125, + "learning_rate": 2.8755954271197207e-05, + "loss": 0.6473, + "step": 58250 + }, + { + "epoch": 18.513813909177518, + "grad_norm": 23166.861328125, + "learning_rate": 2.8716259129882507e-05, + "loss": 0.6451, + "step": 58300 + }, + { + "epoch": 18.5296919657034, + "grad_norm": 23637.072265625, + "learning_rate": 2.86765639885678e-05, + "loss": 0.6481, + "step": 58350 + }, + { + "epoch": 18.545570022229278, + "grad_norm": 23371.7265625, + "learning_rate": 2.8636868847253094e-05, + "loss": 0.6406, + "step": 58400 + }, + { + "epoch": 18.56144807875516, + "grad_norm": 23572.712890625, + "learning_rate": 2.8597173705938398e-05, + "loss": 0.6437, + "step": 58450 + }, + { + "epoch": 18.57732613528104, + "grad_norm": 23404.9765625, + "learning_rate": 2.855747856462369e-05, + "loss": 0.6467, + "step": 58500 + }, + { + "epoch": 18.593204191806922, + "grad_norm": 23365.392578125, + "learning_rate": 2.8517783423308985e-05, + "loss": 0.6464, + "step": 58550 + }, + { + "epoch": 18.609082248332804, + "grad_norm": 22837.333984375, + "learning_rate": 2.8478088281994285e-05, + "loss": 0.6476, + "step": 58600 + }, + { + "epoch": 18.624960304858686, + "grad_norm": 23265.06640625, + "learning_rate": 2.8438393140679582e-05, + "loss": 0.6445, + "step": 58650 + }, + { + "epoch": 18.640838361384567, + "grad_norm": 23657.7265625, + "learning_rate": 2.8398697999364882e-05, + "loss": 0.6487, + "step": 58700 + }, + { + "epoch": 18.65671641791045, + "grad_norm": 23775.322265625, + "learning_rate": 2.8359002858050176e-05, + "loss": 0.6461, + "step": 58750 + }, + { + "epoch": 18.67259447443633, + "grad_norm": 23216.8359375, + "learning_rate": 2.8319307716735472e-05, + "loss": 0.6445, + "step": 58800 + }, + { + "epoch": 18.68847253096221, + "grad_norm": 23683.677734375, + "learning_rate": 2.8279612575420773e-05, + "loss": 0.6464, + "step": 58850 + }, + { + "epoch": 18.70435058748809, + "grad_norm": 24212.357421875, + "learning_rate": 2.8239917434106066e-05, + "loss": 0.6458, + "step": 58900 + }, + { + "epoch": 18.72022864401397, + "grad_norm": 23845.314453125, + "learning_rate": 2.8200222292791363e-05, + "loss": 0.6434, + "step": 58950 + }, + { + "epoch": 18.736106700539853, + "grad_norm": 23429.07421875, + "learning_rate": 2.8160527151476663e-05, + "loss": 0.6454, + "step": 59000 + }, + { + "epoch": 18.751984757065735, + "grad_norm": 23693.30078125, + "learning_rate": 2.8120832010161957e-05, + "loss": 0.6449, + "step": 59050 + }, + { + "epoch": 18.767862813591616, + "grad_norm": 24399.693359375, + "learning_rate": 2.808113686884725e-05, + "loss": 0.6442, + "step": 59100 + }, + { + "epoch": 18.783740870117498, + "grad_norm": 23742.171875, + "learning_rate": 2.8041441727532554e-05, + "loss": 0.645, + "step": 59150 + }, + { + "epoch": 18.79961892664338, + "grad_norm": 23861.95703125, + "learning_rate": 2.8001746586217848e-05, + "loss": 0.6476, + "step": 59200 + }, + { + "epoch": 18.81549698316926, + "grad_norm": 23694.40625, + "learning_rate": 2.7962051444903148e-05, + "loss": 0.6453, + "step": 59250 + }, + { + "epoch": 18.831375039695143, + "grad_norm": 22883.59375, + "learning_rate": 2.792235630358844e-05, + "loss": 0.6441, + "step": 59300 + }, + { + "epoch": 18.847253096221024, + "grad_norm": 24172.53125, + "learning_rate": 2.788266116227374e-05, + "loss": 0.643, + "step": 59350 + }, + { + "epoch": 18.863131152746902, + "grad_norm": 23304.0859375, + "learning_rate": 2.784296602095904e-05, + "loss": 0.6468, + "step": 59400 + }, + { + "epoch": 18.879009209272784, + "grad_norm": 23574.00390625, + "learning_rate": 2.7803270879644332e-05, + "loss": 0.6473, + "step": 59450 + }, + { + "epoch": 18.894887265798666, + "grad_norm": 23528.5859375, + "learning_rate": 2.776357573832963e-05, + "loss": 0.6455, + "step": 59500 + }, + { + "epoch": 18.910765322324547, + "grad_norm": 23860.66015625, + "learning_rate": 2.772388059701493e-05, + "loss": 0.647, + "step": 59550 + }, + { + "epoch": 18.92664337885043, + "grad_norm": 23734.267578125, + "learning_rate": 2.7684185455700223e-05, + "loss": 0.6422, + "step": 59600 + }, + { + "epoch": 18.94252143537631, + "grad_norm": 23822.1875, + "learning_rate": 2.764449031438552e-05, + "loss": 0.643, + "step": 59650 + }, + { + "epoch": 18.958399491902192, + "grad_norm": 22949.701171875, + "learning_rate": 2.760479517307082e-05, + "loss": 0.6441, + "step": 59700 + }, + { + "epoch": 18.974277548428073, + "grad_norm": 23238.943359375, + "learning_rate": 2.7565100031756114e-05, + "loss": 0.6449, + "step": 59750 + }, + { + "epoch": 18.990155604953955, + "grad_norm": 23398.4140625, + "learning_rate": 2.7525404890441407e-05, + "loss": 0.6429, + "step": 59800 + }, + { + "epoch": 19.006033661479837, + "grad_norm": 23431.22265625, + "learning_rate": 2.7485709749126707e-05, + "loss": 0.6427, + "step": 59850 + }, + { + "epoch": 19.021911718005715, + "grad_norm": 23309.41796875, + "learning_rate": 2.7446014607812004e-05, + "loss": 0.6407, + "step": 59900 + }, + { + "epoch": 19.037789774531596, + "grad_norm": 23369.134765625, + "learning_rate": 2.7406319466497305e-05, + "loss": 0.6337, + "step": 59950 + }, + { + "epoch": 19.053667831057478, + "grad_norm": 23830.84375, + "learning_rate": 2.7366624325182598e-05, + "loss": 0.6405, + "step": 60000 + }, + { + "epoch": 19.06954588758336, + "grad_norm": 23755.240234375, + "learning_rate": 2.7326929183867895e-05, + "loss": 0.6404, + "step": 60050 + }, + { + "epoch": 19.08542394410924, + "grad_norm": 23902.658203125, + "learning_rate": 2.7287234042553195e-05, + "loss": 0.639, + "step": 60100 + }, + { + "epoch": 19.101302000635123, + "grad_norm": 23243.98828125, + "learning_rate": 2.724753890123849e-05, + "loss": 0.6352, + "step": 60150 + }, + { + "epoch": 19.117180057161004, + "grad_norm": 23601.951171875, + "learning_rate": 2.7207843759923786e-05, + "loss": 0.6374, + "step": 60200 + }, + { + "epoch": 19.133058113686886, + "grad_norm": 23469.58203125, + "learning_rate": 2.7168148618609086e-05, + "loss": 0.6391, + "step": 60250 + }, + { + "epoch": 19.148936170212767, + "grad_norm": 24173.52734375, + "learning_rate": 2.712845347729438e-05, + "loss": 0.6387, + "step": 60300 + }, + { + "epoch": 19.164814226738645, + "grad_norm": 23875.79296875, + "learning_rate": 2.7088758335979677e-05, + "loss": 0.6399, + "step": 60350 + }, + { + "epoch": 19.180692283264527, + "grad_norm": 23351.52734375, + "learning_rate": 2.7049063194664977e-05, + "loss": 0.6418, + "step": 60400 + }, + { + "epoch": 19.19657033979041, + "grad_norm": 24030.84765625, + "learning_rate": 2.700936805335027e-05, + "loss": 0.6417, + "step": 60450 + }, + { + "epoch": 19.21244839631629, + "grad_norm": 23502.681640625, + "learning_rate": 2.696967291203557e-05, + "loss": 0.6419, + "step": 60500 + }, + { + "epoch": 19.228326452842172, + "grad_norm": 23972.9609375, + "learning_rate": 2.6929977770720864e-05, + "loss": 0.6385, + "step": 60550 + }, + { + "epoch": 19.244204509368053, + "grad_norm": 23450.828125, + "learning_rate": 2.689028262940616e-05, + "loss": 0.6383, + "step": 60600 + }, + { + "epoch": 19.260082565893935, + "grad_norm": 23348.533203125, + "learning_rate": 2.685058748809146e-05, + "loss": 0.6438, + "step": 60650 + }, + { + "epoch": 19.275960622419817, + "grad_norm": 23530.572265625, + "learning_rate": 2.6810892346776755e-05, + "loss": 0.642, + "step": 60700 + }, + { + "epoch": 19.291838678945698, + "grad_norm": 23539.4375, + "learning_rate": 2.6771197205462052e-05, + "loss": 0.6403, + "step": 60750 + }, + { + "epoch": 19.30771673547158, + "grad_norm": 23497.1015625, + "learning_rate": 2.6731502064147352e-05, + "loss": 0.6384, + "step": 60800 + }, + { + "epoch": 19.323594791997458, + "grad_norm": 23695.337890625, + "learning_rate": 2.6691806922832646e-05, + "loss": 0.6428, + "step": 60850 + }, + { + "epoch": 19.33947284852334, + "grad_norm": 23628.791015625, + "learning_rate": 2.6652111781517942e-05, + "loss": 0.6371, + "step": 60900 + }, + { + "epoch": 19.35535090504922, + "grad_norm": 23643.2421875, + "learning_rate": 2.6612416640203243e-05, + "loss": 0.643, + "step": 60950 + }, + { + "epoch": 19.371228961575103, + "grad_norm": 23056.48828125, + "learning_rate": 2.6572721498888536e-05, + "loss": 0.6387, + "step": 61000 + }, + { + "epoch": 19.387107018100984, + "grad_norm": 23687.0859375, + "learning_rate": 2.6533026357573837e-05, + "loss": 0.64, + "step": 61050 + }, + { + "epoch": 19.402985074626866, + "grad_norm": 23476.853515625, + "learning_rate": 2.6493331216259133e-05, + "loss": 0.6406, + "step": 61100 + }, + { + "epoch": 19.418863131152747, + "grad_norm": 24086.623046875, + "learning_rate": 2.6453636074944427e-05, + "loss": 0.6428, + "step": 61150 + }, + { + "epoch": 19.43474118767863, + "grad_norm": 23353.08984375, + "learning_rate": 2.6413940933629727e-05, + "loss": 0.6435, + "step": 61200 + }, + { + "epoch": 19.45061924420451, + "grad_norm": 23448.341796875, + "learning_rate": 2.637424579231502e-05, + "loss": 0.6423, + "step": 61250 + }, + { + "epoch": 19.466497300730392, + "grad_norm": 23803.828125, + "learning_rate": 2.6334550651000318e-05, + "loss": 0.6419, + "step": 61300 + }, + { + "epoch": 19.48237535725627, + "grad_norm": 24309.66796875, + "learning_rate": 2.6294855509685618e-05, + "loss": 0.6434, + "step": 61350 + }, + { + "epoch": 19.49825341378215, + "grad_norm": 23195.96875, + "learning_rate": 2.625516036837091e-05, + "loss": 0.6378, + "step": 61400 + }, + { + "epoch": 19.514131470308033, + "grad_norm": 23841.748046875, + "learning_rate": 2.621546522705621e-05, + "loss": 0.6397, + "step": 61450 + }, + { + "epoch": 19.530009526833915, + "grad_norm": 23373.90625, + "learning_rate": 2.617577008574151e-05, + "loss": 0.6405, + "step": 61500 + }, + { + "epoch": 19.545887583359796, + "grad_norm": 23817.623046875, + "learning_rate": 2.6136074944426802e-05, + "loss": 0.6402, + "step": 61550 + }, + { + "epoch": 19.561765639885678, + "grad_norm": 23955.10546875, + "learning_rate": 2.60963798031121e-05, + "loss": 0.6394, + "step": 61600 + }, + { + "epoch": 19.57764369641156, + "grad_norm": 23063.484375, + "learning_rate": 2.60566846617974e-05, + "loss": 0.6426, + "step": 61650 + }, + { + "epoch": 19.59352175293744, + "grad_norm": 23891.98046875, + "learning_rate": 2.6016989520482693e-05, + "loss": 0.6411, + "step": 61700 + }, + { + "epoch": 19.609399809463323, + "grad_norm": 23992.01171875, + "learning_rate": 2.5977294379167993e-05, + "loss": 0.6414, + "step": 61750 + }, + { + "epoch": 19.625277865989204, + "grad_norm": 23503.697265625, + "learning_rate": 2.5937599237853287e-05, + "loss": 0.6375, + "step": 61800 + }, + { + "epoch": 19.641155922515082, + "grad_norm": 24320.890625, + "learning_rate": 2.5897904096538584e-05, + "loss": 0.6395, + "step": 61850 + }, + { + "epoch": 19.657033979040964, + "grad_norm": 23124.4375, + "learning_rate": 2.5858208955223884e-05, + "loss": 0.6425, + "step": 61900 + }, + { + "epoch": 19.672912035566846, + "grad_norm": 24162.05078125, + "learning_rate": 2.5818513813909177e-05, + "loss": 0.6399, + "step": 61950 + }, + { + "epoch": 19.688790092092727, + "grad_norm": 23305.732421875, + "learning_rate": 2.5778818672594474e-05, + "loss": 0.6375, + "step": 62000 + }, + { + "epoch": 19.70466814861861, + "grad_norm": 23611.771484375, + "learning_rate": 2.5739123531279775e-05, + "loss": 0.6402, + "step": 62050 + }, + { + "epoch": 19.72054620514449, + "grad_norm": 23453.798828125, + "learning_rate": 2.5699428389965068e-05, + "loss": 0.6365, + "step": 62100 + }, + { + "epoch": 19.736424261670372, + "grad_norm": 23632.29296875, + "learning_rate": 2.5659733248650365e-05, + "loss": 0.6387, + "step": 62150 + }, + { + "epoch": 19.752302318196254, + "grad_norm": 23645.380859375, + "learning_rate": 2.5620038107335665e-05, + "loss": 0.642, + "step": 62200 + }, + { + "epoch": 19.768180374722135, + "grad_norm": 23833.201171875, + "learning_rate": 2.558034296602096e-05, + "loss": 0.6401, + "step": 62250 + }, + { + "epoch": 19.784058431248017, + "grad_norm": 23208.59765625, + "learning_rate": 2.554064782470626e-05, + "loss": 0.6398, + "step": 62300 + }, + { + "epoch": 19.799936487773895, + "grad_norm": 23972.072265625, + "learning_rate": 2.5500952683391556e-05, + "loss": 0.6371, + "step": 62350 + }, + { + "epoch": 19.815814544299776, + "grad_norm": 23224.896484375, + "learning_rate": 2.546125754207685e-05, + "loss": 0.6391, + "step": 62400 + }, + { + "epoch": 19.831692600825658, + "grad_norm": 24514.93359375, + "learning_rate": 2.542156240076215e-05, + "loss": 0.6403, + "step": 62450 + }, + { + "epoch": 19.84757065735154, + "grad_norm": 23860.24609375, + "learning_rate": 2.5381867259447443e-05, + "loss": 0.6409, + "step": 62500 + }, + { + "epoch": 19.86344871387742, + "grad_norm": 23272.37890625, + "learning_rate": 2.534217211813274e-05, + "loss": 0.6391, + "step": 62550 + }, + { + "epoch": 19.879326770403303, + "grad_norm": 23465.796875, + "learning_rate": 2.530247697681804e-05, + "loss": 0.6403, + "step": 62600 + }, + { + "epoch": 19.895204826929184, + "grad_norm": 23239.646484375, + "learning_rate": 2.5262781835503334e-05, + "loss": 0.641, + "step": 62650 + }, + { + "epoch": 19.911082883455066, + "grad_norm": 23455.052734375, + "learning_rate": 2.522308669418863e-05, + "loss": 0.641, + "step": 62700 + }, + { + "epoch": 19.926960939980948, + "grad_norm": 23780.283203125, + "learning_rate": 2.518339155287393e-05, + "loss": 0.6403, + "step": 62750 + }, + { + "epoch": 19.94283899650683, + "grad_norm": 24348.8515625, + "learning_rate": 2.5143696411559225e-05, + "loss": 0.638, + "step": 62800 + }, + { + "epoch": 19.958717053032707, + "grad_norm": 23591.212890625, + "learning_rate": 2.5104001270244525e-05, + "loss": 0.6367, + "step": 62850 + }, + { + "epoch": 19.97459510955859, + "grad_norm": 23728.669921875, + "learning_rate": 2.5064306128929822e-05, + "loss": 0.6397, + "step": 62900 + }, + { + "epoch": 19.99047316608447, + "grad_norm": 23320.609375, + "learning_rate": 2.5024610987615116e-05, + "loss": 0.6411, + "step": 62950 + }, + { + "epoch": 20.0, + "step": 62980, + "total_flos": 2.10597197119488e+18, + "train_loss": 0.33622724912249125, + "train_runtime": 72693.7372, + "train_samples_per_second": 55.437, + "train_steps_per_second": 0.866 + } + ], + "logging_steps": 50, + "max_steps": 62980, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.10597197119488e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}