[ { "loss": 4.6142, "grad_norm": 33.659332275390625, "learning_rate": 1.8e-05, "epoch": 0.006297229219143577, "step": 10 }, { "loss": 2.6482, "grad_norm": 14.412582397460938, "learning_rate": 3.8e-05, "epoch": 0.012594458438287154, "step": 20 }, { "loss": 2.2011, "grad_norm": 9.777717590332031, "learning_rate": 5.8e-05, "epoch": 0.018891687657430732, "step": 30 }, { "loss": 1.9662, "grad_norm": 17.46149253845215, "learning_rate": 7.800000000000001e-05, "epoch": 0.02518891687657431, "step": 40 }, { "loss": 1.8247, "grad_norm": 9.683545112609863, "learning_rate": 9.8e-05, "epoch": 0.031486146095717885, "step": 50 }, { "loss": 1.8919, "grad_norm": 10.557292938232422, "learning_rate": 0.000118, "epoch": 0.037783375314861464, "step": 60 }, { "loss": 1.9162, "grad_norm": 11.902077674865723, "learning_rate": 0.000138, "epoch": 0.04408060453400504, "step": 70 }, { "eval_loss": 1.7838011980056763, "eval_runtime": 33.8157, "eval_samples_per_second": 15.348, "eval_steps_per_second": 3.844, "epoch": 0.04974811083123426, "step": 79 }, { "loss": 1.8701, "grad_norm": 8.326041221618652, "learning_rate": 0.00015800000000000002, "epoch": 0.05037783375314862, "step": 80 }, { "loss": 1.7494, "grad_norm": 8.273054122924805, "learning_rate": 0.00017800000000000002, "epoch": 0.05667506297229219, "step": 90 }, { "loss": 1.7557, "grad_norm": 10.647675514221191, "learning_rate": 0.00019800000000000002, "epoch": 0.06297229219143577, "step": 100 }, { "loss": 1.7535, "grad_norm": 6.604005813598633, "learning_rate": 0.0001999819475629623, "epoch": 0.06926952141057935, "step": 110 }, { "loss": 1.7317, "grad_norm": 6.132753372192383, "learning_rate": 0.00019991955244474347, "epoch": 0.07556675062972293, "step": 120 }, { "loss": 1.6196, "grad_norm": 5.765186786651611, "learning_rate": 0.0001998126195808803, "epoch": 0.0818639798488665, "step": 130 }, { "loss": 1.5384, "grad_norm": 5.294787883758545, "learning_rate": 0.00019966119663520412, "epoch": 0.08816120906801007, "step": 140 }, { "loss": 1.7253, "grad_norm": 8.26528549194336, "learning_rate": 0.00019946535110237904, "epoch": 0.09445843828715365, "step": 150 }, { "eval_loss": 1.7441585063934326, "eval_runtime": 14.4715, "eval_samples_per_second": 35.864, "eval_steps_per_second": 8.983, "epoch": 0.09949622166246852, "step": 158 }, { "loss": 1.7126, "grad_norm": 5.565060138702393, "learning_rate": 0.00019922517027781746, "epoch": 0.10075566750629723, "step": 160 }, { "loss": 1.8884, "grad_norm": 6.4396867752075195, "learning_rate": 0.000198940761218769, "epoch": 0.1070528967254408, "step": 170 }, { "loss": 1.7091, "grad_norm": 5.561753273010254, "learning_rate": 0.0001986122506966014, "epoch": 0.11335012594458438, "step": 180 }, { "loss": 1.7571, "grad_norm": 4.833555221557617, "learning_rate": 0.000198239785140294, "epoch": 0.11964735516372796, "step": 190 }, { "loss": 1.6005, "grad_norm": 5.930907249450684, "learning_rate": 0.000197823530571169, "epoch": 0.12594458438287154, "step": 200 }, { "loss": 1.7721, "grad_norm": 5.551156520843506, "learning_rate": 0.00019736367252888974, "epoch": 0.13224181360201512, "step": 210 }, { "loss": 1.6479, "grad_norm": 5.802966117858887, "learning_rate": 0.000196860415988759, "epoch": 0.1385390428211587, "step": 220 }, { "loss": 1.6939, "grad_norm": 5.381985664367676, "learning_rate": 0.00019631398527035422, "epoch": 0.14483627204030228, "step": 230 }, { "eval_loss": 1.6404942274093628, "eval_runtime": 14.4847, "eval_samples_per_second": 35.831, "eval_steps_per_second": 8.975, "epoch": 0.14924433249370278, "step": 237 }, { "loss": 1.6613, "grad_norm": 5.851009368896484, "learning_rate": 0.00019572462393754005, "epoch": 0.15113350125944586, "step": 240 }, { "loss": 1.6204, "grad_norm": 5.324190139770508, "learning_rate": 0.00019509259468990323, "epoch": 0.1574307304785894, "step": 250 }, { "loss": 1.5024, "grad_norm": 6.643704891204834, "learning_rate": 0.00019441817924565786, "epoch": 0.163727959697733, "step": 260 }, { "loss": 1.7425, "grad_norm": 5.175152778625488, "learning_rate": 0.00019370167821607353, "epoch": 0.17002518891687657, "step": 270 }, { "loss": 1.6053, "grad_norm": 5.162317276000977, "learning_rate": 0.00019294341097148194, "epoch": 0.17632241813602015, "step": 280 }, { "loss": 1.6262, "grad_norm": 4.8840179443359375, "learning_rate": 0.0001921437154989221, "epoch": 0.18261964735516373, "step": 290 }, { "loss": 1.6046, "grad_norm": 5.020822048187256, "learning_rate": 0.0001913029482514873, "epoch": 0.1889168765743073, "step": 300 }, { "loss": 1.6283, "grad_norm": 4.799527645111084, "learning_rate": 0.00019042148398944116, "epoch": 0.1952141057934509, "step": 310 }, { "eval_loss": 1.6082123517990112, "eval_runtime": 14.6548, "eval_samples_per_second": 35.415, "eval_steps_per_second": 8.871, "epoch": 0.19899244332493704, "step": 316 }, { "loss": 1.6391, "grad_norm": 5.666315078735352, "learning_rate": 0.0001894997156131734, "epoch": 0.20151133501259447, "step": 320 }, { "loss": 1.6637, "grad_norm": 5.880155086517334, "learning_rate": 0.00018853805398807004, "epoch": 0.20780856423173805, "step": 330 }, { "loss": 1.444, "grad_norm": 5.846175193786621, "learning_rate": 0.00018753692776137584, "epoch": 0.2141057934508816, "step": 340 }, { "loss": 1.625, "grad_norm": 5.362205982208252, "learning_rate": 0.00018649678317113084, "epoch": 0.22040302267002518, "step": 350 }, { "loss": 1.6577, "grad_norm": 4.623821258544922, "learning_rate": 0.0001854180838472659, "epoch": 0.22670025188916876, "step": 360 }, { "loss": 1.5298, "grad_norm": 4.931164741516113, "learning_rate": 0.00018430131060494616, "epoch": 0.23299748110831234, "step": 370 }, { "loss": 1.5142, "grad_norm": 5.050358295440674, "learning_rate": 0.00018314696123025454, "epoch": 0.23929471032745592, "step": 380 }, { "loss": 1.5357, "grad_norm": 4.571812629699707, "learning_rate": 0.00018195555025831022, "epoch": 0.2455919395465995, "step": 390 }, { "eval_loss": 1.5576220750808716, "eval_runtime": 14.7082, "eval_samples_per_second": 35.286, "eval_steps_per_second": 8.839, "epoch": 0.24874055415617127, "step": 395 }, { "loss": 1.5179, "grad_norm": 4.708597183227539, "learning_rate": 0.00018072760874392215, "epoch": 0.2518891687657431, "step": 400 }, { "loss": 1.5781, "grad_norm": 4.262819766998291, "learning_rate": 0.00017946368402487845, "epoch": 0.25818639798488663, "step": 410 }, { "loss": 1.4886, "grad_norm": 5.044946193695068, "learning_rate": 0.00017816433947797858, "epoch": 0.26448362720403024, "step": 420 }, { "loss": 1.4781, "grad_norm": 5.743882656097412, "learning_rate": 0.00017683015426791566, "epoch": 0.2707808564231738, "step": 430 }, { "loss": 1.5219, "grad_norm": 5.393696308135986, "learning_rate": 0.00017546172308912213, "epoch": 0.2770780856423174, "step": 440 }, { "loss": 1.5132, "grad_norm": 7.644551753997803, "learning_rate": 0.0001740596559006929, "epoch": 0.28337531486146095, "step": 450 }, { "loss": 1.4883, "grad_norm": 5.462601661682129, "learning_rate": 0.00017262457765450457, "epoch": 0.28967254408060455, "step": 460 }, { "loss": 1.5252, "grad_norm": 4.247793197631836, "learning_rate": 0.000171157128016652, "epoch": 0.2959697732997481, "step": 470 }, { "eval_loss": 1.5793730020523071, "eval_runtime": 14.5182, "eval_samples_per_second": 35.748, "eval_steps_per_second": 8.954, "epoch": 0.29848866498740556, "step": 474 }, { "loss": 1.6033, "grad_norm": 6.310419082641602, "learning_rate": 0.00016965796108232598, "epoch": 0.3022670025188917, "step": 480 }, { "loss": 1.6718, "grad_norm": 5.100507736206055, "learning_rate": 0.00016812774508425933, "epoch": 0.30856423173803527, "step": 490 }, { "loss": 1.5078, "grad_norm": 4.1262946128845215, "learning_rate": 0.00016656716209487174, "epoch": 0.3148614609571788, "step": 500 }, { "loss": 1.4854, "grad_norm": 5.230551719665527, "learning_rate": 0.00016497690772224522, "epoch": 0.3211586901763224, "step": 510 }, { "loss": 1.5091, "grad_norm": 5.111631393432617, "learning_rate": 0.00016335769080006674, "epoch": 0.327455919395466, "step": 520 }, { "loss": 1.5427, "grad_norm": 4.551891803741455, "learning_rate": 0.00016171023307167545, "epoch": 0.3337531486146096, "step": 530 }, { "loss": 1.5839, "grad_norm": 4.708298683166504, "learning_rate": 0.00016003526886835553, "epoch": 0.34005037783375314, "step": 540 }, { "loss": 1.4655, "grad_norm": 4.417643070220947, "learning_rate": 0.00015833354478201844, "epoch": 0.34634760705289674, "step": 550 }, { "eval_loss": 1.5285385847091675, "eval_runtime": 14.6291, "eval_samples_per_second": 35.477, "eval_steps_per_second": 8.886, "epoch": 0.3482367758186398, "step": 553 }, { "loss": 1.5909, "grad_norm": 5.512932300567627, "learning_rate": 0.00015660581933241993, "epoch": 0.3526448362720403, "step": 560 }, { "loss": 1.5667, "grad_norm": 5.218059539794922, "learning_rate": 0.00015485286262906044, "epoch": 0.3589420654911839, "step": 570 }, { "loss": 1.584, "grad_norm": 5.250601291656494, "learning_rate": 0.00015307545602791955, "epoch": 0.36523929471032746, "step": 580 }, { "loss": 1.5065, "grad_norm": 5.601760387420654, "learning_rate": 0.00015127439178317745, "epoch": 0.371536523929471, "step": 590 }, { "loss": 1.4472, "grad_norm": 4.8602986335754395, "learning_rate": 0.00014945047269407876, "epoch": 0.3778337531486146, "step": 600 }, { "loss": 1.5242, "grad_norm": 3.2672348022460938, "learning_rate": 0.0001476045117470958, "epoch": 0.38413098236775817, "step": 610 }, { "loss": 1.4048, "grad_norm": 4.925168037414551, "learning_rate": 0.0001457373317535515, "epoch": 0.3904282115869018, "step": 620 }, { "loss": 1.4479, "grad_norm": 5.590626239776611, "learning_rate": 0.00014384976498286252, "epoch": 0.3967254408060453, "step": 630 }, { "eval_loss": 1.489644169807434, "eval_runtime": 14.4216, "eval_samples_per_second": 35.988, "eval_steps_per_second": 9.014, "epoch": 0.3979848866498741, "step": 632 }, { "loss": 1.4313, "grad_norm": 5.052401065826416, "learning_rate": 0.00014194265279156675, "epoch": 0.40302267002518893, "step": 640 }, { "loss": 1.5194, "grad_norm": 3.9724292755126953, "learning_rate": 0.00014001684524830057, "epoch": 0.4093198992443325, "step": 650 }, { "loss": 1.4166, "grad_norm": 4.177152156829834, "learning_rate": 0.0001380732007548925, "epoch": 0.4156171284634761, "step": 660 }, { "loss": 1.5784, "grad_norm": 4.706490516662598, "learning_rate": 0.00013611258566374257, "epoch": 0.42191435768261965, "step": 670 }, { "loss": 1.3634, "grad_norm": 4.3738017082214355, "learning_rate": 0.00013413587389165784, "epoch": 0.4282115869017632, "step": 680 }, { "loss": 1.5313, "grad_norm": 3.9545352458953857, "learning_rate": 0.00013214394653031616, "epoch": 0.4345088161209068, "step": 690 }, { "loss": 1.3985, "grad_norm": 4.201825141906738, "learning_rate": 0.000130137691453532, "epoch": 0.44080604534005036, "step": 700 }, { "loss": 1.5721, "grad_norm": 5.4546966552734375, "learning_rate": 0.0001281180029214988, "epoch": 0.44710327455919396, "step": 710 }, { "eval_loss": 1.4889687299728394, "eval_runtime": 14.577, "eval_samples_per_second": 35.604, "eval_steps_per_second": 8.918, "epoch": 0.4477329974811083, "step": 711 }, { "loss": 1.4477, "grad_norm": 5.406968116760254, "learning_rate": 0.00012608578118218535, "epoch": 0.4534005037783375, "step": 720 }, { "loss": 1.4616, "grad_norm": 4.538527965545654, "learning_rate": 0.00012404193207006246, "epoch": 0.4596977329974811, "step": 730 }, { "loss": 1.5133, "grad_norm": 4.29402494430542, "learning_rate": 0.00012198736660234009, "epoch": 0.4659949622166247, "step": 740 }, { "loss": 1.4501, "grad_norm": 3.8199307918548584, "learning_rate": 0.0001199230005728942, "epoch": 0.4722921914357683, "step": 750 }, { "loss": 1.5288, "grad_norm": 6.067654132843018, "learning_rate": 0.00011784975414406426, "epoch": 0.47858942065491183, "step": 760 }, { "loss": 1.3803, "grad_norm": 5.480033874511719, "learning_rate": 0.00011576855143650371, "epoch": 0.48488664987405544, "step": 770 }, { "loss": 1.5517, "grad_norm": 4.182514190673828, "learning_rate": 0.00011368032011726635, "epoch": 0.491183879093199, "step": 780 }, { "loss": 1.5284, "grad_norm": 4.433547496795654, "learning_rate": 0.00011158599098631137, "epoch": 0.49748110831234255, "step": 790 }, { "eval_loss": 1.4723758697509766, "eval_runtime": 14.538, "eval_samples_per_second": 35.7, "eval_steps_per_second": 8.942, "epoch": 0.49748110831234255, "step": 790 }, { "loss": 1.5369, "grad_norm": 4.8588433265686035, "learning_rate": 0.00010948649756161246, "epoch": 0.5037783375314862, "step": 800 }, { "loss": 1.4349, "grad_norm": 4.88569450378418, "learning_rate": 0.00010738277566305514, "epoch": 0.5100755667506297, "step": 810 }, { "loss": 1.6126, "grad_norm": 5.098734378814697, "learning_rate": 0.0001052757629953082, "epoch": 0.5163727959697733, "step": 820 }, { "loss": 1.5043, "grad_norm": 4.012162208557129, "learning_rate": 0.00010316639872985472, "epoch": 0.5226700251889169, "step": 830 }, { "loss": 1.4438, "grad_norm": 4.813101768493652, "learning_rate": 0.00010105562308636973, "epoch": 0.5289672544080605, "step": 840 }, { "loss": 1.434, "grad_norm": 3.8592841625213623, "learning_rate": 9.894437691363028e-05, "epoch": 0.535264483627204, "step": 850 }, { "loss": 1.4978, "grad_norm": 5.364985942840576, "learning_rate": 9.683360127014529e-05, "epoch": 0.5415617128463476, "step": 860 }, { "eval_loss": 1.4382336139678955, "eval_runtime": 14.714, "eval_samples_per_second": 35.272, "eval_steps_per_second": 8.835, "epoch": 0.5472292191435768, "step": 869 }, { "loss": 1.4885, "grad_norm": 5.363029479980469, "learning_rate": 9.47242370046918e-05, "epoch": 0.5478589420654912, "step": 870 }, { "loss": 1.5044, "grad_norm": 4.980364799499512, "learning_rate": 9.261722433694485e-05, "epoch": 0.5541561712846348, "step": 880 }, { "loss": 1.353, "grad_norm": 4.4146952629089355, "learning_rate": 9.051350243838756e-05, "epoch": 0.5604534005037783, "step": 890 }, { "loss": 1.3262, "grad_norm": 5.358567237854004, "learning_rate": 8.841400901368864e-05, "epoch": 0.5667506297229219, "step": 900 }, { "loss": 1.5514, "grad_norm": 4.715535640716553, "learning_rate": 8.631967988273366e-05, "epoch": 0.5730478589420654, "step": 910 }, { "loss": 1.4533, "grad_norm": 4.502087116241455, "learning_rate": 8.423144856349631e-05, "epoch": 0.5793450881612091, "step": 920 }, { "loss": 1.4378, "grad_norm": 4.85370397567749, "learning_rate": 8.215024585593577e-05, "epoch": 0.5856423173803527, "step": 930 }, { "loss": 1.4769, "grad_norm": 4.3678297996521, "learning_rate": 8.007699942710581e-05, "epoch": 0.5919395465994962, "step": 940 }, { "eval_loss": 1.4315046072006226, "eval_runtime": 14.4016, "eval_samples_per_second": 36.038, "eval_steps_per_second": 9.027, "epoch": 0.5969773299748111, "step": 948 }, { "loss": 1.3736, "grad_norm": 4.551608085632324, "learning_rate": 7.801263339765994e-05, "epoch": 0.5982367758186398, "step": 950 }, { "loss": 1.4156, "grad_norm": 3.7702224254608154, "learning_rate": 7.595806792993759e-05, "epoch": 0.6045340050377834, "step": 960 }, { "loss": 1.3623, "grad_norm": 4.440454483032227, "learning_rate": 7.391421881781468e-05, "epoch": 0.610831234256927, "step": 970 }, { "loss": 1.38, "grad_norm": 4.850752830505371, "learning_rate": 7.188199707850122e-05, "epoch": 0.6171284634760705, "step": 980 }, { "loss": 1.3101, "grad_norm": 3.9209938049316406, "learning_rate": 6.986230854646804e-05, "epoch": 0.6234256926952141, "step": 990 }, { "loss": 1.3677, "grad_norm": 4.586505889892578, "learning_rate": 6.785605346968386e-05, "epoch": 0.6297229219143576, "step": 1000 }, { "loss": 1.4159, "grad_norm": 4.723811149597168, "learning_rate": 6.586412610834221e-05, "epoch": 0.6360201511335013, "step": 1010 }, { "loss": 1.3878, "grad_norm": 4.9766154289245605, "learning_rate": 6.388741433625746e-05, "epoch": 0.6423173803526449, "step": 1020 }, { "eval_loss": 1.4128155708312988, "eval_runtime": 14.5331, "eval_samples_per_second": 35.712, "eval_steps_per_second": 8.945, "epoch": 0.6467254408060453, "step": 1027 }, { "loss": 1.3711, "grad_norm": 4.173240661621094, "learning_rate": 6.192679924510752e-05, "epoch": 0.6486146095717884, "step": 1030 }, { "loss": 1.408, "grad_norm": 5.662805557250977, "learning_rate": 5.998315475169942e-05, "epoch": 0.654911838790932, "step": 1040 }, { "loss": 1.4101, "grad_norm": 4.8551716804504395, "learning_rate": 5.8057347208433266e-05, "epoch": 0.6612090680100756, "step": 1050 }, { "loss": 1.2795, "grad_norm": 4.329833030700684, "learning_rate": 5.615023501713752e-05, "epoch": 0.6675062972292192, "step": 1060 }, { "loss": 1.4149, "grad_norm": 3.759012460708618, "learning_rate": 5.4262668246448475e-05, "epoch": 0.6738035264483627, "step": 1070 }, { "loss": 1.3728, "grad_norm": 4.901814937591553, "learning_rate": 5.239548825290419e-05, "epoch": 0.6801007556675063, "step": 1080 }, { "loss": 1.4085, "grad_norm": 4.735474109649658, "learning_rate": 5.0549527305921265e-05, "epoch": 0.6863979848866498, "step": 1090 }, { "loss": 1.3691, "grad_norm": 3.6761767864227295, "learning_rate": 4.872560821682256e-05, "epoch": 0.6926952141057935, "step": 1100 }, { "eval_loss": 1.4010512828826904, "eval_runtime": 14.5256, "eval_samples_per_second": 35.73, "eval_steps_per_second": 8.95, "epoch": 0.6964735516372796, "step": 1106 }, { "loss": 1.3057, "grad_norm": 4.493793964385986, "learning_rate": 4.692454397208048e-05, "epoch": 0.698992443324937, "step": 1110 }, { "loss": 1.3721, "grad_norm": 5.266867160797119, "learning_rate": 4.5147137370939595e-05, "epoch": 0.7052896725440806, "step": 1120 }, { "loss": 1.4906, "grad_norm": 4.867866039276123, "learning_rate": 4.339418066758008e-05, "epoch": 0.7115869017632241, "step": 1130 }, { "loss": 1.4071, "grad_norm": 4.608040809631348, "learning_rate": 4.166645521798157e-05, "epoch": 0.7178841309823678, "step": 1140 }, { "loss": 1.3585, "grad_norm": 7.612720012664795, "learning_rate": 3.99647311316445e-05, "epoch": 0.7241813602015114, "step": 1150 }, { "loss": 1.4131, "grad_norm": 5.223732948303223, "learning_rate": 3.828976692832458e-05, "epoch": 0.7304785894206549, "step": 1160 }, { "loss": 1.3566, "grad_norm": 4.113969802856445, "learning_rate": 3.6642309199933275e-05, "epoch": 0.7367758186397985, "step": 1170 }, { "loss": 1.2864, "grad_norm": 3.8852932453155518, "learning_rate": 3.502309227775482e-05, "epoch": 0.743073047858942, "step": 1180 }, { "eval_loss": 1.391667366027832, "eval_runtime": 14.5167, "eval_samples_per_second": 35.752, "eval_steps_per_second": 8.955, "epoch": 0.7462216624685138, "step": 1185 }, { "loss": 1.416, "grad_norm": 4.738650798797607, "learning_rate": 3.343283790512829e-05, "epoch": 0.7493702770780857, "step": 1190 }, { "loss": 1.4496, "grad_norm": 4.808592319488525, "learning_rate": 3.1872254915740686e-05, "epoch": 0.7556675062972292, "step": 1200 }, { "loss": 1.3199, "grad_norm": 5.310007095336914, "learning_rate": 3.0342038917674064e-05, "epoch": 0.7619647355163728, "step": 1210 }, { "loss": 1.4116, "grad_norm": 3.6367766857147217, "learning_rate": 2.8842871983347998e-05, "epoch": 0.7682619647355163, "step": 1220 }, { "loss": 1.4192, "grad_norm": 4.783074378967285, "learning_rate": 2.7375422345495448e-05, "epoch": 0.77455919395466, "step": 1230 }, { "loss": 1.294, "grad_norm": 5.352371692657471, "learning_rate": 2.5940344099307146e-05, "epoch": 0.7808564231738035, "step": 1240 }, { "loss": 1.493, "grad_norm": 4.887784957885742, "learning_rate": 2.45382769108779e-05, "epoch": 0.7871536523929471, "step": 1250 }, { "loss": 1.4063, "grad_norm": 5.015107154846191, "learning_rate": 2.3169845732084338e-05, "epoch": 0.7934508816120907, "step": 1260 }, { "eval_loss": 1.3790984153747559, "eval_runtime": 14.5192, "eval_samples_per_second": 35.746, "eval_steps_per_second": 8.954, "epoch": 0.7959697732997482, "step": 1264 }, { "loss": 1.4125, "grad_norm": 4.3769917488098145, "learning_rate": 2.1835660522021417e-05, "epoch": 0.7997481108312342, "step": 1270 }, { "loss": 1.3623, "grad_norm": 4.965677261352539, "learning_rate": 2.0536315975121544e-05, "epoch": 0.8060453400503779, "step": 1280 }, { "loss": 1.3702, "grad_norm": 5.39815616607666, "learning_rate": 1.927239125607788e-05, "epoch": 0.8123425692695214, "step": 1290 }, { "loss": 1.3878, "grad_norm": 4.974949836730957, "learning_rate": 1.8044449741689794e-05, "epoch": 0.818639798488665, "step": 1300 }, { "loss": 1.4957, "grad_norm": 4.583181381225586, "learning_rate": 1.6853038769745467e-05, "epoch": 0.8249370277078085, "step": 1310 }, { "loss": 1.4018, "grad_norm": 5.503086566925049, "learning_rate": 1.5698689395053835e-05, "epoch": 0.8312342569269522, "step": 1320 }, { "loss": 1.2619, "grad_norm": 4.569085121154785, "learning_rate": 1.4581916152734132e-05, "epoch": 0.8375314861460957, "step": 1330 }, { "loss": 1.4125, "grad_norm": 5.070247173309326, "learning_rate": 1.3503216828869192e-05, "epoch": 0.8438287153652393, "step": 1340 }, { "eval_loss": 1.3746401071548462, "eval_runtime": 14.702, "eval_samples_per_second": 35.301, "eval_steps_per_second": 8.842, "epoch": 0.8457178841309824, "step": 1343 }, { "loss": 1.405, "grad_norm": 4.505115032196045, "learning_rate": 1.2463072238624185e-05, "epoch": 0.8501259445843828, "step": 1350 }, { "loss": 1.2935, "grad_norm": 3.9875564575195312, "learning_rate": 1.146194601192998e-05, "epoch": 0.8564231738035264, "step": 1360 }, { "loss": 1.3752, "grad_norm": 4.175727844238281, "learning_rate": 1.0500284386826597e-05, "epoch": 0.8627204030226701, "step": 1370 }, { "loss": 1.4002, "grad_norm": 3.8710429668426514, "learning_rate": 9.578516010558847e-06, "epoch": 0.8690176322418136, "step": 1380 }, { "loss": 1.2572, "grad_norm": 39.77766418457031, "learning_rate": 8.697051748512708e-06, "epoch": 0.8753148614609572, "step": 1390 }, { "loss": 1.3341, "grad_norm": 5.350057601928711, "learning_rate": 7.856284501077926e-06, "epoch": 0.8816120906801007, "step": 1400 }, { "loss": 1.3181, "grad_norm": 4.475919246673584, "learning_rate": 7.056589028518068e-06, "epoch": 0.8879093198992444, "step": 1410 }, { "loss": 1.3217, "grad_norm": 4.369590759277344, "learning_rate": 6.2983217839264756e-06, "epoch": 0.8942065491183879, "step": 1420 }, { "eval_loss": 1.37122642993927, "eval_runtime": 14.501, "eval_samples_per_second": 35.791, "eval_steps_per_second": 8.965, "epoch": 0.8954659949622166, "step": 1422 }, { "loss": 1.3551, "grad_norm": 4.054989337921143, "learning_rate": 5.581820754342137e-06, "epoch": 0.9005037783375315, "step": 1430 }, { "loss": 1.2635, "grad_norm": 4.094259738922119, "learning_rate": 4.907405310096769e-06, "epoch": 0.906801007556675, "step": 1440 }, { "loss": 1.3872, "grad_norm": 4.884560585021973, "learning_rate": 4.275376062459946e-06, "epoch": 0.9130982367758187, "step": 1450 }, { "loss": 1.2567, "grad_norm": 4.7786641120910645, "learning_rate": 3.6860147296457816e-06, "epoch": 0.9193954659949622, "step": 1460 }, { "loss": 1.4019, "grad_norm": 4.289524078369141, "learning_rate": 3.1395840112410034e-06, "epoch": 0.9256926952141058, "step": 1470 }, { "loss": 1.3127, "grad_norm": 4.1973490715026855, "learning_rate": 2.636327471110289e-06, "epoch": 0.9319899244332494, "step": 1480 }, { "loss": 1.2502, "grad_norm": 4.9555206298828125, "learning_rate": 2.1764694288310184e-06, "epoch": 0.9382871536523929, "step": 1490 }, { "loss": 1.3839, "grad_norm": 5.079888343811035, "learning_rate": 1.7602148597060108e-06, "epoch": 0.9445843828715366, "step": 1500 }, { "eval_loss": 1.3689830303192139, "eval_runtime": 14.5025, "eval_samples_per_second": 35.787, "eval_steps_per_second": 8.964, "epoch": 0.9452141057934509, "step": 1501 }, { "loss": 1.4754, "grad_norm": 4.302427768707275, "learning_rate": 1.3877493033986223e-06, "epoch": 0.9508816120906801, "step": 1510 }, { "loss": 1.3091, "grad_norm": 4.7720866203308105, "learning_rate": 1.0592387812310311e-06, "epoch": 0.9571788413098237, "step": 1520 }, { "loss": 1.2316, "grad_norm": 4.849659442901611, "learning_rate": 7.748297221825573e-07, "epoch": 0.9634760705289672, "step": 1530 }, { "loss": 1.2809, "grad_norm": 4.5142621994018555, "learning_rate": 5.346488976209596e-07, "epoch": 0.9697732997481109, "step": 1540 }, { "loss": 1.2613, "grad_norm": 4.6826019287109375, "learning_rate": 3.3880336479590325e-07, "epoch": 0.9760705289672544, "step": 1550 }, { "loss": 1.3195, "grad_norm": 4.848607540130615, "learning_rate": 1.8738041911969818e-07, "epoch": 0.982367758186398, "step": 1560 }, { "loss": 1.3067, "grad_norm": 4.836756706237793, "learning_rate": 8.044755525652425e-08, "epoch": 0.9886649874055415, "step": 1570 }, { "loss": 1.3373, "grad_norm": 4.4072771072387695, "learning_rate": 1.8052437037707758e-08, "epoch": 0.9949622166246851, "step": 1580 }, { "eval_loss": 1.3691413402557373, "eval_runtime": 14.5693, "eval_samples_per_second": 35.623, "eval_steps_per_second": 8.923, "epoch": 0.9949622166246851, "step": 1580 }, { "train_runtime": 1573.3242, "train_samples_per_second": 16.149, "train_steps_per_second": 1.009, "total_flos": 1.159913454336e+16, "train_loss": 1.5156250816748784, "epoch": 1.0, "step": 1588 } ]