{ "best_global_step": 21795, "best_metric": 0.9681435371658733, "best_model_checkpoint": "malwi_models/results/checkpoint-21795", "epoch": 3.0, "eval_steps": 500, "global_step": 21795, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013764624913971094, "grad_norm": 4.472999572753906, "learning_rate": 9e-07, "loss": 0.7091, "step": 10 }, { "epoch": 0.0027529249827942187, "grad_norm": 3.841139316558838, "learning_rate": 1.9e-06, "loss": 0.6386, "step": 20 }, { "epoch": 0.0041293874741913286, "grad_norm": 3.104367971420288, "learning_rate": 2.9e-06, "loss": 0.4986, "step": 30 }, { "epoch": 0.0055058499655884375, "grad_norm": 2.127228260040283, "learning_rate": 3.9e-06, "loss": 0.302, "step": 40 }, { "epoch": 0.006882312456985547, "grad_norm": 1.8276289701461792, "learning_rate": 4.9000000000000005e-06, "loss": 0.2328, "step": 50 }, { "epoch": 0.008258774948382657, "grad_norm": 0.9121363759040833, "learning_rate": 5.9e-06, "loss": 0.1883, "step": 60 }, { "epoch": 0.009635237439779766, "grad_norm": 2.601842164993286, "learning_rate": 6.900000000000001e-06, "loss": 0.1924, "step": 70 }, { "epoch": 0.011011699931176875, "grad_norm": 1.700925350189209, "learning_rate": 7.9e-06, "loss": 0.1481, "step": 80 }, { "epoch": 0.012388162422573986, "grad_norm": 0.5809162855148315, "learning_rate": 8.9e-06, "loss": 0.139, "step": 90 }, { "epoch": 0.013764624913971095, "grad_norm": 1.9961662292480469, "learning_rate": 9.900000000000002e-06, "loss": 0.1277, "step": 100 }, { "epoch": 0.015141087405368204, "grad_norm": 7.916194438934326, "learning_rate": 1.09e-05, "loss": 0.1527, "step": 110 }, { "epoch": 0.016517549896765314, "grad_norm": 0.7270988821983337, "learning_rate": 1.19e-05, "loss": 0.0871, "step": 120 }, { "epoch": 0.017894012388162423, "grad_norm": 0.2748268246650696, "learning_rate": 1.29e-05, "loss": 0.0584, "step": 130 }, { "epoch": 0.019270474879559532, "grad_norm": 0.3067634105682373, "learning_rate": 1.3900000000000002e-05, "loss": 0.0515, "step": 140 }, { "epoch": 0.02064693737095664, "grad_norm": 0.22013206779956818, "learning_rate": 1.49e-05, "loss": 0.0732, "step": 150 }, { "epoch": 0.02202339986235375, "grad_norm": 0.22196079790592194, "learning_rate": 1.59e-05, "loss": 0.0954, "step": 160 }, { "epoch": 0.02339986235375086, "grad_norm": 5.057211399078369, "learning_rate": 1.69e-05, "loss": 0.0619, "step": 170 }, { "epoch": 0.02477632484514797, "grad_norm": 140.93508911132812, "learning_rate": 1.79e-05, "loss": 0.016, "step": 180 }, { "epoch": 0.02615278733654508, "grad_norm": 0.12292534112930298, "learning_rate": 1.8900000000000002e-05, "loss": 0.047, "step": 190 }, { "epoch": 0.02752924982794219, "grad_norm": 0.1022370308637619, "learning_rate": 1.9900000000000003e-05, "loss": 0.0626, "step": 200 }, { "epoch": 0.028905712319339298, "grad_norm": 0.10958222299814224, "learning_rate": 2.09e-05, "loss": 0.0232, "step": 210 }, { "epoch": 0.030282174810736407, "grad_norm": 7.240455627441406, "learning_rate": 2.19e-05, "loss": 0.0821, "step": 220 }, { "epoch": 0.031658637302133516, "grad_norm": 0.21588343381881714, "learning_rate": 2.29e-05, "loss": 0.0766, "step": 230 }, { "epoch": 0.03303509979353063, "grad_norm": 0.11172044277191162, "learning_rate": 2.39e-05, "loss": 0.0226, "step": 240 }, { "epoch": 0.034411562284927734, "grad_norm": 0.07041200995445251, "learning_rate": 2.4900000000000002e-05, "loss": 0.0449, "step": 250 }, { "epoch": 0.035788024776324846, "grad_norm": 5.989948272705078, "learning_rate": 2.5900000000000003e-05, "loss": 0.0978, "step": 260 }, { "epoch": 0.03716448726772195, "grad_norm": 0.09964669495820999, "learning_rate": 2.6900000000000003e-05, "loss": 0.0412, "step": 270 }, { "epoch": 0.038540949759119064, "grad_norm": 2.302783966064453, "learning_rate": 2.7900000000000004e-05, "loss": 0.0298, "step": 280 }, { "epoch": 0.03991741225051618, "grad_norm": 0.07127115875482559, "learning_rate": 2.8899999999999998e-05, "loss": 0.0053, "step": 290 }, { "epoch": 0.04129387474191328, "grad_norm": 0.03989144042134285, "learning_rate": 2.9900000000000002e-05, "loss": 0.0023, "step": 300 }, { "epoch": 0.042670337233310394, "grad_norm": 130.65762329101562, "learning_rate": 3.09e-05, "loss": 0.2129, "step": 310 }, { "epoch": 0.0440467997247075, "grad_norm": 0.0915679782629013, "learning_rate": 3.19e-05, "loss": 0.076, "step": 320 }, { "epoch": 0.04542326221610461, "grad_norm": 2.284477710723877, "learning_rate": 3.29e-05, "loss": 0.0463, "step": 330 }, { "epoch": 0.04679972470750172, "grad_norm": 0.09934666752815247, "learning_rate": 3.3900000000000004e-05, "loss": 0.0389, "step": 340 }, { "epoch": 0.04817618719889883, "grad_norm": 0.056454822421073914, "learning_rate": 3.49e-05, "loss": 0.1168, "step": 350 }, { "epoch": 0.04955264969029594, "grad_norm": 0.032067857682704926, "learning_rate": 3.59e-05, "loss": 0.0272, "step": 360 }, { "epoch": 0.05092911218169305, "grad_norm": 0.029113123193383217, "learning_rate": 3.69e-05, "loss": 0.0115, "step": 370 }, { "epoch": 0.05230557467309016, "grad_norm": 0.03807977959513664, "learning_rate": 3.79e-05, "loss": 0.003, "step": 380 }, { "epoch": 0.053682037164487266, "grad_norm": 0.020103249698877335, "learning_rate": 3.8900000000000004e-05, "loss": 0.0015, "step": 390 }, { "epoch": 0.05505849965588438, "grad_norm": 0.29497984051704407, "learning_rate": 3.99e-05, "loss": 0.0981, "step": 400 }, { "epoch": 0.056434962147281484, "grad_norm": 0.055862635374069214, "learning_rate": 4.09e-05, "loss": 0.0845, "step": 410 }, { "epoch": 0.057811424638678596, "grad_norm": 0.32200124859809875, "learning_rate": 4.19e-05, "loss": 0.1012, "step": 420 }, { "epoch": 0.05918788713007571, "grad_norm": 0.05350476875901222, "learning_rate": 4.29e-05, "loss": 0.0042, "step": 430 }, { "epoch": 0.060564349621472814, "grad_norm": 0.0278279110789299, "learning_rate": 4.39e-05, "loss": 0.0646, "step": 440 }, { "epoch": 0.06194081211286993, "grad_norm": 0.024784889072179794, "learning_rate": 4.49e-05, "loss": 0.0015, "step": 450 }, { "epoch": 0.06331727460426703, "grad_norm": 0.030709804967045784, "learning_rate": 4.5900000000000004e-05, "loss": 0.0359, "step": 460 }, { "epoch": 0.06469373709566414, "grad_norm": 0.18985888361930847, "learning_rate": 4.69e-05, "loss": 0.0456, "step": 470 }, { "epoch": 0.06607019958706126, "grad_norm": 4.588707447052002, "learning_rate": 4.79e-05, "loss": 0.0968, "step": 480 }, { "epoch": 0.06744666207845836, "grad_norm": 0.20252466201782227, "learning_rate": 4.89e-05, "loss": 0.0661, "step": 490 }, { "epoch": 0.06882312456985547, "grad_norm": 1.984708309173584, "learning_rate": 4.99e-05, "loss": 0.063, "step": 500 }, { "epoch": 0.07019958706125258, "grad_norm": 0.18678295612335205, "learning_rate": 4.997886827893872e-05, "loss": 0.0336, "step": 510 }, { "epoch": 0.07157604955264969, "grad_norm": 0.5696492791175842, "learning_rate": 4.995538858887063e-05, "loss": 0.078, "step": 520 }, { "epoch": 0.0729525120440468, "grad_norm": 0.047999169677495956, "learning_rate": 4.9931908898802534e-05, "loss": 0.0164, "step": 530 }, { "epoch": 0.0743289745354439, "grad_norm": 0.08052933216094971, "learning_rate": 4.990842920873445e-05, "loss": 0.0447, "step": 540 }, { "epoch": 0.07570543702684102, "grad_norm": 0.03917187824845314, "learning_rate": 4.9884949518666356e-05, "loss": 0.0023, "step": 550 }, { "epoch": 0.07708189951823813, "grad_norm": 0.066179059445858, "learning_rate": 4.986146982859827e-05, "loss": 0.0457, "step": 560 }, { "epoch": 0.07845836200963524, "grad_norm": 0.04286588728427887, "learning_rate": 4.983799013853017e-05, "loss": 0.07, "step": 570 }, { "epoch": 0.07983482450103235, "grad_norm": 0.2532537281513214, "learning_rate": 4.9814510448462085e-05, "loss": 0.0781, "step": 580 }, { "epoch": 0.08121128699242945, "grad_norm": 0.11782566457986832, "learning_rate": 4.979103075839399e-05, "loss": 0.0428, "step": 590 }, { "epoch": 0.08258774948382656, "grad_norm": 0.08183860778808594, "learning_rate": 4.97675510683259e-05, "loss": 0.0758, "step": 600 }, { "epoch": 0.08396421197522368, "grad_norm": 0.41337740421295166, "learning_rate": 4.9744071378257814e-05, "loss": 0.0713, "step": 610 }, { "epoch": 0.08534067446662079, "grad_norm": 0.055460382252931595, "learning_rate": 4.9720591688189714e-05, "loss": 0.0061, "step": 620 }, { "epoch": 0.08671713695801789, "grad_norm": 0.024634292349219322, "learning_rate": 4.969711199812163e-05, "loss": 0.0015, "step": 630 }, { "epoch": 0.088093599449415, "grad_norm": 0.03227999433875084, "learning_rate": 4.9673632308053536e-05, "loss": 0.0428, "step": 640 }, { "epoch": 0.08947006194081211, "grad_norm": 0.045679036527872086, "learning_rate": 4.965015261798544e-05, "loss": 0.028, "step": 650 }, { "epoch": 0.09084652443220922, "grad_norm": 0.3809230327606201, "learning_rate": 4.962667292791736e-05, "loss": 0.0097, "step": 660 }, { "epoch": 0.09222298692360634, "grad_norm": 0.018441978842020035, "learning_rate": 4.960319323784926e-05, "loss": 0.0013, "step": 670 }, { "epoch": 0.09359944941500344, "grad_norm": 0.05591975897550583, "learning_rate": 4.957971354778117e-05, "loss": 0.0381, "step": 680 }, { "epoch": 0.09497591190640055, "grad_norm": 3.629845142364502, "learning_rate": 4.955623385771308e-05, "loss": 0.0263, "step": 690 }, { "epoch": 0.09635237439779766, "grad_norm": 0.05440714582800865, "learning_rate": 4.953275416764499e-05, "loss": 0.0474, "step": 700 }, { "epoch": 0.09772883688919477, "grad_norm": 0.06909291446208954, "learning_rate": 4.95092744775769e-05, "loss": 0.0267, "step": 710 }, { "epoch": 0.09910529938059189, "grad_norm": 0.2693035304546356, "learning_rate": 4.94857947875088e-05, "loss": 0.0552, "step": 720 }, { "epoch": 0.10048176187198898, "grad_norm": 0.01661653071641922, "learning_rate": 4.9462315097440715e-05, "loss": 0.054, "step": 730 }, { "epoch": 0.1018582243633861, "grad_norm": 0.0165869127959013, "learning_rate": 4.943883540737262e-05, "loss": 0.004, "step": 740 }, { "epoch": 0.10323468685478321, "grad_norm": 0.023554543033242226, "learning_rate": 4.941535571730454e-05, "loss": 0.0292, "step": 750 }, { "epoch": 0.10461114934618032, "grad_norm": 0.107005774974823, "learning_rate": 4.9391876027236444e-05, "loss": 0.0178, "step": 760 }, { "epoch": 0.10598761183757742, "grad_norm": 0.01732088252902031, "learning_rate": 4.936839633716835e-05, "loss": 0.0098, "step": 770 }, { "epoch": 0.10736407432897453, "grad_norm": 0.0111131202429533, "learning_rate": 4.934491664710026e-05, "loss": 0.0218, "step": 780 }, { "epoch": 0.10874053682037164, "grad_norm": 2.034214496612549, "learning_rate": 4.9321436957032166e-05, "loss": 0.0229, "step": 790 }, { "epoch": 0.11011699931176876, "grad_norm": 0.02205265313386917, "learning_rate": 4.929795726696408e-05, "loss": 0.0331, "step": 800 }, { "epoch": 0.11149346180316587, "grad_norm": 0.1099054366350174, "learning_rate": 4.927447757689599e-05, "loss": 0.0638, "step": 810 }, { "epoch": 0.11286992429456297, "grad_norm": 0.02967405691742897, "learning_rate": 4.92509978868279e-05, "loss": 0.0398, "step": 820 }, { "epoch": 0.11424638678596008, "grad_norm": 0.01985442079603672, "learning_rate": 4.92275181967598e-05, "loss": 0.0103, "step": 830 }, { "epoch": 0.11562284927735719, "grad_norm": 0.1577325165271759, "learning_rate": 4.920403850669171e-05, "loss": 0.0396, "step": 840 }, { "epoch": 0.1169993117687543, "grad_norm": 0.05393123999238014, "learning_rate": 4.9180558816623624e-05, "loss": 0.0042, "step": 850 }, { "epoch": 0.11837577426015142, "grad_norm": 0.05982143059372902, "learning_rate": 4.915707912655553e-05, "loss": 0.0604, "step": 860 }, { "epoch": 0.11975223675154852, "grad_norm": 0.03662943094968796, "learning_rate": 4.9133599436487445e-05, "loss": 0.026, "step": 870 }, { "epoch": 0.12112869924294563, "grad_norm": 0.027767900377511978, "learning_rate": 4.9110119746419346e-05, "loss": 0.0209, "step": 880 }, { "epoch": 0.12250516173434274, "grad_norm": 0.010798404924571514, "learning_rate": 4.908664005635126e-05, "loss": 0.0127, "step": 890 }, { "epoch": 0.12388162422573985, "grad_norm": 0.3646028935909271, "learning_rate": 4.906316036628317e-05, "loss": 0.0271, "step": 900 }, { "epoch": 0.12525808671713695, "grad_norm": 0.29045411944389343, "learning_rate": 4.9039680676215075e-05, "loss": 0.0673, "step": 910 }, { "epoch": 0.12663454920853406, "grad_norm": 0.7097387313842773, "learning_rate": 4.901620098614699e-05, "loss": 0.0155, "step": 920 }, { "epoch": 0.12801101169993118, "grad_norm": 0.024437498301267624, "learning_rate": 4.899272129607889e-05, "loss": 0.003, "step": 930 }, { "epoch": 0.1293874741913283, "grad_norm": 0.0154951773583889, "learning_rate": 4.8969241606010804e-05, "loss": 0.002, "step": 940 }, { "epoch": 0.1307639366827254, "grad_norm": 0.019727494567632675, "learning_rate": 4.894576191594271e-05, "loss": 0.0527, "step": 950 }, { "epoch": 0.1321403991741225, "grad_norm": 0.2392633557319641, "learning_rate": 4.892228222587462e-05, "loss": 0.0888, "step": 960 }, { "epoch": 0.13351686166551963, "grad_norm": 0.16450823843479156, "learning_rate": 4.889880253580653e-05, "loss": 0.0417, "step": 970 }, { "epoch": 0.1348933241569167, "grad_norm": 0.04718532785773277, "learning_rate": 4.887532284573843e-05, "loss": 0.0036, "step": 980 }, { "epoch": 0.13626978664831382, "grad_norm": 0.06450389325618744, "learning_rate": 4.885184315567035e-05, "loss": 0.0447, "step": 990 }, { "epoch": 0.13764624913971094, "grad_norm": 0.03461958467960358, "learning_rate": 4.8828363465602255e-05, "loss": 0.0042, "step": 1000 }, { "epoch": 0.13902271163110805, "grad_norm": 0.04643983021378517, "learning_rate": 4.880488377553417e-05, "loss": 0.0866, "step": 1010 }, { "epoch": 0.14039917412250516, "grad_norm": 0.11438150703907013, "learning_rate": 4.8781404085466076e-05, "loss": 0.1191, "step": 1020 }, { "epoch": 0.14177563661390227, "grad_norm": 0.20119494199752808, "learning_rate": 4.8757924395397984e-05, "loss": 0.0391, "step": 1030 }, { "epoch": 0.14315209910529939, "grad_norm": 0.06289815902709961, "learning_rate": 4.873444470532989e-05, "loss": 0.0054, "step": 1040 }, { "epoch": 0.1445285615966965, "grad_norm": 0.029058853164315224, "learning_rate": 4.87109650152618e-05, "loss": 0.0023, "step": 1050 }, { "epoch": 0.1459050240880936, "grad_norm": 0.022046463564038277, "learning_rate": 4.868748532519371e-05, "loss": 0.0012, "step": 1060 }, { "epoch": 0.14728148657949072, "grad_norm": 0.028487863019108772, "learning_rate": 4.866400563512562e-05, "loss": 0.0816, "step": 1070 }, { "epoch": 0.1486579490708878, "grad_norm": 0.12508617341518402, "learning_rate": 4.864052594505753e-05, "loss": 0.0943, "step": 1080 }, { "epoch": 0.15003441156228492, "grad_norm": 0.06136130169034004, "learning_rate": 4.8617046254989434e-05, "loss": 0.0456, "step": 1090 }, { "epoch": 0.15141087405368203, "grad_norm": 0.09450109302997589, "learning_rate": 4.859356656492134e-05, "loss": 0.0542, "step": 1100 }, { "epoch": 0.15278733654507914, "grad_norm": 0.07500394433736801, "learning_rate": 4.8570086874853256e-05, "loss": 0.005, "step": 1110 }, { "epoch": 0.15416379903647626, "grad_norm": 0.04748724773526192, "learning_rate": 4.854660718478516e-05, "loss": 0.0034, "step": 1120 }, { "epoch": 0.15554026152787337, "grad_norm": 0.034272897988557816, "learning_rate": 4.852312749471708e-05, "loss": 0.0424, "step": 1130 }, { "epoch": 0.15691672401927048, "grad_norm": 0.07459408789873123, "learning_rate": 4.849964780464898e-05, "loss": 0.1159, "step": 1140 }, { "epoch": 0.1582931865106676, "grad_norm": 0.1717173457145691, "learning_rate": 4.847616811458089e-05, "loss": 0.0469, "step": 1150 }, { "epoch": 0.1596696490020647, "grad_norm": 0.32530370354652405, "learning_rate": 4.84526884245128e-05, "loss": 0.0301, "step": 1160 }, { "epoch": 0.1610461114934618, "grad_norm": 0.04841623455286026, "learning_rate": 4.842920873444471e-05, "loss": 0.0038, "step": 1170 }, { "epoch": 0.1624225739848589, "grad_norm": 0.030305538326501846, "learning_rate": 4.840572904437662e-05, "loss": 0.0041, "step": 1180 }, { "epoch": 0.16379903647625602, "grad_norm": 0.022904468700289726, "learning_rate": 4.838224935430852e-05, "loss": 0.0012, "step": 1190 }, { "epoch": 0.16517549896765313, "grad_norm": 0.022021977230906487, "learning_rate": 4.8358769664240436e-05, "loss": 0.0444, "step": 1200 }, { "epoch": 0.16655196145905024, "grad_norm": 0.02352530136704445, "learning_rate": 4.833528997417234e-05, "loss": 0.0013, "step": 1210 }, { "epoch": 0.16792842395044735, "grad_norm": 0.0990290492773056, "learning_rate": 4.831181028410425e-05, "loss": 0.1245, "step": 1220 }, { "epoch": 0.16930488644184447, "grad_norm": 1.9839175939559937, "learning_rate": 4.8288330594036165e-05, "loss": 0.0862, "step": 1230 }, { "epoch": 0.17068134893324158, "grad_norm": 0.21893317997455597, "learning_rate": 4.8264850903968065e-05, "loss": 0.0677, "step": 1240 }, { "epoch": 0.1720578114246387, "grad_norm": 0.10601121932268143, "learning_rate": 4.824137121389998e-05, "loss": 0.0087, "step": 1250 }, { "epoch": 0.17343427391603577, "grad_norm": 0.14532095193862915, "learning_rate": 4.8217891523831887e-05, "loss": 0.0323, "step": 1260 }, { "epoch": 0.1748107364074329, "grad_norm": 0.036326099187135696, "learning_rate": 4.8194411833763794e-05, "loss": 0.0023, "step": 1270 }, { "epoch": 0.17618719889883, "grad_norm": 0.0427783839404583, "learning_rate": 4.817093214369571e-05, "loss": 0.0488, "step": 1280 }, { "epoch": 0.1775636613902271, "grad_norm": 4.155572414398193, "learning_rate": 4.814745245362761e-05, "loss": 0.0728, "step": 1290 }, { "epoch": 0.17894012388162422, "grad_norm": 0.4664069712162018, "learning_rate": 4.812397276355952e-05, "loss": 0.0044, "step": 1300 }, { "epoch": 0.18031658637302134, "grad_norm": 0.25557464361190796, "learning_rate": 4.810049307349143e-05, "loss": 0.0019, "step": 1310 }, { "epoch": 0.18169304886441845, "grad_norm": 0.022418728098273277, "learning_rate": 4.8077013383423344e-05, "loss": 0.0232, "step": 1320 }, { "epoch": 0.18306951135581556, "grad_norm": 0.020807242020964622, "learning_rate": 4.805353369335525e-05, "loss": 0.0499, "step": 1330 }, { "epoch": 0.18444597384721267, "grad_norm": 0.0350848026573658, "learning_rate": 4.803005400328716e-05, "loss": 0.0014, "step": 1340 }, { "epoch": 0.1858224363386098, "grad_norm": 0.12035457044839859, "learning_rate": 4.8006574313219066e-05, "loss": 0.081, "step": 1350 }, { "epoch": 0.18719889883000687, "grad_norm": 0.045201949775218964, "learning_rate": 4.7983094623150974e-05, "loss": 0.0023, "step": 1360 }, { "epoch": 0.18857536132140398, "grad_norm": 0.07530751079320908, "learning_rate": 4.795961493308289e-05, "loss": 0.0738, "step": 1370 }, { "epoch": 0.1899518238128011, "grad_norm": 0.04519107565283775, "learning_rate": 4.7936135243014795e-05, "loss": 0.0046, "step": 1380 }, { "epoch": 0.1913282863041982, "grad_norm": 0.03507385030388832, "learning_rate": 4.79126555529467e-05, "loss": 0.0127, "step": 1390 }, { "epoch": 0.19270474879559532, "grad_norm": 0.028852898627519608, "learning_rate": 4.788917586287861e-05, "loss": 0.0203, "step": 1400 }, { "epoch": 0.19408121128699243, "grad_norm": 0.024331634864211082, "learning_rate": 4.786569617281052e-05, "loss": 0.0032, "step": 1410 }, { "epoch": 0.19545767377838955, "grad_norm": 0.02570798248052597, "learning_rate": 4.784221648274243e-05, "loss": 0.0459, "step": 1420 }, { "epoch": 0.19683413626978666, "grad_norm": 0.031589195132255554, "learning_rate": 4.781873679267434e-05, "loss": 0.043, "step": 1430 }, { "epoch": 0.19821059876118377, "grad_norm": 0.0531427264213562, "learning_rate": 4.779525710260625e-05, "loss": 0.0376, "step": 1440 }, { "epoch": 0.19958706125258086, "grad_norm": 0.25781574845314026, "learning_rate": 4.7771777412538153e-05, "loss": 0.0802, "step": 1450 }, { "epoch": 0.20096352374397797, "grad_norm": 0.8505409359931946, "learning_rate": 4.774829772247007e-05, "loss": 0.0752, "step": 1460 }, { "epoch": 0.20233998623537508, "grad_norm": 0.22456428408622742, "learning_rate": 4.7724818032401975e-05, "loss": 0.0254, "step": 1470 }, { "epoch": 0.2037164487267722, "grad_norm": 0.041952747851610184, "learning_rate": 4.770133834233388e-05, "loss": 0.0463, "step": 1480 }, { "epoch": 0.2050929112181693, "grad_norm": 0.1576058715581894, "learning_rate": 4.7677858652265796e-05, "loss": 0.0209, "step": 1490 }, { "epoch": 0.20646937370956642, "grad_norm": 0.02799474634230137, "learning_rate": 4.76543789621977e-05, "loss": 0.0022, "step": 1500 }, { "epoch": 0.20784583620096353, "grad_norm": 0.018294574692845345, "learning_rate": 4.763089927212961e-05, "loss": 0.0168, "step": 1510 }, { "epoch": 0.20922229869236064, "grad_norm": 0.015774909406900406, "learning_rate": 4.760741958206152e-05, "loss": 0.0018, "step": 1520 }, { "epoch": 0.21059876118375775, "grad_norm": 0.013713778927922249, "learning_rate": 4.7583939891993426e-05, "loss": 0.0065, "step": 1530 }, { "epoch": 0.21197522367515484, "grad_norm": 0.015924030914902687, "learning_rate": 4.756046020192534e-05, "loss": 0.0505, "step": 1540 }, { "epoch": 0.21335168616655195, "grad_norm": 1.7789572477340698, "learning_rate": 4.753698051185724e-05, "loss": 0.0637, "step": 1550 }, { "epoch": 0.21472814865794906, "grad_norm": 0.14179880917072296, "learning_rate": 4.7513500821789155e-05, "loss": 0.0062, "step": 1560 }, { "epoch": 0.21610461114934618, "grad_norm": 0.018526747822761536, "learning_rate": 4.749002113172106e-05, "loss": 0.0154, "step": 1570 }, { "epoch": 0.2174810736407433, "grad_norm": 0.13851454854011536, "learning_rate": 4.7466541441652976e-05, "loss": 0.0234, "step": 1580 }, { "epoch": 0.2188575361321404, "grad_norm": 0.014294624328613281, "learning_rate": 4.7443061751584884e-05, "loss": 0.0658, "step": 1590 }, { "epoch": 0.2202339986235375, "grad_norm": 0.02725391276180744, "learning_rate": 4.741958206151679e-05, "loss": 0.0177, "step": 1600 }, { "epoch": 0.22161046111493463, "grad_norm": 0.01899637281894684, "learning_rate": 4.73961023714487e-05, "loss": 0.0045, "step": 1610 }, { "epoch": 0.22298692360633174, "grad_norm": 0.019792377948760986, "learning_rate": 4.7372622681380606e-05, "loss": 0.0154, "step": 1620 }, { "epoch": 0.22436338609772882, "grad_norm": 0.014761330559849739, "learning_rate": 4.734914299131252e-05, "loss": 0.0016, "step": 1630 }, { "epoch": 0.22573984858912594, "grad_norm": 0.013016458600759506, "learning_rate": 4.732566330124443e-05, "loss": 0.0027, "step": 1640 }, { "epoch": 0.22711631108052305, "grad_norm": 0.010150584392249584, "learning_rate": 4.7302183611176334e-05, "loss": 0.0038, "step": 1650 }, { "epoch": 0.22849277357192016, "grad_norm": 0.07699078321456909, "learning_rate": 4.727870392110824e-05, "loss": 0.0039, "step": 1660 }, { "epoch": 0.22986923606331727, "grad_norm": 0.8624148368835449, "learning_rate": 4.725522423104015e-05, "loss": 0.0202, "step": 1670 }, { "epoch": 0.23124569855471439, "grad_norm": 0.008768048137426376, "learning_rate": 4.723174454097206e-05, "loss": 0.001, "step": 1680 }, { "epoch": 0.2326221610461115, "grad_norm": 0.008101776242256165, "learning_rate": 4.720826485090397e-05, "loss": 0.0022, "step": 1690 }, { "epoch": 0.2339986235375086, "grad_norm": 0.007918176241219044, "learning_rate": 4.7184785160835885e-05, "loss": 0.0015, "step": 1700 }, { "epoch": 0.23537508602890572, "grad_norm": 0.05520469322800636, "learning_rate": 4.7161305470767785e-05, "loss": 0.0016, "step": 1710 }, { "epoch": 0.23675154852030283, "grad_norm": 0.008863956667482853, "learning_rate": 4.713782578069969e-05, "loss": 0.0511, "step": 1720 }, { "epoch": 0.23812801101169992, "grad_norm": 0.09934405237436295, "learning_rate": 4.711434609063161e-05, "loss": 0.0483, "step": 1730 }, { "epoch": 0.23950447350309703, "grad_norm": 0.016283148899674416, "learning_rate": 4.7090866400563514e-05, "loss": 0.0017, "step": 1740 }, { "epoch": 0.24088093599449414, "grad_norm": 0.07908762991428375, "learning_rate": 4.706738671049543e-05, "loss": 0.0565, "step": 1750 }, { "epoch": 0.24225739848589126, "grad_norm": 0.014586333185434341, "learning_rate": 4.704390702042733e-05, "loss": 0.003, "step": 1760 }, { "epoch": 0.24363386097728837, "grad_norm": 0.12161644548177719, "learning_rate": 4.702042733035924e-05, "loss": 0.0198, "step": 1770 }, { "epoch": 0.24501032346868548, "grad_norm": 0.011741235852241516, "learning_rate": 4.699694764029115e-05, "loss": 0.0193, "step": 1780 }, { "epoch": 0.2463867859600826, "grad_norm": 0.027244795113801956, "learning_rate": 4.697346795022306e-05, "loss": 0.0772, "step": 1790 }, { "epoch": 0.2477632484514797, "grad_norm": 0.5753130316734314, "learning_rate": 4.694998826015497e-05, "loss": 0.0247, "step": 1800 }, { "epoch": 0.24913971094287682, "grad_norm": 0.140317901968956, "learning_rate": 4.692650857008687e-05, "loss": 0.0265, "step": 1810 }, { "epoch": 0.2505161734342739, "grad_norm": 0.017475415021181107, "learning_rate": 4.690302888001879e-05, "loss": 0.0156, "step": 1820 }, { "epoch": 0.251892635925671, "grad_norm": 0.013620712794363499, "learning_rate": 4.6879549189950694e-05, "loss": 0.0037, "step": 1830 }, { "epoch": 0.25326909841706813, "grad_norm": 0.020433681085705757, "learning_rate": 4.68560694998826e-05, "loss": 0.0578, "step": 1840 }, { "epoch": 0.25464556090846524, "grad_norm": 2.448824644088745, "learning_rate": 4.6832589809814515e-05, "loss": 0.0453, "step": 1850 }, { "epoch": 0.25602202339986235, "grad_norm": 0.040950000286102295, "learning_rate": 4.680911011974642e-05, "loss": 0.0042, "step": 1860 }, { "epoch": 0.25739848589125947, "grad_norm": 0.019773241132497787, "learning_rate": 4.678563042967833e-05, "loss": 0.0141, "step": 1870 }, { "epoch": 0.2587749483826566, "grad_norm": 0.16849978268146515, "learning_rate": 4.676215073961024e-05, "loss": 0.0057, "step": 1880 }, { "epoch": 0.2601514108740537, "grad_norm": 0.013958179391920567, "learning_rate": 4.673867104954215e-05, "loss": 0.0131, "step": 1890 }, { "epoch": 0.2615278733654508, "grad_norm": 0.013353968039155006, "learning_rate": 4.671519135947406e-05, "loss": 0.0017, "step": 1900 }, { "epoch": 0.2629043358568479, "grad_norm": 0.010541558265686035, "learning_rate": 4.6691711669405966e-05, "loss": 0.0005, "step": 1910 }, { "epoch": 0.264280798348245, "grad_norm": 0.010194603353738785, "learning_rate": 4.6668231979337874e-05, "loss": 0.0035, "step": 1920 }, { "epoch": 0.26565726083964214, "grad_norm": 1.8915557861328125, "learning_rate": 4.664475228926978e-05, "loss": 0.1063, "step": 1930 }, { "epoch": 0.26703372333103925, "grad_norm": 0.023809868842363358, "learning_rate": 4.6621272599201695e-05, "loss": 0.0111, "step": 1940 }, { "epoch": 0.26841018582243636, "grad_norm": 1.6938400268554688, "learning_rate": 4.65977929091336e-05, "loss": 0.0439, "step": 1950 }, { "epoch": 0.2697866483138334, "grad_norm": 0.050258684903383255, "learning_rate": 4.657431321906551e-05, "loss": 0.0208, "step": 1960 }, { "epoch": 0.27116311080523053, "grad_norm": 0.03813977167010307, "learning_rate": 4.655083352899742e-05, "loss": 0.0037, "step": 1970 }, { "epoch": 0.27253957329662765, "grad_norm": 0.4461005628108978, "learning_rate": 4.6527353838929325e-05, "loss": 0.0123, "step": 1980 }, { "epoch": 0.27391603578802476, "grad_norm": 0.11771722137928009, "learning_rate": 4.650387414886124e-05, "loss": 0.0018, "step": 1990 }, { "epoch": 0.27529249827942187, "grad_norm": 1.7108595371246338, "learning_rate": 4.6480394458793146e-05, "loss": 0.1124, "step": 2000 }, { "epoch": 0.276668960770819, "grad_norm": 0.05135238543152809, "learning_rate": 4.645691476872506e-05, "loss": 0.0072, "step": 2010 }, { "epoch": 0.2780454232622161, "grad_norm": 0.06425310671329498, "learning_rate": 4.643343507865696e-05, "loss": 0.0876, "step": 2020 }, { "epoch": 0.2794218857536132, "grad_norm": 0.052194226533174515, "learning_rate": 4.6409955388588875e-05, "loss": 0.003, "step": 2030 }, { "epoch": 0.2807983482450103, "grad_norm": 0.035610832273960114, "learning_rate": 4.638647569852078e-05, "loss": 0.0089, "step": 2040 }, { "epoch": 0.28217481073640743, "grad_norm": 0.04461894929409027, "learning_rate": 4.636299600845269e-05, "loss": 0.049, "step": 2050 }, { "epoch": 0.28355127322780455, "grad_norm": 0.06284139305353165, "learning_rate": 4.6339516318384604e-05, "loss": 0.0408, "step": 2060 }, { "epoch": 0.28492773571920166, "grad_norm": 0.7144775390625, "learning_rate": 4.6316036628316504e-05, "loss": 0.0487, "step": 2070 }, { "epoch": 0.28630419821059877, "grad_norm": 0.29227763414382935, "learning_rate": 4.629255693824842e-05, "loss": 0.0775, "step": 2080 }, { "epoch": 0.2876806607019959, "grad_norm": 0.2701393961906433, "learning_rate": 4.6269077248180326e-05, "loss": 0.012, "step": 2090 }, { "epoch": 0.289057123193393, "grad_norm": 0.2967076897621155, "learning_rate": 4.624559755811223e-05, "loss": 0.0076, "step": 2100 }, { "epoch": 0.2904335856847901, "grad_norm": 0.059514667838811874, "learning_rate": 4.622211786804415e-05, "loss": 0.0826, "step": 2110 }, { "epoch": 0.2918100481761872, "grad_norm": 1.4362976551055908, "learning_rate": 4.619863817797605e-05, "loss": 0.0441, "step": 2120 }, { "epoch": 0.29318651066758433, "grad_norm": 0.1371992826461792, "learning_rate": 4.617515848790796e-05, "loss": 0.0385, "step": 2130 }, { "epoch": 0.29456297315898144, "grad_norm": 0.1226867064833641, "learning_rate": 4.615167879783987e-05, "loss": 0.0704, "step": 2140 }, { "epoch": 0.2959394356503785, "grad_norm": 0.11872248351573944, "learning_rate": 4.612819910777178e-05, "loss": 0.0059, "step": 2150 }, { "epoch": 0.2973158981417756, "grad_norm": 0.1523495316505432, "learning_rate": 4.610471941770369e-05, "loss": 0.0714, "step": 2160 }, { "epoch": 0.2986923606331727, "grad_norm": 0.07271503657102585, "learning_rate": 4.60812397276356e-05, "loss": 0.0394, "step": 2170 }, { "epoch": 0.30006882312456984, "grad_norm": 0.3765457272529602, "learning_rate": 4.6057760037567506e-05, "loss": 0.0697, "step": 2180 }, { "epoch": 0.30144528561596695, "grad_norm": 0.056270550936460495, "learning_rate": 4.603428034749941e-05, "loss": 0.0055, "step": 2190 }, { "epoch": 0.30282174810736406, "grad_norm": 0.054549600929021835, "learning_rate": 4.601080065743133e-05, "loss": 0.0388, "step": 2200 }, { "epoch": 0.3041982105987612, "grad_norm": 0.06886384636163712, "learning_rate": 4.5987320967363234e-05, "loss": 0.0671, "step": 2210 }, { "epoch": 0.3055746730901583, "grad_norm": 0.07275781035423279, "learning_rate": 4.596384127729514e-05, "loss": 0.0036, "step": 2220 }, { "epoch": 0.3069511355815554, "grad_norm": 0.07663343101739883, "learning_rate": 4.594036158722705e-05, "loss": 0.0572, "step": 2230 }, { "epoch": 0.3083275980729525, "grad_norm": 1.1597808599472046, "learning_rate": 4.5916881897158957e-05, "loss": 0.0233, "step": 2240 }, { "epoch": 0.3097040605643496, "grad_norm": 1.600203514099121, "learning_rate": 4.589340220709087e-05, "loss": 0.021, "step": 2250 }, { "epoch": 0.31108052305574674, "grad_norm": 0.02915968932211399, "learning_rate": 4.586992251702278e-05, "loss": 0.0079, "step": 2260 }, { "epoch": 0.31245698554714385, "grad_norm": 0.5495104789733887, "learning_rate": 4.5846442826954685e-05, "loss": 0.017, "step": 2270 }, { "epoch": 0.31383344803854096, "grad_norm": 0.01875491254031658, "learning_rate": 4.582296313688659e-05, "loss": 0.0334, "step": 2280 }, { "epoch": 0.3152099105299381, "grad_norm": 0.01996096409857273, "learning_rate": 4.57994834468185e-05, "loss": 0.0204, "step": 2290 }, { "epoch": 0.3165863730213352, "grad_norm": 0.972200334072113, "learning_rate": 4.5776003756750414e-05, "loss": 0.0266, "step": 2300 }, { "epoch": 0.3179628355127323, "grad_norm": 0.015618067234754562, "learning_rate": 4.575252406668232e-05, "loss": 0.0114, "step": 2310 }, { "epoch": 0.3193392980041294, "grad_norm": 0.01295034121721983, "learning_rate": 4.5729044376614236e-05, "loss": 0.0181, "step": 2320 }, { "epoch": 0.32071576049552647, "grad_norm": 0.8012134432792664, "learning_rate": 4.5705564686546136e-05, "loss": 0.0336, "step": 2330 }, { "epoch": 0.3220922229869236, "grad_norm": 0.030825184658169746, "learning_rate": 4.568208499647805e-05, "loss": 0.0673, "step": 2340 }, { "epoch": 0.3234686854783207, "grad_norm": 0.02954012341797352, "learning_rate": 4.565860530640996e-05, "loss": 0.018, "step": 2350 }, { "epoch": 0.3248451479697178, "grad_norm": 0.023661140352487564, "learning_rate": 4.5635125616341865e-05, "loss": 0.0095, "step": 2360 }, { "epoch": 0.3262216104611149, "grad_norm": 0.25901320576667786, "learning_rate": 4.561164592627378e-05, "loss": 0.0375, "step": 2370 }, { "epoch": 0.32759807295251203, "grad_norm": 0.014901269227266312, "learning_rate": 4.558816623620568e-05, "loss": 0.0559, "step": 2380 }, { "epoch": 0.32897453544390914, "grad_norm": 0.014345617033541203, "learning_rate": 4.5564686546137594e-05, "loss": 0.0043, "step": 2390 }, { "epoch": 0.33035099793530626, "grad_norm": 0.13511377573013306, "learning_rate": 4.55412068560695e-05, "loss": 0.0192, "step": 2400 }, { "epoch": 0.33172746042670337, "grad_norm": 0.012763244099915028, "learning_rate": 4.551772716600141e-05, "loss": 0.0172, "step": 2410 }, { "epoch": 0.3331039229181005, "grad_norm": 1.128438949584961, "learning_rate": 4.549424747593332e-05, "loss": 0.0306, "step": 2420 }, { "epoch": 0.3344803854094976, "grad_norm": 1.0011532306671143, "learning_rate": 4.547076778586523e-05, "loss": 0.0258, "step": 2430 }, { "epoch": 0.3358568479008947, "grad_norm": 1.616672396659851, "learning_rate": 4.544728809579714e-05, "loss": 0.0676, "step": 2440 }, { "epoch": 0.3372333103922918, "grad_norm": 0.03561177849769592, "learning_rate": 4.5423808405729045e-05, "loss": 0.0138, "step": 2450 }, { "epoch": 0.33860977288368893, "grad_norm": 0.14849260449409485, "learning_rate": 4.540032871566096e-05, "loss": 0.0087, "step": 2460 }, { "epoch": 0.33998623537508604, "grad_norm": 0.018366873264312744, "learning_rate": 4.5376849025592866e-05, "loss": 0.0138, "step": 2470 }, { "epoch": 0.34136269786648316, "grad_norm": 0.02044675685465336, "learning_rate": 4.5353369335524774e-05, "loss": 0.0297, "step": 2480 }, { "epoch": 0.34273916035788027, "grad_norm": 0.24164333939552307, "learning_rate": 4.532988964545668e-05, "loss": 0.0209, "step": 2490 }, { "epoch": 0.3441156228492774, "grad_norm": 0.015189621597528458, "learning_rate": 4.530640995538859e-05, "loss": 0.016, "step": 2500 }, { "epoch": 0.3454920853406745, "grad_norm": 0.011951955035328865, "learning_rate": 4.52829302653205e-05, "loss": 0.0564, "step": 2510 }, { "epoch": 0.34686854783207155, "grad_norm": 0.018690839409828186, "learning_rate": 4.525945057525241e-05, "loss": 0.0017, "step": 2520 }, { "epoch": 0.34824501032346866, "grad_norm": 0.802375853061676, "learning_rate": 4.523597088518432e-05, "loss": 0.0196, "step": 2530 }, { "epoch": 0.3496214728148658, "grad_norm": 0.012011172249913216, "learning_rate": 4.5212491195116225e-05, "loss": 0.015, "step": 2540 }, { "epoch": 0.3509979353062629, "grad_norm": 0.18269647657871246, "learning_rate": 4.518901150504813e-05, "loss": 0.0043, "step": 2550 }, { "epoch": 0.35237439779766, "grad_norm": 0.012187006883323193, "learning_rate": 4.5165531814980046e-05, "loss": 0.0167, "step": 2560 }, { "epoch": 0.3537508602890571, "grad_norm": 2.0083608627319336, "learning_rate": 4.5142052124911954e-05, "loss": 0.1068, "step": 2570 }, { "epoch": 0.3551273227804542, "grad_norm": 0.025134120136499405, "learning_rate": 4.511857243484387e-05, "loss": 0.0239, "step": 2580 }, { "epoch": 0.35650378527185134, "grad_norm": 0.04279841482639313, "learning_rate": 4.509509274477577e-05, "loss": 0.0406, "step": 2590 }, { "epoch": 0.35788024776324845, "grad_norm": 0.03511851653456688, "learning_rate": 4.5071613054707676e-05, "loss": 0.0636, "step": 2600 }, { "epoch": 0.35925671025464556, "grad_norm": 0.021262675523757935, "learning_rate": 4.504813336463959e-05, "loss": 0.0141, "step": 2610 }, { "epoch": 0.3606331727460427, "grad_norm": 0.18972019851207733, "learning_rate": 4.50246536745715e-05, "loss": 0.0088, "step": 2620 }, { "epoch": 0.3620096352374398, "grad_norm": 0.23434261977672577, "learning_rate": 4.500117398450341e-05, "loss": 0.0046, "step": 2630 }, { "epoch": 0.3633860977288369, "grad_norm": 0.011170051991939545, "learning_rate": 4.497769429443531e-05, "loss": 0.0046, "step": 2640 }, { "epoch": 0.364762560220234, "grad_norm": 0.012128886766731739, "learning_rate": 4.4954214604367226e-05, "loss": 0.0006, "step": 2650 }, { "epoch": 0.3661390227116311, "grad_norm": 0.008523658849298954, "learning_rate": 4.493073491429913e-05, "loss": 0.0594, "step": 2660 }, { "epoch": 0.36751548520302824, "grad_norm": 0.23149463534355164, "learning_rate": 4.490725522423104e-05, "loss": 0.0952, "step": 2670 }, { "epoch": 0.36889194769442535, "grad_norm": 0.048753008246421814, "learning_rate": 4.4883775534162955e-05, "loss": 0.0062, "step": 2680 }, { "epoch": 0.37026841018582246, "grad_norm": 0.11497911065816879, "learning_rate": 4.486029584409486e-05, "loss": 0.0603, "step": 2690 }, { "epoch": 0.3716448726772196, "grad_norm": 0.03551657497882843, "learning_rate": 4.483681615402677e-05, "loss": 0.0573, "step": 2700 }, { "epoch": 0.37302133516861663, "grad_norm": 0.06909269094467163, "learning_rate": 4.481333646395868e-05, "loss": 0.0041, "step": 2710 }, { "epoch": 0.37439779766001374, "grad_norm": 1.3509830236434937, "learning_rate": 4.4789856773890584e-05, "loss": 0.0618, "step": 2720 }, { "epoch": 0.37577426015141085, "grad_norm": 0.0599672757089138, "learning_rate": 4.47663770838225e-05, "loss": 0.0901, "step": 2730 }, { "epoch": 0.37715072264280797, "grad_norm": 0.06728827208280563, "learning_rate": 4.4742897393754406e-05, "loss": 0.0372, "step": 2740 }, { "epoch": 0.3785271851342051, "grad_norm": 0.0516839325428009, "learning_rate": 4.471941770368631e-05, "loss": 0.0077, "step": 2750 }, { "epoch": 0.3799036476256022, "grad_norm": 0.02933851256966591, "learning_rate": 4.469593801361822e-05, "loss": 0.025, "step": 2760 }, { "epoch": 0.3812801101169993, "grad_norm": 0.17628280818462372, "learning_rate": 4.4672458323550135e-05, "loss": 0.0324, "step": 2770 }, { "epoch": 0.3826565726083964, "grad_norm": 0.02296600118279457, "learning_rate": 4.464897863348204e-05, "loss": 0.0073, "step": 2780 }, { "epoch": 0.38403303509979353, "grad_norm": 0.017956219613552094, "learning_rate": 4.462549894341395e-05, "loss": 0.0062, "step": 2790 }, { "epoch": 0.38540949759119064, "grad_norm": 0.013861626386642456, "learning_rate": 4.4602019253345857e-05, "loss": 0.0079, "step": 2800 }, { "epoch": 0.38678596008258775, "grad_norm": 0.015802577137947083, "learning_rate": 4.4578539563277764e-05, "loss": 0.0221, "step": 2810 }, { "epoch": 0.38816242257398487, "grad_norm": 0.4854987561702728, "learning_rate": 4.455505987320968e-05, "loss": 0.0262, "step": 2820 }, { "epoch": 0.389538885065382, "grad_norm": 1.6219817399978638, "learning_rate": 4.4531580183141585e-05, "loss": 0.0413, "step": 2830 }, { "epoch": 0.3909153475567791, "grad_norm": 0.07754676043987274, "learning_rate": 4.450810049307349e-05, "loss": 0.052, "step": 2840 }, { "epoch": 0.3922918100481762, "grad_norm": 0.009406493976712227, "learning_rate": 4.44846208030054e-05, "loss": 0.0328, "step": 2850 }, { "epoch": 0.3936682725395733, "grad_norm": 0.03465660661458969, "learning_rate": 4.446114111293731e-05, "loss": 0.0316, "step": 2860 }, { "epoch": 0.39504473503097043, "grad_norm": 0.023476269096136093, "learning_rate": 4.443766142286922e-05, "loss": 0.0287, "step": 2870 }, { "epoch": 0.39642119752236754, "grad_norm": 0.018258871510624886, "learning_rate": 4.441418173280113e-05, "loss": 0.0293, "step": 2880 }, { "epoch": 0.3977976600137646, "grad_norm": 0.017730968073010445, "learning_rate": 4.439070204273304e-05, "loss": 0.0205, "step": 2890 }, { "epoch": 0.3991741225051617, "grad_norm": 0.7179641723632812, "learning_rate": 4.4367222352664944e-05, "loss": 0.0203, "step": 2900 }, { "epoch": 0.4005505849965588, "grad_norm": 0.014021376147866249, "learning_rate": 4.434374266259685e-05, "loss": 0.0138, "step": 2910 }, { "epoch": 0.40192704748795594, "grad_norm": 0.010237162932753563, "learning_rate": 4.4320262972528765e-05, "loss": 0.0163, "step": 2920 }, { "epoch": 0.40330350997935305, "grad_norm": 0.01026883628219366, "learning_rate": 4.429678328246067e-05, "loss": 0.0101, "step": 2930 }, { "epoch": 0.40467997247075016, "grad_norm": 0.31801068782806396, "learning_rate": 4.427330359239259e-05, "loss": 0.103, "step": 2940 }, { "epoch": 0.4060564349621473, "grad_norm": 0.01809278130531311, "learning_rate": 4.424982390232449e-05, "loss": 0.0123, "step": 2950 }, { "epoch": 0.4074328974535444, "grad_norm": 0.19398677349090576, "learning_rate": 4.42263442122564e-05, "loss": 0.0323, "step": 2960 }, { "epoch": 0.4088093599449415, "grad_norm": 2.176738739013672, "learning_rate": 4.420286452218831e-05, "loss": 0.0658, "step": 2970 }, { "epoch": 0.4101858224363386, "grad_norm": 0.04093041270971298, "learning_rate": 4.4179384832120216e-05, "loss": 0.0123, "step": 2980 }, { "epoch": 0.4115622849277357, "grad_norm": 0.34939318895339966, "learning_rate": 4.415590514205213e-05, "loss": 0.0075, "step": 2990 }, { "epoch": 0.41293874741913283, "grad_norm": 0.19567587971687317, "learning_rate": 4.413242545198404e-05, "loss": 0.0182, "step": 3000 }, { "epoch": 0.41431520991052995, "grad_norm": 0.16543368995189667, "learning_rate": 4.4108945761915945e-05, "loss": 0.0473, "step": 3010 }, { "epoch": 0.41569167240192706, "grad_norm": 0.011524620465934277, "learning_rate": 4.408546607184785e-05, "loss": 0.0048, "step": 3020 }, { "epoch": 0.41706813489332417, "grad_norm": 1.6746530532836914, "learning_rate": 4.406198638177976e-05, "loss": 0.0453, "step": 3030 }, { "epoch": 0.4184445973847213, "grad_norm": 0.8192883133888245, "learning_rate": 4.4038506691711674e-05, "loss": 0.0683, "step": 3040 }, { "epoch": 0.4198210598761184, "grad_norm": 0.01069236546754837, "learning_rate": 4.401502700164358e-05, "loss": 0.0119, "step": 3050 }, { "epoch": 0.4211975223675155, "grad_norm": 0.03795655444264412, "learning_rate": 4.399154731157549e-05, "loss": 0.0206, "step": 3060 }, { "epoch": 0.4225739848589126, "grad_norm": 0.008810500614345074, "learning_rate": 4.3968067621507396e-05, "loss": 0.0439, "step": 3070 }, { "epoch": 0.4239504473503097, "grad_norm": 0.03897896409034729, "learning_rate": 4.394458793143931e-05, "loss": 0.0269, "step": 3080 }, { "epoch": 0.4253269098417068, "grad_norm": 0.06175368279218674, "learning_rate": 4.392110824137122e-05, "loss": 0.0602, "step": 3090 }, { "epoch": 0.4267033723331039, "grad_norm": 0.007408774457871914, "learning_rate": 4.3897628551303125e-05, "loss": 0.0386, "step": 3100 }, { "epoch": 0.428079834824501, "grad_norm": 0.0068915546871721745, "learning_rate": 4.387414886123503e-05, "loss": 0.0134, "step": 3110 }, { "epoch": 0.42945629731589813, "grad_norm": 0.005642635747790337, "learning_rate": 4.385066917116694e-05, "loss": 0.0693, "step": 3120 }, { "epoch": 0.43083275980729524, "grad_norm": 0.007590182591229677, "learning_rate": 4.3827189481098854e-05, "loss": 0.0752, "step": 3130 }, { "epoch": 0.43220922229869235, "grad_norm": 6.505997657775879, "learning_rate": 4.380370979103076e-05, "loss": 0.038, "step": 3140 }, { "epoch": 0.43358568479008947, "grad_norm": 0.021713029593229294, "learning_rate": 4.378023010096267e-05, "loss": 0.0552, "step": 3150 }, { "epoch": 0.4349621472814866, "grad_norm": 0.6760035157203674, "learning_rate": 4.3756750410894576e-05, "loss": 0.0477, "step": 3160 }, { "epoch": 0.4363386097728837, "grad_norm": 1.1871752738952637, "learning_rate": 4.373327072082648e-05, "loss": 0.0361, "step": 3170 }, { "epoch": 0.4377150722642808, "grad_norm": 1.3839854001998901, "learning_rate": 4.37097910307584e-05, "loss": 0.0466, "step": 3180 }, { "epoch": 0.4390915347556779, "grad_norm": 0.06708292663097382, "learning_rate": 4.3686311340690304e-05, "loss": 0.0247, "step": 3190 }, { "epoch": 0.440467997247075, "grad_norm": 0.22531658411026, "learning_rate": 4.366283165062222e-05, "loss": 0.0224, "step": 3200 }, { "epoch": 0.44184445973847214, "grad_norm": 0.005535891745239496, "learning_rate": 4.363935196055412e-05, "loss": 0.0317, "step": 3210 }, { "epoch": 0.44322092222986925, "grad_norm": 0.8192137479782104, "learning_rate": 4.361587227048603e-05, "loss": 0.0462, "step": 3220 }, { "epoch": 0.44459738472126636, "grad_norm": 0.1516508162021637, "learning_rate": 4.359239258041794e-05, "loss": 0.0076, "step": 3230 }, { "epoch": 0.4459738472126635, "grad_norm": 0.1447552889585495, "learning_rate": 4.356891289034985e-05, "loss": 0.0328, "step": 3240 }, { "epoch": 0.4473503097040606, "grad_norm": 0.03213917464017868, "learning_rate": 4.354543320028176e-05, "loss": 0.0143, "step": 3250 }, { "epoch": 0.44872677219545765, "grad_norm": 0.27943387627601624, "learning_rate": 4.352195351021367e-05, "loss": 0.005, "step": 3260 }, { "epoch": 0.45010323468685476, "grad_norm": 0.007313841953873634, "learning_rate": 4.349847382014558e-05, "loss": 0.0545, "step": 3270 }, { "epoch": 0.45147969717825187, "grad_norm": 0.043260473757982254, "learning_rate": 4.3474994130077484e-05, "loss": 0.0077, "step": 3280 }, { "epoch": 0.452856159669649, "grad_norm": 0.11500683426856995, "learning_rate": 4.345151444000939e-05, "loss": 0.003, "step": 3290 }, { "epoch": 0.4542326221610461, "grad_norm": 0.1244969591498375, "learning_rate": 4.3428034749941306e-05, "loss": 0.0181, "step": 3300 }, { "epoch": 0.4556090846524432, "grad_norm": 0.045015379786491394, "learning_rate": 4.340455505987321e-05, "loss": 0.0091, "step": 3310 }, { "epoch": 0.4569855471438403, "grad_norm": 0.01492405403405428, "learning_rate": 4.338107536980512e-05, "loss": 0.031, "step": 3320 }, { "epoch": 0.45836200963523743, "grad_norm": 0.007625560741871595, "learning_rate": 4.335759567973703e-05, "loss": 0.0273, "step": 3330 }, { "epoch": 0.45973847212663455, "grad_norm": 0.010230351239442825, "learning_rate": 4.333411598966894e-05, "loss": 0.0721, "step": 3340 }, { "epoch": 0.46111493461803166, "grad_norm": 0.01767081953585148, "learning_rate": 4.331063629960085e-05, "loss": 0.0011, "step": 3350 }, { "epoch": 0.46249139710942877, "grad_norm": 0.20645242929458618, "learning_rate": 4.3287156609532757e-05, "loss": 0.0385, "step": 3360 }, { "epoch": 0.4638678596008259, "grad_norm": 0.01104032527655363, "learning_rate": 4.3263676919464664e-05, "loss": 0.0133, "step": 3370 }, { "epoch": 0.465244322092223, "grad_norm": 0.15712879598140717, "learning_rate": 4.324019722939657e-05, "loss": 0.0587, "step": 3380 }, { "epoch": 0.4666207845836201, "grad_norm": 0.1303199976682663, "learning_rate": 4.3216717539328485e-05, "loss": 0.0064, "step": 3390 }, { "epoch": 0.4679972470750172, "grad_norm": 0.008279926143586636, "learning_rate": 4.319323784926039e-05, "loss": 0.0035, "step": 3400 }, { "epoch": 0.46937370956641433, "grad_norm": 0.01927364617586136, "learning_rate": 4.31697581591923e-05, "loss": 0.0099, "step": 3410 }, { "epoch": 0.47075017205781144, "grad_norm": 0.022532671689987183, "learning_rate": 4.314627846912421e-05, "loss": 0.0207, "step": 3420 }, { "epoch": 0.47212663454920856, "grad_norm": 0.09137268364429474, "learning_rate": 4.3122798779056115e-05, "loss": 0.0073, "step": 3430 }, { "epoch": 0.47350309704060567, "grad_norm": 0.025033535435795784, "learning_rate": 4.309931908898803e-05, "loss": 0.0033, "step": 3440 }, { "epoch": 0.4748795595320027, "grad_norm": 0.007203536108136177, "learning_rate": 4.3075839398919936e-05, "loss": 0.0155, "step": 3450 }, { "epoch": 0.47625602202339984, "grad_norm": 0.030102182179689407, "learning_rate": 4.305235970885185e-05, "loss": 0.0188, "step": 3460 }, { "epoch": 0.47763248451479695, "grad_norm": 0.09996209293603897, "learning_rate": 4.302888001878375e-05, "loss": 0.0049, "step": 3470 }, { "epoch": 0.47900894700619406, "grad_norm": 0.0058035957626998425, "learning_rate": 4.300540032871566e-05, "loss": 0.0614, "step": 3480 }, { "epoch": 0.4803854094975912, "grad_norm": 0.010744770057499409, "learning_rate": 4.298192063864757e-05, "loss": 0.0024, "step": 3490 }, { "epoch": 0.4817618719889883, "grad_norm": 0.09626354277133942, "learning_rate": 4.295844094857948e-05, "loss": 0.0169, "step": 3500 }, { "epoch": 0.4831383344803854, "grad_norm": 0.14397992193698883, "learning_rate": 4.2934961258511394e-05, "loss": 0.0385, "step": 3510 }, { "epoch": 0.4845147969717825, "grad_norm": 0.008807985112071037, "learning_rate": 4.29114815684433e-05, "loss": 0.0035, "step": 3520 }, { "epoch": 0.4858912594631796, "grad_norm": 0.12741419672966003, "learning_rate": 4.288800187837521e-05, "loss": 0.055, "step": 3530 }, { "epoch": 0.48726772195457674, "grad_norm": 0.24736307561397552, "learning_rate": 4.2864522188307116e-05, "loss": 0.0383, "step": 3540 }, { "epoch": 0.48864418444597385, "grad_norm": 0.02119280770421028, "learning_rate": 4.2841042498239023e-05, "loss": 0.0254, "step": 3550 }, { "epoch": 0.49002064693737096, "grad_norm": 0.015386384911835194, "learning_rate": 4.281756280817094e-05, "loss": 0.0074, "step": 3560 }, { "epoch": 0.4913971094287681, "grad_norm": 0.011632720939815044, "learning_rate": 4.2794083118102845e-05, "loss": 0.0379, "step": 3570 }, { "epoch": 0.4927735719201652, "grad_norm": 0.10558334738016129, "learning_rate": 4.277060342803475e-05, "loss": 0.03, "step": 3580 }, { "epoch": 0.4941500344115623, "grad_norm": 0.010115494020283222, "learning_rate": 4.274712373796666e-05, "loss": 0.0021, "step": 3590 }, { "epoch": 0.4955264969029594, "grad_norm": 0.00840273778885603, "learning_rate": 4.272364404789857e-05, "loss": 0.0183, "step": 3600 }, { "epoch": 0.4969029593943565, "grad_norm": 0.049460556358098984, "learning_rate": 4.270016435783048e-05, "loss": 0.0515, "step": 3610 }, { "epoch": 0.49827942188575364, "grad_norm": 0.06648355722427368, "learning_rate": 4.267668466776239e-05, "loss": 0.0059, "step": 3620 }, { "epoch": 0.49965588437715075, "grad_norm": 0.09940154105424881, "learning_rate": 4.2653204977694296e-05, "loss": 0.0184, "step": 3630 }, { "epoch": 0.5010323468685478, "grad_norm": 0.006182067561894655, "learning_rate": 4.26297252876262e-05, "loss": 0.0156, "step": 3640 }, { "epoch": 0.5024088093599449, "grad_norm": 0.006616074126213789, "learning_rate": 4.260624559755812e-05, "loss": 0.02, "step": 3650 }, { "epoch": 0.503785271851342, "grad_norm": 0.011640090495347977, "learning_rate": 4.2582765907490025e-05, "loss": 0.1463, "step": 3660 }, { "epoch": 0.5051617343427391, "grad_norm": 0.033209312707185745, "learning_rate": 4.255928621742193e-05, "loss": 0.0031, "step": 3670 }, { "epoch": 0.5065381968341363, "grad_norm": 0.020665930584073067, "learning_rate": 4.253580652735384e-05, "loss": 0.0016, "step": 3680 }, { "epoch": 0.5079146593255334, "grad_norm": 0.020520459860563278, "learning_rate": 4.251232683728575e-05, "loss": 0.048, "step": 3690 }, { "epoch": 0.5092911218169305, "grad_norm": 0.07317003607749939, "learning_rate": 4.248884714721766e-05, "loss": 0.0466, "step": 3700 }, { "epoch": 0.5106675843083276, "grad_norm": 1.525734305381775, "learning_rate": 4.246536745714957e-05, "loss": 0.0324, "step": 3710 }, { "epoch": 0.5120440467997247, "grad_norm": 0.6271283626556396, "learning_rate": 4.2441887767081476e-05, "loss": 0.0027, "step": 3720 }, { "epoch": 0.5134205092911218, "grad_norm": 0.020733555778861046, "learning_rate": 4.241840807701338e-05, "loss": 0.0013, "step": 3730 }, { "epoch": 0.5147969717825189, "grad_norm": 0.019311992451548576, "learning_rate": 4.239492838694529e-05, "loss": 0.0471, "step": 3740 }, { "epoch": 0.516173434273916, "grad_norm": 0.02362320013344288, "learning_rate": 4.2371448696877204e-05, "loss": 0.0014, "step": 3750 }, { "epoch": 0.5175498967653132, "grad_norm": 0.04842793568968773, "learning_rate": 4.234796900680911e-05, "loss": 0.0727, "step": 3760 }, { "epoch": 0.5189263592567103, "grad_norm": 0.07355684041976929, "learning_rate": 4.2324489316741026e-05, "loss": 0.0472, "step": 3770 }, { "epoch": 0.5203028217481074, "grad_norm": 0.04773195460438728, "learning_rate": 4.2301009626672927e-05, "loss": 0.0027, "step": 3780 }, { "epoch": 0.5216792842395045, "grad_norm": 0.10183379799127579, "learning_rate": 4.2277529936604834e-05, "loss": 0.1297, "step": 3790 }, { "epoch": 0.5230557467309016, "grad_norm": 0.10133621096611023, "learning_rate": 4.225405024653675e-05, "loss": 0.0065, "step": 3800 }, { "epoch": 0.5244322092222987, "grad_norm": 0.08439179509878159, "learning_rate": 4.2230570556468655e-05, "loss": 0.0765, "step": 3810 }, { "epoch": 0.5258086717136958, "grad_norm": 0.16360962390899658, "learning_rate": 4.220709086640057e-05, "loss": 0.0671, "step": 3820 }, { "epoch": 0.5271851342050929, "grad_norm": 0.15848655998706818, "learning_rate": 4.218361117633248e-05, "loss": 0.0407, "step": 3830 }, { "epoch": 0.52856159669649, "grad_norm": 0.09889918565750122, "learning_rate": 4.2160131486264384e-05, "loss": 0.0348, "step": 3840 }, { "epoch": 0.5299380591878872, "grad_norm": 0.06334863603115082, "learning_rate": 4.213665179619629e-05, "loss": 0.004, "step": 3850 }, { "epoch": 0.5313145216792843, "grad_norm": 0.05027531087398529, "learning_rate": 4.21131721061282e-05, "loss": 0.0397, "step": 3860 }, { "epoch": 0.5326909841706814, "grad_norm": 0.043990928679704666, "learning_rate": 4.208969241606011e-05, "loss": 0.0024, "step": 3870 }, { "epoch": 0.5340674466620785, "grad_norm": 0.036276888102293015, "learning_rate": 4.206621272599202e-05, "loss": 0.0021, "step": 3880 }, { "epoch": 0.5354439091534756, "grad_norm": 0.04074348136782646, "learning_rate": 4.204273303592393e-05, "loss": 0.0657, "step": 3890 }, { "epoch": 0.5368203716448727, "grad_norm": 0.05693095177412033, "learning_rate": 4.2019253345855835e-05, "loss": 0.043, "step": 3900 }, { "epoch": 0.5381968341362698, "grad_norm": 0.04247228801250458, "learning_rate": 4.199577365578774e-05, "loss": 0.0027, "step": 3910 }, { "epoch": 0.5395732966276668, "grad_norm": 0.037503890693187714, "learning_rate": 4.197229396571966e-05, "loss": 0.0019, "step": 3920 }, { "epoch": 0.540949759119064, "grad_norm": 0.04074763134121895, "learning_rate": 4.1948814275651564e-05, "loss": 0.0834, "step": 3930 }, { "epoch": 0.5423262216104611, "grad_norm": 0.1277616173028946, "learning_rate": 4.192533458558347e-05, "loss": 0.0835, "step": 3940 }, { "epoch": 0.5437026841018582, "grad_norm": 0.28529268503189087, "learning_rate": 4.190185489551538e-05, "loss": 0.099, "step": 3950 }, { "epoch": 0.5450791465932553, "grad_norm": 0.08919584006071091, "learning_rate": 4.187837520544729e-05, "loss": 0.0071, "step": 3960 }, { "epoch": 0.5464556090846524, "grad_norm": 0.06805167347192764, "learning_rate": 4.18548955153792e-05, "loss": 0.0772, "step": 3970 }, { "epoch": 0.5478320715760495, "grad_norm": 0.0676102489233017, "learning_rate": 4.183141582531111e-05, "loss": 0.0396, "step": 3980 }, { "epoch": 0.5492085340674466, "grad_norm": 1.6669994592666626, "learning_rate": 4.1807936135243015e-05, "loss": 0.0732, "step": 3990 }, { "epoch": 0.5505849965588437, "grad_norm": 0.07471257448196411, "learning_rate": 4.178445644517492e-05, "loss": 0.0047, "step": 4000 }, { "epoch": 0.5519614590502409, "grad_norm": 0.05116121470928192, "learning_rate": 4.1760976755106836e-05, "loss": 0.0031, "step": 4010 }, { "epoch": 0.553337921541638, "grad_norm": 0.060711927711963654, "learning_rate": 4.1737497065038744e-05, "loss": 0.0436, "step": 4020 }, { "epoch": 0.5547143840330351, "grad_norm": 0.04125746339559555, "learning_rate": 4.171401737497065e-05, "loss": 0.0024, "step": 4030 }, { "epoch": 0.5560908465244322, "grad_norm": 0.03683853521943092, "learning_rate": 4.169053768490256e-05, "loss": 0.0361, "step": 4040 }, { "epoch": 0.5574673090158293, "grad_norm": 0.04108884185552597, "learning_rate": 4.1667057994834466e-05, "loss": 0.0851, "step": 4050 }, { "epoch": 0.5588437715072264, "grad_norm": 0.04180365055799484, "learning_rate": 4.164357830476638e-05, "loss": 0.0322, "step": 4060 }, { "epoch": 0.5602202339986235, "grad_norm": 0.03238748386502266, "learning_rate": 4.162009861469829e-05, "loss": 0.002, "step": 4070 }, { "epoch": 0.5615966964900206, "grad_norm": 0.03795193135738373, "learning_rate": 4.15966189246302e-05, "loss": 0.0423, "step": 4080 }, { "epoch": 0.5629731589814178, "grad_norm": 0.030517656356096268, "learning_rate": 4.157313923456211e-05, "loss": 0.0401, "step": 4090 }, { "epoch": 0.5643496214728149, "grad_norm": 0.050783731043338776, "learning_rate": 4.1549659544494016e-05, "loss": 0.0415, "step": 4100 }, { "epoch": 0.565726083964212, "grad_norm": 0.08777064830064774, "learning_rate": 4.1526179854425923e-05, "loss": 0.0417, "step": 4110 }, { "epoch": 0.5671025464556091, "grad_norm": 0.05450589209794998, "learning_rate": 4.150270016435783e-05, "loss": 0.0033, "step": 4120 }, { "epoch": 0.5684790089470062, "grad_norm": 0.0608634315431118, "learning_rate": 4.1479220474289745e-05, "loss": 0.0653, "step": 4130 }, { "epoch": 0.5698554714384033, "grad_norm": 0.04287177324295044, "learning_rate": 4.145574078422165e-05, "loss": 0.0027, "step": 4140 }, { "epoch": 0.5712319339298004, "grad_norm": 0.03484787046909332, "learning_rate": 4.143226109415356e-05, "loss": 0.0221, "step": 4150 }, { "epoch": 0.5726083964211975, "grad_norm": 0.05382194370031357, "learning_rate": 4.140878140408547e-05, "loss": 0.0029, "step": 4160 }, { "epoch": 0.5739848589125947, "grad_norm": 0.024359339848160744, "learning_rate": 4.1385301714017374e-05, "loss": 0.0016, "step": 4170 }, { "epoch": 0.5753613214039918, "grad_norm": 2.301884889602661, "learning_rate": 4.136182202394929e-05, "loss": 0.1088, "step": 4180 }, { "epoch": 0.5767377838953889, "grad_norm": 0.06578797847032547, "learning_rate": 4.1338342333881196e-05, "loss": 0.0039, "step": 4190 }, { "epoch": 0.578114246386786, "grad_norm": 0.07366086542606354, "learning_rate": 4.13148626438131e-05, "loss": 0.0027, "step": 4200 }, { "epoch": 0.5794907088781831, "grad_norm": 0.03419088199734688, "learning_rate": 4.129138295374501e-05, "loss": 0.0348, "step": 4210 }, { "epoch": 0.5808671713695802, "grad_norm": 1.2202235460281372, "learning_rate": 4.1267903263676925e-05, "loss": 0.0449, "step": 4220 }, { "epoch": 0.5822436338609773, "grad_norm": 0.1904834508895874, "learning_rate": 4.124442357360883e-05, "loss": 0.0194, "step": 4230 }, { "epoch": 0.5836200963523744, "grad_norm": 0.027312304824590683, "learning_rate": 4.122094388354074e-05, "loss": 0.006, "step": 4240 }, { "epoch": 0.5849965588437716, "grad_norm": 0.11641061305999756, "learning_rate": 4.119746419347265e-05, "loss": 0.0296, "step": 4250 }, { "epoch": 0.5863730213351687, "grad_norm": 0.021699363365769386, "learning_rate": 4.1173984503404554e-05, "loss": 0.0404, "step": 4260 }, { "epoch": 0.5877494838265658, "grad_norm": 0.016810908913612366, "learning_rate": 4.115050481333647e-05, "loss": 0.0149, "step": 4270 }, { "epoch": 0.5891259463179629, "grad_norm": 0.05373326316475868, "learning_rate": 4.1127025123268376e-05, "loss": 0.0488, "step": 4280 }, { "epoch": 0.5905024088093599, "grad_norm": 0.016372468322515488, "learning_rate": 4.110354543320028e-05, "loss": 0.0145, "step": 4290 }, { "epoch": 0.591878871300757, "grad_norm": 0.04519746080040932, "learning_rate": 4.108006574313219e-05, "loss": 0.0101, "step": 4300 }, { "epoch": 0.5932553337921541, "grad_norm": 0.04009389877319336, "learning_rate": 4.10565860530641e-05, "loss": 0.0388, "step": 4310 }, { "epoch": 0.5946317962835512, "grad_norm": 0.03946505859494209, "learning_rate": 4.103310636299601e-05, "loss": 0.0026, "step": 4320 }, { "epoch": 0.5960082587749483, "grad_norm": 1.4367682933807373, "learning_rate": 4.100962667292792e-05, "loss": 0.0262, "step": 4330 }, { "epoch": 0.5973847212663455, "grad_norm": 0.01057188306003809, "learning_rate": 4.0986146982859827e-05, "loss": 0.0052, "step": 4340 }, { "epoch": 0.5987611837577426, "grad_norm": 0.010452769696712494, "learning_rate": 4.096266729279174e-05, "loss": 0.0037, "step": 4350 }, { "epoch": 0.6001376462491397, "grad_norm": 0.010675103403627872, "learning_rate": 4.093918760272364e-05, "loss": 0.0036, "step": 4360 }, { "epoch": 0.6015141087405368, "grad_norm": 0.06103414669632912, "learning_rate": 4.0915707912655555e-05, "loss": 0.0159, "step": 4370 }, { "epoch": 0.6028905712319339, "grad_norm": 0.212608203291893, "learning_rate": 4.089222822258746e-05, "loss": 0.0162, "step": 4380 }, { "epoch": 0.604267033723331, "grad_norm": 0.054902222007513046, "learning_rate": 4.086874853251938e-05, "loss": 0.0117, "step": 4390 }, { "epoch": 0.6056434962147281, "grad_norm": 0.04055459797382355, "learning_rate": 4.0845268842451284e-05, "loss": 0.0169, "step": 4400 }, { "epoch": 0.6070199587061252, "grad_norm": 0.8294593095779419, "learning_rate": 4.082178915238319e-05, "loss": 0.0385, "step": 4410 }, { "epoch": 0.6083964211975224, "grad_norm": 0.1836964637041092, "learning_rate": 4.07983094623151e-05, "loss": 0.0683, "step": 4420 }, { "epoch": 0.6097728836889195, "grad_norm": 0.024691011756658554, "learning_rate": 4.0774829772247006e-05, "loss": 0.001, "step": 4430 }, { "epoch": 0.6111493461803166, "grad_norm": 0.21845021843910217, "learning_rate": 4.075135008217892e-05, "loss": 0.0245, "step": 4440 }, { "epoch": 0.6125258086717137, "grad_norm": 0.025589926168322563, "learning_rate": 4.072787039211083e-05, "loss": 0.0306, "step": 4450 }, { "epoch": 0.6139022711631108, "grad_norm": 0.23382195830345154, "learning_rate": 4.0704390702042735e-05, "loss": 0.0167, "step": 4460 }, { "epoch": 0.6152787336545079, "grad_norm": 0.009115026332437992, "learning_rate": 4.068091101197464e-05, "loss": 0.0118, "step": 4470 }, { "epoch": 0.616655196145905, "grad_norm": 0.008166569285094738, "learning_rate": 4.065743132190655e-05, "loss": 0.0148, "step": 4480 }, { "epoch": 0.6180316586373021, "grad_norm": 1.8295501470565796, "learning_rate": 4.0633951631838464e-05, "loss": 0.0159, "step": 4490 }, { "epoch": 0.6194081211286993, "grad_norm": 0.18780843913555145, "learning_rate": 4.061047194177037e-05, "loss": 0.055, "step": 4500 }, { "epoch": 0.6207845836200964, "grad_norm": 0.007608760613948107, "learning_rate": 4.058699225170228e-05, "loss": 0.0337, "step": 4510 }, { "epoch": 0.6221610461114935, "grad_norm": 0.006360324565321207, "learning_rate": 4.0563512561634186e-05, "loss": 0.0134, "step": 4520 }, { "epoch": 0.6235375086028906, "grad_norm": 0.38930192589759827, "learning_rate": 4.05400328715661e-05, "loss": 0.0288, "step": 4530 }, { "epoch": 0.6249139710942877, "grad_norm": 0.009345685131847858, "learning_rate": 4.051655318149801e-05, "loss": 0.0915, "step": 4540 }, { "epoch": 0.6262904335856848, "grad_norm": 0.023494336754083633, "learning_rate": 4.0493073491429915e-05, "loss": 0.0205, "step": 4550 }, { "epoch": 0.6276668960770819, "grad_norm": 0.5628198981285095, "learning_rate": 4.046959380136182e-05, "loss": 0.0187, "step": 4560 }, { "epoch": 0.629043358568479, "grad_norm": 0.2486339509487152, "learning_rate": 4.044611411129373e-05, "loss": 0.0076, "step": 4570 }, { "epoch": 0.6304198210598762, "grad_norm": 0.01084219105541706, "learning_rate": 4.0422634421225644e-05, "loss": 0.0058, "step": 4580 }, { "epoch": 0.6317962835512733, "grad_norm": 0.010050120763480663, "learning_rate": 4.039915473115755e-05, "loss": 0.0124, "step": 4590 }, { "epoch": 0.6331727460426704, "grad_norm": 7.784799098968506, "learning_rate": 4.037567504108946e-05, "loss": 0.06, "step": 4600 }, { "epoch": 0.6345492085340675, "grad_norm": 0.016491269692778587, "learning_rate": 4.0352195351021366e-05, "loss": 0.0056, "step": 4610 }, { "epoch": 0.6359256710254646, "grad_norm": 0.014940978959202766, "learning_rate": 4.032871566095327e-05, "loss": 0.0077, "step": 4620 }, { "epoch": 0.6373021335168617, "grad_norm": 0.00919331330806017, "learning_rate": 4.030523597088519e-05, "loss": 0.0494, "step": 4630 }, { "epoch": 0.6386785960082588, "grad_norm": 0.013384257443249226, "learning_rate": 4.0281756280817095e-05, "loss": 0.0005, "step": 4640 }, { "epoch": 0.6400550584996559, "grad_norm": 0.007284614723175764, "learning_rate": 4.025827659074901e-05, "loss": 0.001, "step": 4650 }, { "epoch": 0.6414315209910529, "grad_norm": 0.061221931129693985, "learning_rate": 4.0234796900680916e-05, "loss": 0.0024, "step": 4660 }, { "epoch": 0.64280798348245, "grad_norm": 0.015073307789862156, "learning_rate": 4.021131721061282e-05, "loss": 0.0555, "step": 4670 }, { "epoch": 0.6441844459738472, "grad_norm": 0.019245782867074013, "learning_rate": 4.018783752054473e-05, "loss": 0.0049, "step": 4680 }, { "epoch": 0.6455609084652443, "grad_norm": 0.04255415126681328, "learning_rate": 4.016435783047664e-05, "loss": 0.0016, "step": 4690 }, { "epoch": 0.6469373709566414, "grad_norm": 0.018413281068205833, "learning_rate": 4.014087814040855e-05, "loss": 0.049, "step": 4700 }, { "epoch": 0.6483138334480385, "grad_norm": 0.08554636687040329, "learning_rate": 4.011739845034046e-05, "loss": 0.0262, "step": 4710 }, { "epoch": 0.6496902959394356, "grad_norm": 0.016794443130493164, "learning_rate": 4.009391876027237e-05, "loss": 0.0014, "step": 4720 }, { "epoch": 0.6510667584308327, "grad_norm": 0.015564914792776108, "learning_rate": 4.0070439070204274e-05, "loss": 0.0077, "step": 4730 }, { "epoch": 0.6524432209222298, "grad_norm": 0.008857227861881256, "learning_rate": 4.004695938013618e-05, "loss": 0.0005, "step": 4740 }, { "epoch": 0.653819683413627, "grad_norm": 0.0566171295940876, "learning_rate": 4.0023479690068096e-05, "loss": 0.0946, "step": 4750 }, { "epoch": 0.6551961459050241, "grad_norm": 0.09732162207365036, "learning_rate": 4e-05, "loss": 0.0343, "step": 4760 }, { "epoch": 0.6565726083964212, "grad_norm": 0.41853252053260803, "learning_rate": 3.997652030993191e-05, "loss": 0.1003, "step": 4770 }, { "epoch": 0.6579490708878183, "grad_norm": 0.04264218732714653, "learning_rate": 3.995304061986382e-05, "loss": 0.0064, "step": 4780 }, { "epoch": 0.6593255333792154, "grad_norm": 0.021636012941598892, "learning_rate": 3.9929560929795725e-05, "loss": 0.001, "step": 4790 }, { "epoch": 0.6607019958706125, "grad_norm": 0.016809310764074326, "learning_rate": 3.990608123972764e-05, "loss": 0.0255, "step": 4800 }, { "epoch": 0.6620784583620096, "grad_norm": 0.06638923287391663, "learning_rate": 3.988260154965955e-05, "loss": 0.0885, "step": 4810 }, { "epoch": 0.6634549208534067, "grad_norm": 0.038510192185640335, "learning_rate": 3.9859121859591454e-05, "loss": 0.0036, "step": 4820 }, { "epoch": 0.6648313833448039, "grad_norm": 0.03993036970496178, "learning_rate": 3.983564216952336e-05, "loss": 0.0454, "step": 4830 }, { "epoch": 0.666207845836201, "grad_norm": 0.05053792893886566, "learning_rate": 3.9812162479455276e-05, "loss": 0.0457, "step": 4840 }, { "epoch": 0.6675843083275981, "grad_norm": 0.06896813958883286, "learning_rate": 3.978868278938718e-05, "loss": 0.0767, "step": 4850 }, { "epoch": 0.6689607708189952, "grad_norm": 0.38988053798675537, "learning_rate": 3.976520309931909e-05, "loss": 0.0027, "step": 4860 }, { "epoch": 0.6703372333103923, "grad_norm": 0.03155899420380592, "learning_rate": 3.9741723409251e-05, "loss": 0.0046, "step": 4870 }, { "epoch": 0.6717136958017894, "grad_norm": 3.3701858520507812, "learning_rate": 3.9718243719182905e-05, "loss": 0.1269, "step": 4880 }, { "epoch": 0.6730901582931865, "grad_norm": 0.08917071670293808, "learning_rate": 3.969476402911482e-05, "loss": 0.079, "step": 4890 }, { "epoch": 0.6744666207845836, "grad_norm": 0.09433002024888992, "learning_rate": 3.9671284339046727e-05, "loss": 0.0119, "step": 4900 }, { "epoch": 0.6758430832759807, "grad_norm": 0.057283129543066025, "learning_rate": 3.9647804648978634e-05, "loss": 0.0385, "step": 4910 }, { "epoch": 0.6772195457673779, "grad_norm": 5.546417713165283, "learning_rate": 3.962432495891055e-05, "loss": 0.0975, "step": 4920 }, { "epoch": 0.678596008258775, "grad_norm": 0.07295430451631546, "learning_rate": 3.960084526884245e-05, "loss": 0.0041, "step": 4930 }, { "epoch": 0.6799724707501721, "grad_norm": 0.2660236954689026, "learning_rate": 3.957736557877436e-05, "loss": 0.0042, "step": 4940 }, { "epoch": 0.6813489332415692, "grad_norm": 0.04161861166357994, "learning_rate": 3.955388588870627e-05, "loss": 0.0464, "step": 4950 }, { "epoch": 0.6827253957329663, "grad_norm": 0.03979240730404854, "learning_rate": 3.9530406198638184e-05, "loss": 0.0019, "step": 4960 }, { "epoch": 0.6841018582243634, "grad_norm": 0.03659604489803314, "learning_rate": 3.950692650857009e-05, "loss": 0.0418, "step": 4970 }, { "epoch": 0.6854783207157605, "grad_norm": 0.028376974165439606, "learning_rate": 3.9483446818502e-05, "loss": 0.0017, "step": 4980 }, { "epoch": 0.6868547832071576, "grad_norm": 0.0217717457562685, "learning_rate": 3.9459967128433906e-05, "loss": 0.001, "step": 4990 }, { "epoch": 0.6882312456985548, "grad_norm": 0.025561854243278503, "learning_rate": 3.9436487438365814e-05, "loss": 0.0018, "step": 5000 }, { "epoch": 0.6896077081899519, "grad_norm": 0.021822791546583176, "learning_rate": 3.941300774829773e-05, "loss": 0.0581, "step": 5010 }, { "epoch": 0.690984170681349, "grad_norm": 0.025252507999539375, "learning_rate": 3.9389528058229635e-05, "loss": 0.0014, "step": 5020 }, { "epoch": 0.692360633172746, "grad_norm": 0.023208104074001312, "learning_rate": 3.936604836816154e-05, "loss": 0.0508, "step": 5030 }, { "epoch": 0.6937370956641431, "grad_norm": 0.03280678391456604, "learning_rate": 3.934256867809345e-05, "loss": 0.0065, "step": 5040 }, { "epoch": 0.6951135581555402, "grad_norm": 0.02938653714954853, "learning_rate": 3.931908898802536e-05, "loss": 0.0013, "step": 5050 }, { "epoch": 0.6964900206469373, "grad_norm": 0.023236721754074097, "learning_rate": 3.929560929795727e-05, "loss": 0.0011, "step": 5060 }, { "epoch": 0.6978664831383344, "grad_norm": 0.028620554134249687, "learning_rate": 3.927212960788918e-05, "loss": 0.0883, "step": 5070 }, { "epoch": 0.6992429456297315, "grad_norm": 0.04798030108213425, "learning_rate": 3.9248649917821086e-05, "loss": 0.0402, "step": 5080 }, { "epoch": 0.7006194081211287, "grad_norm": 0.10726285725831985, "learning_rate": 3.9225170227752993e-05, "loss": 0.0574, "step": 5090 }, { "epoch": 0.7019958706125258, "grad_norm": 0.13319772481918335, "learning_rate": 3.92016905376849e-05, "loss": 0.0404, "step": 5100 }, { "epoch": 0.7033723331039229, "grad_norm": 0.17747831344604492, "learning_rate": 3.9178210847616815e-05, "loss": 0.0775, "step": 5110 }, { "epoch": 0.70474879559532, "grad_norm": 0.17154037952423096, "learning_rate": 3.915473115754872e-05, "loss": 0.0522, "step": 5120 }, { "epoch": 0.7061252580867171, "grad_norm": 45.10865020751953, "learning_rate": 3.913125146748063e-05, "loss": 0.1242, "step": 5130 }, { "epoch": 0.7075017205781142, "grad_norm": 0.18649514019489288, "learning_rate": 3.910777177741254e-05, "loss": 0.0076, "step": 5140 }, { "epoch": 0.7088781830695113, "grad_norm": 0.11753788590431213, "learning_rate": 3.908429208734445e-05, "loss": 0.0408, "step": 5150 }, { "epoch": 0.7102546455609084, "grad_norm": 0.6258233189582825, "learning_rate": 3.906081239727636e-05, "loss": 0.0122, "step": 5160 }, { "epoch": 0.7116311080523056, "grad_norm": 0.06050730496644974, "learning_rate": 3.9037332707208266e-05, "loss": 0.0391, "step": 5170 }, { "epoch": 0.7130075705437027, "grad_norm": 0.050429802387952805, "learning_rate": 3.901385301714018e-05, "loss": 0.0095, "step": 5180 }, { "epoch": 0.7143840330350998, "grad_norm": 0.036430928856134415, "learning_rate": 3.899037332707208e-05, "loss": 0.008, "step": 5190 }, { "epoch": 0.7157604955264969, "grad_norm": 0.025729890912771225, "learning_rate": 3.8966893637003995e-05, "loss": 0.0051, "step": 5200 }, { "epoch": 0.717136958017894, "grad_norm": 0.03142096847295761, "learning_rate": 3.89434139469359e-05, "loss": 0.0575, "step": 5210 }, { "epoch": 0.7185134205092911, "grad_norm": 0.6522919535636902, "learning_rate": 3.891993425686781e-05, "loss": 0.1058, "step": 5220 }, { "epoch": 0.7198898830006882, "grad_norm": 0.05427301675081253, "learning_rate": 3.8896454566799724e-05, "loss": 0.0077, "step": 5230 }, { "epoch": 0.7212663454920853, "grad_norm": 0.04726822301745415, "learning_rate": 3.8872974876731624e-05, "loss": 0.0033, "step": 5240 }, { "epoch": 0.7226428079834825, "grad_norm": 0.0415482223033905, "learning_rate": 3.884949518666354e-05, "loss": 0.0042, "step": 5250 }, { "epoch": 0.7240192704748796, "grad_norm": 0.0323251374065876, "learning_rate": 3.8826015496595446e-05, "loss": 0.0103, "step": 5260 }, { "epoch": 0.7253957329662767, "grad_norm": 0.023249007761478424, "learning_rate": 3.880253580652736e-05, "loss": 0.0085, "step": 5270 }, { "epoch": 0.7267721954576738, "grad_norm": 0.7628380656242371, "learning_rate": 3.877905611645927e-05, "loss": 0.0048, "step": 5280 }, { "epoch": 0.7281486579490709, "grad_norm": 1.8686316013336182, "learning_rate": 3.8755576426391174e-05, "loss": 0.0493, "step": 5290 }, { "epoch": 0.729525120440468, "grad_norm": 0.02291961759328842, "learning_rate": 3.873209673632308e-05, "loss": 0.0043, "step": 5300 }, { "epoch": 0.7309015829318651, "grad_norm": 0.021915780380368233, "learning_rate": 3.870861704625499e-05, "loss": 0.006, "step": 5310 }, { "epoch": 0.7322780454232622, "grad_norm": 0.030924083665013313, "learning_rate": 3.86851373561869e-05, "loss": 0.085, "step": 5320 }, { "epoch": 0.7336545079146594, "grad_norm": 0.04096757248044014, "learning_rate": 3.866165766611881e-05, "loss": 0.0502, "step": 5330 }, { "epoch": 0.7350309704060565, "grad_norm": 2.500483751296997, "learning_rate": 3.863817797605072e-05, "loss": 0.0343, "step": 5340 }, { "epoch": 0.7364074328974536, "grad_norm": 0.07433193176984787, "learning_rate": 3.8614698285982625e-05, "loss": 0.042, "step": 5350 }, { "epoch": 0.7377838953888507, "grad_norm": 0.09542132914066315, "learning_rate": 3.859121859591453e-05, "loss": 0.0491, "step": 5360 }, { "epoch": 0.7391603578802478, "grad_norm": 0.06892585754394531, "learning_rate": 3.856773890584645e-05, "loss": 0.0034, "step": 5370 }, { "epoch": 0.7405368203716449, "grad_norm": 0.45573365688323975, "learning_rate": 3.8544259215778354e-05, "loss": 0.0165, "step": 5380 }, { "epoch": 0.741913282863042, "grad_norm": 0.032064322382211685, "learning_rate": 3.852077952571026e-05, "loss": 0.007, "step": 5390 }, { "epoch": 0.7432897453544391, "grad_norm": 0.02818894572556019, "learning_rate": 3.849729983564217e-05, "loss": 0.0059, "step": 5400 }, { "epoch": 0.7446662078458361, "grad_norm": 0.02032279781997204, "learning_rate": 3.847382014557408e-05, "loss": 0.001, "step": 5410 }, { "epoch": 0.7460426703372333, "grad_norm": 0.02900795452296734, "learning_rate": 3.845034045550599e-05, "loss": 0.0442, "step": 5420 }, { "epoch": 0.7474191328286304, "grad_norm": 13.882761001586914, "learning_rate": 3.84268607654379e-05, "loss": 0.0371, "step": 5430 }, { "epoch": 0.7487955953200275, "grad_norm": 0.036407940089702606, "learning_rate": 3.8403381075369805e-05, "loss": 0.0497, "step": 5440 }, { "epoch": 0.7501720578114246, "grad_norm": 0.2244081199169159, "learning_rate": 3.837990138530171e-05, "loss": 0.0173, "step": 5450 }, { "epoch": 0.7515485203028217, "grad_norm": 0.02951628901064396, "learning_rate": 3.8356421695233627e-05, "loss": 0.048, "step": 5460 }, { "epoch": 0.7529249827942188, "grad_norm": 0.03507159650325775, "learning_rate": 3.8332942005165534e-05, "loss": 0.0104, "step": 5470 }, { "epoch": 0.7543014452856159, "grad_norm": 0.029022637754678726, "learning_rate": 3.830946231509744e-05, "loss": 0.0045, "step": 5480 }, { "epoch": 0.755677907777013, "grad_norm": 0.026552027091383934, "learning_rate": 3.8285982625029355e-05, "loss": 0.0113, "step": 5490 }, { "epoch": 0.7570543702684102, "grad_norm": 15.544509887695312, "learning_rate": 3.8262502934961256e-05, "loss": 0.0522, "step": 5500 }, { "epoch": 0.7584308327598073, "grad_norm": 0.020716305822134018, "learning_rate": 3.823902324489317e-05, "loss": 0.0052, "step": 5510 }, { "epoch": 0.7598072952512044, "grad_norm": 0.5929539799690247, "learning_rate": 3.821554355482508e-05, "loss": 0.0918, "step": 5520 }, { "epoch": 0.7611837577426015, "grad_norm": 0.03996877744793892, "learning_rate": 3.819206386475699e-05, "loss": 0.0052, "step": 5530 }, { "epoch": 0.7625602202339986, "grad_norm": 0.035206060856580734, "learning_rate": 3.81685841746889e-05, "loss": 0.0062, "step": 5540 }, { "epoch": 0.7639366827253957, "grad_norm": 0.02610827051103115, "learning_rate": 3.81451044846208e-05, "loss": 0.006, "step": 5550 }, { "epoch": 0.7653131452167928, "grad_norm": 0.023904848843812943, "learning_rate": 3.8121624794552714e-05, "loss": 0.0249, "step": 5560 }, { "epoch": 0.76668960770819, "grad_norm": 0.02612803876399994, "learning_rate": 3.809814510448462e-05, "loss": 0.0459, "step": 5570 }, { "epoch": 0.7680660701995871, "grad_norm": 0.029840538278222084, "learning_rate": 3.8074665414416535e-05, "loss": 0.0059, "step": 5580 }, { "epoch": 0.7694425326909842, "grad_norm": 0.024743275716900826, "learning_rate": 3.805118572434844e-05, "loss": 0.0024, "step": 5590 }, { "epoch": 0.7708189951823813, "grad_norm": 0.027379589155316353, "learning_rate": 3.802770603428035e-05, "loss": 0.001, "step": 5600 }, { "epoch": 0.7721954576737784, "grad_norm": 1.857625126838684, "learning_rate": 3.800422634421226e-05, "loss": 0.0448, "step": 5610 }, { "epoch": 0.7735719201651755, "grad_norm": 0.02666308358311653, "learning_rate": 3.7980746654144165e-05, "loss": 0.0009, "step": 5620 }, { "epoch": 0.7749483826565726, "grad_norm": 0.02614920772612095, "learning_rate": 3.795726696407608e-05, "loss": 0.001, "step": 5630 }, { "epoch": 0.7763248451479697, "grad_norm": 0.021932609379291534, "learning_rate": 3.7933787274007986e-05, "loss": 0.0009, "step": 5640 }, { "epoch": 0.7777013076393668, "grad_norm": 0.03293583169579506, "learning_rate": 3.7910307583939893e-05, "loss": 0.0446, "step": 5650 }, { "epoch": 0.779077770130764, "grad_norm": 37.00752258300781, "learning_rate": 3.78868278938718e-05, "loss": 0.0249, "step": 5660 }, { "epoch": 0.7804542326221611, "grad_norm": 0.02619984932243824, "learning_rate": 3.786334820380371e-05, "loss": 0.0026, "step": 5670 }, { "epoch": 0.7818306951135582, "grad_norm": 1.8732949495315552, "learning_rate": 3.783986851373562e-05, "loss": 0.1154, "step": 5680 }, { "epoch": 0.7832071576049553, "grad_norm": 1.5856634378433228, "learning_rate": 3.781638882366753e-05, "loss": 0.0788, "step": 5690 }, { "epoch": 0.7845836200963524, "grad_norm": 0.09367092698812485, "learning_rate": 3.779290913359944e-05, "loss": 0.0165, "step": 5700 }, { "epoch": 0.7859600825877495, "grad_norm": 0.07298976927995682, "learning_rate": 3.7769429443531344e-05, "loss": 0.0407, "step": 5710 }, { "epoch": 0.7873365450791466, "grad_norm": 0.046042557805776596, "learning_rate": 3.774594975346326e-05, "loss": 0.0026, "step": 5720 }, { "epoch": 0.7887130075705437, "grad_norm": 0.03447524458169937, "learning_rate": 3.7722470063395166e-05, "loss": 0.0086, "step": 5730 }, { "epoch": 0.7900894700619409, "grad_norm": 0.5734868049621582, "learning_rate": 3.769899037332707e-05, "loss": 0.0109, "step": 5740 }, { "epoch": 0.791465932553338, "grad_norm": 0.35710757970809937, "learning_rate": 3.767551068325899e-05, "loss": 0.0088, "step": 5750 }, { "epoch": 0.7928423950447351, "grad_norm": 0.02040611393749714, "learning_rate": 3.765203099319089e-05, "loss": 0.0009, "step": 5760 }, { "epoch": 0.7942188575361322, "grad_norm": 0.016577407717704773, "learning_rate": 3.76285513031228e-05, "loss": 0.0007, "step": 5770 }, { "epoch": 0.7955953200275292, "grad_norm": 0.3687322735786438, "learning_rate": 3.760507161305471e-05, "loss": 0.0309, "step": 5780 }, { "epoch": 0.7969717825189263, "grad_norm": 0.01644628681242466, "learning_rate": 3.758159192298662e-05, "loss": 0.011, "step": 5790 }, { "epoch": 0.7983482450103234, "grad_norm": 0.015299319289624691, "learning_rate": 3.755811223291853e-05, "loss": 0.0067, "step": 5800 }, { "epoch": 0.7997247075017205, "grad_norm": 0.016319483518600464, "learning_rate": 3.753463254285043e-05, "loss": 0.0498, "step": 5810 }, { "epoch": 0.8011011699931176, "grad_norm": 0.019704503938555717, "learning_rate": 3.7511152852782346e-05, "loss": 0.0007, "step": 5820 }, { "epoch": 0.8024776324845148, "grad_norm": 0.37491586804389954, "learning_rate": 3.748767316271425e-05, "loss": 0.0012, "step": 5830 }, { "epoch": 0.8038540949759119, "grad_norm": 0.2969169318675995, "learning_rate": 3.746419347264617e-05, "loss": 0.0551, "step": 5840 }, { "epoch": 0.805230557467309, "grad_norm": 1.8501636981964111, "learning_rate": 3.7440713782578074e-05, "loss": 0.0472, "step": 5850 }, { "epoch": 0.8066070199587061, "grad_norm": 0.03491336852312088, "learning_rate": 3.741723409250998e-05, "loss": 0.0013, "step": 5860 }, { "epoch": 0.8079834824501032, "grad_norm": 0.5837016105651855, "learning_rate": 3.739375440244189e-05, "loss": 0.0039, "step": 5870 }, { "epoch": 0.8093599449415003, "grad_norm": 0.02787739224731922, "learning_rate": 3.7370274712373797e-05, "loss": 0.0012, "step": 5880 }, { "epoch": 0.8107364074328974, "grad_norm": 0.023075049743056297, "learning_rate": 3.734679502230571e-05, "loss": 0.0009, "step": 5890 }, { "epoch": 0.8121128699242945, "grad_norm": 1.8409175872802734, "learning_rate": 3.732331533223762e-05, "loss": 0.0898, "step": 5900 }, { "epoch": 0.8134893324156917, "grad_norm": 0.05840715020895004, "learning_rate": 3.7299835642169525e-05, "loss": 0.0695, "step": 5910 }, { "epoch": 0.8148657949070888, "grad_norm": 0.06378444284200668, "learning_rate": 3.727635595210143e-05, "loss": 0.0037, "step": 5920 }, { "epoch": 0.8162422573984859, "grad_norm": 0.043205466121435165, "learning_rate": 3.725287626203334e-05, "loss": 0.0025, "step": 5930 }, { "epoch": 0.817618719889883, "grad_norm": 0.0403926745057106, "learning_rate": 3.7229396571965254e-05, "loss": 0.0163, "step": 5940 }, { "epoch": 0.8189951823812801, "grad_norm": 0.026618458330631256, "learning_rate": 3.720591688189716e-05, "loss": 0.0122, "step": 5950 }, { "epoch": 0.8203716448726772, "grad_norm": 0.0272475965321064, "learning_rate": 3.718243719182907e-05, "loss": 0.0264, "step": 5960 }, { "epoch": 0.8217481073640743, "grad_norm": 0.3211561143398285, "learning_rate": 3.7158957501760976e-05, "loss": 0.0105, "step": 5970 }, { "epoch": 0.8231245698554714, "grad_norm": 0.01771422289311886, "learning_rate": 3.7135477811692884e-05, "loss": 0.0395, "step": 5980 }, { "epoch": 0.8245010323468686, "grad_norm": 0.017653895542025566, "learning_rate": 3.71119981216248e-05, "loss": 0.0069, "step": 5990 }, { "epoch": 0.8258774948382657, "grad_norm": 0.016729559749364853, "learning_rate": 3.7088518431556705e-05, "loss": 0.053, "step": 6000 }, { "epoch": 0.8272539573296628, "grad_norm": 0.016743090003728867, "learning_rate": 3.706503874148862e-05, "loss": 0.0041, "step": 6010 }, { "epoch": 0.8286304198210599, "grad_norm": 0.019531408324837685, "learning_rate": 3.704155905142052e-05, "loss": 0.0147, "step": 6020 }, { "epoch": 0.830006882312457, "grad_norm": 0.01832745596766472, "learning_rate": 3.7018079361352434e-05, "loss": 0.0007, "step": 6030 }, { "epoch": 0.8313833448038541, "grad_norm": 0.5321931838989258, "learning_rate": 3.699459967128434e-05, "loss": 0.0444, "step": 6040 }, { "epoch": 0.8327598072952512, "grad_norm": 0.0225497018545866, "learning_rate": 3.697111998121625e-05, "loss": 0.0009, "step": 6050 }, { "epoch": 0.8341362697866483, "grad_norm": 0.020134877413511276, "learning_rate": 3.694764029114816e-05, "loss": 0.019, "step": 6060 }, { "epoch": 0.8355127322780455, "grad_norm": 0.022453324869275093, "learning_rate": 3.6924160601080063e-05, "loss": 0.0537, "step": 6070 }, { "epoch": 0.8368891947694426, "grad_norm": 1.9429528713226318, "learning_rate": 3.690068091101198e-05, "loss": 0.1218, "step": 6080 }, { "epoch": 0.8382656572608397, "grad_norm": 0.16150428354740143, "learning_rate": 3.6877201220943885e-05, "loss": 0.0783, "step": 6090 }, { "epoch": 0.8396421197522368, "grad_norm": 0.11635234951972961, "learning_rate": 3.685372153087579e-05, "loss": 0.0348, "step": 6100 }, { "epoch": 0.8410185822436339, "grad_norm": 0.08331966400146484, "learning_rate": 3.6830241840807706e-05, "loss": 0.0354, "step": 6110 }, { "epoch": 0.842395044735031, "grad_norm": 0.06296124309301376, "learning_rate": 3.680676215073961e-05, "loss": 0.0873, "step": 6120 }, { "epoch": 0.8437715072264281, "grad_norm": 0.4554714858531952, "learning_rate": 3.678328246067152e-05, "loss": 0.0115, "step": 6130 }, { "epoch": 0.8451479697178252, "grad_norm": 0.04912176355719566, "learning_rate": 3.675980277060343e-05, "loss": 0.0191, "step": 6140 }, { "epoch": 0.8465244322092222, "grad_norm": 0.023414717987179756, "learning_rate": 3.673632308053534e-05, "loss": 0.0014, "step": 6150 }, { "epoch": 0.8479008947006194, "grad_norm": 0.7490262389183044, "learning_rate": 3.671284339046725e-05, "loss": 0.0116, "step": 6160 }, { "epoch": 0.8492773571920165, "grad_norm": 1.2616363763809204, "learning_rate": 3.668936370039916e-05, "loss": 0.0083, "step": 6170 }, { "epoch": 0.8506538196834136, "grad_norm": 0.0158780999481678, "learning_rate": 3.6665884010331065e-05, "loss": 0.031, "step": 6180 }, { "epoch": 0.8520302821748107, "grad_norm": 0.024353092536330223, "learning_rate": 3.664240432026297e-05, "loss": 0.0268, "step": 6190 }, { "epoch": 0.8534067446662078, "grad_norm": 0.13278056681156158, "learning_rate": 3.6618924630194886e-05, "loss": 0.0027, "step": 6200 }, { "epoch": 0.8547832071576049, "grad_norm": 0.11732760071754456, "learning_rate": 3.6595444940126794e-05, "loss": 0.0675, "step": 6210 }, { "epoch": 0.856159669649002, "grad_norm": 0.029110025614500046, "learning_rate": 3.65719652500587e-05, "loss": 0.0975, "step": 6220 }, { "epoch": 0.8575361321403991, "grad_norm": 0.26787710189819336, "learning_rate": 3.654848555999061e-05, "loss": 0.0673, "step": 6230 }, { "epoch": 0.8589125946317963, "grad_norm": 0.023557499051094055, "learning_rate": 3.6525005869922516e-05, "loss": 0.006, "step": 6240 }, { "epoch": 0.8602890571231934, "grad_norm": 0.11473897099494934, "learning_rate": 3.650152617985443e-05, "loss": 0.0045, "step": 6250 }, { "epoch": 0.8616655196145905, "grad_norm": 0.11777396500110626, "learning_rate": 3.647804648978634e-05, "loss": 0.0039, "step": 6260 }, { "epoch": 0.8630419821059876, "grad_norm": 0.027078041806817055, "learning_rate": 3.6454566799718244e-05, "loss": 0.0571, "step": 6270 }, { "epoch": 0.8644184445973847, "grad_norm": 0.021739939227700233, "learning_rate": 3.643108710965015e-05, "loss": 0.0171, "step": 6280 }, { "epoch": 0.8657949070887818, "grad_norm": 0.022900700569152832, "learning_rate": 3.6407607419582066e-05, "loss": 0.0099, "step": 6290 }, { "epoch": 0.8671713695801789, "grad_norm": 0.017219560220837593, "learning_rate": 3.638412772951397e-05, "loss": 0.0016, "step": 6300 }, { "epoch": 0.868547832071576, "grad_norm": 0.018023142591118813, "learning_rate": 3.636064803944588e-05, "loss": 0.0073, "step": 6310 }, { "epoch": 0.8699242945629732, "grad_norm": 0.014313475228846073, "learning_rate": 3.6337168349377795e-05, "loss": 0.0115, "step": 6320 }, { "epoch": 0.8713007570543703, "grad_norm": 0.013741008937358856, "learning_rate": 3.6313688659309695e-05, "loss": 0.0193, "step": 6330 }, { "epoch": 0.8726772195457674, "grad_norm": 0.07222304493188858, "learning_rate": 3.629020896924161e-05, "loss": 0.0017, "step": 6340 }, { "epoch": 0.8740536820371645, "grad_norm": 0.013471345417201519, "learning_rate": 3.626672927917352e-05, "loss": 0.0073, "step": 6350 }, { "epoch": 0.8754301445285616, "grad_norm": 0.06347732990980148, "learning_rate": 3.6243249589105424e-05, "loss": 0.0012, "step": 6360 }, { "epoch": 0.8768066070199587, "grad_norm": 0.04955543205142021, "learning_rate": 3.621976989903734e-05, "loss": 0.0013, "step": 6370 }, { "epoch": 0.8781830695113558, "grad_norm": 0.009371286258101463, "learning_rate": 3.619629020896924e-05, "loss": 0.0004, "step": 6380 }, { "epoch": 0.8795595320027529, "grad_norm": 0.013512141071259975, "learning_rate": 3.617281051890115e-05, "loss": 0.0008, "step": 6390 }, { "epoch": 0.88093599449415, "grad_norm": 1.1064565181732178, "learning_rate": 3.614933082883306e-05, "loss": 0.0433, "step": 6400 }, { "epoch": 0.8823124569855472, "grad_norm": 0.4722721576690674, "learning_rate": 3.6125851138764975e-05, "loss": 0.0338, "step": 6410 }, { "epoch": 0.8836889194769443, "grad_norm": 0.7248334288597107, "learning_rate": 3.610237144869688e-05, "loss": 0.0171, "step": 6420 }, { "epoch": 0.8850653819683414, "grad_norm": 0.008546540513634682, "learning_rate": 3.607889175862878e-05, "loss": 0.002, "step": 6430 }, { "epoch": 0.8864418444597385, "grad_norm": 0.008768325671553612, "learning_rate": 3.6055412068560697e-05, "loss": 0.049, "step": 6440 }, { "epoch": 0.8878183069511356, "grad_norm": 0.009989731945097446, "learning_rate": 3.6031932378492604e-05, "loss": 0.0708, "step": 6450 }, { "epoch": 0.8891947694425327, "grad_norm": 0.013383635319769382, "learning_rate": 3.600845268842452e-05, "loss": 0.0008, "step": 6460 }, { "epoch": 0.8905712319339298, "grad_norm": 0.01058033388108015, "learning_rate": 3.5984972998356425e-05, "loss": 0.0005, "step": 6470 }, { "epoch": 0.891947694425327, "grad_norm": 0.02269929088652134, "learning_rate": 3.596149330828833e-05, "loss": 0.0089, "step": 6480 }, { "epoch": 0.8933241569167241, "grad_norm": 0.011928311549127102, "learning_rate": 3.593801361822024e-05, "loss": 0.0091, "step": 6490 }, { "epoch": 0.8947006194081212, "grad_norm": 0.010069628246128559, "learning_rate": 3.591453392815215e-05, "loss": 0.0005, "step": 6500 }, { "epoch": 0.8960770818995183, "grad_norm": 0.008379790931940079, "learning_rate": 3.589105423808406e-05, "loss": 0.0041, "step": 6510 }, { "epoch": 0.8974535443909153, "grad_norm": 0.0084075927734375, "learning_rate": 3.586757454801597e-05, "loss": 0.0005, "step": 6520 }, { "epoch": 0.8988300068823124, "grad_norm": 0.010765559040009975, "learning_rate": 3.5844094857947876e-05, "loss": 0.1159, "step": 6530 }, { "epoch": 0.9002064693737095, "grad_norm": 0.011307496577501297, "learning_rate": 3.5820615167879784e-05, "loss": 0.0371, "step": 6540 }, { "epoch": 0.9015829318651066, "grad_norm": 0.41120797395706177, "learning_rate": 3.579713547781169e-05, "loss": 0.0613, "step": 6550 }, { "epoch": 0.9029593943565037, "grad_norm": 0.02174476534128189, "learning_rate": 3.5773655787743605e-05, "loss": 0.001, "step": 6560 }, { "epoch": 0.9043358568479009, "grad_norm": 0.022884322330355644, "learning_rate": 3.575017609767551e-05, "loss": 0.0056, "step": 6570 }, { "epoch": 0.905712319339298, "grad_norm": 0.019788052886724472, "learning_rate": 3.572669640760743e-05, "loss": 0.0094, "step": 6580 }, { "epoch": 0.9070887818306951, "grad_norm": 0.01494582649320364, "learning_rate": 3.570321671753933e-05, "loss": 0.0007, "step": 6590 }, { "epoch": 0.9084652443220922, "grad_norm": 0.021362876519560814, "learning_rate": 3.567973702747124e-05, "loss": 0.0925, "step": 6600 }, { "epoch": 0.9098417068134893, "grad_norm": 0.056275371462106705, "learning_rate": 3.565625733740315e-05, "loss": 0.0214, "step": 6610 }, { "epoch": 0.9112181693048864, "grad_norm": 0.03216521069407463, "learning_rate": 3.5632777647335056e-05, "loss": 0.0342, "step": 6620 }, { "epoch": 0.9125946317962835, "grad_norm": 0.17453809082508087, "learning_rate": 3.560929795726697e-05, "loss": 0.0527, "step": 6630 }, { "epoch": 0.9139710942876806, "grad_norm": 0.21362973749637604, "learning_rate": 3.558581826719887e-05, "loss": 0.0485, "step": 6640 }, { "epoch": 0.9153475567790778, "grad_norm": 0.02672101929783821, "learning_rate": 3.5562338577130785e-05, "loss": 0.019, "step": 6650 }, { "epoch": 0.9167240192704749, "grad_norm": 0.06827089935541153, "learning_rate": 3.553885888706269e-05, "loss": 0.0166, "step": 6660 }, { "epoch": 0.918100481761872, "grad_norm": 0.021122710779309273, "learning_rate": 3.55153791969946e-05, "loss": 0.0029, "step": 6670 }, { "epoch": 0.9194769442532691, "grad_norm": 0.014503482729196548, "learning_rate": 3.5491899506926514e-05, "loss": 0.0127, "step": 6680 }, { "epoch": 0.9208534067446662, "grad_norm": 0.2477692812681198, "learning_rate": 3.5468419816858414e-05, "loss": 0.0642, "step": 6690 }, { "epoch": 0.9222298692360633, "grad_norm": 0.5295566916465759, "learning_rate": 3.544494012679033e-05, "loss": 0.026, "step": 6700 }, { "epoch": 0.9236063317274604, "grad_norm": 0.011720262467861176, "learning_rate": 3.5421460436722236e-05, "loss": 0.008, "step": 6710 }, { "epoch": 0.9249827942188575, "grad_norm": 0.39994943141937256, "learning_rate": 3.539798074665415e-05, "loss": 0.0057, "step": 6720 }, { "epoch": 0.9263592567102547, "grad_norm": 0.460148423910141, "learning_rate": 3.537450105658606e-05, "loss": 0.0512, "step": 6730 }, { "epoch": 0.9277357192016518, "grad_norm": 0.015706421807408333, "learning_rate": 3.535102136651796e-05, "loss": 0.0006, "step": 6740 }, { "epoch": 0.9291121816930489, "grad_norm": 0.026508312672376633, "learning_rate": 3.532754167644987e-05, "loss": 0.0729, "step": 6750 }, { "epoch": 0.930488644184446, "grad_norm": 0.01836998201906681, "learning_rate": 3.530406198638178e-05, "loss": 0.0019, "step": 6760 }, { "epoch": 0.9318651066758431, "grad_norm": 0.01747228018939495, "learning_rate": 3.5280582296313694e-05, "loss": 0.0149, "step": 6770 }, { "epoch": 0.9332415691672402, "grad_norm": 0.018819166347384453, "learning_rate": 3.52571026062456e-05, "loss": 0.0007, "step": 6780 }, { "epoch": 0.9346180316586373, "grad_norm": 0.01888113282620907, "learning_rate": 3.523362291617751e-05, "loss": 0.0494, "step": 6790 }, { "epoch": 0.9359944941500344, "grad_norm": 0.024128947407007217, "learning_rate": 3.5210143226109416e-05, "loss": 0.0116, "step": 6800 }, { "epoch": 0.9373709566414316, "grad_norm": 0.017092648893594742, "learning_rate": 3.518666353604132e-05, "loss": 0.0085, "step": 6810 }, { "epoch": 0.9387474191328287, "grad_norm": 0.23530860245227814, "learning_rate": 3.516318384597324e-05, "loss": 0.0067, "step": 6820 }, { "epoch": 0.9401238816242258, "grad_norm": 0.01951962150633335, "learning_rate": 3.5139704155905144e-05, "loss": 0.0439, "step": 6830 }, { "epoch": 0.9415003441156229, "grad_norm": 0.018564322963356972, "learning_rate": 3.511622446583706e-05, "loss": 0.0069, "step": 6840 }, { "epoch": 0.94287680660702, "grad_norm": 0.016449956223368645, "learning_rate": 3.509274477576896e-05, "loss": 0.0085, "step": 6850 }, { "epoch": 0.9442532690984171, "grad_norm": 0.015442545525729656, "learning_rate": 3.5069265085700867e-05, "loss": 0.0028, "step": 6860 }, { "epoch": 0.9456297315898142, "grad_norm": 0.018565157428383827, "learning_rate": 3.504578539563278e-05, "loss": 0.0467, "step": 6870 }, { "epoch": 0.9470061940812113, "grad_norm": 1.8916300535202026, "learning_rate": 3.502230570556469e-05, "loss": 0.0347, "step": 6880 }, { "epoch": 0.9483826565726085, "grad_norm": 0.6812904477119446, "learning_rate": 3.49988260154966e-05, "loss": 0.0426, "step": 6890 }, { "epoch": 0.9497591190640055, "grad_norm": 0.19350042939186096, "learning_rate": 3.49753463254285e-05, "loss": 0.0077, "step": 6900 }, { "epoch": 0.9511355815554026, "grad_norm": 0.018957002088427544, "learning_rate": 3.495186663536042e-05, "loss": 0.0057, "step": 6910 }, { "epoch": 0.9525120440467997, "grad_norm": 0.015442267060279846, "learning_rate": 3.4928386945292324e-05, "loss": 0.0138, "step": 6920 }, { "epoch": 0.9538885065381968, "grad_norm": 0.02918126992881298, "learning_rate": 3.490490725522423e-05, "loss": 0.0256, "step": 6930 }, { "epoch": 0.9552649690295939, "grad_norm": 0.0316859669983387, "learning_rate": 3.4881427565156146e-05, "loss": 0.0269, "step": 6940 }, { "epoch": 0.956641431520991, "grad_norm": 0.06608280539512634, "learning_rate": 3.4857947875088046e-05, "loss": 0.0062, "step": 6950 }, { "epoch": 0.9580178940123881, "grad_norm": 1.164088487625122, "learning_rate": 3.483446818501996e-05, "loss": 0.0094, "step": 6960 }, { "epoch": 0.9593943565037852, "grad_norm": 0.01180696114897728, "learning_rate": 3.481098849495187e-05, "loss": 0.0129, "step": 6970 }, { "epoch": 0.9607708189951824, "grad_norm": 0.009360818192362785, "learning_rate": 3.4787508804883775e-05, "loss": 0.0154, "step": 6980 }, { "epoch": 0.9621472814865795, "grad_norm": 0.007021991536021233, "learning_rate": 3.476402911481569e-05, "loss": 0.0015, "step": 6990 }, { "epoch": 0.9635237439779766, "grad_norm": 0.00995588582009077, "learning_rate": 3.474054942474759e-05, "loss": 0.0095, "step": 7000 }, { "epoch": 0.9649002064693737, "grad_norm": 0.008730718865990639, "learning_rate": 3.4717069734679504e-05, "loss": 0.0052, "step": 7010 }, { "epoch": 0.9662766689607708, "grad_norm": 0.00850655697286129, "learning_rate": 3.469359004461141e-05, "loss": 0.0003, "step": 7020 }, { "epoch": 0.9676531314521679, "grad_norm": 0.006976262200623751, "learning_rate": 3.4670110354543325e-05, "loss": 0.0599, "step": 7030 }, { "epoch": 0.969029593943565, "grad_norm": 0.008669561706483364, "learning_rate": 3.464663066447523e-05, "loss": 0.0004, "step": 7040 }, { "epoch": 0.9704060564349621, "grad_norm": 0.01344081200659275, "learning_rate": 3.462315097440714e-05, "loss": 0.0347, "step": 7050 }, { "epoch": 0.9717825189263593, "grad_norm": 0.010220236144959927, "learning_rate": 3.459967128433905e-05, "loss": 0.0004, "step": 7060 }, { "epoch": 0.9731589814177564, "grad_norm": 0.009870830923318863, "learning_rate": 3.4576191594270955e-05, "loss": 0.0004, "step": 7070 }, { "epoch": 0.9745354439091535, "grad_norm": 0.009553988464176655, "learning_rate": 3.455271190420287e-05, "loss": 0.0004, "step": 7080 }, { "epoch": 0.9759119064005506, "grad_norm": 0.009609032422304153, "learning_rate": 3.4529232214134776e-05, "loss": 0.0053, "step": 7090 }, { "epoch": 0.9772883688919477, "grad_norm": 0.008326184004545212, "learning_rate": 3.4505752524066684e-05, "loss": 0.0003, "step": 7100 }, { "epoch": 0.9786648313833448, "grad_norm": 0.0078686298802495, "learning_rate": 3.448227283399859e-05, "loss": 0.0003, "step": 7110 }, { "epoch": 0.9800412938747419, "grad_norm": 0.00935379695147276, "learning_rate": 3.44587931439305e-05, "loss": 0.0932, "step": 7120 }, { "epoch": 0.981417756366139, "grad_norm": 1.652978777885437, "learning_rate": 3.443531345386241e-05, "loss": 0.0973, "step": 7130 }, { "epoch": 0.9827942188575362, "grad_norm": 0.09444475173950195, "learning_rate": 3.441183376379432e-05, "loss": 0.0434, "step": 7140 }, { "epoch": 0.9841706813489333, "grad_norm": 0.023771895095705986, "learning_rate": 3.4388354073726234e-05, "loss": 0.0056, "step": 7150 }, { "epoch": 0.9855471438403304, "grad_norm": 0.016039764508605003, "learning_rate": 3.4364874383658135e-05, "loss": 0.0257, "step": 7160 }, { "epoch": 0.9869236063317275, "grad_norm": 0.8533821702003479, "learning_rate": 3.434139469359005e-05, "loss": 0.0197, "step": 7170 }, { "epoch": 0.9883000688231246, "grad_norm": 0.01403704471886158, "learning_rate": 3.4317915003521956e-05, "loss": 0.0251, "step": 7180 }, { "epoch": 0.9896765313145217, "grad_norm": 0.012785268016159534, "learning_rate": 3.4294435313453863e-05, "loss": 0.0112, "step": 7190 }, { "epoch": 0.9910529938059188, "grad_norm": 0.6982820630073547, "learning_rate": 3.427095562338578e-05, "loss": 0.0105, "step": 7200 }, { "epoch": 0.9924294562973159, "grad_norm": 0.010448599234223366, "learning_rate": 3.424747593331768e-05, "loss": 0.0004, "step": 7210 }, { "epoch": 0.993805918788713, "grad_norm": 0.5973999500274658, "learning_rate": 3.422399624324959e-05, "loss": 0.0133, "step": 7220 }, { "epoch": 0.9951823812801102, "grad_norm": 2.401728868484497, "learning_rate": 3.42005165531815e-05, "loss": 0.0039, "step": 7230 }, { "epoch": 0.9965588437715073, "grad_norm": 0.009499720297753811, "learning_rate": 3.417703686311341e-05, "loss": 0.002, "step": 7240 }, { "epoch": 0.9979353062629044, "grad_norm": 0.010743265971541405, "learning_rate": 3.415355717304532e-05, "loss": 0.0555, "step": 7250 }, { "epoch": 0.9993117687543015, "grad_norm": 0.012025471776723862, "learning_rate": 3.413007748297722e-05, "loss": 0.0572, "step": 7260 }, { "epoch": 1.0, "eval_accuracy": 0.9950791465932554, "eval_f1": 0.9474071349760942, "eval_loss": 0.021067893132567406, "eval_precision": 0.9647940074906367, "eval_recall": 0.930635838150289, "eval_runtime": 52.9606, "eval_samples_per_second": 548.71, "eval_steps_per_second": 34.309, "step": 7265 }, { "epoch": 1.0006882312456986, "grad_norm": 0.021515965461730957, "learning_rate": 3.4106597792909136e-05, "loss": 0.0133, "step": 7270 }, { "epoch": 1.0020646937370956, "grad_norm": 0.010623365640640259, "learning_rate": 3.408311810284104e-05, "loss": 0.0035, "step": 7280 }, { "epoch": 1.0034411562284928, "grad_norm": 0.38930949568748474, "learning_rate": 3.405963841277296e-05, "loss": 0.0073, "step": 7290 }, { "epoch": 1.0048176187198898, "grad_norm": 0.009073280729353428, "learning_rate": 3.4036158722704865e-05, "loss": 0.0477, "step": 7300 }, { "epoch": 1.006194081211287, "grad_norm": 0.015406585298478603, "learning_rate": 3.4012679032636765e-05, "loss": 0.0005, "step": 7310 }, { "epoch": 1.007570543702684, "grad_norm": 0.015377533622086048, "learning_rate": 3.398919934256868e-05, "loss": 0.0006, "step": 7320 }, { "epoch": 1.0089470061940813, "grad_norm": 0.37189781665802, "learning_rate": 3.396571965250059e-05, "loss": 0.0047, "step": 7330 }, { "epoch": 1.0103234686854783, "grad_norm": 0.013250071555376053, "learning_rate": 3.39422399624325e-05, "loss": 0.0005, "step": 7340 }, { "epoch": 1.0116999311768755, "grad_norm": 2.9877495765686035, "learning_rate": 3.391876027236441e-05, "loss": 0.0479, "step": 7350 }, { "epoch": 1.0130763936682725, "grad_norm": 0.0216897614300251, "learning_rate": 3.3895280582296316e-05, "loss": 0.0467, "step": 7360 }, { "epoch": 1.0144528561596697, "grad_norm": 0.053148508071899414, "learning_rate": 3.387180089222822e-05, "loss": 0.0847, "step": 7370 }, { "epoch": 1.0158293186510667, "grad_norm": 0.09653236716985703, "learning_rate": 3.384832120216013e-05, "loss": 0.0425, "step": 7380 }, { "epoch": 1.017205781142464, "grad_norm": 0.06451202929019928, "learning_rate": 3.3824841512092044e-05, "loss": 0.0075, "step": 7390 }, { "epoch": 1.018582243633861, "grad_norm": 0.03150139003992081, "learning_rate": 3.380136182202395e-05, "loss": 0.0064, "step": 7400 }, { "epoch": 1.0199587061252582, "grad_norm": 0.02418307401239872, "learning_rate": 3.377788213195586e-05, "loss": 0.0025, "step": 7410 }, { "epoch": 1.0213351686166552, "grad_norm": 0.03348683938384056, "learning_rate": 3.3754402441887767e-05, "loss": 0.0881, "step": 7420 }, { "epoch": 1.0227116311080524, "grad_norm": 0.04249432310461998, "learning_rate": 3.3730922751819674e-05, "loss": 0.0287, "step": 7430 }, { "epoch": 1.0240880935994494, "grad_norm": 0.03965974599123001, "learning_rate": 3.370744306175159e-05, "loss": 0.0018, "step": 7440 }, { "epoch": 1.0254645560908466, "grad_norm": 0.03319505602121353, "learning_rate": 3.3683963371683495e-05, "loss": 0.0014, "step": 7450 }, { "epoch": 1.0268410185822436, "grad_norm": 0.02594437263906002, "learning_rate": 3.366048368161541e-05, "loss": 0.0059, "step": 7460 }, { "epoch": 1.0282174810736406, "grad_norm": 0.02441740781068802, "learning_rate": 3.363700399154731e-05, "loss": 0.0322, "step": 7470 }, { "epoch": 1.0295939435650379, "grad_norm": 0.016510264948010445, "learning_rate": 3.3613524301479224e-05, "loss": 0.0024, "step": 7480 }, { "epoch": 1.0309704060564349, "grad_norm": 0.016530070453882217, "learning_rate": 3.359004461141113e-05, "loss": 0.0013, "step": 7490 }, { "epoch": 1.032346868547832, "grad_norm": 0.013162774965167046, "learning_rate": 3.356656492134304e-05, "loss": 0.0045, "step": 7500 }, { "epoch": 1.033723331039229, "grad_norm": 0.01049899309873581, "learning_rate": 3.354308523127495e-05, "loss": 0.0026, "step": 7510 }, { "epoch": 1.0350997935306263, "grad_norm": 0.015327578410506248, "learning_rate": 3.3519605541206854e-05, "loss": 0.048, "step": 7520 }, { "epoch": 1.0364762560220233, "grad_norm": 0.017364943400025368, "learning_rate": 3.349612585113877e-05, "loss": 0.0033, "step": 7530 }, { "epoch": 1.0378527185134205, "grad_norm": 1.8788636922836304, "learning_rate": 3.3472646161070675e-05, "loss": 0.0436, "step": 7540 }, { "epoch": 1.0392291810048175, "grad_norm": 0.017028305679559708, "learning_rate": 3.344916647100258e-05, "loss": 0.0397, "step": 7550 }, { "epoch": 1.0406056434962148, "grad_norm": 0.029619060456752777, "learning_rate": 3.34256867809345e-05, "loss": 0.001, "step": 7560 }, { "epoch": 1.0419821059876118, "grad_norm": 0.09371836483478546, "learning_rate": 3.34022070908664e-05, "loss": 0.0887, "step": 7570 }, { "epoch": 1.043358568479009, "grad_norm": 0.07959452271461487, "learning_rate": 3.337872740079831e-05, "loss": 0.0056, "step": 7580 }, { "epoch": 1.044735030970406, "grad_norm": 0.03549978882074356, "learning_rate": 3.335524771073022e-05, "loss": 0.0023, "step": 7590 }, { "epoch": 1.0461114934618032, "grad_norm": 0.04110166057944298, "learning_rate": 3.333176802066213e-05, "loss": 0.0829, "step": 7600 }, { "epoch": 1.0474879559532002, "grad_norm": 0.07368134707212448, "learning_rate": 3.330828833059404e-05, "loss": 0.0837, "step": 7610 }, { "epoch": 1.0488644184445974, "grad_norm": 0.11884484440088272, "learning_rate": 3.328480864052594e-05, "loss": 0.0396, "step": 7620 }, { "epoch": 1.0502408809359944, "grad_norm": 0.055099476128816605, "learning_rate": 3.3261328950457855e-05, "loss": 0.0274, "step": 7630 }, { "epoch": 1.0516173434273917, "grad_norm": 1.7757881879806519, "learning_rate": 3.323784926038976e-05, "loss": 0.0422, "step": 7640 }, { "epoch": 1.0529938059187887, "grad_norm": 0.06088874116539955, "learning_rate": 3.3214369570321676e-05, "loss": 0.0026, "step": 7650 }, { "epoch": 1.0543702684101859, "grad_norm": 0.05047605186700821, "learning_rate": 3.3190889880253584e-05, "loss": 0.0022, "step": 7660 }, { "epoch": 1.0557467309015829, "grad_norm": 0.03019540384411812, "learning_rate": 3.316741019018549e-05, "loss": 0.0014, "step": 7670 }, { "epoch": 1.05712319339298, "grad_norm": 0.025413209572434425, "learning_rate": 3.31439305001174e-05, "loss": 0.0011, "step": 7680 }, { "epoch": 1.058499655884377, "grad_norm": 0.017985697835683823, "learning_rate": 3.3120450810049306e-05, "loss": 0.0009, "step": 7690 }, { "epoch": 1.0598761183757743, "grad_norm": 0.016330938786268234, "learning_rate": 3.309697111998122e-05, "loss": 0.0631, "step": 7700 }, { "epoch": 1.0612525808671713, "grad_norm": 0.024029932916164398, "learning_rate": 3.307349142991313e-05, "loss": 0.001, "step": 7710 }, { "epoch": 1.0626290433585686, "grad_norm": 0.01883810944855213, "learning_rate": 3.305001173984504e-05, "loss": 0.001, "step": 7720 }, { "epoch": 1.0640055058499656, "grad_norm": 1.8245927095413208, "learning_rate": 3.302653204977694e-05, "loss": 0.1175, "step": 7730 }, { "epoch": 1.0653819683413628, "grad_norm": 0.10825851559638977, "learning_rate": 3.300305235970885e-05, "loss": 0.0444, "step": 7740 }, { "epoch": 1.0667584308327598, "grad_norm": 0.06336242705583572, "learning_rate": 3.2979572669640763e-05, "loss": 0.0031, "step": 7750 }, { "epoch": 1.068134893324157, "grad_norm": 0.03223087266087532, "learning_rate": 3.295609297957267e-05, "loss": 0.0042, "step": 7760 }, { "epoch": 1.069511355815554, "grad_norm": 0.025799306109547615, "learning_rate": 3.2932613289504585e-05, "loss": 0.0012, "step": 7770 }, { "epoch": 1.0708878183069512, "grad_norm": 0.022181998938322067, "learning_rate": 3.2909133599436486e-05, "loss": 0.0032, "step": 7780 }, { "epoch": 1.0722642807983482, "grad_norm": 0.02970580942928791, "learning_rate": 3.28856539093684e-05, "loss": 0.046, "step": 7790 }, { "epoch": 1.0736407432897455, "grad_norm": 0.0225661713629961, "learning_rate": 3.286217421930031e-05, "loss": 0.0228, "step": 7800 }, { "epoch": 1.0750172057811425, "grad_norm": 0.018679741770029068, "learning_rate": 3.2838694529232214e-05, "loss": 0.0066, "step": 7810 }, { "epoch": 1.0763936682725395, "grad_norm": 0.018814492970705032, "learning_rate": 3.281521483916413e-05, "loss": 0.0026, "step": 7820 }, { "epoch": 1.0777701307639367, "grad_norm": 0.015044591389596462, "learning_rate": 3.279173514909603e-05, "loss": 0.0008, "step": 7830 }, { "epoch": 1.079146593255334, "grad_norm": 0.015530485659837723, "learning_rate": 3.276825545902794e-05, "loss": 0.0006, "step": 7840 }, { "epoch": 1.080523055746731, "grad_norm": 0.0171443410217762, "learning_rate": 3.274477576895985e-05, "loss": 0.0149, "step": 7850 }, { "epoch": 1.081899518238128, "grad_norm": 0.13316205143928528, "learning_rate": 3.272129607889176e-05, "loss": 0.0009, "step": 7860 }, { "epoch": 1.0832759807295251, "grad_norm": 0.015186971053481102, "learning_rate": 3.269781638882367e-05, "loss": 0.0007, "step": 7870 }, { "epoch": 1.0846524432209221, "grad_norm": 0.05500231310725212, "learning_rate": 3.267433669875557e-05, "loss": 0.0864, "step": 7880 }, { "epoch": 1.0860289057123194, "grad_norm": 0.07985639572143555, "learning_rate": 3.265085700868749e-05, "loss": 0.0269, "step": 7890 }, { "epoch": 1.0874053682037164, "grad_norm": 0.01555081456899643, "learning_rate": 3.2627377318619394e-05, "loss": 0.0183, "step": 7900 }, { "epoch": 1.0887818306951136, "grad_norm": 0.23896756768226624, "learning_rate": 3.260389762855131e-05, "loss": 0.0023, "step": 7910 }, { "epoch": 1.0901582931865106, "grad_norm": 1.067336082458496, "learning_rate": 3.2580417938483216e-05, "loss": 0.0103, "step": 7920 }, { "epoch": 1.0915347556779078, "grad_norm": 0.011696066707372665, "learning_rate": 3.255693824841512e-05, "loss": 0.0005, "step": 7930 }, { "epoch": 1.0929112181693048, "grad_norm": 0.008536611683666706, "learning_rate": 3.253345855834703e-05, "loss": 0.0417, "step": 7940 }, { "epoch": 1.094287680660702, "grad_norm": 0.009531383402645588, "learning_rate": 3.250997886827894e-05, "loss": 0.0164, "step": 7950 }, { "epoch": 1.095664143152099, "grad_norm": 0.012200838886201382, "learning_rate": 3.248649917821085e-05, "loss": 0.0009, "step": 7960 }, { "epoch": 1.0970406056434963, "grad_norm": 0.013682760298252106, "learning_rate": 3.246301948814276e-05, "loss": 0.0005, "step": 7970 }, { "epoch": 1.0984170681348933, "grad_norm": 0.06768498569726944, "learning_rate": 3.2439539798074667e-05, "loss": 0.0553, "step": 7980 }, { "epoch": 1.0997935306262905, "grad_norm": 0.02024843543767929, "learning_rate": 3.2416060108006574e-05, "loss": 0.0506, "step": 7990 }, { "epoch": 1.1011699931176875, "grad_norm": 0.02581336535513401, "learning_rate": 3.239258041793848e-05, "loss": 0.0016, "step": 8000 }, { "epoch": 1.1025464556090847, "grad_norm": 0.02221924439072609, "learning_rate": 3.2369100727870395e-05, "loss": 0.0431, "step": 8010 }, { "epoch": 1.1039229181004817, "grad_norm": 0.03398057818412781, "learning_rate": 3.23456210378023e-05, "loss": 0.0465, "step": 8020 }, { "epoch": 1.105299380591879, "grad_norm": 0.08442604541778564, "learning_rate": 3.232214134773422e-05, "loss": 0.0819, "step": 8030 }, { "epoch": 1.106675843083276, "grad_norm": 0.10404931753873825, "learning_rate": 3.229866165766612e-05, "loss": 0.0046, "step": 8040 }, { "epoch": 1.1080523055746732, "grad_norm": 0.46733078360557556, "learning_rate": 3.227518196759803e-05, "loss": 0.0431, "step": 8050 }, { "epoch": 1.1094287680660702, "grad_norm": 0.05850812420248985, "learning_rate": 3.225170227752994e-05, "loss": 0.0028, "step": 8060 }, { "epoch": 1.1108052305574674, "grad_norm": 2.542656421661377, "learning_rate": 3.2228222587461846e-05, "loss": 0.0486, "step": 8070 }, { "epoch": 1.1121816930488644, "grad_norm": 0.04030469432473183, "learning_rate": 3.220474289739376e-05, "loss": 0.0017, "step": 8080 }, { "epoch": 1.1135581555402616, "grad_norm": 0.23898345232009888, "learning_rate": 3.218126320732566e-05, "loss": 0.0376, "step": 8090 }, { "epoch": 1.1149346180316586, "grad_norm": 0.02344943955540657, "learning_rate": 3.2157783517257575e-05, "loss": 0.0012, "step": 8100 }, { "epoch": 1.1163110805230558, "grad_norm": 1.2166472673416138, "learning_rate": 3.213430382718948e-05, "loss": 0.0093, "step": 8110 }, { "epoch": 1.1176875430144528, "grad_norm": 0.0220763199031353, "learning_rate": 3.211082413712139e-05, "loss": 0.0853, "step": 8120 }, { "epoch": 1.11906400550585, "grad_norm": 0.05702443793416023, "learning_rate": 3.2087344447053304e-05, "loss": 0.0508, "step": 8130 }, { "epoch": 1.120440467997247, "grad_norm": 1.5215332508087158, "learning_rate": 3.2063864756985205e-05, "loss": 0.0248, "step": 8140 }, { "epoch": 1.1218169304886443, "grad_norm": 0.03612302616238594, "learning_rate": 3.204038506691712e-05, "loss": 0.0019, "step": 8150 }, { "epoch": 1.1231933929800413, "grad_norm": 0.02920125238597393, "learning_rate": 3.2016905376849026e-05, "loss": 0.0783, "step": 8160 }, { "epoch": 1.1245698554714385, "grad_norm": 0.03463609516620636, "learning_rate": 3.1993425686780933e-05, "loss": 0.0713, "step": 8170 }, { "epoch": 1.1259463179628355, "grad_norm": 0.03627566993236542, "learning_rate": 3.196994599671285e-05, "loss": 0.0651, "step": 8180 }, { "epoch": 1.1273227804542327, "grad_norm": 0.03510391712188721, "learning_rate": 3.194646630664475e-05, "loss": 0.0112, "step": 8190 }, { "epoch": 1.1286992429456297, "grad_norm": 0.4027557671070099, "learning_rate": 3.192298661657666e-05, "loss": 0.0163, "step": 8200 }, { "epoch": 1.1300757054370267, "grad_norm": 0.29657047986984253, "learning_rate": 3.189950692650857e-05, "loss": 0.0259, "step": 8210 }, { "epoch": 1.131452167928424, "grad_norm": 0.39591091871261597, "learning_rate": 3.1876027236440484e-05, "loss": 0.0559, "step": 8220 }, { "epoch": 1.1328286304198212, "grad_norm": 0.41990822553634644, "learning_rate": 3.185254754637239e-05, "loss": 0.0136, "step": 8230 }, { "epoch": 1.1342050929112182, "grad_norm": 0.02248874306678772, "learning_rate": 3.18290678563043e-05, "loss": 0.0114, "step": 8240 }, { "epoch": 1.1355815554026152, "grad_norm": 0.024457888677716255, "learning_rate": 3.1805588166236206e-05, "loss": 0.0346, "step": 8250 }, { "epoch": 1.1369580178940124, "grad_norm": 0.01849205233156681, "learning_rate": 3.178210847616811e-05, "loss": 0.006, "step": 8260 }, { "epoch": 1.1383344803854094, "grad_norm": 0.018232887610793114, "learning_rate": 3.175862878610003e-05, "loss": 0.0007, "step": 8270 }, { "epoch": 1.1397109428768066, "grad_norm": 0.017362311482429504, "learning_rate": 3.1735149096031935e-05, "loss": 0.0008, "step": 8280 }, { "epoch": 1.1410874053682036, "grad_norm": 1.894794225692749, "learning_rate": 3.171166940596384e-05, "loss": 0.0994, "step": 8290 }, { "epoch": 1.1424638678596009, "grad_norm": 0.030782019719481468, "learning_rate": 3.168818971589575e-05, "loss": 0.0468, "step": 8300 }, { "epoch": 1.1438403303509979, "grad_norm": 49.25630569458008, "learning_rate": 3.166471002582766e-05, "loss": 0.0093, "step": 8310 }, { "epoch": 1.145216792842395, "grad_norm": 0.04444660618901253, "learning_rate": 3.164123033575957e-05, "loss": 0.0019, "step": 8320 }, { "epoch": 1.146593255333792, "grad_norm": 0.03653968870639801, "learning_rate": 3.161775064569148e-05, "loss": 0.0459, "step": 8330 }, { "epoch": 1.1479697178251893, "grad_norm": 0.04526354745030403, "learning_rate": 3.159427095562339e-05, "loss": 0.0331, "step": 8340 }, { "epoch": 1.1493461803165863, "grad_norm": 0.04439878091216087, "learning_rate": 3.157079126555529e-05, "loss": 0.0058, "step": 8350 }, { "epoch": 1.1507226428079835, "grad_norm": 0.02629615180194378, "learning_rate": 3.154731157548721e-05, "loss": 0.0048, "step": 8360 }, { "epoch": 1.1520991052993805, "grad_norm": 0.05196559429168701, "learning_rate": 3.1523831885419114e-05, "loss": 0.1275, "step": 8370 }, { "epoch": 1.1534755677907778, "grad_norm": 3.203660488128662, "learning_rate": 3.150035219535102e-05, "loss": 0.1206, "step": 8380 }, { "epoch": 1.1548520302821748, "grad_norm": 0.13807561993598938, "learning_rate": 3.1476872505282936e-05, "loss": 0.0356, "step": 8390 }, { "epoch": 1.156228492773572, "grad_norm": 0.20143716037273407, "learning_rate": 3.1453392815214836e-05, "loss": 0.0715, "step": 8400 }, { "epoch": 1.157604955264969, "grad_norm": 0.12777476012706757, "learning_rate": 3.142991312514675e-05, "loss": 0.0124, "step": 8410 }, { "epoch": 1.1589814177563662, "grad_norm": 0.0673007220029831, "learning_rate": 3.140643343507866e-05, "loss": 0.0428, "step": 8420 }, { "epoch": 1.1603578802477632, "grad_norm": 0.06350237131118774, "learning_rate": 3.1382953745010565e-05, "loss": 0.0029, "step": 8430 }, { "epoch": 1.1617343427391604, "grad_norm": 0.045864857733249664, "learning_rate": 3.135947405494248e-05, "loss": 0.0019, "step": 8440 }, { "epoch": 1.1631108052305574, "grad_norm": 0.025866564363241196, "learning_rate": 3.133599436487438e-05, "loss": 0.0038, "step": 8450 }, { "epoch": 1.1644872677219547, "grad_norm": 0.028133923187851906, "learning_rate": 3.1312514674806294e-05, "loss": 0.0011, "step": 8460 }, { "epoch": 1.1658637302133517, "grad_norm": 0.0283712949603796, "learning_rate": 3.12890349847382e-05, "loss": 0.0009, "step": 8470 }, { "epoch": 1.1672401927047489, "grad_norm": 0.022812968119978905, "learning_rate": 3.1265555294670116e-05, "loss": 0.0417, "step": 8480 }, { "epoch": 1.1686166551961459, "grad_norm": 0.027264578267931938, "learning_rate": 3.124207560460202e-05, "loss": 0.0011, "step": 8490 }, { "epoch": 1.169993117687543, "grad_norm": 0.041887011379003525, "learning_rate": 3.121859591453393e-05, "loss": 0.0144, "step": 8500 }, { "epoch": 1.17136958017894, "grad_norm": 0.08359098434448242, "learning_rate": 3.119511622446584e-05, "loss": 0.0772, "step": 8510 }, { "epoch": 1.1727460426703373, "grad_norm": 0.03644919767975807, "learning_rate": 3.1171636534397745e-05, "loss": 0.0472, "step": 8520 }, { "epoch": 1.1741225051617343, "grad_norm": 0.6915259957313538, "learning_rate": 3.114815684432966e-05, "loss": 0.0774, "step": 8530 }, { "epoch": 1.1754989676531316, "grad_norm": 0.03120124340057373, "learning_rate": 3.1124677154261567e-05, "loss": 0.0021, "step": 8540 }, { "epoch": 1.1768754301445286, "grad_norm": 0.028726542368531227, "learning_rate": 3.1101197464193474e-05, "loss": 0.0039, "step": 8550 }, { "epoch": 1.1782518926359256, "grad_norm": 0.027665939182043076, "learning_rate": 3.107771777412538e-05, "loss": 0.0417, "step": 8560 }, { "epoch": 1.1796283551273228, "grad_norm": 0.027170080691576004, "learning_rate": 3.105423808405729e-05, "loss": 0.0011, "step": 8570 }, { "epoch": 1.18100481761872, "grad_norm": 0.025620920583605766, "learning_rate": 3.10307583939892e-05, "loss": 0.001, "step": 8580 }, { "epoch": 1.182381280110117, "grad_norm": 0.02977997437119484, "learning_rate": 3.100727870392111e-05, "loss": 0.0414, "step": 8590 }, { "epoch": 1.183757742601514, "grad_norm": 0.028498144820332527, "learning_rate": 3.0983799013853024e-05, "loss": 0.0221, "step": 8600 }, { "epoch": 1.1851342050929112, "grad_norm": 0.03645610064268112, "learning_rate": 3.0960319323784925e-05, "loss": 0.0483, "step": 8610 }, { "epoch": 1.1865106675843082, "grad_norm": 0.0346309170126915, "learning_rate": 3.093683963371683e-05, "loss": 0.0098, "step": 8620 }, { "epoch": 1.1878871300757055, "grad_norm": 0.034166499972343445, "learning_rate": 3.0913359943648746e-05, "loss": 0.0013, "step": 8630 }, { "epoch": 1.1892635925671025, "grad_norm": 0.026223216205835342, "learning_rate": 3.0889880253580654e-05, "loss": 0.0011, "step": 8640 }, { "epoch": 1.1906400550584997, "grad_norm": 0.020918376743793488, "learning_rate": 3.086640056351257e-05, "loss": 0.0009, "step": 8650 }, { "epoch": 1.1920165175498967, "grad_norm": 0.018002262338995934, "learning_rate": 3.084292087344447e-05, "loss": 0.0008, "step": 8660 }, { "epoch": 1.193392980041294, "grad_norm": 0.01898243837058544, "learning_rate": 3.081944118337638e-05, "loss": 0.0007, "step": 8670 }, { "epoch": 1.194769442532691, "grad_norm": 0.017855720594525337, "learning_rate": 3.079596149330829e-05, "loss": 0.0006, "step": 8680 }, { "epoch": 1.1961459050240881, "grad_norm": 2.064791202545166, "learning_rate": 3.07724818032402e-05, "loss": 0.1028, "step": 8690 }, { "epoch": 1.1975223675154851, "grad_norm": 0.024349046871066093, "learning_rate": 3.074900211317211e-05, "loss": 0.001, "step": 8700 }, { "epoch": 1.1988988300068824, "grad_norm": 0.033991649746894836, "learning_rate": 3.072552242310401e-05, "loss": 0.0114, "step": 8710 }, { "epoch": 1.2002752924982794, "grad_norm": 0.023988425731658936, "learning_rate": 3.0702042733035926e-05, "loss": 0.0065, "step": 8720 }, { "epoch": 1.2016517549896766, "grad_norm": 0.03604234755039215, "learning_rate": 3.0678563042967833e-05, "loss": 0.133, "step": 8730 }, { "epoch": 1.2030282174810736, "grad_norm": 0.08237233757972717, "learning_rate": 3.065508335289974e-05, "loss": 0.0052, "step": 8740 }, { "epoch": 1.2044046799724708, "grad_norm": 0.07353729754686356, "learning_rate": 3.0631603662831655e-05, "loss": 0.0385, "step": 8750 }, { "epoch": 1.2057811424638678, "grad_norm": 0.09792488068342209, "learning_rate": 3.0608123972763556e-05, "loss": 0.0421, "step": 8760 }, { "epoch": 1.207157604955265, "grad_norm": 0.13896237313747406, "learning_rate": 3.058464428269547e-05, "loss": 0.0617, "step": 8770 }, { "epoch": 1.208534067446662, "grad_norm": 0.17447154223918915, "learning_rate": 3.056116459262738e-05, "loss": 0.0105, "step": 8780 }, { "epoch": 1.2099105299380593, "grad_norm": 0.038310155272483826, "learning_rate": 3.053768490255929e-05, "loss": 0.0017, "step": 8790 }, { "epoch": 1.2112869924294563, "grad_norm": 0.03304479643702507, "learning_rate": 3.05142052124912e-05, "loss": 0.0411, "step": 8800 }, { "epoch": 1.2126634549208535, "grad_norm": 0.04138489067554474, "learning_rate": 3.049072552242311e-05, "loss": 0.0404, "step": 8810 }, { "epoch": 1.2140399174122505, "grad_norm": 3.3786427974700928, "learning_rate": 3.0467245832355013e-05, "loss": 0.0777, "step": 8820 }, { "epoch": 1.2154163799036477, "grad_norm": 0.03629371523857117, "learning_rate": 3.0443766142286924e-05, "loss": 0.0224, "step": 8830 }, { "epoch": 1.2167928423950447, "grad_norm": 0.03215630352497101, "learning_rate": 3.042028645221883e-05, "loss": 0.0205, "step": 8840 }, { "epoch": 1.218169304886442, "grad_norm": 0.03390325978398323, "learning_rate": 3.0396806762150742e-05, "loss": 0.0459, "step": 8850 }, { "epoch": 1.219545767377839, "grad_norm": 0.0317840538918972, "learning_rate": 3.0373327072082653e-05, "loss": 0.0015, "step": 8860 }, { "epoch": 1.2209222298692362, "grad_norm": 0.07705999910831451, "learning_rate": 3.0349847382014557e-05, "loss": 0.0011, "step": 8870 }, { "epoch": 1.2222986923606332, "grad_norm": 0.024175794795155525, "learning_rate": 3.0326367691946468e-05, "loss": 0.0017, "step": 8880 }, { "epoch": 1.2236751548520304, "grad_norm": 0.018058501183986664, "learning_rate": 3.0302888001878378e-05, "loss": 0.0009, "step": 8890 }, { "epoch": 1.2250516173434274, "grad_norm": 0.02315705642104149, "learning_rate": 3.0279408311810286e-05, "loss": 0.0117, "step": 8900 }, { "epoch": 1.2264280798348244, "grad_norm": 0.014247541315853596, "learning_rate": 3.0255928621742196e-05, "loss": 0.0109, "step": 8910 }, { "epoch": 1.2278045423262216, "grad_norm": 0.01358788087964058, "learning_rate": 3.02324489316741e-05, "loss": 0.0005, "step": 8920 }, { "epoch": 1.2291810048176188, "grad_norm": 0.011992747895419598, "learning_rate": 3.020896924160601e-05, "loss": 0.0042, "step": 8930 }, { "epoch": 1.2305574673090158, "grad_norm": 0.012968027032911777, "learning_rate": 3.0185489551537922e-05, "loss": 0.0007, "step": 8940 }, { "epoch": 1.2319339298004128, "grad_norm": 0.01127535104751587, "learning_rate": 3.0162009861469833e-05, "loss": 0.0004, "step": 8950 }, { "epoch": 1.23331039229181, "grad_norm": 0.010431253351271152, "learning_rate": 3.013853017140174e-05, "loss": 0.0004, "step": 8960 }, { "epoch": 1.2346868547832073, "grad_norm": 0.13627731800079346, "learning_rate": 3.0115050481333644e-05, "loss": 0.003, "step": 8970 }, { "epoch": 1.2360633172746043, "grad_norm": 0.016022950410842896, "learning_rate": 3.0091570791265555e-05, "loss": 0.0501, "step": 8980 }, { "epoch": 1.2374397797660013, "grad_norm": 0.021504592150449753, "learning_rate": 3.0068091101197465e-05, "loss": 0.0107, "step": 8990 }, { "epoch": 1.2388162422573985, "grad_norm": 0.016150908544659615, "learning_rate": 3.0044611411129376e-05, "loss": 0.0006, "step": 9000 }, { "epoch": 1.2401927047487955, "grad_norm": 0.009948354214429855, "learning_rate": 3.0021131721061287e-05, "loss": 0.0022, "step": 9010 }, { "epoch": 1.2415691672401927, "grad_norm": 0.009492177516222, "learning_rate": 2.999765203099319e-05, "loss": 0.0025, "step": 9020 }, { "epoch": 1.2429456297315897, "grad_norm": 0.009465621784329414, "learning_rate": 2.9974172340925098e-05, "loss": 0.003, "step": 9030 }, { "epoch": 1.244322092222987, "grad_norm": 0.7412835359573364, "learning_rate": 2.995069265085701e-05, "loss": 0.0127, "step": 9040 }, { "epoch": 1.245698554714384, "grad_norm": 4.683021068572998, "learning_rate": 2.992721296078892e-05, "loss": 0.0141, "step": 9050 }, { "epoch": 1.2470750172057812, "grad_norm": 0.008489931002259254, "learning_rate": 2.990373327072083e-05, "loss": 0.0003, "step": 9060 }, { "epoch": 1.2484514796971782, "grad_norm": 0.012501278892159462, "learning_rate": 2.988025358065274e-05, "loss": 0.0403, "step": 9070 }, { "epoch": 1.2498279421885754, "grad_norm": 0.010246503166854382, "learning_rate": 2.9856773890584645e-05, "loss": 0.0004, "step": 9080 }, { "epoch": 1.2512044046799724, "grad_norm": 0.6677660346031189, "learning_rate": 2.9833294200516552e-05, "loss": 0.0104, "step": 9090 }, { "epoch": 1.2525808671713696, "grad_norm": 0.08200287818908691, "learning_rate": 2.9809814510448463e-05, "loss": 0.0536, "step": 9100 }, { "epoch": 1.2539573296627666, "grad_norm": 0.012925310991704464, "learning_rate": 2.9786334820380374e-05, "loss": 0.0247, "step": 9110 }, { "epoch": 1.2553337921541639, "grad_norm": 0.009677432477474213, "learning_rate": 2.9762855130312285e-05, "loss": 0.0144, "step": 9120 }, { "epoch": 1.2567102546455609, "grad_norm": 0.053978025913238525, "learning_rate": 2.973937544024419e-05, "loss": 0.005, "step": 9130 }, { "epoch": 1.258086717136958, "grad_norm": 0.007046802435070276, "learning_rate": 2.97158957501761e-05, "loss": 0.0103, "step": 9140 }, { "epoch": 1.259463179628355, "grad_norm": 0.005789673887193203, "learning_rate": 2.9692416060108007e-05, "loss": 0.0147, "step": 9150 }, { "epoch": 1.2608396421197523, "grad_norm": 0.006138138473033905, "learning_rate": 2.9668936370039918e-05, "loss": 0.0288, "step": 9160 }, { "epoch": 1.2622161046111493, "grad_norm": 0.006734949070960283, "learning_rate": 2.9645456679971828e-05, "loss": 0.0025, "step": 9170 }, { "epoch": 1.2635925671025465, "grad_norm": 0.006863762624561787, "learning_rate": 2.9621976989903732e-05, "loss": 0.0031, "step": 9180 }, { "epoch": 1.2649690295939435, "grad_norm": 0.00471033900976181, "learning_rate": 2.9598497299835643e-05, "loss": 0.0033, "step": 9190 }, { "epoch": 1.2663454920853408, "grad_norm": 0.004977040458470583, "learning_rate": 2.9575017609767554e-05, "loss": 0.0003, "step": 9200 }, { "epoch": 1.2677219545767378, "grad_norm": 0.004502297844737768, "learning_rate": 2.955153791969946e-05, "loss": 0.0071, "step": 9210 }, { "epoch": 1.269098417068135, "grad_norm": 0.006263911724090576, "learning_rate": 2.9528058229631372e-05, "loss": 0.0458, "step": 9220 }, { "epoch": 1.270474879559532, "grad_norm": 0.0448748953640461, "learning_rate": 2.9504578539563276e-05, "loss": 0.0046, "step": 9230 }, { "epoch": 1.2718513420509292, "grad_norm": 0.02474851906299591, "learning_rate": 2.9481098849495187e-05, "loss": 0.0037, "step": 9240 }, { "epoch": 1.2732278045423262, "grad_norm": 12.501758575439453, "learning_rate": 2.9457619159427097e-05, "loss": 0.1089, "step": 9250 }, { "epoch": 1.2746042670337232, "grad_norm": 0.1046203076839447, "learning_rate": 2.9434139469359008e-05, "loss": 0.0037, "step": 9260 }, { "epoch": 1.2759807295251204, "grad_norm": 0.4267828166484833, "learning_rate": 2.9410659779290915e-05, "loss": 0.0569, "step": 9270 }, { "epoch": 1.2773571920165177, "grad_norm": 0.007830817252397537, "learning_rate": 2.9387180089222823e-05, "loss": 0.0004, "step": 9280 }, { "epoch": 1.2787336545079147, "grad_norm": 0.006392312236130238, "learning_rate": 2.936370039915473e-05, "loss": 0.0003, "step": 9290 }, { "epoch": 1.2801101169993117, "grad_norm": 0.00674103619530797, "learning_rate": 2.934022070908664e-05, "loss": 0.0003, "step": 9300 }, { "epoch": 1.2814865794907089, "grad_norm": 0.007592615205794573, "learning_rate": 2.931674101901855e-05, "loss": 0.0037, "step": 9310 }, { "epoch": 1.282863041982106, "grad_norm": 0.006516032852232456, "learning_rate": 2.9293261328950462e-05, "loss": 0.0002, "step": 9320 }, { "epoch": 1.284239504473503, "grad_norm": 0.005117457825690508, "learning_rate": 2.926978163888237e-05, "loss": 0.0067, "step": 9330 }, { "epoch": 1.2856159669649, "grad_norm": 0.00913609191775322, "learning_rate": 2.9246301948814274e-05, "loss": 0.1142, "step": 9340 }, { "epoch": 1.2869924294562973, "grad_norm": 0.008628927171230316, "learning_rate": 2.9222822258746184e-05, "loss": 0.0069, "step": 9350 }, { "epoch": 1.2883688919476946, "grad_norm": 0.008551936596632004, "learning_rate": 2.9199342568678095e-05, "loss": 0.0098, "step": 9360 }, { "epoch": 1.2897453544390916, "grad_norm": 0.007917960174381733, "learning_rate": 2.9175862878610006e-05, "loss": 0.0067, "step": 9370 }, { "epoch": 1.2911218169304886, "grad_norm": 0.00976267084479332, "learning_rate": 2.9152383188541917e-05, "loss": 0.0003, "step": 9380 }, { "epoch": 1.2924982794218858, "grad_norm": 0.01250447053462267, "learning_rate": 2.912890349847382e-05, "loss": 0.0506, "step": 9390 }, { "epoch": 1.293874741913283, "grad_norm": 0.018209166824817657, "learning_rate": 2.9105423808405728e-05, "loss": 0.0156, "step": 9400 }, { "epoch": 1.29525120440468, "grad_norm": 0.03205062821507454, "learning_rate": 2.908194411833764e-05, "loss": 0.0426, "step": 9410 }, { "epoch": 1.296627666896077, "grad_norm": 0.0931173786520958, "learning_rate": 2.905846442826955e-05, "loss": 0.0441, "step": 9420 }, { "epoch": 1.2980041293874742, "grad_norm": 0.05968797951936722, "learning_rate": 2.903498473820146e-05, "loss": 0.0054, "step": 9430 }, { "epoch": 1.2993805918788712, "grad_norm": 3.0881783962249756, "learning_rate": 2.9011505048133364e-05, "loss": 0.0289, "step": 9440 }, { "epoch": 1.3007570543702685, "grad_norm": 2.33501935005188, "learning_rate": 2.8988025358065275e-05, "loss": 0.0222, "step": 9450 }, { "epoch": 1.3021335168616655, "grad_norm": 1.2864433526992798, "learning_rate": 2.8964545667997182e-05, "loss": 0.0766, "step": 9460 }, { "epoch": 1.3035099793530627, "grad_norm": 0.03782026097178459, "learning_rate": 2.8941065977929093e-05, "loss": 0.0135, "step": 9470 }, { "epoch": 1.3048864418444597, "grad_norm": 0.017889073118567467, "learning_rate": 2.8917586287861004e-05, "loss": 0.0082, "step": 9480 }, { "epoch": 1.306262904335857, "grad_norm": 0.012841041199862957, "learning_rate": 2.8894106597792908e-05, "loss": 0.002, "step": 9490 }, { "epoch": 1.307639366827254, "grad_norm": 0.017291700467467308, "learning_rate": 2.887062690772482e-05, "loss": 0.0468, "step": 9500 }, { "epoch": 1.3090158293186511, "grad_norm": 0.021390002220869064, "learning_rate": 2.884714721765673e-05, "loss": 0.0032, "step": 9510 }, { "epoch": 1.3103922918100481, "grad_norm": 0.01667897216975689, "learning_rate": 2.8823667527588637e-05, "loss": 0.0007, "step": 9520 }, { "epoch": 1.3117687543014453, "grad_norm": 0.015712326392531395, "learning_rate": 2.8800187837520547e-05, "loss": 0.0069, "step": 9530 }, { "epoch": 1.3131452167928424, "grad_norm": 0.01881253719329834, "learning_rate": 2.877670814745245e-05, "loss": 0.0635, "step": 9540 }, { "epoch": 1.3145216792842396, "grad_norm": 0.026662955060601234, "learning_rate": 2.8753228457384362e-05, "loss": 0.0009, "step": 9550 }, { "epoch": 1.3158981417756366, "grad_norm": 0.019506927579641342, "learning_rate": 2.8729748767316273e-05, "loss": 0.0068, "step": 9560 }, { "epoch": 1.3172746042670338, "grad_norm": 0.017153818160295486, "learning_rate": 2.8706269077248183e-05, "loss": 0.0032, "step": 9570 }, { "epoch": 1.3186510667584308, "grad_norm": 0.015676824375987053, "learning_rate": 2.868278938718009e-05, "loss": 0.0501, "step": 9580 }, { "epoch": 1.320027529249828, "grad_norm": 2.07688045501709, "learning_rate": 2.8659309697111998e-05, "loss": 0.0496, "step": 9590 }, { "epoch": 1.321403991741225, "grad_norm": 0.01916983164846897, "learning_rate": 2.8635830007043906e-05, "loss": 0.0421, "step": 9600 }, { "epoch": 1.322780454232622, "grad_norm": 0.02361614629626274, "learning_rate": 2.8612350316975816e-05, "loss": 0.0015, "step": 9610 }, { "epoch": 1.3241569167240193, "grad_norm": 0.0356995165348053, "learning_rate": 2.8588870626907727e-05, "loss": 0.0473, "step": 9620 }, { "epoch": 1.3255333792154165, "grad_norm": 1.7962840795516968, "learning_rate": 2.8565390936839638e-05, "loss": 0.0504, "step": 9630 }, { "epoch": 1.3269098417068135, "grad_norm": 0.061549749225378036, "learning_rate": 2.8541911246771545e-05, "loss": 0.035, "step": 9640 }, { "epoch": 1.3282863041982105, "grad_norm": 0.2911525368690491, "learning_rate": 2.8518431556703452e-05, "loss": 0.0156, "step": 9650 }, { "epoch": 1.3296627666896077, "grad_norm": 0.06726056337356567, "learning_rate": 2.849495186663536e-05, "loss": 0.0056, "step": 9660 }, { "epoch": 1.331039229181005, "grad_norm": 0.029883068054914474, "learning_rate": 2.847147217656727e-05, "loss": 0.0021, "step": 9670 }, { "epoch": 1.332415691672402, "grad_norm": 9.963424682617188, "learning_rate": 2.844799248649918e-05, "loss": 0.0354, "step": 9680 }, { "epoch": 1.333792154163799, "grad_norm": 0.02046596072614193, "learning_rate": 2.8424512796431092e-05, "loss": 0.0506, "step": 9690 }, { "epoch": 1.3351686166551961, "grad_norm": 0.02014990523457527, "learning_rate": 2.8401033106362996e-05, "loss": 0.0049, "step": 9700 }, { "epoch": 1.3365450791465934, "grad_norm": 0.02060380019247532, "learning_rate": 2.8377553416294907e-05, "loss": 0.0021, "step": 9710 }, { "epoch": 1.3379215416379904, "grad_norm": 0.020940499380230904, "learning_rate": 2.8354073726226814e-05, "loss": 0.0008, "step": 9720 }, { "epoch": 1.3392980041293874, "grad_norm": 0.015663472935557365, "learning_rate": 2.8330594036158725e-05, "loss": 0.0064, "step": 9730 }, { "epoch": 1.3406744666207846, "grad_norm": 1.7442946434020996, "learning_rate": 2.8307114346090636e-05, "loss": 0.0366, "step": 9740 }, { "epoch": 1.3420509291121818, "grad_norm": 0.027374329045414925, "learning_rate": 2.828363465602254e-05, "loss": 0.0963, "step": 9750 }, { "epoch": 1.3434273916035788, "grad_norm": 0.03140200674533844, "learning_rate": 2.826015496595445e-05, "loss": 0.0066, "step": 9760 }, { "epoch": 1.3448038540949758, "grad_norm": 0.04669001325964928, "learning_rate": 2.823667527588636e-05, "loss": 0.0058, "step": 9770 }, { "epoch": 1.346180316586373, "grad_norm": 0.8155864477157593, "learning_rate": 2.821319558581827e-05, "loss": 0.0147, "step": 9780 }, { "epoch": 1.34755677907777, "grad_norm": 0.01442616805434227, "learning_rate": 2.818971589575018e-05, "loss": 0.0012, "step": 9790 }, { "epoch": 1.3489332415691673, "grad_norm": 0.24510769546031952, "learning_rate": 2.8166236205682083e-05, "loss": 0.0023, "step": 9800 }, { "epoch": 1.3503097040605643, "grad_norm": 0.013476569205522537, "learning_rate": 2.8142756515613994e-05, "loss": 0.0006, "step": 9810 }, { "epoch": 1.3516861665519615, "grad_norm": 0.019142646342515945, "learning_rate": 2.8119276825545905e-05, "loss": 0.018, "step": 9820 }, { "epoch": 1.3530626290433585, "grad_norm": 0.00993957556784153, "learning_rate": 2.8095797135477815e-05, "loss": 0.0023, "step": 9830 }, { "epoch": 1.3544390915347557, "grad_norm": 0.022571871057152748, "learning_rate": 2.8072317445409723e-05, "loss": 0.0033, "step": 9840 }, { "epoch": 1.3558155540261527, "grad_norm": 0.012195726856589317, "learning_rate": 2.8048837755341627e-05, "loss": 0.0009, "step": 9850 }, { "epoch": 1.35719201651755, "grad_norm": 0.01030392199754715, "learning_rate": 2.8025358065273537e-05, "loss": 0.0004, "step": 9860 }, { "epoch": 1.358568479008947, "grad_norm": 0.01108589582145214, "learning_rate": 2.8001878375205448e-05, "loss": 0.0043, "step": 9870 }, { "epoch": 1.3599449415003442, "grad_norm": 0.007381286472082138, "learning_rate": 2.797839868513736e-05, "loss": 0.0134, "step": 9880 }, { "epoch": 1.3613214039917412, "grad_norm": 0.009930983185768127, "learning_rate": 2.795491899506927e-05, "loss": 0.0013, "step": 9890 }, { "epoch": 1.3626978664831384, "grad_norm": 2.24116849899292, "learning_rate": 2.7931439305001177e-05, "loss": 0.0543, "step": 9900 }, { "epoch": 1.3640743289745354, "grad_norm": 0.008840580470860004, "learning_rate": 2.790795961493308e-05, "loss": 0.0004, "step": 9910 }, { "epoch": 1.3654507914659326, "grad_norm": 0.5563055276870728, "learning_rate": 2.7884479924864992e-05, "loss": 0.0024, "step": 9920 }, { "epoch": 1.3668272539573296, "grad_norm": 0.012726586312055588, "learning_rate": 2.7861000234796903e-05, "loss": 0.0512, "step": 9930 }, { "epoch": 1.3682037164487268, "grad_norm": 0.018311336636543274, "learning_rate": 2.7837520544728813e-05, "loss": 0.0018, "step": 9940 }, { "epoch": 1.3695801789401238, "grad_norm": 0.016148675233125687, "learning_rate": 2.781404085466072e-05, "loss": 0.0022, "step": 9950 }, { "epoch": 1.370956641431521, "grad_norm": 0.01348541397601366, "learning_rate": 2.7790561164592628e-05, "loss": 0.0016, "step": 9960 }, { "epoch": 1.372333103922918, "grad_norm": 0.015365678817033768, "learning_rate": 2.7767081474524535e-05, "loss": 0.079, "step": 9970 }, { "epoch": 1.3737095664143153, "grad_norm": 0.020452169701457024, "learning_rate": 2.7743601784456446e-05, "loss": 0.0027, "step": 9980 }, { "epoch": 1.3750860289057123, "grad_norm": 0.019936421886086464, "learning_rate": 2.7720122094388357e-05, "loss": 0.0273, "step": 9990 }, { "epoch": 1.3764624913971093, "grad_norm": 0.0206698477268219, "learning_rate": 2.7696642404320268e-05, "loss": 0.0008, "step": 10000 }, { "epoch": 1.3778389538885065, "grad_norm": 0.02864844724535942, "learning_rate": 2.767316271425217e-05, "loss": 0.1043, "step": 10010 }, { "epoch": 1.3792154163799037, "grad_norm": 0.037661053240299225, "learning_rate": 2.7649683024184082e-05, "loss": 0.0325, "step": 10020 }, { "epoch": 1.3805918788713007, "grad_norm": 0.030517948791384697, "learning_rate": 2.762620333411599e-05, "loss": 0.0154, "step": 10030 }, { "epoch": 1.3819683413626977, "grad_norm": 0.02358236163854599, "learning_rate": 2.76027236440479e-05, "loss": 0.0012, "step": 10040 }, { "epoch": 1.383344803854095, "grad_norm": 0.4532214403152466, "learning_rate": 2.757924395397981e-05, "loss": 0.0027, "step": 10050 }, { "epoch": 1.3847212663454922, "grad_norm": 0.014557798393070698, "learning_rate": 2.7555764263911715e-05, "loss": 0.0022, "step": 10060 }, { "epoch": 1.3860977288368892, "grad_norm": 0.01349212508648634, "learning_rate": 2.7532284573843626e-05, "loss": 0.0175, "step": 10070 }, { "epoch": 1.3874741913282862, "grad_norm": 0.011753273196518421, "learning_rate": 2.7508804883775537e-05, "loss": 0.0126, "step": 10080 }, { "epoch": 1.3888506538196834, "grad_norm": 0.018702656030654907, "learning_rate": 2.7485325193707444e-05, "loss": 0.0512, "step": 10090 }, { "epoch": 1.3902271163110806, "grad_norm": 0.018739372491836548, "learning_rate": 2.7461845503639355e-05, "loss": 0.0007, "step": 10100 }, { "epoch": 1.3916035788024776, "grad_norm": 0.016466490924358368, "learning_rate": 2.743836581357126e-05, "loss": 0.0008, "step": 10110 }, { "epoch": 1.3929800412938746, "grad_norm": 0.02920321375131607, "learning_rate": 2.741488612350317e-05, "loss": 0.0934, "step": 10120 }, { "epoch": 1.3943565037852719, "grad_norm": 0.03858359903097153, "learning_rate": 2.739140643343508e-05, "loss": 0.0054, "step": 10130 }, { "epoch": 1.395732966276669, "grad_norm": 3.9971954822540283, "learning_rate": 2.736792674336699e-05, "loss": 0.0308, "step": 10140 }, { "epoch": 1.397109428768066, "grad_norm": 0.04408375546336174, "learning_rate": 2.7344447053298898e-05, "loss": 0.0032, "step": 10150 }, { "epoch": 1.398485891259463, "grad_norm": 0.20868784189224243, "learning_rate": 2.732096736323081e-05, "loss": 0.0046, "step": 10160 }, { "epoch": 1.3998623537508603, "grad_norm": 0.018119895830750465, "learning_rate": 2.7297487673162713e-05, "loss": 0.0013, "step": 10170 }, { "epoch": 1.4012388162422573, "grad_norm": 0.01632274128496647, "learning_rate": 2.7274007983094624e-05, "loss": 0.0014, "step": 10180 }, { "epoch": 1.4026152787336545, "grad_norm": 0.014994965866208076, "learning_rate": 2.7250528293026534e-05, "loss": 0.0006, "step": 10190 }, { "epoch": 1.4039917412250515, "grad_norm": 0.012106852605938911, "learning_rate": 2.7227048602958445e-05, "loss": 0.0006, "step": 10200 }, { "epoch": 1.4053682037164488, "grad_norm": 0.021616360172629356, "learning_rate": 2.7203568912890353e-05, "loss": 0.0304, "step": 10210 }, { "epoch": 1.4067446662078458, "grad_norm": 0.01244658138602972, "learning_rate": 2.7180089222822256e-05, "loss": 0.0175, "step": 10220 }, { "epoch": 1.408121128699243, "grad_norm": 0.012573972344398499, "learning_rate": 2.7156609532754167e-05, "loss": 0.003, "step": 10230 }, { "epoch": 1.40949759119064, "grad_norm": 0.009852863848209381, "learning_rate": 2.7133129842686078e-05, "loss": 0.0011, "step": 10240 }, { "epoch": 1.4108740536820372, "grad_norm": 0.009701041504740715, "learning_rate": 2.710965015261799e-05, "loss": 0.0004, "step": 10250 }, { "epoch": 1.4122505161734342, "grad_norm": 0.0083130719140172, "learning_rate": 2.70861704625499e-05, "loss": 0.0004, "step": 10260 }, { "epoch": 1.4136269786648314, "grad_norm": 0.007407383993268013, "learning_rate": 2.7062690772481803e-05, "loss": 0.0017, "step": 10270 }, { "epoch": 1.4150034411562284, "grad_norm": 0.009641473181545734, "learning_rate": 2.703921108241371e-05, "loss": 0.0006, "step": 10280 }, { "epoch": 1.4163799036476257, "grad_norm": 0.00826194416731596, "learning_rate": 2.701573139234562e-05, "loss": 0.0145, "step": 10290 }, { "epoch": 1.4177563661390227, "grad_norm": 0.3388448655605316, "learning_rate": 2.6992251702277532e-05, "loss": 0.0137, "step": 10300 }, { "epoch": 1.41913282863042, "grad_norm": 0.007092120125889778, "learning_rate": 2.6968772012209443e-05, "loss": 0.0151, "step": 10310 }, { "epoch": 1.420509291121817, "grad_norm": 0.00848405808210373, "learning_rate": 2.6945292322141347e-05, "loss": 0.0709, "step": 10320 }, { "epoch": 1.4218857536132141, "grad_norm": 0.006756073795258999, "learning_rate": 2.6921812632073258e-05, "loss": 0.002, "step": 10330 }, { "epoch": 1.4232622161046111, "grad_norm": 0.007049960549920797, "learning_rate": 2.6898332942005165e-05, "loss": 0.0022, "step": 10340 }, { "epoch": 1.4246386785960081, "grad_norm": 0.014797673560678959, "learning_rate": 2.6874853251937076e-05, "loss": 0.034, "step": 10350 }, { "epoch": 1.4260151410874053, "grad_norm": 1.0711532831192017, "learning_rate": 2.6851373561868987e-05, "loss": 0.0395, "step": 10360 }, { "epoch": 1.4273916035788026, "grad_norm": 0.006496098358184099, "learning_rate": 2.682789387180089e-05, "loss": 0.0013, "step": 10370 }, { "epoch": 1.4287680660701996, "grad_norm": 0.005622785538434982, "learning_rate": 2.68044141817328e-05, "loss": 0.0034, "step": 10380 }, { "epoch": 1.4301445285615966, "grad_norm": 0.004898019600659609, "learning_rate": 2.6780934491664712e-05, "loss": 0.0034, "step": 10390 }, { "epoch": 1.4315209910529938, "grad_norm": 0.03493994474411011, "learning_rate": 2.675745480159662e-05, "loss": 0.0076, "step": 10400 }, { "epoch": 1.432897453544391, "grad_norm": 0.004973824135959148, "learning_rate": 2.673397511152853e-05, "loss": 0.019, "step": 10410 }, { "epoch": 1.434273916035788, "grad_norm": 0.027292657643556595, "learning_rate": 2.6710495421460434e-05, "loss": 0.0598, "step": 10420 }, { "epoch": 1.435650378527185, "grad_norm": 0.016689104959368706, "learning_rate": 2.6687015731392345e-05, "loss": 0.0026, "step": 10430 }, { "epoch": 1.4370268410185822, "grad_norm": 2.132362127304077, "learning_rate": 2.6663536041324256e-05, "loss": 0.0513, "step": 10440 }, { "epoch": 1.4384033035099795, "grad_norm": 0.039230260998010635, "learning_rate": 2.6640056351256166e-05, "loss": 0.0324, "step": 10450 }, { "epoch": 1.4397797660013765, "grad_norm": 0.03446982428431511, "learning_rate": 2.6616576661188074e-05, "loss": 0.002, "step": 10460 }, { "epoch": 1.4411562284927735, "grad_norm": 0.013069100677967072, "learning_rate": 2.6593096971119984e-05, "loss": 0.0026, "step": 10470 }, { "epoch": 1.4425326909841707, "grad_norm": 0.014296557754278183, "learning_rate": 2.656961728105189e-05, "loss": 0.001, "step": 10480 }, { "epoch": 1.443909153475568, "grad_norm": 0.010211389511823654, "learning_rate": 2.65461375909838e-05, "loss": 0.0012, "step": 10490 }, { "epoch": 1.445285615966965, "grad_norm": 0.009738571010529995, "learning_rate": 2.652265790091571e-05, "loss": 0.033, "step": 10500 }, { "epoch": 1.446662078458362, "grad_norm": 0.024979522451758385, "learning_rate": 2.649917821084762e-05, "loss": 0.0016, "step": 10510 }, { "epoch": 1.4480385409497591, "grad_norm": 0.02968720532953739, "learning_rate": 2.6475698520779528e-05, "loss": 0.0787, "step": 10520 }, { "epoch": 1.4494150034411561, "grad_norm": 0.0773964375257492, "learning_rate": 2.6452218830711435e-05, "loss": 0.0016, "step": 10530 }, { "epoch": 1.4507914659325534, "grad_norm": 0.06606357544660568, "learning_rate": 2.6428739140643343e-05, "loss": 0.0218, "step": 10540 }, { "epoch": 1.4521679284239504, "grad_norm": 0.04967644438147545, "learning_rate": 2.6405259450575253e-05, "loss": 0.0019, "step": 10550 }, { "epoch": 1.4535443909153476, "grad_norm": 0.010468605905771255, "learning_rate": 2.6381779760507164e-05, "loss": 0.0516, "step": 10560 }, { "epoch": 1.4549208534067446, "grad_norm": 0.05936382710933685, "learning_rate": 2.6358300070439075e-05, "loss": 0.0158, "step": 10570 }, { "epoch": 1.4562973158981418, "grad_norm": 0.017470311373472214, "learning_rate": 2.633482038037098e-05, "loss": 0.0017, "step": 10580 }, { "epoch": 1.4576737783895388, "grad_norm": 0.041099607944488525, "learning_rate": 2.631134069030289e-05, "loss": 0.002, "step": 10590 }, { "epoch": 1.459050240880936, "grad_norm": 0.013546425849199295, "learning_rate": 2.6287861000234797e-05, "loss": 0.0241, "step": 10600 }, { "epoch": 1.460426703372333, "grad_norm": 0.012219764292240143, "learning_rate": 2.6264381310166708e-05, "loss": 0.0008, "step": 10610 }, { "epoch": 1.4618031658637303, "grad_norm": 0.02229696698486805, "learning_rate": 2.624090162009862e-05, "loss": 0.0122, "step": 10620 }, { "epoch": 1.4631796283551273, "grad_norm": 0.026320239529013634, "learning_rate": 2.6217421930030522e-05, "loss": 0.0015, "step": 10630 }, { "epoch": 1.4645560908465245, "grad_norm": 0.012845059856772423, "learning_rate": 2.6193942239962433e-05, "loss": 0.0007, "step": 10640 }, { "epoch": 1.4659325533379215, "grad_norm": 2.9102137088775635, "learning_rate": 2.6170462549894344e-05, "loss": 0.0527, "step": 10650 }, { "epoch": 1.4673090158293187, "grad_norm": 0.018991604447364807, "learning_rate": 2.614698285982625e-05, "loss": 0.0246, "step": 10660 }, { "epoch": 1.4686854783207157, "grad_norm": 0.015752600505948067, "learning_rate": 2.6123503169758162e-05, "loss": 0.019, "step": 10670 }, { "epoch": 1.470061940812113, "grad_norm": 0.011750704608857632, "learning_rate": 2.6100023479690066e-05, "loss": 0.001, "step": 10680 }, { "epoch": 1.47143840330351, "grad_norm": 0.011764878407120705, "learning_rate": 2.6076543789621977e-05, "loss": 0.0005, "step": 10690 }, { "epoch": 1.4728148657949072, "grad_norm": 0.009964767843484879, "learning_rate": 2.6053064099553888e-05, "loss": 0.0009, "step": 10700 }, { "epoch": 1.4741913282863042, "grad_norm": 0.009428976103663445, "learning_rate": 2.6029584409485798e-05, "loss": 0.0004, "step": 10710 }, { "epoch": 1.4755677907777014, "grad_norm": 0.008854003623127937, "learning_rate": 2.6006104719417706e-05, "loss": 0.0003, "step": 10720 }, { "epoch": 1.4769442532690984, "grad_norm": 0.09832137823104858, "learning_rate": 2.5982625029349616e-05, "loss": 0.0006, "step": 10730 }, { "epoch": 1.4783207157604954, "grad_norm": 0.3997594714164734, "learning_rate": 2.595914533928152e-05, "loss": 0.0214, "step": 10740 }, { "epoch": 1.4796971782518926, "grad_norm": 1.8332618474960327, "learning_rate": 2.593566564921343e-05, "loss": 0.0474, "step": 10750 }, { "epoch": 1.4810736407432898, "grad_norm": 0.0065375277772545815, "learning_rate": 2.5912185959145342e-05, "loss": 0.0113, "step": 10760 }, { "epoch": 1.4824501032346868, "grad_norm": 0.027123719453811646, "learning_rate": 2.588870626907725e-05, "loss": 0.0097, "step": 10770 }, { "epoch": 1.4838265657260838, "grad_norm": 0.7287060022354126, "learning_rate": 2.586522657900916e-05, "loss": 0.0125, "step": 10780 }, { "epoch": 1.485203028217481, "grad_norm": 0.05322369188070297, "learning_rate": 2.5841746888941064e-05, "loss": 0.0557, "step": 10790 }, { "epoch": 1.4865794907088783, "grad_norm": 0.04496969282627106, "learning_rate": 2.5818267198872975e-05, "loss": 0.004, "step": 10800 }, { "epoch": 1.4879559532002753, "grad_norm": 0.008320951834321022, "learning_rate": 2.5794787508804885e-05, "loss": 0.0004, "step": 10810 }, { "epoch": 1.4893324156916723, "grad_norm": 0.00979155395179987, "learning_rate": 2.5771307818736796e-05, "loss": 0.0323, "step": 10820 }, { "epoch": 1.4907088781830695, "grad_norm": 0.011519234627485275, "learning_rate": 2.5747828128668703e-05, "loss": 0.0483, "step": 10830 }, { "epoch": 1.4920853406744667, "grad_norm": 0.21893887221813202, "learning_rate": 2.572434843860061e-05, "loss": 0.0355, "step": 10840 }, { "epoch": 1.4934618031658637, "grad_norm": 0.015970543026924133, "learning_rate": 2.5700868748532518e-05, "loss": 0.0023, "step": 10850 }, { "epoch": 1.4948382656572607, "grad_norm": 33.26145935058594, "learning_rate": 2.567738905846443e-05, "loss": 0.0232, "step": 10860 }, { "epoch": 1.496214728148658, "grad_norm": 0.007780445273965597, "learning_rate": 2.565390936839634e-05, "loss": 0.0014, "step": 10870 }, { "epoch": 1.4975911906400552, "grad_norm": 0.07392710447311401, "learning_rate": 2.563042967832825e-05, "loss": 0.096, "step": 10880 }, { "epoch": 1.4989676531314522, "grad_norm": 0.028467949479818344, "learning_rate": 2.5606949988260154e-05, "loss": 0.0009, "step": 10890 }, { "epoch": 1.5003441156228492, "grad_norm": 13.634817123413086, "learning_rate": 2.5583470298192065e-05, "loss": 0.0662, "step": 10900 }, { "epoch": 1.5017205781142464, "grad_norm": 0.3698103129863739, "learning_rate": 2.5559990608123972e-05, "loss": 0.0405, "step": 10910 }, { "epoch": 1.5030970406056436, "grad_norm": 0.06074180081486702, "learning_rate": 2.5536510918055883e-05, "loss": 0.0563, "step": 10920 }, { "epoch": 1.5044735030970406, "grad_norm": 0.014030024409294128, "learning_rate": 2.5513031227987794e-05, "loss": 0.0102, "step": 10930 }, { "epoch": 1.5058499655884376, "grad_norm": 0.012856217101216316, "learning_rate": 2.5489551537919698e-05, "loss": 0.0191, "step": 10940 }, { "epoch": 1.5072264280798349, "grad_norm": 0.14362230896949768, "learning_rate": 2.546607184785161e-05, "loss": 0.0144, "step": 10950 }, { "epoch": 1.508602890571232, "grad_norm": 2.667771816253662, "learning_rate": 2.544259215778352e-05, "loss": 0.0135, "step": 10960 }, { "epoch": 1.509979353062629, "grad_norm": 0.00872336607426405, "learning_rate": 2.5419112467715427e-05, "loss": 0.0135, "step": 10970 }, { "epoch": 1.511355815554026, "grad_norm": 0.006830106023699045, "learning_rate": 2.5395632777647338e-05, "loss": 0.0038, "step": 10980 }, { "epoch": 1.5127322780454233, "grad_norm": 0.006794198881834745, "learning_rate": 2.5372153087579248e-05, "loss": 0.0016, "step": 10990 }, { "epoch": 1.5141087405368203, "grad_norm": 0.7173166275024414, "learning_rate": 2.5348673397511152e-05, "loss": 0.0024, "step": 11000 }, { "epoch": 1.5154852030282173, "grad_norm": 0.006951649207621813, "learning_rate": 2.5325193707443063e-05, "loss": 0.0003, "step": 11010 }, { "epoch": 1.5168616655196145, "grad_norm": 0.006769006606191397, "learning_rate": 2.5301714017374974e-05, "loss": 0.0114, "step": 11020 }, { "epoch": 1.5182381280110118, "grad_norm": 0.0058417245745658875, "learning_rate": 2.527823432730688e-05, "loss": 0.0042, "step": 11030 }, { "epoch": 1.5196145905024088, "grad_norm": 0.004894690588116646, "learning_rate": 2.5254754637238792e-05, "loss": 0.0002, "step": 11040 }, { "epoch": 1.5209910529938058, "grad_norm": 0.005281677469611168, "learning_rate": 2.5231274947170696e-05, "loss": 0.0002, "step": 11050 }, { "epoch": 1.522367515485203, "grad_norm": 0.005624835845082998, "learning_rate": 2.5207795257102607e-05, "loss": 0.0542, "step": 11060 }, { "epoch": 1.5237439779766002, "grad_norm": 0.13457074761390686, "learning_rate": 2.5184315567034517e-05, "loss": 0.0031, "step": 11070 }, { "epoch": 1.5251204404679972, "grad_norm": 0.009839137084782124, "learning_rate": 2.5160835876966428e-05, "loss": 0.0523, "step": 11080 }, { "epoch": 1.5264969029593942, "grad_norm": 0.050115667283535004, "learning_rate": 2.5137356186898335e-05, "loss": 0.0113, "step": 11090 }, { "epoch": 1.5278733654507914, "grad_norm": 0.00906539335846901, "learning_rate": 2.511387649683024e-05, "loss": 0.0311, "step": 11100 }, { "epoch": 1.5292498279421887, "grad_norm": 0.008596427738666534, "learning_rate": 2.509039680676215e-05, "loss": 0.0013, "step": 11110 }, { "epoch": 1.5306262904335857, "grad_norm": 5.201973915100098, "learning_rate": 2.506691711669406e-05, "loss": 0.0179, "step": 11120 }, { "epoch": 1.5320027529249827, "grad_norm": 0.006756656337529421, "learning_rate": 2.504343742662597e-05, "loss": 0.0024, "step": 11130 }, { "epoch": 1.53337921541638, "grad_norm": 0.05719393864274025, "learning_rate": 2.5019957736557882e-05, "loss": 0.0016, "step": 11140 }, { "epoch": 1.5347556779077771, "grad_norm": 0.006463745143264532, "learning_rate": 2.4996478046489786e-05, "loss": 0.0273, "step": 11150 }, { "epoch": 1.5361321403991741, "grad_norm": 0.006732061039656401, "learning_rate": 2.4972998356421694e-05, "loss": 0.0016, "step": 11160 }, { "epoch": 1.5375086028905711, "grad_norm": 0.006338499020785093, "learning_rate": 2.4949518666353604e-05, "loss": 0.0026, "step": 11170 }, { "epoch": 1.5388850653819683, "grad_norm": 0.007990856654942036, "learning_rate": 2.4926038976285515e-05, "loss": 0.0555, "step": 11180 }, { "epoch": 1.5402615278733656, "grad_norm": 0.009589756838977337, "learning_rate": 2.4902559286217422e-05, "loss": 0.003, "step": 11190 }, { "epoch": 1.5416379903647626, "grad_norm": 0.00850045494735241, "learning_rate": 2.4879079596149333e-05, "loss": 0.0414, "step": 11200 }, { "epoch": 1.5430144528561596, "grad_norm": 0.015693694353103638, "learning_rate": 2.485559990608124e-05, "loss": 0.0005, "step": 11210 }, { "epoch": 1.5443909153475568, "grad_norm": 0.008239972405135632, "learning_rate": 2.4832120216013148e-05, "loss": 0.0372, "step": 11220 }, { "epoch": 1.545767377838954, "grad_norm": 0.02053440362215042, "learning_rate": 2.480864052594506e-05, "loss": 0.0035, "step": 11230 }, { "epoch": 1.547143840330351, "grad_norm": 0.008120126090943813, "learning_rate": 2.4785160835876966e-05, "loss": 0.0005, "step": 11240 }, { "epoch": 1.548520302821748, "grad_norm": 0.01017313078045845, "learning_rate": 2.4761681145808877e-05, "loss": 0.0541, "step": 11250 }, { "epoch": 1.5498967653131452, "grad_norm": 0.01090360339730978, "learning_rate": 2.4738201455740788e-05, "loss": 0.0017, "step": 11260 }, { "epoch": 1.5512732278045425, "grad_norm": 0.01929691806435585, "learning_rate": 2.4714721765672695e-05, "loss": 0.0037, "step": 11270 }, { "epoch": 1.5526496902959395, "grad_norm": 0.05219206586480141, "learning_rate": 2.4691242075604602e-05, "loss": 0.0006, "step": 11280 }, { "epoch": 1.5540261527873365, "grad_norm": 0.009356614202260971, "learning_rate": 2.466776238553651e-05, "loss": 0.0006, "step": 11290 }, { "epoch": 1.5554026152787337, "grad_norm": 0.010170584544539452, "learning_rate": 2.464428269546842e-05, "loss": 0.0161, "step": 11300 }, { "epoch": 1.556779077770131, "grad_norm": 0.008509612642228603, "learning_rate": 2.462080300540033e-05, "loss": 0.0102, "step": 11310 }, { "epoch": 1.558155540261528, "grad_norm": 0.008614452555775642, "learning_rate": 2.459732331533224e-05, "loss": 0.0104, "step": 11320 }, { "epoch": 1.559532002752925, "grad_norm": 0.008502032607793808, "learning_rate": 2.457384362526415e-05, "loss": 0.0512, "step": 11330 }, { "epoch": 1.5609084652443221, "grad_norm": 0.18153183162212372, "learning_rate": 2.4550363935196057e-05, "loss": 0.0041, "step": 11340 }, { "epoch": 1.5622849277357194, "grad_norm": 6.459352493286133, "learning_rate": 2.4526884245127964e-05, "loss": 0.0108, "step": 11350 }, { "epoch": 1.5636613902271164, "grad_norm": 0.012046468444168568, "learning_rate": 2.4503404555059875e-05, "loss": 0.0005, "step": 11360 }, { "epoch": 1.5650378527185134, "grad_norm": 0.011568199843168259, "learning_rate": 2.4479924864991782e-05, "loss": 0.0005, "step": 11370 }, { "epoch": 1.5664143152099106, "grad_norm": 0.009781219996511936, "learning_rate": 2.4456445174923693e-05, "loss": 0.0289, "step": 11380 }, { "epoch": 1.5677907777013076, "grad_norm": 0.14281205832958221, "learning_rate": 2.4432965484855603e-05, "loss": 0.0051, "step": 11390 }, { "epoch": 1.5691672401927046, "grad_norm": 0.00868828222155571, "learning_rate": 2.440948579478751e-05, "loss": 0.0004, "step": 11400 }, { "epoch": 1.5705437026841018, "grad_norm": 0.009113933891057968, "learning_rate": 2.4386006104719418e-05, "loss": 0.0131, "step": 11410 }, { "epoch": 1.571920165175499, "grad_norm": 0.008033568039536476, "learning_rate": 2.4362526414651326e-05, "loss": 0.01, "step": 11420 }, { "epoch": 1.573296627666896, "grad_norm": 0.007041790056973696, "learning_rate": 2.4339046724583236e-05, "loss": 0.0004, "step": 11430 }, { "epoch": 1.574673090158293, "grad_norm": 0.00998032558709383, "learning_rate": 2.4315567034515147e-05, "loss": 0.0514, "step": 11440 }, { "epoch": 1.5760495526496903, "grad_norm": 0.23412452638149261, "learning_rate": 2.4292087344447054e-05, "loss": 0.006, "step": 11450 }, { "epoch": 1.5774260151410875, "grad_norm": 1.9679007530212402, "learning_rate": 2.4268607654378965e-05, "loss": 0.0937, "step": 11460 }, { "epoch": 1.5788024776324845, "grad_norm": 0.05044076219201088, "learning_rate": 2.4245127964310872e-05, "loss": 0.05, "step": 11470 }, { "epoch": 1.5801789401238815, "grad_norm": 0.04937722161412239, "learning_rate": 2.422164827424278e-05, "loss": 0.0041, "step": 11480 }, { "epoch": 1.5815554026152787, "grad_norm": 2.7819716930389404, "learning_rate": 2.419816858417469e-05, "loss": 0.0454, "step": 11490 }, { "epoch": 1.582931865106676, "grad_norm": 2.121037006378174, "learning_rate": 2.4174688894106598e-05, "loss": 0.0145, "step": 11500 }, { "epoch": 1.584308327598073, "grad_norm": 0.0400252491235733, "learning_rate": 2.415120920403851e-05, "loss": 0.0018, "step": 11510 }, { "epoch": 1.58568479008947, "grad_norm": 0.02398155815899372, "learning_rate": 2.412772951397042e-05, "loss": 0.0103, "step": 11520 }, { "epoch": 1.5870612525808672, "grad_norm": 0.0236700139939785, "learning_rate": 2.4104249823902327e-05, "loss": 0.0012, "step": 11530 }, { "epoch": 1.5884377150722644, "grad_norm": 0.019024787470698357, "learning_rate": 2.4080770133834234e-05, "loss": 0.0008, "step": 11540 }, { "epoch": 1.5898141775636614, "grad_norm": 0.14116360247135162, "learning_rate": 2.405729044376614e-05, "loss": 0.0032, "step": 11550 }, { "epoch": 1.5911906400550584, "grad_norm": 0.013822670094668865, "learning_rate": 2.4033810753698052e-05, "loss": 0.0423, "step": 11560 }, { "epoch": 1.5925671025464556, "grad_norm": 0.015932025387883186, "learning_rate": 2.4010331063629963e-05, "loss": 0.014, "step": 11570 }, { "epoch": 1.5939435650378528, "grad_norm": 0.013905310072004795, "learning_rate": 2.398685137356187e-05, "loss": 0.0026, "step": 11580 }, { "epoch": 1.5953200275292498, "grad_norm": 0.011655794456601143, "learning_rate": 2.3963371683493778e-05, "loss": 0.0006, "step": 11590 }, { "epoch": 1.5966964900206468, "grad_norm": 0.01029499713331461, "learning_rate": 2.3939891993425685e-05, "loss": 0.008, "step": 11600 }, { "epoch": 1.598072952512044, "grad_norm": 0.010671551339328289, "learning_rate": 2.3916412303357596e-05, "loss": 0.0029, "step": 11610 }, { "epoch": 1.5994494150034413, "grad_norm": 0.013266420923173428, "learning_rate": 2.3892932613289507e-05, "loss": 0.0135, "step": 11620 }, { "epoch": 1.6008258774948383, "grad_norm": 0.011431637220084667, "learning_rate": 2.3869452923221414e-05, "loss": 0.0475, "step": 11630 }, { "epoch": 1.6022023399862353, "grad_norm": 0.013550806790590286, "learning_rate": 2.3845973233153325e-05, "loss": 0.0017, "step": 11640 }, { "epoch": 1.6035788024776325, "grad_norm": 0.012052079662680626, "learning_rate": 2.3822493543085232e-05, "loss": 0.0017, "step": 11650 }, { "epoch": 1.6049552649690297, "grad_norm": 0.01136066671460867, "learning_rate": 2.379901385301714e-05, "loss": 0.0027, "step": 11660 }, { "epoch": 1.6063317274604267, "grad_norm": 0.009207936003804207, "learning_rate": 2.377553416294905e-05, "loss": 0.0292, "step": 11670 }, { "epoch": 1.6077081899518237, "grad_norm": 0.009458073414862156, "learning_rate": 2.3752054472880957e-05, "loss": 0.0025, "step": 11680 }, { "epoch": 1.609084652443221, "grad_norm": 0.01119165774434805, "learning_rate": 2.3728574782812868e-05, "loss": 0.0013, "step": 11690 }, { "epoch": 1.6104611149346182, "grad_norm": 0.009882903657853603, "learning_rate": 2.370509509274478e-05, "loss": 0.0046, "step": 11700 }, { "epoch": 1.6118375774260152, "grad_norm": 0.008105527609586716, "learning_rate": 2.3681615402676686e-05, "loss": 0.003, "step": 11710 }, { "epoch": 1.6132140399174122, "grad_norm": 0.41764068603515625, "learning_rate": 2.3658135712608594e-05, "loss": 0.0017, "step": 11720 }, { "epoch": 1.6145905024088094, "grad_norm": 0.008639617823064327, "learning_rate": 2.36346560225405e-05, "loss": 0.0003, "step": 11730 }, { "epoch": 1.6159669649002064, "grad_norm": 0.009647867642343044, "learning_rate": 2.3611176332472412e-05, "loss": 0.0016, "step": 11740 }, { "epoch": 1.6173434273916034, "grad_norm": 0.006339425686746836, "learning_rate": 2.3587696642404323e-05, "loss": 0.0005, "step": 11750 }, { "epoch": 1.6187198898830006, "grad_norm": 0.01337434258311987, "learning_rate": 2.356421695233623e-05, "loss": 0.1062, "step": 11760 }, { "epoch": 1.6200963523743979, "grad_norm": 0.01474896352738142, "learning_rate": 2.354073726226814e-05, "loss": 0.0017, "step": 11770 }, { "epoch": 1.6214728148657949, "grad_norm": 0.014938612468540668, "learning_rate": 2.3517257572200048e-05, "loss": 0.0007, "step": 11780 }, { "epoch": 1.6228492773571919, "grad_norm": 0.017360704019665718, "learning_rate": 2.3493777882131955e-05, "loss": 0.0006, "step": 11790 }, { "epoch": 1.624225739848589, "grad_norm": 0.014478340744972229, "learning_rate": 2.3470298192063866e-05, "loss": 0.0007, "step": 11800 }, { "epoch": 1.6256022023399863, "grad_norm": 0.012019380927085876, "learning_rate": 2.3446818501995773e-05, "loss": 0.0016, "step": 11810 }, { "epoch": 1.6269786648313833, "grad_norm": 0.013227684423327446, "learning_rate": 2.3423338811927684e-05, "loss": 0.0153, "step": 11820 }, { "epoch": 1.6283551273227803, "grad_norm": 0.012500644661486149, "learning_rate": 2.3399859121859595e-05, "loss": 0.0004, "step": 11830 }, { "epoch": 1.6297315898141775, "grad_norm": 0.012191076762974262, "learning_rate": 2.3376379431791502e-05, "loss": 0.0004, "step": 11840 }, { "epoch": 1.6311080523055748, "grad_norm": 0.013842448592185974, "learning_rate": 2.335289974172341e-05, "loss": 0.049, "step": 11850 }, { "epoch": 1.6324845147969718, "grad_norm": 0.014162111096084118, "learning_rate": 2.3329420051655317e-05, "loss": 0.045, "step": 11860 }, { "epoch": 1.6338609772883688, "grad_norm": 0.020589498803019524, "learning_rate": 2.3305940361587228e-05, "loss": 0.0018, "step": 11870 }, { "epoch": 1.635237439779766, "grad_norm": 0.04854802414774895, "learning_rate": 2.328246067151914e-05, "loss": 0.0501, "step": 11880 }, { "epoch": 1.6366139022711632, "grad_norm": 0.03464007005095482, "learning_rate": 2.3258980981451046e-05, "loss": 0.0013, "step": 11890 }, { "epoch": 1.6379903647625602, "grad_norm": 0.026949292048811913, "learning_rate": 2.3235501291382957e-05, "loss": 0.0011, "step": 11900 }, { "epoch": 1.6393668272539572, "grad_norm": 0.028828062117099762, "learning_rate": 2.3212021601314864e-05, "loss": 0.0682, "step": 11910 }, { "epoch": 1.6407432897453544, "grad_norm": 0.04125591367483139, "learning_rate": 2.318854191124677e-05, "loss": 0.0443, "step": 11920 }, { "epoch": 1.6421197522367517, "grad_norm": 0.048131730407476425, "learning_rate": 2.3165062221178682e-05, "loss": 0.0056, "step": 11930 }, { "epoch": 1.6434962147281487, "grad_norm": 0.03268555924296379, "learning_rate": 2.314158253111059e-05, "loss": 0.0016, "step": 11940 }, { "epoch": 1.6448726772195457, "grad_norm": 0.028451042249798775, "learning_rate": 2.31181028410425e-05, "loss": 0.0134, "step": 11950 }, { "epoch": 1.6462491397109429, "grad_norm": 0.0340987928211689, "learning_rate": 2.309462315097441e-05, "loss": 0.0424, "step": 11960 }, { "epoch": 1.6476256022023401, "grad_norm": 0.03255735710263252, "learning_rate": 2.3071143460906315e-05, "loss": 0.0013, "step": 11970 }, { "epoch": 1.6490020646937371, "grad_norm": 0.02954251691699028, "learning_rate": 2.3047663770838226e-05, "loss": 0.0136, "step": 11980 }, { "epoch": 1.6503785271851341, "grad_norm": 0.021474245935678482, "learning_rate": 2.3024184080770133e-05, "loss": 0.0009, "step": 11990 }, { "epoch": 1.6517549896765313, "grad_norm": 0.02148095890879631, "learning_rate": 2.3000704390702044e-05, "loss": 0.0096, "step": 12000 }, { "epoch": 1.6531314521679286, "grad_norm": 0.04241275042295456, "learning_rate": 2.2977224700633954e-05, "loss": 0.0455, "step": 12010 }, { "epoch": 1.6545079146593256, "grad_norm": 0.013634774833917618, "learning_rate": 2.2953745010565862e-05, "loss": 0.0008, "step": 12020 }, { "epoch": 1.6558843771507226, "grad_norm": 0.012582059018313885, "learning_rate": 2.293026532049777e-05, "loss": 0.0381, "step": 12030 }, { "epoch": 1.6572608396421198, "grad_norm": 0.014319190755486488, "learning_rate": 2.290678563042968e-05, "loss": 0.0087, "step": 12040 }, { "epoch": 1.658637302133517, "grad_norm": 0.01379112433642149, "learning_rate": 2.2883305940361587e-05, "loss": 0.051, "step": 12050 }, { "epoch": 1.660013764624914, "grad_norm": 0.018615897744894028, "learning_rate": 2.2859826250293498e-05, "loss": 0.0289, "step": 12060 }, { "epoch": 1.661390227116311, "grad_norm": 0.019208010286092758, "learning_rate": 2.2836346560225405e-05, "loss": 0.0008, "step": 12070 }, { "epoch": 1.6627666896077082, "grad_norm": 0.015350312925875187, "learning_rate": 2.2812866870157316e-05, "loss": 0.0022, "step": 12080 }, { "epoch": 1.6641431520991055, "grad_norm": 0.01880769617855549, "learning_rate": 2.2789387180089223e-05, "loss": 0.0007, "step": 12090 }, { "epoch": 1.6655196145905025, "grad_norm": 0.48616766929626465, "learning_rate": 2.276590749002113e-05, "loss": 0.0047, "step": 12100 }, { "epoch": 1.6668960770818995, "grad_norm": 0.01342343632131815, "learning_rate": 2.274242779995304e-05, "loss": 0.0079, "step": 12110 }, { "epoch": 1.6682725395732967, "grad_norm": 0.013538386672735214, "learning_rate": 2.271894810988495e-05, "loss": 0.0464, "step": 12120 }, { "epoch": 1.6696490020646937, "grad_norm": 0.01821916550397873, "learning_rate": 2.269546841981686e-05, "loss": 0.0047, "step": 12130 }, { "epoch": 1.6710254645560907, "grad_norm": 0.017319485545158386, "learning_rate": 2.267198872974877e-05, "loss": 0.0008, "step": 12140 }, { "epoch": 1.672401927047488, "grad_norm": 0.014176364988088608, "learning_rate": 2.2648509039680678e-05, "loss": 0.0008, "step": 12150 }, { "epoch": 1.6737783895388851, "grad_norm": 0.020567666739225388, "learning_rate": 2.2625029349612585e-05, "loss": 0.0623, "step": 12160 }, { "epoch": 1.6751548520302821, "grad_norm": 0.021809574216604233, "learning_rate": 2.2601549659544496e-05, "loss": 0.0208, "step": 12170 }, { "epoch": 1.6765313145216791, "grad_norm": 0.024950852617621422, "learning_rate": 2.2578069969476403e-05, "loss": 0.0471, "step": 12180 }, { "epoch": 1.6779077770130764, "grad_norm": 0.02233227528631687, "learning_rate": 2.2554590279408314e-05, "loss": 0.0025, "step": 12190 }, { "epoch": 1.6792842395044736, "grad_norm": 0.025386884808540344, "learning_rate": 2.253111058934022e-05, "loss": 0.0015, "step": 12200 }, { "epoch": 1.6806607019958706, "grad_norm": 0.029864227399230003, "learning_rate": 2.2507630899272132e-05, "loss": 0.0896, "step": 12210 }, { "epoch": 1.6820371644872676, "grad_norm": 0.042778149247169495, "learning_rate": 2.248415120920404e-05, "loss": 0.0019, "step": 12220 }, { "epoch": 1.6834136269786648, "grad_norm": 0.04294196888804436, "learning_rate": 2.2460671519135947e-05, "loss": 0.0018, "step": 12230 }, { "epoch": 1.684790089470062, "grad_norm": 0.030220238491892815, "learning_rate": 2.2437191829067857e-05, "loss": 0.0209, "step": 12240 }, { "epoch": 1.686166551961459, "grad_norm": 0.02478274703025818, "learning_rate": 2.2413712138999765e-05, "loss": 0.0018, "step": 12250 }, { "epoch": 1.687543014452856, "grad_norm": 0.02032957598567009, "learning_rate": 2.2390232448931676e-05, "loss": 0.0017, "step": 12260 }, { "epoch": 1.6889194769442533, "grad_norm": 0.018265675753355026, "learning_rate": 2.2366752758863586e-05, "loss": 0.0029, "step": 12270 }, { "epoch": 1.6902959394356505, "grad_norm": 0.0171518437564373, "learning_rate": 2.2343273068795494e-05, "loss": 0.0007, "step": 12280 }, { "epoch": 1.6916724019270475, "grad_norm": 0.16263538599014282, "learning_rate": 2.23197933787274e-05, "loss": 0.0018, "step": 12290 }, { "epoch": 1.6930488644184445, "grad_norm": 0.01255828607827425, "learning_rate": 2.2296313688659312e-05, "loss": 0.001, "step": 12300 }, { "epoch": 1.6944253269098417, "grad_norm": 0.07742094248533249, "learning_rate": 2.227283399859122e-05, "loss": 0.0476, "step": 12310 }, { "epoch": 1.695801789401239, "grad_norm": 0.019308097660541534, "learning_rate": 2.224935430852313e-05, "loss": 0.0015, "step": 12320 }, { "epoch": 1.697178251892636, "grad_norm": 0.06314108520746231, "learning_rate": 2.2225874618455037e-05, "loss": 0.0022, "step": 12330 }, { "epoch": 1.698554714384033, "grad_norm": 0.01384037733078003, "learning_rate": 2.2202394928386948e-05, "loss": 0.0049, "step": 12340 }, { "epoch": 1.6999311768754302, "grad_norm": 0.012380830012261868, "learning_rate": 2.2178915238318855e-05, "loss": 0.0402, "step": 12350 }, { "epoch": 1.7013076393668274, "grad_norm": 0.014076135121285915, "learning_rate": 2.2155435548250763e-05, "loss": 0.0005, "step": 12360 }, { "epoch": 1.7026841018582244, "grad_norm": 0.020070498809218407, "learning_rate": 2.2131955858182673e-05, "loss": 0.0501, "step": 12370 }, { "epoch": 1.7040605643496214, "grad_norm": 0.02946837991476059, "learning_rate": 2.210847616811458e-05, "loss": 0.0445, "step": 12380 }, { "epoch": 1.7054370268410186, "grad_norm": 0.1673538088798523, "learning_rate": 2.208499647804649e-05, "loss": 0.0024, "step": 12390 }, { "epoch": 1.7068134893324158, "grad_norm": 0.020942257717251778, "learning_rate": 2.2061516787978402e-05, "loss": 0.0022, "step": 12400 }, { "epoch": 1.7081899518238128, "grad_norm": 0.02922196500003338, "learning_rate": 2.2038037097910306e-05, "loss": 0.0438, "step": 12410 }, { "epoch": 1.7095664143152098, "grad_norm": 0.033540818840265274, "learning_rate": 2.2014557407842217e-05, "loss": 0.0012, "step": 12420 }, { "epoch": 1.710942876806607, "grad_norm": 0.019946373999118805, "learning_rate": 2.1991077717774124e-05, "loss": 0.0377, "step": 12430 }, { "epoch": 1.7123193392980043, "grad_norm": 0.01750066876411438, "learning_rate": 2.1967598027706035e-05, "loss": 0.002, "step": 12440 }, { "epoch": 1.7136958017894013, "grad_norm": 0.01977379433810711, "learning_rate": 2.1944118337637946e-05, "loss": 0.0021, "step": 12450 }, { "epoch": 1.7150722642807983, "grad_norm": 0.01949247345328331, "learning_rate": 2.1920638647569853e-05, "loss": 0.0006, "step": 12460 }, { "epoch": 1.7164487267721955, "grad_norm": 0.014204725623130798, "learning_rate": 2.189715895750176e-05, "loss": 0.001, "step": 12470 }, { "epoch": 1.7178251892635927, "grad_norm": 0.01158700231462717, "learning_rate": 2.187367926743367e-05, "loss": 0.0008, "step": 12480 }, { "epoch": 1.7192016517549895, "grad_norm": 0.013839386403560638, "learning_rate": 2.185019957736558e-05, "loss": 0.002, "step": 12490 }, { "epoch": 1.7205781142463867, "grad_norm": 0.09548407047986984, "learning_rate": 2.182671988729749e-05, "loss": 0.0021, "step": 12500 }, { "epoch": 1.721954576737784, "grad_norm": 0.01072004996240139, "learning_rate": 2.1803240197229397e-05, "loss": 0.0061, "step": 12510 }, { "epoch": 1.723331039229181, "grad_norm": 0.009345059283077717, "learning_rate": 2.1779760507161308e-05, "loss": 0.0004, "step": 12520 }, { "epoch": 1.724707501720578, "grad_norm": 0.009427688084542751, "learning_rate": 2.1756280817093215e-05, "loss": 0.0061, "step": 12530 }, { "epoch": 1.7260839642119752, "grad_norm": 0.009184526279568672, "learning_rate": 2.1732801127025122e-05, "loss": 0.0008, "step": 12540 }, { "epoch": 1.7274604267033724, "grad_norm": 0.008315183222293854, "learning_rate": 2.1709321436957033e-05, "loss": 0.0006, "step": 12550 }, { "epoch": 1.7288368891947694, "grad_norm": 0.007684558164328337, "learning_rate": 2.168584174688894e-05, "loss": 0.0152, "step": 12560 }, { "epoch": 1.7302133516861664, "grad_norm": 0.008124252781271935, "learning_rate": 2.166236205682085e-05, "loss": 0.0003, "step": 12570 }, { "epoch": 1.7315898141775636, "grad_norm": 2.5148093700408936, "learning_rate": 2.1638882366752762e-05, "loss": 0.0524, "step": 12580 }, { "epoch": 1.7329662766689609, "grad_norm": 0.019515881314873695, "learning_rate": 2.161540267668467e-05, "loss": 0.001, "step": 12590 }, { "epoch": 1.7343427391603579, "grad_norm": 0.04805806279182434, "learning_rate": 2.1591922986616577e-05, "loss": 0.0062, "step": 12600 }, { "epoch": 1.7357192016517549, "grad_norm": 0.015712911263108253, "learning_rate": 2.1568443296548487e-05, "loss": 0.0057, "step": 12610 }, { "epoch": 1.737095664143152, "grad_norm": 0.00672736344859004, "learning_rate": 2.1544963606480395e-05, "loss": 0.0003, "step": 12620 }, { "epoch": 1.7384721266345493, "grad_norm": 0.02248559519648552, "learning_rate": 2.1521483916412305e-05, "loss": 0.0008, "step": 12630 }, { "epoch": 1.7398485891259463, "grad_norm": 0.006910775788128376, "learning_rate": 2.1498004226344213e-05, "loss": 0.014, "step": 12640 }, { "epoch": 1.7412250516173433, "grad_norm": 9.395773887634277, "learning_rate": 2.1474524536276123e-05, "loss": 0.07, "step": 12650 }, { "epoch": 1.7426015141087405, "grad_norm": 0.08001919090747833, "learning_rate": 2.145104484620803e-05, "loss": 0.0021, "step": 12660 }, { "epoch": 1.7439779766001378, "grad_norm": 0.009299058467149734, "learning_rate": 2.1427565156139938e-05, "loss": 0.0006, "step": 12670 }, { "epoch": 1.7453544390915348, "grad_norm": 0.014485635794699192, "learning_rate": 2.140408546607185e-05, "loss": 0.0005, "step": 12680 }, { "epoch": 1.7467309015829318, "grad_norm": 0.0073399413377046585, "learning_rate": 2.1380605776003756e-05, "loss": 0.0004, "step": 12690 }, { "epoch": 1.748107364074329, "grad_norm": 0.011932270601391792, "learning_rate": 2.1357126085935667e-05, "loss": 0.0007, "step": 12700 }, { "epoch": 1.7494838265657262, "grad_norm": 0.007301196455955505, "learning_rate": 2.1333646395867578e-05, "loss": 0.0322, "step": 12710 }, { "epoch": 1.7508602890571232, "grad_norm": 0.006611840799450874, "learning_rate": 2.1310166705799485e-05, "loss": 0.0006, "step": 12720 }, { "epoch": 1.7522367515485202, "grad_norm": 0.023443792015314102, "learning_rate": 2.1286687015731392e-05, "loss": 0.0003, "step": 12730 }, { "epoch": 1.7536132140399174, "grad_norm": 0.008864818140864372, "learning_rate": 2.1263207325663303e-05, "loss": 0.0006, "step": 12740 }, { "epoch": 1.7549896765313147, "grad_norm": 0.017270183190703392, "learning_rate": 2.123972763559521e-05, "loss": 0.037, "step": 12750 }, { "epoch": 1.7563661390227117, "grad_norm": 4.378342151641846, "learning_rate": 2.121624794552712e-05, "loss": 0.0758, "step": 12760 }, { "epoch": 1.7577426015141087, "grad_norm": 0.01663787290453911, "learning_rate": 2.119276825545903e-05, "loss": 0.0018, "step": 12770 }, { "epoch": 1.7591190640055059, "grad_norm": 0.009519710205495358, "learning_rate": 2.116928856539094e-05, "loss": 0.0051, "step": 12780 }, { "epoch": 1.760495526496903, "grad_norm": 0.02645360864698887, "learning_rate": 2.1145808875322847e-05, "loss": 0.0205, "step": 12790 }, { "epoch": 1.7618719889883, "grad_norm": 0.04989407956600189, "learning_rate": 2.1122329185254754e-05, "loss": 0.0004, "step": 12800 }, { "epoch": 1.763248451479697, "grad_norm": 0.0072103627026081085, "learning_rate": 2.1098849495186665e-05, "loss": 0.0005, "step": 12810 }, { "epoch": 1.7646249139710943, "grad_norm": 0.011418350972235203, "learning_rate": 2.1075369805118572e-05, "loss": 0.0515, "step": 12820 }, { "epoch": 1.7660013764624916, "grad_norm": 0.20928557217121124, "learning_rate": 2.1051890115050483e-05, "loss": 0.0019, "step": 12830 }, { "epoch": 1.7673778389538886, "grad_norm": 0.0906970202922821, "learning_rate": 2.1028410424982394e-05, "loss": 0.0461, "step": 12840 }, { "epoch": 1.7687543014452856, "grad_norm": 1.0802427530288696, "learning_rate": 2.1004930734914298e-05, "loss": 0.0147, "step": 12850 }, { "epoch": 1.7701307639366828, "grad_norm": 0.3978409767150879, "learning_rate": 2.098145104484621e-05, "loss": 0.0051, "step": 12860 }, { "epoch": 1.7715072264280798, "grad_norm": 0.10376014560461044, "learning_rate": 2.095797135477812e-05, "loss": 0.0059, "step": 12870 }, { "epoch": 1.7728836889194768, "grad_norm": 0.017478251829743385, "learning_rate": 2.0934491664710027e-05, "loss": 0.0566, "step": 12880 }, { "epoch": 1.774260151410874, "grad_norm": 0.015911513939499855, "learning_rate": 2.0911011974641937e-05, "loss": 0.0077, "step": 12890 }, { "epoch": 1.7756366139022712, "grad_norm": 0.02560059167444706, "learning_rate": 2.0887532284573845e-05, "loss": 0.0514, "step": 12900 }, { "epoch": 1.7770130763936682, "grad_norm": 0.030713137239217758, "learning_rate": 2.0864052594505752e-05, "loss": 0.0012, "step": 12910 }, { "epoch": 1.7783895388850652, "grad_norm": 1.8284029960632324, "learning_rate": 2.0840572904437663e-05, "loss": 0.0788, "step": 12920 }, { "epoch": 1.7797660013764625, "grad_norm": 0.05922043323516846, "learning_rate": 2.081709321436957e-05, "loss": 0.0053, "step": 12930 }, { "epoch": 1.7811424638678597, "grad_norm": 0.04786548390984535, "learning_rate": 2.079361352430148e-05, "loss": 0.0481, "step": 12940 }, { "epoch": 1.7825189263592567, "grad_norm": 0.04129546880722046, "learning_rate": 2.0770133834233388e-05, "loss": 0.0096, "step": 12950 }, { "epoch": 1.7838953888506537, "grad_norm": 0.03781941533088684, "learning_rate": 2.07466541441653e-05, "loss": 0.03, "step": 12960 }, { "epoch": 1.785271851342051, "grad_norm": 0.03458619490265846, "learning_rate": 2.0723174454097206e-05, "loss": 0.0437, "step": 12970 }, { "epoch": 1.7866483138334481, "grad_norm": 0.03710738196969032, "learning_rate": 2.0699694764029114e-05, "loss": 0.0171, "step": 12980 }, { "epoch": 1.7880247763248451, "grad_norm": 0.03228791058063507, "learning_rate": 2.0676215073961024e-05, "loss": 0.0013, "step": 12990 }, { "epoch": 1.7894012388162421, "grad_norm": 0.018600914627313614, "learning_rate": 2.0652735383892935e-05, "loss": 0.001, "step": 13000 }, { "epoch": 1.7907777013076394, "grad_norm": 0.02467518486082554, "learning_rate": 2.0629255693824842e-05, "loss": 0.0845, "step": 13010 }, { "epoch": 1.7921541637990366, "grad_norm": 0.03502516821026802, "learning_rate": 2.0605776003756753e-05, "loss": 0.0013, "step": 13020 }, { "epoch": 1.7935306262904336, "grad_norm": 0.0689443051815033, "learning_rate": 2.058229631368866e-05, "loss": 0.0126, "step": 13030 }, { "epoch": 1.7949070887818306, "grad_norm": 0.020423468202352524, "learning_rate": 2.0558816623620568e-05, "loss": 0.0202, "step": 13040 }, { "epoch": 1.7962835512732278, "grad_norm": 0.016915827989578247, "learning_rate": 2.053533693355248e-05, "loss": 0.0044, "step": 13050 }, { "epoch": 1.797660013764625, "grad_norm": 0.014452680014073849, "learning_rate": 2.0511857243484386e-05, "loss": 0.003, "step": 13060 }, { "epoch": 1.799036476256022, "grad_norm": 0.015959396958351135, "learning_rate": 2.0488377553416297e-05, "loss": 0.0044, "step": 13070 }, { "epoch": 1.800412938747419, "grad_norm": 0.010908365249633789, "learning_rate": 2.0464897863348204e-05, "loss": 0.0004, "step": 13080 }, { "epoch": 1.8017894012388163, "grad_norm": 0.010948669165372849, "learning_rate": 2.0441418173280115e-05, "loss": 0.0348, "step": 13090 }, { "epoch": 1.8031658637302135, "grad_norm": 0.01744878478348255, "learning_rate": 2.0417938483212022e-05, "loss": 0.0261, "step": 13100 }, { "epoch": 1.8045423262216105, "grad_norm": 0.09164771437644958, "learning_rate": 2.039445879314393e-05, "loss": 0.0021, "step": 13110 }, { "epoch": 1.8059187887130075, "grad_norm": 0.01617882214486599, "learning_rate": 2.037097910307584e-05, "loss": 0.0022, "step": 13120 }, { "epoch": 1.8072952512044047, "grad_norm": 0.008918128907680511, "learning_rate": 2.034749941300775e-05, "loss": 0.0296, "step": 13130 }, { "epoch": 1.808671713695802, "grad_norm": 0.011586562730371952, "learning_rate": 2.032401972293966e-05, "loss": 0.0004, "step": 13140 }, { "epoch": 1.810048176187199, "grad_norm": 0.010208925232291222, "learning_rate": 2.030054003287157e-05, "loss": 0.0004, "step": 13150 }, { "epoch": 1.811424638678596, "grad_norm": 0.008525541983544827, "learning_rate": 2.0277060342803477e-05, "loss": 0.0014, "step": 13160 }, { "epoch": 1.8128011011699932, "grad_norm": 2.0569634437561035, "learning_rate": 2.0253580652735384e-05, "loss": 0.103, "step": 13170 }, { "epoch": 1.8141775636613904, "grad_norm": 0.01340760849416256, "learning_rate": 2.0230100962667295e-05, "loss": 0.0093, "step": 13180 }, { "epoch": 1.8155540261527874, "grad_norm": 0.011951197870075703, "learning_rate": 2.0206621272599202e-05, "loss": 0.0005, "step": 13190 }, { "epoch": 1.8169304886441844, "grad_norm": 0.009214317426085472, "learning_rate": 2.0183141582531113e-05, "loss": 0.0006, "step": 13200 }, { "epoch": 1.8183069511355816, "grad_norm": 12.915182113647461, "learning_rate": 2.015966189246302e-05, "loss": 0.0374, "step": 13210 }, { "epoch": 1.8196834136269788, "grad_norm": 2.015533685684204, "learning_rate": 2.013618220239493e-05, "loss": 0.0412, "step": 13220 }, { "epoch": 1.8210598761183756, "grad_norm": 0.1951271891593933, "learning_rate": 2.0112702512326838e-05, "loss": 0.0012, "step": 13230 }, { "epoch": 1.8224363386097728, "grad_norm": 0.061943795531988144, "learning_rate": 2.0089222822258746e-05, "loss": 0.0732, "step": 13240 }, { "epoch": 1.82381280110117, "grad_norm": 0.045225612819194794, "learning_rate": 2.0065743132190656e-05, "loss": 0.0029, "step": 13250 }, { "epoch": 1.825189263592567, "grad_norm": 0.016883326694369316, "learning_rate": 2.0042263442122564e-05, "loss": 0.0262, "step": 13260 }, { "epoch": 1.826565726083964, "grad_norm": 0.009702551178634167, "learning_rate": 2.0018783752054474e-05, "loss": 0.0188, "step": 13270 }, { "epoch": 1.8279421885753613, "grad_norm": 1.955073356628418, "learning_rate": 1.9995304061986385e-05, "loss": 0.046, "step": 13280 }, { "epoch": 1.8293186510667585, "grad_norm": 2.7702863216400146, "learning_rate": 1.997182437191829e-05, "loss": 0.0491, "step": 13290 }, { "epoch": 1.8306951135581555, "grad_norm": 0.028814565390348434, "learning_rate": 1.99483446818502e-05, "loss": 0.0035, "step": 13300 }, { "epoch": 1.8320715760495525, "grad_norm": 0.02669665403664112, "learning_rate": 1.992486499178211e-05, "loss": 0.0092, "step": 13310 }, { "epoch": 1.8334480385409497, "grad_norm": 0.06526514887809753, "learning_rate": 1.9901385301714018e-05, "loss": 0.0428, "step": 13320 }, { "epoch": 1.834824501032347, "grad_norm": 0.03472983464598656, "learning_rate": 1.987790561164593e-05, "loss": 0.0305, "step": 13330 }, { "epoch": 1.836200963523744, "grad_norm": 0.10420204699039459, "learning_rate": 1.9854425921577836e-05, "loss": 0.0255, "step": 13340 }, { "epoch": 1.837577426015141, "grad_norm": 0.04072057083249092, "learning_rate": 1.9830946231509743e-05, "loss": 0.0031, "step": 13350 }, { "epoch": 1.8389538885065382, "grad_norm": 0.09574218094348907, "learning_rate": 1.9807466541441654e-05, "loss": 0.0176, "step": 13360 }, { "epoch": 1.8403303509979354, "grad_norm": 0.01084713451564312, "learning_rate": 1.978398685137356e-05, "loss": 0.067, "step": 13370 }, { "epoch": 1.8417068134893324, "grad_norm": 0.013614550232887268, "learning_rate": 1.9760507161305472e-05, "loss": 0.0015, "step": 13380 }, { "epoch": 1.8430832759807294, "grad_norm": 0.014630693942308426, "learning_rate": 1.973702747123738e-05, "loss": 0.0426, "step": 13390 }, { "epoch": 1.8444597384721266, "grad_norm": 0.020147480070590973, "learning_rate": 1.971354778116929e-05, "loss": 0.0559, "step": 13400 }, { "epoch": 1.8458362009635239, "grad_norm": 0.157150000333786, "learning_rate": 1.9690068091101198e-05, "loss": 0.0052, "step": 13410 }, { "epoch": 1.8472126634549209, "grad_norm": 0.017643120139837265, "learning_rate": 1.9666588401033105e-05, "loss": 0.0032, "step": 13420 }, { "epoch": 1.8485891259463179, "grad_norm": 0.016933830454945564, "learning_rate": 1.9643108710965016e-05, "loss": 0.0022, "step": 13430 }, { "epoch": 1.849965588437715, "grad_norm": 0.02108391560614109, "learning_rate": 1.9619629020896927e-05, "loss": 0.0653, "step": 13440 }, { "epoch": 1.8513420509291123, "grad_norm": 0.035726070404052734, "learning_rate": 1.9596149330828834e-05, "loss": 0.001, "step": 13450 }, { "epoch": 1.8527185134205093, "grad_norm": 0.1129087507724762, "learning_rate": 1.9572669640760745e-05, "loss": 0.0019, "step": 13460 }, { "epoch": 1.8540949759119063, "grad_norm": 0.02454477921128273, "learning_rate": 1.9549189950692652e-05, "loss": 0.0493, "step": 13470 }, { "epoch": 1.8554714384033035, "grad_norm": 0.03192034736275673, "learning_rate": 1.952571026062456e-05, "loss": 0.0423, "step": 13480 }, { "epoch": 1.8568479008947008, "grad_norm": 0.054306525737047195, "learning_rate": 1.950223057055647e-05, "loss": 0.0118, "step": 13490 }, { "epoch": 1.8582243633860978, "grad_norm": 0.035901427268981934, "learning_rate": 1.9478750880488377e-05, "loss": 0.0081, "step": 13500 }, { "epoch": 1.8596008258774948, "grad_norm": 0.02472263015806675, "learning_rate": 1.9455271190420288e-05, "loss": 0.0036, "step": 13510 }, { "epoch": 1.860977288368892, "grad_norm": 0.018902182579040527, "learning_rate": 1.9431791500352196e-05, "loss": 0.001, "step": 13520 }, { "epoch": 1.8623537508602892, "grad_norm": 0.9209521412849426, "learning_rate": 1.9408311810284106e-05, "loss": 0.0057, "step": 13530 }, { "epoch": 1.8637302133516862, "grad_norm": 0.012889071367681026, "learning_rate": 1.9384832120216014e-05, "loss": 0.0009, "step": 13540 }, { "epoch": 1.8651066758430832, "grad_norm": 0.012852941639721394, "learning_rate": 1.936135243014792e-05, "loss": 0.0509, "step": 13550 }, { "epoch": 1.8664831383344804, "grad_norm": 0.023172274231910706, "learning_rate": 1.9337872740079832e-05, "loss": 0.0027, "step": 13560 }, { "epoch": 1.8678596008258777, "grad_norm": 0.7110316753387451, "learning_rate": 1.9314393050011743e-05, "loss": 0.0342, "step": 13570 }, { "epoch": 1.8692360633172747, "grad_norm": 0.0161312073469162, "learning_rate": 1.929091335994365e-05, "loss": 0.0024, "step": 13580 }, { "epoch": 1.8706125258086717, "grad_norm": 0.03316260874271393, "learning_rate": 1.926743366987556e-05, "loss": 0.063, "step": 13590 }, { "epoch": 1.8719889883000689, "grad_norm": 0.1657414734363556, "learning_rate": 1.9243953979807468e-05, "loss": 0.0028, "step": 13600 }, { "epoch": 1.8733654507914659, "grad_norm": 0.027636444196105003, "learning_rate": 1.9220474289739375e-05, "loss": 0.0014, "step": 13610 }, { "epoch": 1.8747419132828629, "grad_norm": 0.01915459707379341, "learning_rate": 1.9196994599671286e-05, "loss": 0.0405, "step": 13620 }, { "epoch": 1.87611837577426, "grad_norm": 0.02367568202316761, "learning_rate": 1.9173514909603193e-05, "loss": 0.0009, "step": 13630 }, { "epoch": 1.8774948382656573, "grad_norm": 0.0176922045648098, "learning_rate": 1.9150035219535104e-05, "loss": 0.0009, "step": 13640 }, { "epoch": 1.8788713007570543, "grad_norm": 0.018141701817512512, "learning_rate": 1.912655552946701e-05, "loss": 0.0019, "step": 13650 }, { "epoch": 1.8802477632484513, "grad_norm": 0.012576526030898094, "learning_rate": 1.9103075839398922e-05, "loss": 0.0006, "step": 13660 }, { "epoch": 1.8816242257398486, "grad_norm": 0.015321780927479267, "learning_rate": 1.907959614933083e-05, "loss": 0.0005, "step": 13670 }, { "epoch": 1.8830006882312458, "grad_norm": 0.010657928884029388, "learning_rate": 1.9056116459262737e-05, "loss": 0.0517, "step": 13680 }, { "epoch": 1.8843771507226428, "grad_norm": 4.510225296020508, "learning_rate": 1.9032636769194648e-05, "loss": 0.0096, "step": 13690 }, { "epoch": 1.8857536132140398, "grad_norm": 0.016853289678692818, "learning_rate": 1.900915707912656e-05, "loss": 0.0007, "step": 13700 }, { "epoch": 1.887130075705437, "grad_norm": 0.016955044120550156, "learning_rate": 1.8985677389058466e-05, "loss": 0.0179, "step": 13710 }, { "epoch": 1.8885065381968342, "grad_norm": 0.012910791672766209, "learning_rate": 1.8962197698990377e-05, "loss": 0.0147, "step": 13720 }, { "epoch": 1.8898830006882312, "grad_norm": 0.01392082218080759, "learning_rate": 1.893871800892228e-05, "loss": 0.0139, "step": 13730 }, { "epoch": 1.8912594631796282, "grad_norm": 0.012979269959032536, "learning_rate": 1.891523831885419e-05, "loss": 0.0377, "step": 13740 }, { "epoch": 1.8926359256710255, "grad_norm": 0.011527977883815765, "learning_rate": 1.8891758628786102e-05, "loss": 0.003, "step": 13750 }, { "epoch": 1.8940123881624227, "grad_norm": 0.013026701286435127, "learning_rate": 1.886827893871801e-05, "loss": 0.0081, "step": 13760 }, { "epoch": 1.8953888506538197, "grad_norm": 0.012695708312094212, "learning_rate": 1.884479924864992e-05, "loss": 0.046, "step": 13770 }, { "epoch": 1.8967653131452167, "grad_norm": 0.013390621170401573, "learning_rate": 1.8821319558581827e-05, "loss": 0.0017, "step": 13780 }, { "epoch": 1.898141775636614, "grad_norm": 0.01494542509317398, "learning_rate": 1.8797839868513735e-05, "loss": 0.03, "step": 13790 }, { "epoch": 1.8995182381280111, "grad_norm": 0.011178439483046532, "learning_rate": 1.8774360178445646e-05, "loss": 0.0006, "step": 13800 }, { "epoch": 1.9008947006194081, "grad_norm": 0.010817430913448334, "learning_rate": 1.8750880488377553e-05, "loss": 0.0092, "step": 13810 }, { "epoch": 1.9022711631108051, "grad_norm": 0.012096919119358063, "learning_rate": 1.8727400798309464e-05, "loss": 0.0017, "step": 13820 }, { "epoch": 1.9036476256022024, "grad_norm": 0.026760948821902275, "learning_rate": 1.8703921108241374e-05, "loss": 0.0485, "step": 13830 }, { "epoch": 1.9050240880935996, "grad_norm": 0.07884696125984192, "learning_rate": 1.8680441418173282e-05, "loss": 0.0013, "step": 13840 }, { "epoch": 1.9064005505849966, "grad_norm": 0.019958876073360443, "learning_rate": 1.865696172810519e-05, "loss": 0.0131, "step": 13850 }, { "epoch": 1.9077770130763936, "grad_norm": 0.02350415475666523, "learning_rate": 1.8633482038037096e-05, "loss": 0.0028, "step": 13860 }, { "epoch": 1.9091534755677908, "grad_norm": 0.017545750364661217, "learning_rate": 1.8610002347969007e-05, "loss": 0.033, "step": 13870 }, { "epoch": 1.910529938059188, "grad_norm": 0.031122436746954918, "learning_rate": 1.8586522657900918e-05, "loss": 0.0505, "step": 13880 }, { "epoch": 1.911906400550585, "grad_norm": 0.028246723115444183, "learning_rate": 1.8563042967832825e-05, "loss": 0.0027, "step": 13890 }, { "epoch": 1.913282863041982, "grad_norm": 0.016807598993182182, "learning_rate": 1.8539563277764736e-05, "loss": 0.001, "step": 13900 }, { "epoch": 1.9146593255333793, "grad_norm": 34.673095703125, "learning_rate": 1.8516083587696643e-05, "loss": 0.015, "step": 13910 }, { "epoch": 1.9160357880247765, "grad_norm": 0.01144159585237503, "learning_rate": 1.849260389762855e-05, "loss": 0.0008, "step": 13920 }, { "epoch": 1.9174122505161735, "grad_norm": 0.0117637375369668, "learning_rate": 1.846912420756046e-05, "loss": 0.01, "step": 13930 }, { "epoch": 1.9187887130075705, "grad_norm": 0.010061141103506088, "learning_rate": 1.844564451749237e-05, "loss": 0.0006, "step": 13940 }, { "epoch": 1.9201651754989677, "grad_norm": 0.010193397291004658, "learning_rate": 1.842216482742428e-05, "loss": 0.0005, "step": 13950 }, { "epoch": 1.921541637990365, "grad_norm": 0.009352926164865494, "learning_rate": 1.839868513735619e-05, "loss": 0.0004, "step": 13960 }, { "epoch": 1.922918100481762, "grad_norm": 0.009108150377869606, "learning_rate": 1.8375205447288098e-05, "loss": 0.0008, "step": 13970 }, { "epoch": 1.924294562973159, "grad_norm": 0.007733062840998173, "learning_rate": 1.8351725757220005e-05, "loss": 0.0013, "step": 13980 }, { "epoch": 1.9256710254645562, "grad_norm": 0.05485077574849129, "learning_rate": 1.8328246067151912e-05, "loss": 0.0004, "step": 13990 }, { "epoch": 1.9270474879559532, "grad_norm": 0.008947255089879036, "learning_rate": 1.8304766377083823e-05, "loss": 0.0323, "step": 14000 }, { "epoch": 1.9284239504473502, "grad_norm": 0.07555273175239563, "learning_rate": 1.8281286687015734e-05, "loss": 0.0029, "step": 14010 }, { "epoch": 1.9298004129387474, "grad_norm": 0.011211949400603771, "learning_rate": 1.825780699694764e-05, "loss": 0.0578, "step": 14020 }, { "epoch": 1.9311768754301446, "grad_norm": 0.019071441143751144, "learning_rate": 1.8234327306879552e-05, "loss": 0.0013, "step": 14030 }, { "epoch": 1.9325533379215416, "grad_norm": 0.020604675635695457, "learning_rate": 1.821084761681146e-05, "loss": 0.0009, "step": 14040 }, { "epoch": 1.9339298004129386, "grad_norm": 2.0718631744384766, "learning_rate": 1.8187367926743367e-05, "loss": 0.002, "step": 14050 }, { "epoch": 1.9353062629043358, "grad_norm": 0.011949008330702782, "learning_rate": 1.8163888236675277e-05, "loss": 0.0007, "step": 14060 }, { "epoch": 1.936682725395733, "grad_norm": 0.010087862610816956, "learning_rate": 1.8140408546607185e-05, "loss": 0.0006, "step": 14070 }, { "epoch": 1.93805918788713, "grad_norm": 3.170825958251953, "learning_rate": 1.8116928856539096e-05, "loss": 0.0428, "step": 14080 }, { "epoch": 1.939435650378527, "grad_norm": 0.010439671576023102, "learning_rate": 1.8093449166471003e-05, "loss": 0.0008, "step": 14090 }, { "epoch": 1.9408121128699243, "grad_norm": 0.035380616784095764, "learning_rate": 1.8069969476402914e-05, "loss": 0.0005, "step": 14100 }, { "epoch": 1.9421885753613215, "grad_norm": 2.6345393657684326, "learning_rate": 1.804648978633482e-05, "loss": 0.0593, "step": 14110 }, { "epoch": 1.9435650378527185, "grad_norm": 0.009278692305088043, "learning_rate": 1.802301009626673e-05, "loss": 0.0052, "step": 14120 }, { "epoch": 1.9449415003441155, "grad_norm": 0.008539421483874321, "learning_rate": 1.799953040619864e-05, "loss": 0.0032, "step": 14130 }, { "epoch": 1.9463179628355127, "grad_norm": 0.009281005710363388, "learning_rate": 1.797605071613055e-05, "loss": 0.0024, "step": 14140 }, { "epoch": 1.94769442532691, "grad_norm": 0.03864012286067009, "learning_rate": 1.7952571026062457e-05, "loss": 0.0495, "step": 14150 }, { "epoch": 1.949070887818307, "grad_norm": 0.03348974511027336, "learning_rate": 1.7929091335994368e-05, "loss": 0.0419, "step": 14160 }, { "epoch": 1.950447350309704, "grad_norm": 0.0705091580748558, "learning_rate": 1.7905611645926272e-05, "loss": 0.0052, "step": 14170 }, { "epoch": 1.9518238128011012, "grad_norm": 0.027572311460971832, "learning_rate": 1.7882131955858183e-05, "loss": 0.0015, "step": 14180 }, { "epoch": 1.9532002752924984, "grad_norm": 0.012726387940347195, "learning_rate": 1.7858652265790093e-05, "loss": 0.0011, "step": 14190 }, { "epoch": 1.9545767377838954, "grad_norm": 0.08100269734859467, "learning_rate": 1.7835172575722e-05, "loss": 0.0012, "step": 14200 }, { "epoch": 1.9559532002752924, "grad_norm": 0.010158671997487545, "learning_rate": 1.781169288565391e-05, "loss": 0.0428, "step": 14210 }, { "epoch": 1.9573296627666896, "grad_norm": 0.08443813771009445, "learning_rate": 1.778821319558582e-05, "loss": 0.0016, "step": 14220 }, { "epoch": 1.9587061252580868, "grad_norm": 0.027394089847803116, "learning_rate": 1.7764733505517726e-05, "loss": 0.0004, "step": 14230 }, { "epoch": 1.9600825877494839, "grad_norm": 0.006580899935215712, "learning_rate": 1.7741253815449637e-05, "loss": 0.0007, "step": 14240 }, { "epoch": 1.9614590502408809, "grad_norm": 0.006730885710567236, "learning_rate": 1.7717774125381544e-05, "loss": 0.0005, "step": 14250 }, { "epoch": 1.962835512732278, "grad_norm": 0.0063227140344679356, "learning_rate": 1.7694294435313455e-05, "loss": 0.0003, "step": 14260 }, { "epoch": 1.9642119752236753, "grad_norm": 0.0676233246922493, "learning_rate": 1.7670814745245366e-05, "loss": 0.0015, "step": 14270 }, { "epoch": 1.9655884377150723, "grad_norm": 0.005627878941595554, "learning_rate": 1.7647335055177273e-05, "loss": 0.0286, "step": 14280 }, { "epoch": 1.9669649002064693, "grad_norm": 0.0059104119427502155, "learning_rate": 1.762385536510918e-05, "loss": 0.0048, "step": 14290 }, { "epoch": 1.9683413626978665, "grad_norm": 0.2746431529521942, "learning_rate": 1.7600375675041088e-05, "loss": 0.0025, "step": 14300 }, { "epoch": 1.9697178251892637, "grad_norm": 0.006931979209184647, "learning_rate": 1.7576895984973e-05, "loss": 0.0013, "step": 14310 }, { "epoch": 1.9710942876806608, "grad_norm": 0.0054471236653625965, "learning_rate": 1.755341629490491e-05, "loss": 0.0004, "step": 14320 }, { "epoch": 1.9724707501720578, "grad_norm": 0.006512288469821215, "learning_rate": 1.7529936604836817e-05, "loss": 0.0004, "step": 14330 }, { "epoch": 1.973847212663455, "grad_norm": 0.00530678266659379, "learning_rate": 1.7506456914768728e-05, "loss": 0.0017, "step": 14340 }, { "epoch": 1.975223675154852, "grad_norm": 0.005542337894439697, "learning_rate": 1.7482977224700635e-05, "loss": 0.0002, "step": 14350 }, { "epoch": 1.976600137646249, "grad_norm": 0.005094851367175579, "learning_rate": 1.7459497534632542e-05, "loss": 0.0008, "step": 14360 }, { "epoch": 1.9779766001376462, "grad_norm": 0.005683694966137409, "learning_rate": 1.7436017844564453e-05, "loss": 0.0002, "step": 14370 }, { "epoch": 1.9793530626290434, "grad_norm": 0.01256958395242691, "learning_rate": 1.741253815449636e-05, "loss": 0.0006, "step": 14380 }, { "epoch": 1.9807295251204404, "grad_norm": 0.006077440455555916, "learning_rate": 1.738905846442827e-05, "loss": 0.0006, "step": 14390 }, { "epoch": 1.9821059876118374, "grad_norm": 0.005488158669322729, "learning_rate": 1.7365578774360182e-05, "loss": 0.0002, "step": 14400 }, { "epoch": 1.9834824501032347, "grad_norm": 0.004299502354115248, "learning_rate": 1.734209908429209e-05, "loss": 0.0003, "step": 14410 }, { "epoch": 1.9848589125946319, "grad_norm": 0.004981528501957655, "learning_rate": 1.7318619394223997e-05, "loss": 0.0002, "step": 14420 }, { "epoch": 1.9862353750860289, "grad_norm": 0.0040782783180475235, "learning_rate": 1.7295139704155904e-05, "loss": 0.0533, "step": 14430 }, { "epoch": 1.9876118375774259, "grad_norm": 0.005624454002827406, "learning_rate": 1.7271660014087815e-05, "loss": 0.0002, "step": 14440 }, { "epoch": 1.988988300068823, "grad_norm": 0.0068679386749863625, "learning_rate": 1.7248180324019725e-05, "loss": 0.0273, "step": 14450 }, { "epoch": 1.9903647625602203, "grad_norm": 0.006082321982830763, "learning_rate": 1.7224700633951633e-05, "loss": 0.0004, "step": 14460 }, { "epoch": 1.9917412250516173, "grad_norm": 0.008290169760584831, "learning_rate": 1.7201220943883543e-05, "loss": 0.0003, "step": 14470 }, { "epoch": 1.9931176875430143, "grad_norm": 0.009760730899870396, "learning_rate": 1.717774125381545e-05, "loss": 0.0558, "step": 14480 }, { "epoch": 1.9944941500344116, "grad_norm": 0.016116272658109665, "learning_rate": 1.7154261563747358e-05, "loss": 0.0374, "step": 14490 }, { "epoch": 1.9958706125258088, "grad_norm": 0.01658851094543934, "learning_rate": 1.713078187367927e-05, "loss": 0.0246, "step": 14500 }, { "epoch": 1.9972470750172058, "grad_norm": 0.014818375930190086, "learning_rate": 1.7107302183611176e-05, "loss": 0.0012, "step": 14510 }, { "epoch": 1.9986235375086028, "grad_norm": 0.0107679832726717, "learning_rate": 1.7083822493543087e-05, "loss": 0.0233, "step": 14520 }, { "epoch": 2.0, "grad_norm": 0.011767382733523846, "learning_rate": 1.7060342803474998e-05, "loss": 0.0016, "step": 14530 }, { "epoch": 2.0, "eval_accuracy": 0.9960426703372333, "eval_f1": 0.9585286693112153, "eval_loss": 0.01602308452129364, "eval_precision": 0.9568034557235421, "eval_recall": 0.9602601156069365, "eval_runtime": 51.2015, "eval_samples_per_second": 567.561, "eval_steps_per_second": 35.487, "step": 14530 }, { "epoch": 2.0013764624913972, "grad_norm": 0.008866538293659687, "learning_rate": 1.7036863113406905e-05, "loss": 0.0011, "step": 14540 }, { "epoch": 2.002752924982794, "grad_norm": 0.024682050570845604, "learning_rate": 1.7013383423338812e-05, "loss": 0.0407, "step": 14550 }, { "epoch": 2.0041293874741912, "grad_norm": 0.11835715174674988, "learning_rate": 1.698990373327072e-05, "loss": 0.0213, "step": 14560 }, { "epoch": 2.0055058499655884, "grad_norm": 0.17038236558437347, "learning_rate": 1.696642404320263e-05, "loss": 0.0028, "step": 14570 }, { "epoch": 2.0068823124569857, "grad_norm": 0.009786905720829964, "learning_rate": 1.694294435313454e-05, "loss": 0.0012, "step": 14580 }, { "epoch": 2.0082587749483825, "grad_norm": 0.01710568554699421, "learning_rate": 1.691946466306645e-05, "loss": 0.0237, "step": 14590 }, { "epoch": 2.0096352374397797, "grad_norm": 0.010813843458890915, "learning_rate": 1.6895984972998356e-05, "loss": 0.0696, "step": 14600 }, { "epoch": 2.011011699931177, "grad_norm": 0.10460500419139862, "learning_rate": 1.6872505282930263e-05, "loss": 0.0018, "step": 14610 }, { "epoch": 2.012388162422574, "grad_norm": 0.0122667932882905, "learning_rate": 1.6849025592862174e-05, "loss": 0.0015, "step": 14620 }, { "epoch": 2.013764624913971, "grad_norm": 0.009771223179996014, "learning_rate": 1.6825545902794085e-05, "loss": 0.0009, "step": 14630 }, { "epoch": 2.015141087405368, "grad_norm": 0.00786112155765295, "learning_rate": 1.6802066212725992e-05, "loss": 0.0011, "step": 14640 }, { "epoch": 2.0165175498967653, "grad_norm": 0.0069689624942839146, "learning_rate": 1.6778586522657903e-05, "loss": 0.0007, "step": 14650 }, { "epoch": 2.0178940123881626, "grad_norm": 0.06915377825498581, "learning_rate": 1.675510683258981e-05, "loss": 0.0009, "step": 14660 }, { "epoch": 2.0192704748795594, "grad_norm": 0.0077467672526836395, "learning_rate": 1.6731627142521718e-05, "loss": 0.0211, "step": 14670 }, { "epoch": 2.0206469373709566, "grad_norm": 0.006269470788538456, "learning_rate": 1.670814745245363e-05, "loss": 0.0005, "step": 14680 }, { "epoch": 2.022023399862354, "grad_norm": 0.006324428133666515, "learning_rate": 1.6684667762385536e-05, "loss": 0.0011, "step": 14690 }, { "epoch": 2.023399862353751, "grad_norm": 0.009962160140275955, "learning_rate": 1.6661188072317447e-05, "loss": 0.0171, "step": 14700 }, { "epoch": 2.024776324845148, "grad_norm": 0.0746166929602623, "learning_rate": 1.6637708382249357e-05, "loss": 0.0005, "step": 14710 }, { "epoch": 2.026152787336545, "grad_norm": 0.007255637552589178, "learning_rate": 1.6614228692181265e-05, "loss": 0.055, "step": 14720 }, { "epoch": 2.0275292498279422, "grad_norm": 0.007976378314197063, "learning_rate": 1.6590749002113172e-05, "loss": 0.0004, "step": 14730 }, { "epoch": 2.0289057123193395, "grad_norm": 0.008148830384016037, "learning_rate": 1.656726931204508e-05, "loss": 0.0009, "step": 14740 }, { "epoch": 2.0302821748107363, "grad_norm": 2.4707279205322266, "learning_rate": 1.654378962197699e-05, "loss": 0.0497, "step": 14750 }, { "epoch": 2.0316586373021335, "grad_norm": 0.016896268352866173, "learning_rate": 1.65203099319089e-05, "loss": 0.0006, "step": 14760 }, { "epoch": 2.0330350997935307, "grad_norm": 0.0202204417437315, "learning_rate": 1.6496830241840808e-05, "loss": 0.0177, "step": 14770 }, { "epoch": 2.034411562284928, "grad_norm": 0.01204210240393877, "learning_rate": 1.647335055177272e-05, "loss": 0.0014, "step": 14780 }, { "epoch": 2.0357880247763247, "grad_norm": 0.021756399422883987, "learning_rate": 1.6449870861704626e-05, "loss": 0.0004, "step": 14790 }, { "epoch": 2.037164487267722, "grad_norm": 0.009363628923892975, "learning_rate": 1.6426391171636534e-05, "loss": 0.0027, "step": 14800 }, { "epoch": 2.038540949759119, "grad_norm": 54.625511169433594, "learning_rate": 1.6402911481568444e-05, "loss": 0.0053, "step": 14810 }, { "epoch": 2.0399174122505164, "grad_norm": 0.007546938955783844, "learning_rate": 1.6379431791500352e-05, "loss": 0.0637, "step": 14820 }, { "epoch": 2.041293874741913, "grad_norm": 0.007591840345412493, "learning_rate": 1.6355952101432262e-05, "loss": 0.0003, "step": 14830 }, { "epoch": 2.0426703372333104, "grad_norm": 0.013256668113172054, "learning_rate": 1.6332472411364173e-05, "loss": 0.0326, "step": 14840 }, { "epoch": 2.0440467997247076, "grad_norm": 0.007273561786860228, "learning_rate": 1.630899272129608e-05, "loss": 0.0012, "step": 14850 }, { "epoch": 2.045423262216105, "grad_norm": 0.006617393344640732, "learning_rate": 1.6285513031227988e-05, "loss": 0.0018, "step": 14860 }, { "epoch": 2.0467997247075016, "grad_norm": 0.015325683169066906, "learning_rate": 1.6262033341159895e-05, "loss": 0.0023, "step": 14870 }, { "epoch": 2.048176187198899, "grad_norm": 0.006566501688212156, "learning_rate": 1.6238553651091806e-05, "loss": 0.0007, "step": 14880 }, { "epoch": 2.049552649690296, "grad_norm": 0.005262897349894047, "learning_rate": 1.6215073961023717e-05, "loss": 0.0076, "step": 14890 }, { "epoch": 2.0509291121816933, "grad_norm": 0.005610655527561903, "learning_rate": 1.6191594270955624e-05, "loss": 0.0421, "step": 14900 }, { "epoch": 2.05230557467309, "grad_norm": 0.008396641351282597, "learning_rate": 1.6168114580887535e-05, "loss": 0.0172, "step": 14910 }, { "epoch": 2.0536820371644873, "grad_norm": 0.005525871645659208, "learning_rate": 1.6144634890819442e-05, "loss": 0.0003, "step": 14920 }, { "epoch": 2.0550584996558845, "grad_norm": 0.005806797184050083, "learning_rate": 1.612115520075135e-05, "loss": 0.0023, "step": 14930 }, { "epoch": 2.0564349621472813, "grad_norm": 0.029443582519888878, "learning_rate": 1.609767551068326e-05, "loss": 0.0003, "step": 14940 }, { "epoch": 2.0578114246386785, "grad_norm": 0.005531475879251957, "learning_rate": 1.6074195820615168e-05, "loss": 0.0005, "step": 14950 }, { "epoch": 2.0591878871300757, "grad_norm": 0.3197421133518219, "learning_rate": 1.605071613054708e-05, "loss": 0.0006, "step": 14960 }, { "epoch": 2.060564349621473, "grad_norm": 0.040478575974702835, "learning_rate": 1.602723644047899e-05, "loss": 0.0389, "step": 14970 }, { "epoch": 2.0619408121128697, "grad_norm": 0.017292339354753494, "learning_rate": 1.6003756750410893e-05, "loss": 0.0007, "step": 14980 }, { "epoch": 2.063317274604267, "grad_norm": 0.022878378629684448, "learning_rate": 1.5980277060342804e-05, "loss": 0.0011, "step": 14990 }, { "epoch": 2.064693737095664, "grad_norm": 0.005106889642775059, "learning_rate": 1.595679737027471e-05, "loss": 0.0003, "step": 15000 }, { "epoch": 2.0660701995870614, "grad_norm": 1.841195821762085, "learning_rate": 1.5933317680206622e-05, "loss": 0.0011, "step": 15010 }, { "epoch": 2.067446662078458, "grad_norm": 0.004719685297459364, "learning_rate": 1.5909837990138533e-05, "loss": 0.0011, "step": 15020 }, { "epoch": 2.0688231245698554, "grad_norm": 0.03911282867193222, "learning_rate": 1.588635830007044e-05, "loss": 0.0687, "step": 15030 }, { "epoch": 2.0701995870612526, "grad_norm": 0.10343584418296814, "learning_rate": 1.5862878610002347e-05, "loss": 0.0099, "step": 15040 }, { "epoch": 2.07157604955265, "grad_norm": 0.006051979027688503, "learning_rate": 1.5839398919934255e-05, "loss": 0.0022, "step": 15050 }, { "epoch": 2.0729525120440466, "grad_norm": 0.0430036224424839, "learning_rate": 1.5815919229866166e-05, "loss": 0.0011, "step": 15060 }, { "epoch": 2.074328974535444, "grad_norm": 0.006438880227506161, "learning_rate": 1.5792439539798076e-05, "loss": 0.0011, "step": 15070 }, { "epoch": 2.075705437026841, "grad_norm": 60.276432037353516, "learning_rate": 1.5768959849729984e-05, "loss": 0.0626, "step": 15080 }, { "epoch": 2.0770818995182383, "grad_norm": 0.053911324590444565, "learning_rate": 1.5745480159661894e-05, "loss": 0.0014, "step": 15090 }, { "epoch": 2.078458362009635, "grad_norm": 0.005304734688252211, "learning_rate": 1.5722000469593802e-05, "loss": 0.0203, "step": 15100 }, { "epoch": 2.0798348245010323, "grad_norm": 0.00624978169798851, "learning_rate": 1.569852077952571e-05, "loss": 0.0236, "step": 15110 }, { "epoch": 2.0812112869924295, "grad_norm": 0.004321543499827385, "learning_rate": 1.567504108945762e-05, "loss": 0.0017, "step": 15120 }, { "epoch": 2.0825877494838267, "grad_norm": 0.005269702058285475, "learning_rate": 1.5651561399389527e-05, "loss": 0.019, "step": 15130 }, { "epoch": 2.0839642119752235, "grad_norm": 0.004582828376442194, "learning_rate": 1.5628081709321438e-05, "loss": 0.0009, "step": 15140 }, { "epoch": 2.0853406744666207, "grad_norm": 0.037227753549814224, "learning_rate": 1.560460201925335e-05, "loss": 0.0004, "step": 15150 }, { "epoch": 2.086717136958018, "grad_norm": 0.004351510666310787, "learning_rate": 1.5581122329185256e-05, "loss": 0.0004, "step": 15160 }, { "epoch": 2.088093599449415, "grad_norm": 0.049037061631679535, "learning_rate": 1.5557642639117163e-05, "loss": 0.0642, "step": 15170 }, { "epoch": 2.089470061940812, "grad_norm": 0.13950757682323456, "learning_rate": 1.553416294904907e-05, "loss": 0.0032, "step": 15180 }, { "epoch": 2.090846524432209, "grad_norm": 0.011455926112830639, "learning_rate": 1.551068325898098e-05, "loss": 0.0021, "step": 15190 }, { "epoch": 2.0922229869236064, "grad_norm": 0.006481320597231388, "learning_rate": 1.5487203568912892e-05, "loss": 0.0007, "step": 15200 }, { "epoch": 2.0935994494150036, "grad_norm": 0.005417388863861561, "learning_rate": 1.54637238788448e-05, "loss": 0.0006, "step": 15210 }, { "epoch": 2.0949759119064004, "grad_norm": 0.004794521257281303, "learning_rate": 1.544024418877671e-05, "loss": 0.0005, "step": 15220 }, { "epoch": 2.0963523743977976, "grad_norm": 0.004543993156403303, "learning_rate": 1.5416764498708618e-05, "loss": 0.0341, "step": 15230 }, { "epoch": 2.097728836889195, "grad_norm": 0.005414238199591637, "learning_rate": 1.5393284808640525e-05, "loss": 0.0004, "step": 15240 }, { "epoch": 2.099105299380592, "grad_norm": 0.004324499983340502, "learning_rate": 1.5369805118572436e-05, "loss": 0.0023, "step": 15250 }, { "epoch": 2.100481761871989, "grad_norm": 0.004601080436259508, "learning_rate": 1.5346325428504343e-05, "loss": 0.0001, "step": 15260 }, { "epoch": 2.101858224363386, "grad_norm": 0.004050959832966328, "learning_rate": 1.5322845738436254e-05, "loss": 0.0127, "step": 15270 }, { "epoch": 2.1032346868547833, "grad_norm": 0.1295706182718277, "learning_rate": 1.5299366048368165e-05, "loss": 0.0015, "step": 15280 }, { "epoch": 2.10461114934618, "grad_norm": 0.003535772208124399, "learning_rate": 1.5275886358300072e-05, "loss": 0.0047, "step": 15290 }, { "epoch": 2.1059876118375773, "grad_norm": 0.003767473855987191, "learning_rate": 1.5252406668231981e-05, "loss": 0.0012, "step": 15300 }, { "epoch": 2.1073640743289745, "grad_norm": 0.004612368531525135, "learning_rate": 1.5228926978163888e-05, "loss": 0.0237, "step": 15310 }, { "epoch": 2.1087405368203718, "grad_norm": 0.005236895754933357, "learning_rate": 1.5205447288095797e-05, "loss": 0.0008, "step": 15320 }, { "epoch": 2.1101169993117685, "grad_norm": 0.6976677775382996, "learning_rate": 1.5181967598027708e-05, "loss": 0.0159, "step": 15330 }, { "epoch": 2.1114934618031658, "grad_norm": 0.005726093892008066, "learning_rate": 1.5158487907959616e-05, "loss": 0.0788, "step": 15340 }, { "epoch": 2.112869924294563, "grad_norm": 0.005639946553856134, "learning_rate": 1.5135008217891525e-05, "loss": 0.0002, "step": 15350 }, { "epoch": 2.11424638678596, "grad_norm": 0.005523793864995241, "learning_rate": 1.5111528527823435e-05, "loss": 0.0007, "step": 15360 }, { "epoch": 2.115622849277357, "grad_norm": 0.02015271969139576, "learning_rate": 1.5088048837755343e-05, "loss": 0.0195, "step": 15370 }, { "epoch": 2.116999311768754, "grad_norm": 0.006135217379778624, "learning_rate": 1.5064569147687252e-05, "loss": 0.0005, "step": 15380 }, { "epoch": 2.1183757742601514, "grad_norm": 0.004654943943023682, "learning_rate": 1.5041089457619159e-05, "loss": 0.0027, "step": 15390 }, { "epoch": 2.1197522367515487, "grad_norm": 0.004184729419648647, "learning_rate": 1.501760976755107e-05, "loss": 0.0005, "step": 15400 }, { "epoch": 2.1211286992429454, "grad_norm": 0.004361656494438648, "learning_rate": 1.4994130077482979e-05, "loss": 0.0004, "step": 15410 }, { "epoch": 2.1225051617343427, "grad_norm": 0.052996281534433365, "learning_rate": 1.4970650387414886e-05, "loss": 0.0022, "step": 15420 }, { "epoch": 2.12388162422574, "grad_norm": 0.004608427174389362, "learning_rate": 1.4947170697346797e-05, "loss": 0.0007, "step": 15430 }, { "epoch": 2.125258086717137, "grad_norm": 0.004510062281042337, "learning_rate": 1.4923691007278703e-05, "loss": 0.0002, "step": 15440 }, { "epoch": 2.126634549208534, "grad_norm": 0.00474014226347208, "learning_rate": 1.4900211317210613e-05, "loss": 0.0002, "step": 15450 }, { "epoch": 2.128011011699931, "grad_norm": 0.05187118798494339, "learning_rate": 1.4876731627142524e-05, "loss": 0.001, "step": 15460 }, { "epoch": 2.1293874741913283, "grad_norm": 0.0060547892935574055, "learning_rate": 1.485325193707443e-05, "loss": 0.0004, "step": 15470 }, { "epoch": 2.1307639366827256, "grad_norm": 0.0037264085840433836, "learning_rate": 1.482977224700634e-05, "loss": 0.0001, "step": 15480 }, { "epoch": 2.1321403991741223, "grad_norm": 0.003720672335475683, "learning_rate": 1.480629255693825e-05, "loss": 0.0282, "step": 15490 }, { "epoch": 2.1335168616655196, "grad_norm": 0.002936543431133032, "learning_rate": 1.4782812866870157e-05, "loss": 0.0215, "step": 15500 }, { "epoch": 2.134893324156917, "grad_norm": 0.004322501830756664, "learning_rate": 1.4759333176802068e-05, "loss": 0.0008, "step": 15510 }, { "epoch": 2.136269786648314, "grad_norm": 0.003767559537664056, "learning_rate": 1.4735853486733975e-05, "loss": 0.0006, "step": 15520 }, { "epoch": 2.137646249139711, "grad_norm": 0.005496096331626177, "learning_rate": 1.4712373796665884e-05, "loss": 0.0141, "step": 15530 }, { "epoch": 2.139022711631108, "grad_norm": 0.004007866606116295, "learning_rate": 1.4688894106597795e-05, "loss": 0.0581, "step": 15540 }, { "epoch": 2.1403991741225052, "grad_norm": 0.005200222600251436, "learning_rate": 1.4665414416529702e-05, "loss": 0.001, "step": 15550 }, { "epoch": 2.1417756366139025, "grad_norm": 0.49161410331726074, "learning_rate": 1.4641934726461611e-05, "loss": 0.0193, "step": 15560 }, { "epoch": 2.1431520991052992, "grad_norm": 0.004998934920877218, "learning_rate": 1.4618455036393519e-05, "loss": 0.0002, "step": 15570 }, { "epoch": 2.1445285615966965, "grad_norm": 0.005437081679701805, "learning_rate": 1.459497534632543e-05, "loss": 0.0002, "step": 15580 }, { "epoch": 2.1459050240880937, "grad_norm": 0.005025902763009071, "learning_rate": 1.4571495656257338e-05, "loss": 0.0006, "step": 15590 }, { "epoch": 2.147281486579491, "grad_norm": 0.0044281622394919395, "learning_rate": 1.4548015966189246e-05, "loss": 0.0008, "step": 15600 }, { "epoch": 2.1486579490708877, "grad_norm": 0.004281396511942148, "learning_rate": 1.4524536276121157e-05, "loss": 0.0013, "step": 15610 }, { "epoch": 2.150034411562285, "grad_norm": 0.014449895359575748, "learning_rate": 1.4501056586053066e-05, "loss": 0.0003, "step": 15620 }, { "epoch": 2.151410874053682, "grad_norm": 0.004536815453320742, "learning_rate": 1.4477576895984973e-05, "loss": 0.0582, "step": 15630 }, { "epoch": 2.152787336545079, "grad_norm": 0.008017223328351974, "learning_rate": 1.4454097205916884e-05, "loss": 0.0178, "step": 15640 }, { "epoch": 2.154163799036476, "grad_norm": 0.0069669149816036224, "learning_rate": 1.4430617515848791e-05, "loss": 0.0009, "step": 15650 }, { "epoch": 2.1555402615278734, "grad_norm": 0.006845965515822172, "learning_rate": 1.44071378257807e-05, "loss": 0.0003, "step": 15660 }, { "epoch": 2.1569167240192706, "grad_norm": 0.008999993093311787, "learning_rate": 1.438365813571261e-05, "loss": 0.0003, "step": 15670 }, { "epoch": 2.158293186510668, "grad_norm": 0.006189899519085884, "learning_rate": 1.4360178445644518e-05, "loss": 0.0002, "step": 15680 }, { "epoch": 2.1596696490020646, "grad_norm": 0.005932024214416742, "learning_rate": 1.4336698755576427e-05, "loss": 0.0017, "step": 15690 }, { "epoch": 2.161046111493462, "grad_norm": 7.173044681549072, "learning_rate": 1.4313219065508335e-05, "loss": 0.0222, "step": 15700 }, { "epoch": 2.162422573984859, "grad_norm": 0.00583347212523222, "learning_rate": 1.4289739375440245e-05, "loss": 0.0006, "step": 15710 }, { "epoch": 2.163799036476256, "grad_norm": 0.004940108396112919, "learning_rate": 1.4266259685372154e-05, "loss": 0.0178, "step": 15720 }, { "epoch": 2.165175498967653, "grad_norm": 0.0058240871876478195, "learning_rate": 1.4242779995304062e-05, "loss": 0.0007, "step": 15730 }, { "epoch": 2.1665519614590503, "grad_norm": 0.12519052624702454, "learning_rate": 1.4219300305235972e-05, "loss": 0.0022, "step": 15740 }, { "epoch": 2.1679284239504475, "grad_norm": 0.00462671322748065, "learning_rate": 1.419582061516788e-05, "loss": 0.0117, "step": 15750 }, { "epoch": 2.1693048864418443, "grad_norm": 0.10263559222221375, "learning_rate": 1.4172340925099789e-05, "loss": 0.0111, "step": 15760 }, { "epoch": 2.1706813489332415, "grad_norm": 0.006658340338617563, "learning_rate": 1.41488612350317e-05, "loss": 0.0006, "step": 15770 }, { "epoch": 2.1720578114246387, "grad_norm": 0.004906149581074715, "learning_rate": 1.4125381544963607e-05, "loss": 0.0017, "step": 15780 }, { "epoch": 2.173434273916036, "grad_norm": 0.005223682615906, "learning_rate": 1.4101901854895516e-05, "loss": 0.0002, "step": 15790 }, { "epoch": 2.1748107364074327, "grad_norm": 0.004606060683727264, "learning_rate": 1.4078422164827427e-05, "loss": 0.0029, "step": 15800 }, { "epoch": 2.17618719889883, "grad_norm": 0.0040675075724720955, "learning_rate": 1.4054942474759334e-05, "loss": 0.0125, "step": 15810 }, { "epoch": 2.177563661390227, "grad_norm": 0.00478462316095829, "learning_rate": 1.4031462784691243e-05, "loss": 0.0487, "step": 15820 }, { "epoch": 2.1789401238816244, "grad_norm": 0.006273125763982534, "learning_rate": 1.400798309462315e-05, "loss": 0.0147, "step": 15830 }, { "epoch": 2.180316586373021, "grad_norm": 0.058305833488702774, "learning_rate": 1.3984503404555061e-05, "loss": 0.0008, "step": 15840 }, { "epoch": 2.1816930488644184, "grad_norm": 0.009209031239151955, "learning_rate": 1.396102371448697e-05, "loss": 0.0031, "step": 15850 }, { "epoch": 2.1830695113558156, "grad_norm": 0.0057002785615623, "learning_rate": 1.3937544024418878e-05, "loss": 0.0314, "step": 15860 }, { "epoch": 2.184445973847213, "grad_norm": 0.005813268944621086, "learning_rate": 1.3914064334350788e-05, "loss": 0.0005, "step": 15870 }, { "epoch": 2.1858224363386096, "grad_norm": 0.006548648700118065, "learning_rate": 1.3890584644282694e-05, "loss": 0.0203, "step": 15880 }, { "epoch": 2.187198898830007, "grad_norm": 0.004864300135523081, "learning_rate": 1.3867104954214605e-05, "loss": 0.0147, "step": 15890 }, { "epoch": 2.188575361321404, "grad_norm": 0.00463848328217864, "learning_rate": 1.3843625264146514e-05, "loss": 0.0002, "step": 15900 }, { "epoch": 2.1899518238128013, "grad_norm": 0.004506588447839022, "learning_rate": 1.3820145574078421e-05, "loss": 0.0177, "step": 15910 }, { "epoch": 2.191328286304198, "grad_norm": 0.00651097996160388, "learning_rate": 1.3796665884010332e-05, "loss": 0.0014, "step": 15920 }, { "epoch": 2.1927047487955953, "grad_norm": 0.004669906571507454, "learning_rate": 1.3773186193942241e-05, "loss": 0.0002, "step": 15930 }, { "epoch": 2.1940812112869925, "grad_norm": 2.3310317993164062, "learning_rate": 1.3749706503874148e-05, "loss": 0.1084, "step": 15940 }, { "epoch": 2.1954576737783897, "grad_norm": 0.011468169279396534, "learning_rate": 1.3726226813806059e-05, "loss": 0.0019, "step": 15950 }, { "epoch": 2.1968341362697865, "grad_norm": 0.06027821823954582, "learning_rate": 1.3702747123737966e-05, "loss": 0.0031, "step": 15960 }, { "epoch": 2.1982105987611837, "grad_norm": 0.026332557201385498, "learning_rate": 1.3679267433669876e-05, "loss": 0.0532, "step": 15970 }, { "epoch": 2.199587061252581, "grad_norm": 0.021535011008381844, "learning_rate": 1.3655787743601786e-05, "loss": 0.0009, "step": 15980 }, { "epoch": 2.2009635237439777, "grad_norm": 0.01685076393187046, "learning_rate": 1.3632308053533694e-05, "loss": 0.0018, "step": 15990 }, { "epoch": 2.202339986235375, "grad_norm": 0.015705300495028496, "learning_rate": 1.3608828363465603e-05, "loss": 0.0011, "step": 16000 }, { "epoch": 2.203716448726772, "grad_norm": 0.01280893199145794, "learning_rate": 1.358534867339751e-05, "loss": 0.0014, "step": 16010 }, { "epoch": 2.2050929112181694, "grad_norm": 0.010068601928651333, "learning_rate": 1.356186898332942e-05, "loss": 0.0004, "step": 16020 }, { "epoch": 2.2064693737095666, "grad_norm": 0.007684720680117607, "learning_rate": 1.353838929326133e-05, "loss": 0.0258, "step": 16030 }, { "epoch": 2.2078458362009634, "grad_norm": 0.013119915500283241, "learning_rate": 1.3514909603193237e-05, "loss": 0.052, "step": 16040 }, { "epoch": 2.2092222986923606, "grad_norm": 0.034611575305461884, "learning_rate": 1.3491429913125148e-05, "loss": 0.0409, "step": 16050 }, { "epoch": 2.210598761183758, "grad_norm": 0.029080132022500038, "learning_rate": 1.3467950223057057e-05, "loss": 0.0015, "step": 16060 }, { "epoch": 2.2119752236751546, "grad_norm": 0.01701802760362625, "learning_rate": 1.3444470532988964e-05, "loss": 0.0724, "step": 16070 }, { "epoch": 2.213351686166552, "grad_norm": 0.01373692974448204, "learning_rate": 1.3420990842920875e-05, "loss": 0.0015, "step": 16080 }, { "epoch": 2.214728148657949, "grad_norm": 0.00990945566445589, "learning_rate": 1.3397511152852782e-05, "loss": 0.0302, "step": 16090 }, { "epoch": 2.2161046111493463, "grad_norm": 0.027871023863554, "learning_rate": 1.3374031462784692e-05, "loss": 0.0007, "step": 16100 }, { "epoch": 2.217481073640743, "grad_norm": 2.878763198852539, "learning_rate": 1.3350551772716602e-05, "loss": 0.0392, "step": 16110 }, { "epoch": 2.2188575361321403, "grad_norm": 0.012546445243060589, "learning_rate": 1.332707208264851e-05, "loss": 0.0023, "step": 16120 }, { "epoch": 2.2202339986235375, "grad_norm": 0.008298369124531746, "learning_rate": 1.3303592392580419e-05, "loss": 0.0016, "step": 16130 }, { "epoch": 2.2216104611149348, "grad_norm": 0.008873777464032173, "learning_rate": 1.3280112702512326e-05, "loss": 0.0013, "step": 16140 }, { "epoch": 2.2229869236063315, "grad_norm": 0.01491972990334034, "learning_rate": 1.3256633012444237e-05, "loss": 0.0478, "step": 16150 }, { "epoch": 2.2243633860977288, "grad_norm": 0.03752816095948219, "learning_rate": 1.3233153322376146e-05, "loss": 0.0017, "step": 16160 }, { "epoch": 2.225739848589126, "grad_norm": 0.047794315963983536, "learning_rate": 1.3209673632308053e-05, "loss": 0.0173, "step": 16170 }, { "epoch": 2.227116311080523, "grad_norm": 0.17070272564888, "learning_rate": 1.3186193942239964e-05, "loss": 0.0034, "step": 16180 }, { "epoch": 2.22849277357192, "grad_norm": 0.022974595427513123, "learning_rate": 1.3162714252171873e-05, "loss": 0.0032, "step": 16190 }, { "epoch": 2.229869236063317, "grad_norm": 0.0065156533382833, "learning_rate": 1.313923456210378e-05, "loss": 0.0016, "step": 16200 }, { "epoch": 2.2312456985547144, "grad_norm": 0.124773308634758, "learning_rate": 1.3115754872035691e-05, "loss": 0.0004, "step": 16210 }, { "epoch": 2.2326221610461117, "grad_norm": 0.009697481989860535, "learning_rate": 1.3092275181967598e-05, "loss": 0.0009, "step": 16220 }, { "epoch": 2.2339986235375084, "grad_norm": 0.0052283694967627525, "learning_rate": 1.3068795491899507e-05, "loss": 0.0003, "step": 16230 }, { "epoch": 2.2353750860289057, "grad_norm": 0.008450747467577457, "learning_rate": 1.3045315801831418e-05, "loss": 0.0003, "step": 16240 }, { "epoch": 2.236751548520303, "grad_norm": 0.004986380692571402, "learning_rate": 1.3021836111763326e-05, "loss": 0.0013, "step": 16250 }, { "epoch": 2.2381280110117, "grad_norm": 0.005081825889647007, "learning_rate": 1.2998356421695235e-05, "loss": 0.0002, "step": 16260 }, { "epoch": 2.239504473503097, "grad_norm": 0.0043195332400500774, "learning_rate": 1.2974876731627142e-05, "loss": 0.0011, "step": 16270 }, { "epoch": 2.240880935994494, "grad_norm": 0.0043087066151201725, "learning_rate": 1.2951397041559053e-05, "loss": 0.0002, "step": 16280 }, { "epoch": 2.2422573984858913, "grad_norm": 3.6520700454711914, "learning_rate": 1.2927917351490962e-05, "loss": 0.035, "step": 16290 }, { "epoch": 2.2436338609772886, "grad_norm": 1.0565367937088013, "learning_rate": 1.2904437661422869e-05, "loss": 0.0329, "step": 16300 }, { "epoch": 2.2450103234686853, "grad_norm": 0.004967827815562487, "learning_rate": 1.2880957971354778e-05, "loss": 0.0187, "step": 16310 }, { "epoch": 2.2463867859600826, "grad_norm": 0.06998451799154282, "learning_rate": 1.2857478281286689e-05, "loss": 0.002, "step": 16320 }, { "epoch": 2.24776324845148, "grad_norm": 0.00575461657717824, "learning_rate": 1.2833998591218596e-05, "loss": 0.0025, "step": 16330 }, { "epoch": 2.249139710942877, "grad_norm": 0.06366704404354095, "learning_rate": 1.2810518901150505e-05, "loss": 0.0019, "step": 16340 }, { "epoch": 2.250516173434274, "grad_norm": 0.004235418047755957, "learning_rate": 1.2787039211082413e-05, "loss": 0.0454, "step": 16350 }, { "epoch": 2.251892635925671, "grad_norm": 0.00466632004827261, "learning_rate": 1.2763559521014323e-05, "loss": 0.0313, "step": 16360 }, { "epoch": 2.2532690984170682, "grad_norm": 0.028230218216776848, "learning_rate": 1.2740079830946232e-05, "loss": 0.0044, "step": 16370 }, { "epoch": 2.2546455609084655, "grad_norm": 0.004655507393181324, "learning_rate": 1.271660014087814e-05, "loss": 0.0017, "step": 16380 }, { "epoch": 2.2560220233998622, "grad_norm": 0.00419363658875227, "learning_rate": 1.269312045081005e-05, "loss": 0.001, "step": 16390 }, { "epoch": 2.2573984858912595, "grad_norm": 0.00383185432292521, "learning_rate": 1.2669640760741958e-05, "loss": 0.0016, "step": 16400 }, { "epoch": 2.2587749483826567, "grad_norm": 0.1102212592959404, "learning_rate": 1.2646161070673867e-05, "loss": 0.0021, "step": 16410 }, { "epoch": 2.2601514108740535, "grad_norm": 0.007312215398997068, "learning_rate": 1.2622681380605778e-05, "loss": 0.0013, "step": 16420 }, { "epoch": 2.2615278733654507, "grad_norm": 0.03817540407180786, "learning_rate": 1.2599201690537685e-05, "loss": 0.0002, "step": 16430 }, { "epoch": 2.262904335856848, "grad_norm": 0.003868594067171216, "learning_rate": 1.2575722000469594e-05, "loss": 0.0003, "step": 16440 }, { "epoch": 2.264280798348245, "grad_norm": 0.003422378795221448, "learning_rate": 1.2552242310401505e-05, "loss": 0.0016, "step": 16450 }, { "epoch": 2.2656572608396424, "grad_norm": 0.0035754269920289516, "learning_rate": 1.2528762620333412e-05, "loss": 0.0012, "step": 16460 }, { "epoch": 2.267033723331039, "grad_norm": 0.0036260925699025393, "learning_rate": 1.2505282930265321e-05, "loss": 0.0327, "step": 16470 }, { "epoch": 2.2684101858224364, "grad_norm": 0.003300633979961276, "learning_rate": 1.248180324019723e-05, "loss": 0.0029, "step": 16480 }, { "epoch": 2.2697866483138336, "grad_norm": 0.006106718443334103, "learning_rate": 1.245832355012914e-05, "loss": 0.0001, "step": 16490 }, { "epoch": 2.2711631108052304, "grad_norm": 0.00421554408967495, "learning_rate": 1.2434843860061047e-05, "loss": 0.0017, "step": 16500 }, { "epoch": 2.2725395732966276, "grad_norm": 0.031289130449295044, "learning_rate": 1.2411364169992956e-05, "loss": 0.0016, "step": 16510 }, { "epoch": 2.273916035788025, "grad_norm": 0.0035773427225649357, "learning_rate": 1.2387884479924867e-05, "loss": 0.0008, "step": 16520 }, { "epoch": 2.275292498279422, "grad_norm": 0.0030972997192293406, "learning_rate": 1.2364404789856774e-05, "loss": 0.0001, "step": 16530 }, { "epoch": 2.276668960770819, "grad_norm": 0.0717763900756836, "learning_rate": 1.2340925099788683e-05, "loss": 0.0005, "step": 16540 }, { "epoch": 2.278045423262216, "grad_norm": 0.0031977801118046045, "learning_rate": 1.2317445409720592e-05, "loss": 0.0001, "step": 16550 }, { "epoch": 2.2794218857536133, "grad_norm": 0.0032542503904551268, "learning_rate": 1.2293965719652501e-05, "loss": 0.0011, "step": 16560 }, { "epoch": 2.2807983482450105, "grad_norm": 0.0029131637420505285, "learning_rate": 1.227048602958441e-05, "loss": 0.0001, "step": 16570 }, { "epoch": 2.2821748107364073, "grad_norm": 0.003221919760107994, "learning_rate": 1.2247006339516319e-05, "loss": 0.0001, "step": 16580 }, { "epoch": 2.2835512732278045, "grad_norm": 0.0030108862556517124, "learning_rate": 1.2223526649448228e-05, "loss": 0.0166, "step": 16590 }, { "epoch": 2.2849277357192017, "grad_norm": 0.0024079822469502687, "learning_rate": 1.2200046959380136e-05, "loss": 0.0312, "step": 16600 }, { "epoch": 2.286304198210599, "grad_norm": 0.05413069203495979, "learning_rate": 1.2176567269312046e-05, "loss": 0.0182, "step": 16610 }, { "epoch": 2.2876806607019957, "grad_norm": 0.1562635898590088, "learning_rate": 1.2153087579243955e-05, "loss": 0.0038, "step": 16620 }, { "epoch": 2.289057123193393, "grad_norm": 0.035557568073272705, "learning_rate": 1.2129607889175863e-05, "loss": 0.0155, "step": 16630 }, { "epoch": 2.29043358568479, "grad_norm": 0.0035333663690835238, "learning_rate": 1.2106128199107772e-05, "loss": 0.013, "step": 16640 }, { "epoch": 2.2918100481761874, "grad_norm": 0.09955685585737228, "learning_rate": 1.2082648509039682e-05, "loss": 0.0006, "step": 16650 }, { "epoch": 2.293186510667584, "grad_norm": 0.003153973026201129, "learning_rate": 1.205916881897159e-05, "loss": 0.0325, "step": 16660 }, { "epoch": 2.2945629731589814, "grad_norm": 0.022029578685760498, "learning_rate": 1.2035689128903499e-05, "loss": 0.0551, "step": 16670 }, { "epoch": 2.2959394356503786, "grad_norm": 0.0029600129928439856, "learning_rate": 1.2012209438835408e-05, "loss": 0.0004, "step": 16680 }, { "epoch": 2.2973158981417754, "grad_norm": 0.0034592950250953436, "learning_rate": 1.1988729748767317e-05, "loss": 0.002, "step": 16690 }, { "epoch": 2.2986923606331726, "grad_norm": 0.013330941088497639, "learning_rate": 1.1965250058699226e-05, "loss": 0.0015, "step": 16700 }, { "epoch": 2.30006882312457, "grad_norm": 0.004250243306159973, "learning_rate": 1.1941770368631135e-05, "loss": 0.0024, "step": 16710 }, { "epoch": 2.301445285615967, "grad_norm": 0.022681793197989464, "learning_rate": 1.1918290678563042e-05, "loss": 0.0358, "step": 16720 }, { "epoch": 2.3028217481073643, "grad_norm": 0.053822070360183716, "learning_rate": 1.1894810988494951e-05, "loss": 0.0016, "step": 16730 }, { "epoch": 2.304198210598761, "grad_norm": 0.004227160941809416, "learning_rate": 1.1871331298426862e-05, "loss": 0.0014, "step": 16740 }, { "epoch": 2.3055746730901583, "grad_norm": 0.017719721421599388, "learning_rate": 1.184785160835877e-05, "loss": 0.0019, "step": 16750 }, { "epoch": 2.3069511355815555, "grad_norm": 0.0421658530831337, "learning_rate": 1.1824371918290679e-05, "loss": 0.0263, "step": 16760 }, { "epoch": 2.3083275980729523, "grad_norm": 0.025251897051930428, "learning_rate": 1.1800892228222588e-05, "loss": 0.0012, "step": 16770 }, { "epoch": 2.3097040605643495, "grad_norm": 0.0024115366395562887, "learning_rate": 1.1777412538154497e-05, "loss": 0.0011, "step": 16780 }, { "epoch": 2.3110805230557467, "grad_norm": 0.0034553357400000095, "learning_rate": 1.1753932848086406e-05, "loss": 0.0014, "step": 16790 }, { "epoch": 2.312456985547144, "grad_norm": 0.009824160486459732, "learning_rate": 1.1730453158018315e-05, "loss": 0.0008, "step": 16800 }, { "epoch": 2.313833448038541, "grad_norm": 0.09683432430028915, "learning_rate": 1.1706973467950224e-05, "loss": 0.001, "step": 16810 }, { "epoch": 2.315209910529938, "grad_norm": 0.004298373591154814, "learning_rate": 1.1683493777882131e-05, "loss": 0.0429, "step": 16820 }, { "epoch": 2.316586373021335, "grad_norm": 0.0254741832613945, "learning_rate": 1.1660014087814042e-05, "loss": 0.0008, "step": 16830 }, { "epoch": 2.3179628355127324, "grad_norm": 0.003534565446898341, "learning_rate": 1.1636534397745951e-05, "loss": 0.0161, "step": 16840 }, { "epoch": 2.319339298004129, "grad_norm": 0.0776563286781311, "learning_rate": 1.1613054707677858e-05, "loss": 0.0003, "step": 16850 }, { "epoch": 2.3207157604955264, "grad_norm": 0.003251482732594013, "learning_rate": 1.1589575017609767e-05, "loss": 0.0831, "step": 16860 }, { "epoch": 2.3220922229869236, "grad_norm": 0.006613313220441341, "learning_rate": 1.1566095327541678e-05, "loss": 0.0379, "step": 16870 }, { "epoch": 2.323468685478321, "grad_norm": 0.005484608467668295, "learning_rate": 1.1542615637473586e-05, "loss": 0.0073, "step": 16880 }, { "epoch": 2.3248451479697176, "grad_norm": 0.03550928458571434, "learning_rate": 1.1519135947405495e-05, "loss": 0.0268, "step": 16890 }, { "epoch": 2.326221610461115, "grad_norm": 0.03681137412786484, "learning_rate": 1.1495656257337404e-05, "loss": 0.0025, "step": 16900 }, { "epoch": 2.327598072952512, "grad_norm": 0.004117549397051334, "learning_rate": 1.1472176567269313e-05, "loss": 0.0024, "step": 16910 }, { "epoch": 2.3289745354439093, "grad_norm": 0.022049039602279663, "learning_rate": 1.1448696877201222e-05, "loss": 0.0006, "step": 16920 }, { "epoch": 2.330350997935306, "grad_norm": 0.002901240484789014, "learning_rate": 1.142521718713313e-05, "loss": 0.0003, "step": 16930 }, { "epoch": 2.3317274604267033, "grad_norm": 0.0027352937031537294, "learning_rate": 1.1401737497065038e-05, "loss": 0.0186, "step": 16940 }, { "epoch": 2.3331039229181005, "grad_norm": 0.004185016732662916, "learning_rate": 1.1378257806996947e-05, "loss": 0.0166, "step": 16950 }, { "epoch": 2.3344803854094978, "grad_norm": 0.0034794046077877283, "learning_rate": 1.1354778116928858e-05, "loss": 0.0105, "step": 16960 }, { "epoch": 2.3358568479008945, "grad_norm": 0.0032284066546708345, "learning_rate": 1.1331298426860765e-05, "loss": 0.0003, "step": 16970 }, { "epoch": 2.3372333103922918, "grad_norm": 0.002739533083513379, "learning_rate": 1.1307818736792674e-05, "loss": 0.0013, "step": 16980 }, { "epoch": 2.338609772883689, "grad_norm": 0.02277335710823536, "learning_rate": 1.1284339046724583e-05, "loss": 0.0021, "step": 16990 }, { "epoch": 2.339986235375086, "grad_norm": 0.0025710256304591894, "learning_rate": 1.1260859356656492e-05, "loss": 0.0001, "step": 17000 }, { "epoch": 2.341362697866483, "grad_norm": 0.002985884202644229, "learning_rate": 1.1237379666588402e-05, "loss": 0.0011, "step": 17010 }, { "epoch": 2.34273916035788, "grad_norm": 0.005158576183021069, "learning_rate": 1.121389997652031e-05, "loss": 0.0002, "step": 17020 }, { "epoch": 2.3441156228492774, "grad_norm": 0.18208461999893188, "learning_rate": 1.119042028645222e-05, "loss": 0.0014, "step": 17030 }, { "epoch": 2.3454920853406747, "grad_norm": 0.13928323984146118, "learning_rate": 1.1166940596384129e-05, "loss": 0.0015, "step": 17040 }, { "epoch": 2.3468685478320714, "grad_norm": 0.08140414208173752, "learning_rate": 1.1143460906316038e-05, "loss": 0.0186, "step": 17050 }, { "epoch": 2.3482450103234687, "grad_norm": 0.0032377000898122787, "learning_rate": 1.1119981216247947e-05, "loss": 0.0005, "step": 17060 }, { "epoch": 2.349621472814866, "grad_norm": 0.002675029681995511, "learning_rate": 1.1096501526179854e-05, "loss": 0.0192, "step": 17070 }, { "epoch": 2.350997935306263, "grad_norm": 0.0035454551689326763, "learning_rate": 1.1073021836111763e-05, "loss": 0.0001, "step": 17080 }, { "epoch": 2.35237439779766, "grad_norm": 0.0021786089055240154, "learning_rate": 1.1049542146043674e-05, "loss": 0.0002, "step": 17090 }, { "epoch": 2.353750860289057, "grad_norm": 0.1036883220076561, "learning_rate": 1.1026062455975581e-05, "loss": 0.001, "step": 17100 }, { "epoch": 2.3551273227804543, "grad_norm": 0.02267100289463997, "learning_rate": 1.100258276590749e-05, "loss": 0.0227, "step": 17110 }, { "epoch": 2.356503785271851, "grad_norm": 0.00453278748318553, "learning_rate": 1.09791030758394e-05, "loss": 0.0155, "step": 17120 }, { "epoch": 2.3578802477632483, "grad_norm": 0.10005038231611252, "learning_rate": 1.0955623385771308e-05, "loss": 0.0181, "step": 17130 }, { "epoch": 2.3592567102546456, "grad_norm": 0.024886049330234528, "learning_rate": 1.0932143695703217e-05, "loss": 0.0006, "step": 17140 }, { "epoch": 2.360633172746043, "grad_norm": 0.10947442054748535, "learning_rate": 1.0908664005635127e-05, "loss": 0.0018, "step": 17150 }, { "epoch": 2.36200963523744, "grad_norm": 0.002400325145572424, "learning_rate": 1.0885184315567034e-05, "loss": 0.0002, "step": 17160 }, { "epoch": 2.363386097728837, "grad_norm": 0.06447102874517441, "learning_rate": 1.0861704625498943e-05, "loss": 0.0005, "step": 17170 }, { "epoch": 2.364762560220234, "grad_norm": 0.005458171479403973, "learning_rate": 1.0838224935430854e-05, "loss": 0.0014, "step": 17180 }, { "epoch": 2.3661390227116312, "grad_norm": 0.00378242414444685, "learning_rate": 1.0814745245362761e-05, "loss": 0.0015, "step": 17190 }, { "epoch": 2.367515485203028, "grad_norm": 0.0022008162923157215, "learning_rate": 1.079126555529467e-05, "loss": 0.0019, "step": 17200 }, { "epoch": 2.3688919476944252, "grad_norm": 0.09457189589738846, "learning_rate": 1.0767785865226579e-05, "loss": 0.0014, "step": 17210 }, { "epoch": 2.3702684101858225, "grad_norm": 0.0025666726287454367, "learning_rate": 1.0744306175158488e-05, "loss": 0.0007, "step": 17220 }, { "epoch": 2.3716448726772197, "grad_norm": 0.0021153443958610296, "learning_rate": 1.0720826485090397e-05, "loss": 0.001, "step": 17230 }, { "epoch": 2.3730213351686165, "grad_norm": 0.0024693270679563284, "learning_rate": 1.0697346795022306e-05, "loss": 0.0173, "step": 17240 }, { "epoch": 2.3743977976600137, "grad_norm": 0.0026640386786311865, "learning_rate": 1.0673867104954215e-05, "loss": 0.0008, "step": 17250 }, { "epoch": 2.375774260151411, "grad_norm": 0.03538583591580391, "learning_rate": 1.0650387414886124e-05, "loss": 0.0013, "step": 17260 }, { "epoch": 2.377150722642808, "grad_norm": 0.10732273757457733, "learning_rate": 1.0626907724818033e-05, "loss": 0.0496, "step": 17270 }, { "epoch": 2.378527185134205, "grad_norm": 0.004489613231271505, "learning_rate": 1.0603428034749942e-05, "loss": 0.0004, "step": 17280 }, { "epoch": 2.379903647625602, "grad_norm": 0.005731387063860893, "learning_rate": 1.057994834468185e-05, "loss": 0.0006, "step": 17290 }, { "epoch": 2.3812801101169994, "grad_norm": 0.0020768449176102877, "learning_rate": 1.0556468654613759e-05, "loss": 0.0007, "step": 17300 }, { "epoch": 2.3826565726083966, "grad_norm": 0.0030674710869789124, "learning_rate": 1.053298896454567e-05, "loss": 0.0006, "step": 17310 }, { "epoch": 2.3840330350997934, "grad_norm": 0.0051978821866214275, "learning_rate": 1.0509509274477577e-05, "loss": 0.0611, "step": 17320 }, { "epoch": 2.3854094975911906, "grad_norm": 0.004597569815814495, "learning_rate": 1.0486029584409486e-05, "loss": 0.0004, "step": 17330 }, { "epoch": 2.386785960082588, "grad_norm": 0.0471312515437603, "learning_rate": 1.0462549894341395e-05, "loss": 0.0356, "step": 17340 }, { "epoch": 2.388162422573985, "grad_norm": 0.010465112514793873, "learning_rate": 1.0439070204273304e-05, "loss": 0.001, "step": 17350 }, { "epoch": 2.389538885065382, "grad_norm": 0.02170352265238762, "learning_rate": 1.0415590514205213e-05, "loss": 0.0371, "step": 17360 }, { "epoch": 2.390915347556779, "grad_norm": 0.028568517416715622, "learning_rate": 1.0392110824137122e-05, "loss": 0.0009, "step": 17370 }, { "epoch": 2.3922918100481763, "grad_norm": 0.060332510620355606, "learning_rate": 1.036863113406903e-05, "loss": 0.0016, "step": 17380 }, { "epoch": 2.3936682725395735, "grad_norm": 0.029738182201981544, "learning_rate": 1.034515144400094e-05, "loss": 0.0555, "step": 17390 }, { "epoch": 2.3950447350309703, "grad_norm": 0.07651922851800919, "learning_rate": 1.032167175393285e-05, "loss": 0.0018, "step": 17400 }, { "epoch": 2.3964211975223675, "grad_norm": 0.9309929013252258, "learning_rate": 1.0298192063864757e-05, "loss": 0.0194, "step": 17410 }, { "epoch": 2.3977976600137647, "grad_norm": 0.0062381187453866005, "learning_rate": 1.0274712373796666e-05, "loss": 0.0006, "step": 17420 }, { "epoch": 2.399174122505162, "grad_norm": 0.005443766713142395, "learning_rate": 1.0251232683728575e-05, "loss": 0.0331, "step": 17430 }, { "epoch": 2.4005505849965587, "grad_norm": 0.006431065499782562, "learning_rate": 1.0227752993660484e-05, "loss": 0.0002, "step": 17440 }, { "epoch": 2.401927047487956, "grad_norm": 0.03066302090883255, "learning_rate": 1.0204273303592393e-05, "loss": 0.001, "step": 17450 }, { "epoch": 2.403303509979353, "grad_norm": 0.015972044318914413, "learning_rate": 1.0180793613524302e-05, "loss": 0.0012, "step": 17460 }, { "epoch": 2.40467997247075, "grad_norm": 0.004990411456674337, "learning_rate": 1.0157313923456211e-05, "loss": 0.0008, "step": 17470 }, { "epoch": 2.406056434962147, "grad_norm": 0.00502657238394022, "learning_rate": 1.013383423338812e-05, "loss": 0.0342, "step": 17480 }, { "epoch": 2.4074328974535444, "grad_norm": 0.003942796494811773, "learning_rate": 1.0110354543320029e-05, "loss": 0.0008, "step": 17490 }, { "epoch": 2.4088093599449416, "grad_norm": 0.0037104864604771137, "learning_rate": 1.0086874853251938e-05, "loss": 0.0354, "step": 17500 }, { "epoch": 2.410185822436339, "grad_norm": 0.003349665319547057, "learning_rate": 1.0063395163183846e-05, "loss": 0.0625, "step": 17510 }, { "epoch": 2.4115622849277356, "grad_norm": 0.1563953459262848, "learning_rate": 1.0039915473115756e-05, "loss": 0.0017, "step": 17520 }, { "epoch": 2.412938747419133, "grad_norm": 0.005117695778608322, "learning_rate": 1.0016435783047665e-05, "loss": 0.0023, "step": 17530 }, { "epoch": 2.41431520991053, "grad_norm": 0.003334884764626622, "learning_rate": 9.992956092979573e-06, "loss": 0.0008, "step": 17540 }, { "epoch": 2.415691672401927, "grad_norm": 0.004285046365112066, "learning_rate": 9.969476402911482e-06, "loss": 0.0196, "step": 17550 }, { "epoch": 2.417068134893324, "grad_norm": 0.0036209984682500362, "learning_rate": 9.94599671284339e-06, "loss": 0.0009, "step": 17560 }, { "epoch": 2.4184445973847213, "grad_norm": 0.003270169720053673, "learning_rate": 9.9225170227753e-06, "loss": 0.0019, "step": 17570 }, { "epoch": 2.4198210598761185, "grad_norm": 0.0028148540295660496, "learning_rate": 9.899037332707209e-06, "loss": 0.0015, "step": 17580 }, { "epoch": 2.4211975223675157, "grad_norm": 0.46761927008628845, "learning_rate": 9.875557642639118e-06, "loss": 0.0012, "step": 17590 }, { "epoch": 2.4225739848589125, "grad_norm": 0.0031428427901118994, "learning_rate": 9.852077952571025e-06, "loss": 0.0256, "step": 17600 }, { "epoch": 2.4239504473503097, "grad_norm": 0.0028510456904768944, "learning_rate": 9.828598262502936e-06, "loss": 0.0266, "step": 17610 }, { "epoch": 2.425326909841707, "grad_norm": 0.0037182350642979145, "learning_rate": 9.805118572434845e-06, "loss": 0.0162, "step": 17620 }, { "epoch": 2.4267033723331037, "grad_norm": 0.023671559989452362, "learning_rate": 9.781638882366752e-06, "loss": 0.0016, "step": 17630 }, { "epoch": 2.428079834824501, "grad_norm": 0.0025620930828154087, "learning_rate": 9.758159192298661e-06, "loss": 0.0001, "step": 17640 }, { "epoch": 2.429456297315898, "grad_norm": 0.08003072440624237, "learning_rate": 9.73467950223057e-06, "loss": 0.0014, "step": 17650 }, { "epoch": 2.4308327598072954, "grad_norm": 0.0027324145194143057, "learning_rate": 9.71119981216248e-06, "loss": 0.0016, "step": 17660 }, { "epoch": 2.432209222298692, "grad_norm": 0.0033126117195934057, "learning_rate": 9.687720122094389e-06, "loss": 0.0008, "step": 17670 }, { "epoch": 2.4335856847900894, "grad_norm": 0.0038483846001327038, "learning_rate": 9.664240432026298e-06, "loss": 0.0759, "step": 17680 }, { "epoch": 2.4349621472814866, "grad_norm": 0.02047133632004261, "learning_rate": 9.640760741958207e-06, "loss": 0.0341, "step": 17690 }, { "epoch": 2.436338609772884, "grad_norm": 0.02431926131248474, "learning_rate": 9.617281051890116e-06, "loss": 0.0074, "step": 17700 }, { "epoch": 2.4377150722642806, "grad_norm": 0.004275232553482056, "learning_rate": 9.593801361822025e-06, "loss": 0.0005, "step": 17710 }, { "epoch": 2.439091534755678, "grad_norm": 0.06768094003200531, "learning_rate": 9.570321671753934e-06, "loss": 0.0014, "step": 17720 }, { "epoch": 2.440467997247075, "grad_norm": 0.0058098770678043365, "learning_rate": 9.546841981685841e-06, "loss": 0.0342, "step": 17730 }, { "epoch": 2.4418444597384723, "grad_norm": 0.01322112511843443, "learning_rate": 9.523362291617752e-06, "loss": 0.0005, "step": 17740 }, { "epoch": 2.443220922229869, "grad_norm": 0.005498557351529598, "learning_rate": 9.499882601549661e-06, "loss": 0.0017, "step": 17750 }, { "epoch": 2.4445973847212663, "grad_norm": 0.003586405888199806, "learning_rate": 9.476402911481568e-06, "loss": 0.001, "step": 17760 }, { "epoch": 2.4459738472126635, "grad_norm": 0.0034812844824045897, "learning_rate": 9.452923221413477e-06, "loss": 0.0007, "step": 17770 }, { "epoch": 2.4473503097040608, "grad_norm": 0.004172442015260458, "learning_rate": 9.429443531345386e-06, "loss": 0.0007, "step": 17780 }, { "epoch": 2.4487267721954575, "grad_norm": 0.0035246103070676327, "learning_rate": 9.405963841277296e-06, "loss": 0.0007, "step": 17790 }, { "epoch": 2.4501032346868548, "grad_norm": 0.003187755588442087, "learning_rate": 9.382484151209205e-06, "loss": 0.0002, "step": 17800 }, { "epoch": 2.451479697178252, "grad_norm": 0.004040002357214689, "learning_rate": 9.359004461141114e-06, "loss": 0.0006, "step": 17810 }, { "epoch": 2.4528561596696488, "grad_norm": 0.004429070744663477, "learning_rate": 9.335524771073021e-06, "loss": 0.0046, "step": 17820 }, { "epoch": 2.454232622161046, "grad_norm": 0.00906100682914257, "learning_rate": 9.312045081004932e-06, "loss": 0.0065, "step": 17830 }, { "epoch": 2.455609084652443, "grad_norm": 0.0032097608782351017, "learning_rate": 9.28856539093684e-06, "loss": 0.0017, "step": 17840 }, { "epoch": 2.4569855471438404, "grad_norm": 0.0033619164023548365, "learning_rate": 9.265085700868748e-06, "loss": 0.007, "step": 17850 }, { "epoch": 2.4583620096352377, "grad_norm": 0.003231579903513193, "learning_rate": 9.241606010800657e-06, "loss": 0.0016, "step": 17860 }, { "epoch": 2.4597384721266344, "grad_norm": 0.003239681012928486, "learning_rate": 9.218126320732568e-06, "loss": 0.0011, "step": 17870 }, { "epoch": 2.4611149346180317, "grad_norm": 0.0029821773059666157, "learning_rate": 9.194646630664475e-06, "loss": 0.0007, "step": 17880 }, { "epoch": 2.462491397109429, "grad_norm": 0.002948109293356538, "learning_rate": 9.171166940596384e-06, "loss": 0.0002, "step": 17890 }, { "epoch": 2.4638678596008257, "grad_norm": 0.002904691267758608, "learning_rate": 9.147687250528293e-06, "loss": 0.0125, "step": 17900 }, { "epoch": 2.465244322092223, "grad_norm": 0.002750644227489829, "learning_rate": 9.124207560460202e-06, "loss": 0.0134, "step": 17910 }, { "epoch": 2.46662078458362, "grad_norm": 2.362738847732544, "learning_rate": 9.100727870392112e-06, "loss": 0.0575, "step": 17920 }, { "epoch": 2.4679972470750173, "grad_norm": 0.010659296065568924, "learning_rate": 9.07724818032402e-06, "loss": 0.0075, "step": 17930 }, { "epoch": 2.4693737095664146, "grad_norm": 0.00408196123316884, "learning_rate": 9.05376849025593e-06, "loss": 0.0001, "step": 17940 }, { "epoch": 2.4707501720578113, "grad_norm": 0.004041661974042654, "learning_rate": 9.030288800187837e-06, "loss": 0.0014, "step": 17950 }, { "epoch": 2.4721266345492086, "grad_norm": 0.003186454065144062, "learning_rate": 9.006809110119748e-06, "loss": 0.0025, "step": 17960 }, { "epoch": 2.473503097040606, "grad_norm": 0.044607799500226974, "learning_rate": 8.983329420051657e-06, "loss": 0.0005, "step": 17970 }, { "epoch": 2.4748795595320026, "grad_norm": 0.0034542980138212442, "learning_rate": 8.959849729983564e-06, "loss": 0.0004, "step": 17980 }, { "epoch": 2.4762560220234, "grad_norm": 0.004620147868990898, "learning_rate": 8.936370039915473e-06, "loss": 0.0227, "step": 17990 }, { "epoch": 2.477632484514797, "grad_norm": 0.0034398322459310293, "learning_rate": 8.912890349847382e-06, "loss": 0.0005, "step": 18000 }, { "epoch": 2.4790089470061942, "grad_norm": 0.0035394737496972084, "learning_rate": 8.889410659779291e-06, "loss": 0.0009, "step": 18010 }, { "epoch": 2.480385409497591, "grad_norm": 0.004152490757405758, "learning_rate": 8.8659309697112e-06, "loss": 0.0001, "step": 18020 }, { "epoch": 2.4817618719889882, "grad_norm": 0.003855903400108218, "learning_rate": 8.84245127964311e-06, "loss": 0.0004, "step": 18030 }, { "epoch": 2.4831383344803855, "grad_norm": 0.003463648958131671, "learning_rate": 8.818971589575017e-06, "loss": 0.0004, "step": 18040 }, { "epoch": 2.4845147969717827, "grad_norm": 0.003661786438897252, "learning_rate": 8.795491899506927e-06, "loss": 0.001, "step": 18050 }, { "epoch": 2.4858912594631795, "grad_norm": 0.0044890474528074265, "learning_rate": 8.772012209438837e-06, "loss": 0.0004, "step": 18060 }, { "epoch": 2.4872677219545767, "grad_norm": 0.0030545340850949287, "learning_rate": 8.748532519370744e-06, "loss": 0.0186, "step": 18070 }, { "epoch": 2.488644184445974, "grad_norm": 0.003303753212094307, "learning_rate": 8.725052829302653e-06, "loss": 0.0377, "step": 18080 }, { "epoch": 2.490020646937371, "grad_norm": 0.003316381247714162, "learning_rate": 8.701573139234564e-06, "loss": 0.0002, "step": 18090 }, { "epoch": 2.491397109428768, "grad_norm": 0.004243026487529278, "learning_rate": 8.678093449166471e-06, "loss": 0.0004, "step": 18100 }, { "epoch": 2.492773571920165, "grad_norm": 0.004384580068290234, "learning_rate": 8.65461375909838e-06, "loss": 0.0607, "step": 18110 }, { "epoch": 2.4941500344115624, "grad_norm": 0.0038801473565399647, "learning_rate": 8.631134069030289e-06, "loss": 0.048, "step": 18120 }, { "epoch": 2.4955264969029596, "grad_norm": 0.003646759781986475, "learning_rate": 8.607654378962198e-06, "loss": 0.001, "step": 18130 }, { "epoch": 2.4969029593943564, "grad_norm": 0.004383504390716553, "learning_rate": 8.584174688894107e-06, "loss": 0.0028, "step": 18140 }, { "epoch": 2.4982794218857536, "grad_norm": 0.06718702614307404, "learning_rate": 8.560694998826016e-06, "loss": 0.0005, "step": 18150 }, { "epoch": 2.499655884377151, "grad_norm": 0.004362696316093206, "learning_rate": 8.537215308757925e-06, "loss": 0.001, "step": 18160 }, { "epoch": 2.5010323468685476, "grad_norm": 0.0036143844481557608, "learning_rate": 8.513735618689833e-06, "loss": 0.0002, "step": 18170 }, { "epoch": 2.502408809359945, "grad_norm": 0.004395583178848028, "learning_rate": 8.490255928621743e-06, "loss": 0.0316, "step": 18180 }, { "epoch": 2.503785271851342, "grad_norm": 0.004048427566885948, "learning_rate": 8.466776238553652e-06, "loss": 0.0294, "step": 18190 }, { "epoch": 2.5051617343427393, "grad_norm": 0.004275417886674404, "learning_rate": 8.44329654848556e-06, "loss": 0.0005, "step": 18200 }, { "epoch": 2.5065381968341365, "grad_norm": 0.05066804960370064, "learning_rate": 8.419816858417469e-06, "loss": 0.0387, "step": 18210 }, { "epoch": 2.5079146593255333, "grad_norm": 0.007686094380915165, "learning_rate": 8.39633716834938e-06, "loss": 0.0024, "step": 18220 }, { "epoch": 2.5092911218169305, "grad_norm": 0.0039205607026815414, "learning_rate": 8.372857478281287e-06, "loss": 0.0615, "step": 18230 }, { "epoch": 2.5106675843083277, "grad_norm": 0.005317528732120991, "learning_rate": 8.349377788213196e-06, "loss": 0.0136, "step": 18240 }, { "epoch": 2.5120440467997245, "grad_norm": 0.9919801950454712, "learning_rate": 8.325898098145105e-06, "loss": 0.0037, "step": 18250 }, { "epoch": 2.5134205092911217, "grad_norm": 0.005125365685671568, "learning_rate": 8.302418408077012e-06, "loss": 0.0172, "step": 18260 }, { "epoch": 2.514796971782519, "grad_norm": 0.005410562735050917, "learning_rate": 8.278938718008923e-06, "loss": 0.0121, "step": 18270 }, { "epoch": 2.516173434273916, "grad_norm": 18.020517349243164, "learning_rate": 8.255459027940832e-06, "loss": 0.0117, "step": 18280 }, { "epoch": 2.5175498967653134, "grad_norm": 0.004844162613153458, "learning_rate": 8.23197933787274e-06, "loss": 0.0021, "step": 18290 }, { "epoch": 2.51892635925671, "grad_norm": 0.004951165057718754, "learning_rate": 8.208499647804649e-06, "loss": 0.0092, "step": 18300 }, { "epoch": 2.5203028217481074, "grad_norm": 0.004495666362345219, "learning_rate": 8.18501995773656e-06, "loss": 0.0013, "step": 18310 }, { "epoch": 2.5216792842395046, "grad_norm": 0.00570897338911891, "learning_rate": 8.161540267668467e-06, "loss": 0.0913, "step": 18320 }, { "epoch": 2.5230557467309014, "grad_norm": 0.008307446725666523, "learning_rate": 8.138060577600376e-06, "loss": 0.0564, "step": 18330 }, { "epoch": 2.5244322092222986, "grad_norm": 0.01065066084265709, "learning_rate": 8.114580887532285e-06, "loss": 0.0088, "step": 18340 }, { "epoch": 2.525808671713696, "grad_norm": 0.009945325553417206, "learning_rate": 8.091101197464194e-06, "loss": 0.0007, "step": 18350 }, { "epoch": 2.527185134205093, "grad_norm": 0.01124708354473114, "learning_rate": 8.067621507396103e-06, "loss": 0.046, "step": 18360 }, { "epoch": 2.5285615966964903, "grad_norm": 0.017190467566251755, "learning_rate": 8.044141817328012e-06, "loss": 0.0084, "step": 18370 }, { "epoch": 2.529938059187887, "grad_norm": 0.04266795143485069, "learning_rate": 8.020662127259921e-06, "loss": 0.0012, "step": 18380 }, { "epoch": 2.5313145216792843, "grad_norm": 0.015506861731410027, "learning_rate": 7.997182437191828e-06, "loss": 0.0016, "step": 18390 }, { "epoch": 2.5326909841706815, "grad_norm": 0.012267494574189186, "learning_rate": 7.973702747123739e-06, "loss": 0.0178, "step": 18400 }, { "epoch": 2.5340674466620783, "grad_norm": 0.017596907913684845, "learning_rate": 7.950223057055648e-06, "loss": 0.0472, "step": 18410 }, { "epoch": 2.5354439091534755, "grad_norm": 0.05161113291978836, "learning_rate": 7.926743366987556e-06, "loss": 0.0433, "step": 18420 }, { "epoch": 2.5368203716448727, "grad_norm": 0.0797748938202858, "learning_rate": 7.903263676919465e-06, "loss": 0.0537, "step": 18430 }, { "epoch": 2.53819683413627, "grad_norm": 0.041050296276807785, "learning_rate": 7.879783986851375e-06, "loss": 0.0145, "step": 18440 }, { "epoch": 2.5395732966276667, "grad_norm": 0.018233496695756912, "learning_rate": 7.856304296783283e-06, "loss": 0.0035, "step": 18450 }, { "epoch": 2.540949759119064, "grad_norm": 0.02511315792798996, "learning_rate": 7.832824606715192e-06, "loss": 0.0014, "step": 18460 }, { "epoch": 2.542326221610461, "grad_norm": 0.027422351762652397, "learning_rate": 7.8093449166471e-06, "loss": 0.0005, "step": 18470 }, { "epoch": 2.5437026841018584, "grad_norm": 0.012228838168084621, "learning_rate": 7.785865226579008e-06, "loss": 0.0009, "step": 18480 }, { "epoch": 2.545079146593255, "grad_norm": 0.008324486203491688, "learning_rate": 7.762385536510919e-06, "loss": 0.0004, "step": 18490 }, { "epoch": 2.5464556090846524, "grad_norm": 0.01068083755671978, "learning_rate": 7.738905846442828e-06, "loss": 0.0006, "step": 18500 }, { "epoch": 2.5478320715760496, "grad_norm": 6.41408109664917, "learning_rate": 7.715426156374735e-06, "loss": 0.0079, "step": 18510 }, { "epoch": 2.5492085340674464, "grad_norm": 0.007799135986715555, "learning_rate": 7.691946466306644e-06, "loss": 0.0674, "step": 18520 }, { "epoch": 2.5505849965588436, "grad_norm": 0.012690936215221882, "learning_rate": 7.668466776238555e-06, "loss": 0.0007, "step": 18530 }, { "epoch": 2.551961459050241, "grad_norm": 0.011871379800140858, "learning_rate": 7.644987086170462e-06, "loss": 0.0506, "step": 18540 }, { "epoch": 2.553337921541638, "grad_norm": 0.014154437929391861, "learning_rate": 7.6215073961023715e-06, "loss": 0.0006, "step": 18550 }, { "epoch": 2.5547143840330353, "grad_norm": 0.043574340641498566, "learning_rate": 7.5980277060342805e-06, "loss": 0.0008, "step": 18560 }, { "epoch": 2.556090846524432, "grad_norm": 0.01686745509505272, "learning_rate": 7.5745480159661904e-06, "loss": 0.0007, "step": 18570 }, { "epoch": 2.5574673090158293, "grad_norm": 0.013989301398396492, "learning_rate": 7.551068325898099e-06, "loss": 0.0109, "step": 18580 }, { "epoch": 2.5588437715072265, "grad_norm": 0.012241232208907604, "learning_rate": 7.527588635830008e-06, "loss": 0.002, "step": 18590 }, { "epoch": 2.5602202339986233, "grad_norm": 0.009555461816489697, "learning_rate": 7.504108945761916e-06, "loss": 0.009, "step": 18600 }, { "epoch": 2.5615966964900205, "grad_norm": 0.013078044168651104, "learning_rate": 7.480629255693825e-06, "loss": 0.0249, "step": 18610 }, { "epoch": 2.5629731589814178, "grad_norm": 0.008629821240901947, "learning_rate": 7.457149565625735e-06, "loss": 0.0004, "step": 18620 }, { "epoch": 2.564349621472815, "grad_norm": 0.0093661118298769, "learning_rate": 7.433669875557643e-06, "loss": 0.0007, "step": 18630 }, { "epoch": 2.565726083964212, "grad_norm": 0.01100209355354309, "learning_rate": 7.410190185489552e-06, "loss": 0.0003, "step": 18640 }, { "epoch": 2.567102546455609, "grad_norm": 0.7286455035209656, "learning_rate": 7.38671049542146e-06, "loss": 0.0075, "step": 18650 }, { "epoch": 2.568479008947006, "grad_norm": 0.008741943165659904, "learning_rate": 7.36323080535337e-06, "loss": 0.0088, "step": 18660 }, { "epoch": 2.5698554714384034, "grad_norm": 0.04528751224279404, "learning_rate": 7.339751115285278e-06, "loss": 0.0005, "step": 18670 }, { "epoch": 2.5712319339298, "grad_norm": 0.007430265657603741, "learning_rate": 7.3162714252171874e-06, "loss": 0.0003, "step": 18680 }, { "epoch": 2.5726083964211974, "grad_norm": 0.007825636304914951, "learning_rate": 7.292791735149096e-06, "loss": 0.0007, "step": 18690 }, { "epoch": 2.5739848589125947, "grad_norm": 0.007508194539695978, "learning_rate": 7.2693120450810055e-06, "loss": 0.0003, "step": 18700 }, { "epoch": 2.575361321403992, "grad_norm": 0.006888227537274361, "learning_rate": 7.245832355012915e-06, "loss": 0.0026, "step": 18710 }, { "epoch": 2.576737783895389, "grad_norm": 0.008085234090685844, "learning_rate": 7.222352664944823e-06, "loss": 0.0003, "step": 18720 }, { "epoch": 2.578114246386786, "grad_norm": 0.007945626974105835, "learning_rate": 7.198872974876732e-06, "loss": 0.0002, "step": 18730 }, { "epoch": 2.579490708878183, "grad_norm": 0.005616112612187862, "learning_rate": 7.17539328480864e-06, "loss": 0.0002, "step": 18740 }, { "epoch": 2.5808671713695803, "grad_norm": 0.11620043963193893, "learning_rate": 7.15191359474055e-06, "loss": 0.0025, "step": 18750 }, { "epoch": 2.582243633860977, "grad_norm": 0.005201470572501421, "learning_rate": 7.128433904672459e-06, "loss": 0.0005, "step": 18760 }, { "epoch": 2.5836200963523743, "grad_norm": 0.005957538262009621, "learning_rate": 7.104954214604367e-06, "loss": 0.0313, "step": 18770 }, { "epoch": 2.5849965588437716, "grad_norm": 0.005437659565359354, "learning_rate": 7.081474524536276e-06, "loss": 0.0083, "step": 18780 }, { "epoch": 2.5863730213351688, "grad_norm": 0.006864655762910843, "learning_rate": 7.057994834468186e-06, "loss": 0.0011, "step": 18790 }, { "epoch": 2.587749483826566, "grad_norm": 0.00630088010802865, "learning_rate": 7.034515144400094e-06, "loss": 0.0003, "step": 18800 }, { "epoch": 2.589125946317963, "grad_norm": 0.039472535252571106, "learning_rate": 7.011035454332003e-06, "loss": 0.012, "step": 18810 }, { "epoch": 2.59050240880936, "grad_norm": 0.00500042038038373, "learning_rate": 6.987555764263912e-06, "loss": 0.0075, "step": 18820 }, { "epoch": 2.5918788713007572, "grad_norm": 0.0058846077881753445, "learning_rate": 6.9640760741958215e-06, "loss": 0.0565, "step": 18830 }, { "epoch": 2.593255333792154, "grad_norm": 1.2770411968231201, "learning_rate": 6.9405963841277306e-06, "loss": 0.0071, "step": 18840 }, { "epoch": 2.5946317962835512, "grad_norm": 0.007312930654734373, "learning_rate": 6.917116694059639e-06, "loss": 0.0305, "step": 18850 }, { "epoch": 2.5960082587749485, "grad_norm": 0.008678222075104713, "learning_rate": 6.893637003991547e-06, "loss": 0.007, "step": 18860 }, { "epoch": 2.5973847212663452, "grad_norm": 0.10399042814970016, "learning_rate": 6.870157313923456e-06, "loss": 0.062, "step": 18870 }, { "epoch": 2.5987611837577425, "grad_norm": 0.012444945983588696, "learning_rate": 6.846677623855366e-06, "loss": 0.0082, "step": 18880 }, { "epoch": 2.6001376462491397, "grad_norm": 0.009172143414616585, "learning_rate": 6.823197933787274e-06, "loss": 0.0003, "step": 18890 }, { "epoch": 2.601514108740537, "grad_norm": 0.673055112361908, "learning_rate": 6.799718243719183e-06, "loss": 0.0024, "step": 18900 }, { "epoch": 2.602890571231934, "grad_norm": 0.00925646536052227, "learning_rate": 6.776238553651091e-06, "loss": 0.0005, "step": 18910 }, { "epoch": 2.604267033723331, "grad_norm": 1.6665762662887573, "learning_rate": 6.752758863583001e-06, "loss": 0.0106, "step": 18920 }, { "epoch": 2.605643496214728, "grad_norm": 0.24754302203655243, "learning_rate": 6.72927917351491e-06, "loss": 0.0258, "step": 18930 }, { "epoch": 2.6070199587061254, "grad_norm": 0.012733501382172108, "learning_rate": 6.7057994834468185e-06, "loss": 0.0018, "step": 18940 }, { "epoch": 2.608396421197522, "grad_norm": 0.006816320586949587, "learning_rate": 6.6823197933787276e-06, "loss": 0.0048, "step": 18950 }, { "epoch": 2.6097728836889194, "grad_norm": 0.007947875186800957, "learning_rate": 6.658840103310636e-06, "loss": 0.0004, "step": 18960 }, { "epoch": 2.6111493461803166, "grad_norm": 0.007081441581249237, "learning_rate": 6.635360413242546e-06, "loss": 0.0062, "step": 18970 }, { "epoch": 2.612525808671714, "grad_norm": 0.0058317226357758045, "learning_rate": 6.611880723174455e-06, "loss": 0.0038, "step": 18980 }, { "epoch": 2.613902271163111, "grad_norm": 0.006537840235978365, "learning_rate": 6.588401033106363e-06, "loss": 0.0004, "step": 18990 }, { "epoch": 2.615278733654508, "grad_norm": 0.005678875371813774, "learning_rate": 6.564921343038272e-06, "loss": 0.0005, "step": 19000 }, { "epoch": 2.616655196145905, "grad_norm": 0.04256076738238335, "learning_rate": 6.541441652970182e-06, "loss": 0.0004, "step": 19010 }, { "epoch": 2.6180316586373023, "grad_norm": 0.006289094686508179, "learning_rate": 6.51796196290209e-06, "loss": 0.0267, "step": 19020 }, { "epoch": 2.619408121128699, "grad_norm": 0.00480860797688365, "learning_rate": 6.494482272833999e-06, "loss": 0.0002, "step": 19030 }, { "epoch": 2.6207845836200963, "grad_norm": 0.005817321129143238, "learning_rate": 6.471002582765907e-06, "loss": 0.0524, "step": 19040 }, { "epoch": 2.6221610461114935, "grad_norm": 0.006093348376452923, "learning_rate": 6.447522892697817e-06, "loss": 0.0006, "step": 19050 }, { "epoch": 2.6235375086028907, "grad_norm": 0.04417076334357262, "learning_rate": 6.424043202629726e-06, "loss": 0.0004, "step": 19060 }, { "epoch": 2.624913971094288, "grad_norm": 0.0060739945620298386, "learning_rate": 6.4005635125616345e-06, "loss": 0.0004, "step": 19070 }, { "epoch": 2.6262904335856847, "grad_norm": 1.2983286380767822, "learning_rate": 6.377083822493543e-06, "loss": 0.0219, "step": 19080 }, { "epoch": 2.627666896077082, "grad_norm": 0.04400370642542839, "learning_rate": 6.353604132425452e-06, "loss": 0.0041, "step": 19090 }, { "epoch": 2.629043358568479, "grad_norm": 3.9816274642944336, "learning_rate": 6.330124442357362e-06, "loss": 0.0433, "step": 19100 }, { "epoch": 2.630419821059876, "grad_norm": 0.02237449772655964, "learning_rate": 6.30664475228927e-06, "loss": 0.0005, "step": 19110 }, { "epoch": 2.631796283551273, "grad_norm": 0.024591173976659775, "learning_rate": 6.283165062221179e-06, "loss": 0.0004, "step": 19120 }, { "epoch": 2.6331727460426704, "grad_norm": 0.01507965475320816, "learning_rate": 6.259685372153087e-06, "loss": 0.0328, "step": 19130 }, { "epoch": 2.6345492085340676, "grad_norm": 0.010558038018643856, "learning_rate": 6.236205682084997e-06, "loss": 0.0007, "step": 19140 }, { "epoch": 2.635925671025465, "grad_norm": 3.0602142810821533, "learning_rate": 6.212725992016906e-06, "loss": 0.0095, "step": 19150 }, { "epoch": 2.6373021335168616, "grad_norm": 0.015972502529621124, "learning_rate": 6.189246301948814e-06, "loss": 0.03, "step": 19160 }, { "epoch": 2.638678596008259, "grad_norm": 0.058577630668878555, "learning_rate": 6.165766611880724e-06, "loss": 0.0011, "step": 19170 }, { "epoch": 2.640055058499656, "grad_norm": 0.006526827812194824, "learning_rate": 6.142286921812632e-06, "loss": 0.001, "step": 19180 }, { "epoch": 2.641431520991053, "grad_norm": 0.006233698688447475, "learning_rate": 6.1188072317445405e-06, "loss": 0.0062, "step": 19190 }, { "epoch": 2.64280798348245, "grad_norm": 0.049199797213077545, "learning_rate": 6.0953275416764504e-06, "loss": 0.0117, "step": 19200 }, { "epoch": 2.6441844459738473, "grad_norm": 0.005571519490331411, "learning_rate": 6.071847851608359e-06, "loss": 0.0006, "step": 19210 }, { "epoch": 2.645560908465244, "grad_norm": 0.043522872030735016, "learning_rate": 6.048368161540268e-06, "loss": 0.001, "step": 19220 }, { "epoch": 2.6469373709566413, "grad_norm": 0.004998145159333944, "learning_rate": 6.024888471472177e-06, "loss": 0.0005, "step": 19230 }, { "epoch": 2.6483138334480385, "grad_norm": 0.0046535152941942215, "learning_rate": 6.001408781404086e-06, "loss": 0.0002, "step": 19240 }, { "epoch": 2.6496902959394357, "grad_norm": 0.005101282615214586, "learning_rate": 5.977929091335995e-06, "loss": 0.0002, "step": 19250 }, { "epoch": 2.651066758430833, "grad_norm": 0.039638858288526535, "learning_rate": 5.954449401267904e-06, "loss": 0.0006, "step": 19260 }, { "epoch": 2.6524432209222297, "grad_norm": 0.009360401891171932, "learning_rate": 5.930969711199812e-06, "loss": 0.0004, "step": 19270 }, { "epoch": 2.653819683413627, "grad_norm": 0.003996743820607662, "learning_rate": 5.907490021131722e-06, "loss": 0.0382, "step": 19280 }, { "epoch": 2.655196145905024, "grad_norm": 0.00421122694388032, "learning_rate": 5.88401033106363e-06, "loss": 0.001, "step": 19290 }, { "epoch": 2.656572608396421, "grad_norm": 0.020505066961050034, "learning_rate": 5.860530640995539e-06, "loss": 0.0002, "step": 19300 }, { "epoch": 2.657949070887818, "grad_norm": 0.011534849181771278, "learning_rate": 5.837050950927448e-06, "loss": 0.0049, "step": 19310 }, { "epoch": 2.6593255333792154, "grad_norm": 0.0037951041013002396, "learning_rate": 5.8135712608593565e-06, "loss": 0.001, "step": 19320 }, { "epoch": 2.6607019958706126, "grad_norm": 0.004129904322326183, "learning_rate": 5.7900915707912655e-06, "loss": 0.0002, "step": 19330 }, { "epoch": 2.66207845836201, "grad_norm": 0.003837422700598836, "learning_rate": 5.766611880723175e-06, "loss": 0.0004, "step": 19340 }, { "epoch": 2.6634549208534066, "grad_norm": 0.0044785537756979465, "learning_rate": 5.743132190655084e-06, "loss": 0.0239, "step": 19350 }, { "epoch": 2.664831383344804, "grad_norm": 0.005386338569223881, "learning_rate": 5.719652500586993e-06, "loss": 0.0065, "step": 19360 }, { "epoch": 2.666207845836201, "grad_norm": 0.0050216373056173325, "learning_rate": 5.696172810518902e-06, "loss": 0.0049, "step": 19370 }, { "epoch": 2.667584308327598, "grad_norm": 0.004063263535499573, "learning_rate": 5.67269312045081e-06, "loss": 0.022, "step": 19380 }, { "epoch": 2.668960770818995, "grad_norm": 2.536984443664551, "learning_rate": 5.64921343038272e-06, "loss": 0.007, "step": 19390 }, { "epoch": 2.6703372333103923, "grad_norm": 0.004037255886942148, "learning_rate": 5.625733740314628e-06, "loss": 0.0079, "step": 19400 }, { "epoch": 2.6717136958017895, "grad_norm": 0.0038250356446951628, "learning_rate": 5.602254050246537e-06, "loss": 0.0003, "step": 19410 }, { "epoch": 2.6730901582931867, "grad_norm": 0.0036689569242298603, "learning_rate": 5.578774360178446e-06, "loss": 0.0043, "step": 19420 }, { "epoch": 2.6744666207845835, "grad_norm": 0.03208768367767334, "learning_rate": 5.555294670110354e-06, "loss": 0.0014, "step": 19430 }, { "epoch": 2.6758430832759807, "grad_norm": 0.0035674653481692076, "learning_rate": 5.531814980042263e-06, "loss": 0.0003, "step": 19440 }, { "epoch": 2.677219545767378, "grad_norm": 0.0034957369789481163, "learning_rate": 5.5083352899741724e-06, "loss": 0.0006, "step": 19450 }, { "epoch": 2.6785960082587748, "grad_norm": 0.004075723700225353, "learning_rate": 5.4848555999060815e-06, "loss": 0.0003, "step": 19460 }, { "epoch": 2.679972470750172, "grad_norm": 0.0035140025429427624, "learning_rate": 5.4613759098379905e-06, "loss": 0.0272, "step": 19470 }, { "epoch": 2.681348933241569, "grad_norm": 0.003611938562244177, "learning_rate": 5.4378962197699e-06, "loss": 0.0007, "step": 19480 }, { "epoch": 2.6827253957329664, "grad_norm": 0.0033822436816990376, "learning_rate": 5.414416529701808e-06, "loss": 0.0003, "step": 19490 }, { "epoch": 2.6841018582243636, "grad_norm": 0.006646830588579178, "learning_rate": 5.390936839633718e-06, "loss": 0.0011, "step": 19500 }, { "epoch": 2.6854783207157604, "grad_norm": 0.0038820113986730576, "learning_rate": 5.367457149565626e-06, "loss": 0.0617, "step": 19510 }, { "epoch": 2.6868547832071576, "grad_norm": 0.00490087503567338, "learning_rate": 5.343977459497535e-06, "loss": 0.0006, "step": 19520 }, { "epoch": 2.688231245698555, "grad_norm": 4.868660926818848, "learning_rate": 5.320497769429444e-06, "loss": 0.0164, "step": 19530 }, { "epoch": 2.6896077081899517, "grad_norm": 0.004284149035811424, "learning_rate": 5.297018079361353e-06, "loss": 0.0008, "step": 19540 }, { "epoch": 2.690984170681349, "grad_norm": 0.05161572992801666, "learning_rate": 5.273538389293261e-06, "loss": 0.0008, "step": 19550 }, { "epoch": 2.692360633172746, "grad_norm": 0.003029824700206518, "learning_rate": 5.25005869922517e-06, "loss": 0.039, "step": 19560 }, { "epoch": 2.693737095664143, "grad_norm": 0.0042708879336714745, "learning_rate": 5.226579009157079e-06, "loss": 0.0006, "step": 19570 }, { "epoch": 2.69511355815554, "grad_norm": 0.14658531546592712, "learning_rate": 5.203099319088988e-06, "loss": 0.0268, "step": 19580 }, { "epoch": 2.6964900206469373, "grad_norm": 0.0034185429103672504, "learning_rate": 5.1796196290208974e-06, "loss": 0.0017, "step": 19590 }, { "epoch": 2.6978664831383345, "grad_norm": 0.0033439507242292166, "learning_rate": 5.156139938952806e-06, "loss": 0.0367, "step": 19600 }, { "epoch": 2.6992429456297318, "grad_norm": 0.0033403674606233835, "learning_rate": 5.1326602488847155e-06, "loss": 0.0012, "step": 19610 }, { "epoch": 2.7006194081211286, "grad_norm": 0.0036226417869329453, "learning_rate": 5.109180558816624e-06, "loss": 0.0005, "step": 19620 }, { "epoch": 2.7019958706125258, "grad_norm": 0.0379006452858448, "learning_rate": 5.085700868748533e-06, "loss": 0.0003, "step": 19630 }, { "epoch": 2.703372333103923, "grad_norm": 0.005044223740696907, "learning_rate": 5.062221178680442e-06, "loss": 0.0001, "step": 19640 }, { "epoch": 2.7047487955953198, "grad_norm": 0.003378689056262374, "learning_rate": 5.038741488612351e-06, "loss": 0.0126, "step": 19650 }, { "epoch": 2.706125258086717, "grad_norm": 0.003268527565523982, "learning_rate": 5.015261798544259e-06, "loss": 0.0018, "step": 19660 }, { "epoch": 2.7075017205781142, "grad_norm": 0.0033989809453487396, "learning_rate": 4.991782108476168e-06, "loss": 0.0009, "step": 19670 }, { "epoch": 2.7088781830695114, "grad_norm": 0.0035981147084385157, "learning_rate": 4.968302418408077e-06, "loss": 0.0008, "step": 19680 }, { "epoch": 2.7102546455609087, "grad_norm": 0.4333048164844513, "learning_rate": 4.944822728339986e-06, "loss": 0.0556, "step": 19690 }, { "epoch": 2.7116311080523054, "grad_norm": 0.003537156619131565, "learning_rate": 4.921343038271895e-06, "loss": 0.0003, "step": 19700 }, { "epoch": 2.7130075705437027, "grad_norm": 0.004502769559621811, "learning_rate": 4.8978633482038035e-06, "loss": 0.0545, "step": 19710 }, { "epoch": 2.7143840330351, "grad_norm": 0.0043348027393221855, "learning_rate": 4.874383658135713e-06, "loss": 0.0005, "step": 19720 }, { "epoch": 2.7157604955264967, "grad_norm": 0.0048585874028503895, "learning_rate": 4.850903968067622e-06, "loss": 0.024, "step": 19730 }, { "epoch": 2.717136958017894, "grad_norm": 0.004925417248159647, "learning_rate": 4.827424277999531e-06, "loss": 0.0006, "step": 19740 }, { "epoch": 2.718513420509291, "grad_norm": 0.05101995915174484, "learning_rate": 4.80394458793144e-06, "loss": 0.0058, "step": 19750 }, { "epoch": 2.7198898830006883, "grad_norm": 0.031387392431497574, "learning_rate": 4.780464897863349e-06, "loss": 0.0005, "step": 19760 }, { "epoch": 2.7212663454920856, "grad_norm": 0.0041635469533503056, "learning_rate": 4.756985207795257e-06, "loss": 0.0001, "step": 19770 }, { "epoch": 2.7226428079834823, "grad_norm": 0.004330832045525312, "learning_rate": 4.733505517727166e-06, "loss": 0.0163, "step": 19780 }, { "epoch": 2.7240192704748796, "grad_norm": 0.004832245409488678, "learning_rate": 4.710025827659075e-06, "loss": 0.0004, "step": 19790 }, { "epoch": 2.725395732966277, "grad_norm": 0.005771843250840902, "learning_rate": 4.686546137590984e-06, "loss": 0.0146, "step": 19800 }, { "epoch": 2.7267721954576736, "grad_norm": 0.004430850967764854, "learning_rate": 4.663066447522893e-06, "loss": 0.0044, "step": 19810 }, { "epoch": 2.728148657949071, "grad_norm": 0.004212536383420229, "learning_rate": 4.639586757454801e-06, "loss": 0.0583, "step": 19820 }, { "epoch": 2.729525120440468, "grad_norm": 0.005218318663537502, "learning_rate": 4.616107067386711e-06, "loss": 0.0003, "step": 19830 }, { "epoch": 2.7309015829318652, "grad_norm": 0.005164403468370438, "learning_rate": 4.5926273773186195e-06, "loss": 0.0008, "step": 19840 }, { "epoch": 2.7322780454232625, "grad_norm": 0.006057046353816986, "learning_rate": 4.5691476872505285e-06, "loss": 0.0047, "step": 19850 }, { "epoch": 2.7336545079146592, "grad_norm": 0.005149207077920437, "learning_rate": 4.5456679971824376e-06, "loss": 0.0039, "step": 19860 }, { "epoch": 2.7350309704060565, "grad_norm": 0.7401258945465088, "learning_rate": 4.522188307114347e-06, "loss": 0.0042, "step": 19870 }, { "epoch": 2.7364074328974537, "grad_norm": 0.005428907927125692, "learning_rate": 4.498708617046255e-06, "loss": 0.0003, "step": 19880 }, { "epoch": 2.7377838953888505, "grad_norm": 0.004474862478673458, "learning_rate": 4.475228926978165e-06, "loss": 0.0009, "step": 19890 }, { "epoch": 2.7391603578802477, "grad_norm": 0.0052408576011657715, "learning_rate": 4.451749236910073e-06, "loss": 0.0456, "step": 19900 }, { "epoch": 2.740536820371645, "grad_norm": 0.005202410276979208, "learning_rate": 4.428269546841982e-06, "loss": 0.0001, "step": 19910 }, { "epoch": 2.741913282863042, "grad_norm": 0.0049728890880942345, "learning_rate": 4.404789856773891e-06, "loss": 0.0002, "step": 19920 }, { "epoch": 2.7432897453544394, "grad_norm": 0.004456256981939077, "learning_rate": 4.381310166705799e-06, "loss": 0.0577, "step": 19930 }, { "epoch": 2.744666207845836, "grad_norm": 11.45100212097168, "learning_rate": 4.357830476637709e-06, "loss": 0.0329, "step": 19940 }, { "epoch": 2.7460426703372334, "grad_norm": 0.004116313997656107, "learning_rate": 4.334350786569617e-06, "loss": 0.022, "step": 19950 }, { "epoch": 2.7474191328286306, "grad_norm": 0.0049108220264315605, "learning_rate": 4.310871096501526e-06, "loss": 0.0008, "step": 19960 }, { "epoch": 2.7487955953200274, "grad_norm": 0.004905202426016331, "learning_rate": 4.287391406433435e-06, "loss": 0.0004, "step": 19970 }, { "epoch": 2.7501720578114246, "grad_norm": 0.006841752678155899, "learning_rate": 4.2639117163653445e-06, "loss": 0.0144, "step": 19980 }, { "epoch": 2.751548520302822, "grad_norm": 0.004826438147574663, "learning_rate": 4.240432026297253e-06, "loss": 0.0045, "step": 19990 }, { "epoch": 2.7529249827942186, "grad_norm": 0.005043282639235258, "learning_rate": 4.2169523362291626e-06, "loss": 0.0008, "step": 20000 }, { "epoch": 2.754301445285616, "grad_norm": 0.0049483440816402435, "learning_rate": 4.193472646161071e-06, "loss": 0.0005, "step": 20010 }, { "epoch": 2.755677907777013, "grad_norm": 0.005649138242006302, "learning_rate": 4.16999295609298e-06, "loss": 0.0564, "step": 20020 }, { "epoch": 2.7570543702684103, "grad_norm": 0.012305212207138538, "learning_rate": 4.146513266024889e-06, "loss": 0.0023, "step": 20030 }, { "epoch": 2.7584308327598075, "grad_norm": 0.307921439409256, "learning_rate": 4.123033575956797e-06, "loss": 0.0025, "step": 20040 }, { "epoch": 2.7598072952512043, "grad_norm": 1.1222519874572754, "learning_rate": 4.099553885888707e-06, "loss": 0.0056, "step": 20050 }, { "epoch": 2.7611837577426015, "grad_norm": 0.004743290599435568, "learning_rate": 4.076074195820615e-06, "loss": 0.0004, "step": 20060 }, { "epoch": 2.7625602202339987, "grad_norm": 0.006287624593824148, "learning_rate": 4.052594505752524e-06, "loss": 0.0569, "step": 20070 }, { "epoch": 2.7639366827253955, "grad_norm": 0.00570255983620882, "learning_rate": 4.029114815684433e-06, "loss": 0.0008, "step": 20080 }, { "epoch": 2.7653131452167927, "grad_norm": 0.005643543321639299, "learning_rate": 4.005635125616342e-06, "loss": 0.0006, "step": 20090 }, { "epoch": 2.76668960770819, "grad_norm": 0.006388967391103506, "learning_rate": 3.9821554355482505e-06, "loss": 0.0004, "step": 20100 }, { "epoch": 2.768066070199587, "grad_norm": 0.005465967580676079, "learning_rate": 3.9586757454801604e-06, "loss": 0.0002, "step": 20110 }, { "epoch": 2.7694425326909844, "grad_norm": 0.0689219981431961, "learning_rate": 3.935196055412069e-06, "loss": 0.0013, "step": 20120 }, { "epoch": 2.770818995182381, "grad_norm": 0.4551244378089905, "learning_rate": 3.911716365343978e-06, "loss": 0.005, "step": 20130 }, { "epoch": 2.7721954576737784, "grad_norm": 0.004078532103449106, "learning_rate": 3.888236675275887e-06, "loss": 0.0003, "step": 20140 }, { "epoch": 2.7735719201651756, "grad_norm": 0.004984228406101465, "learning_rate": 3.864756985207795e-06, "loss": 0.0007, "step": 20150 }, { "epoch": 2.7749483826565724, "grad_norm": 0.004633408971130848, "learning_rate": 3.841277295139705e-06, "loss": 0.0003, "step": 20160 }, { "epoch": 2.7763248451479696, "grad_norm": 0.004313938319683075, "learning_rate": 3.817797605071613e-06, "loss": 0.0005, "step": 20170 }, { "epoch": 2.777701307639367, "grad_norm": 0.0057646650820970535, "learning_rate": 3.794317915003522e-06, "loss": 0.0008, "step": 20180 }, { "epoch": 2.779077770130764, "grad_norm": 0.004843806382268667, "learning_rate": 3.7708382249354307e-06, "loss": 0.0212, "step": 20190 }, { "epoch": 2.7804542326221613, "grad_norm": 0.004186523612588644, "learning_rate": 3.74735853486734e-06, "loss": 0.0013, "step": 20200 }, { "epoch": 2.781830695113558, "grad_norm": 0.00556503189727664, "learning_rate": 3.723878844799249e-06, "loss": 0.0001, "step": 20210 }, { "epoch": 2.7832071576049553, "grad_norm": 0.006421599071472883, "learning_rate": 3.700399154731158e-06, "loss": 0.0003, "step": 20220 }, { "epoch": 2.7845836200963525, "grad_norm": 0.004170421045273542, "learning_rate": 3.6769194646630665e-06, "loss": 0.0255, "step": 20230 }, { "epoch": 2.7859600825877493, "grad_norm": 0.006257891654968262, "learning_rate": 3.653439774594976e-06, "loss": 0.0007, "step": 20240 }, { "epoch": 2.7873365450791465, "grad_norm": 0.004026244394481182, "learning_rate": 3.6299600845268846e-06, "loss": 0.0002, "step": 20250 }, { "epoch": 2.7887130075705437, "grad_norm": 0.08671367168426514, "learning_rate": 3.6064803944587928e-06, "loss": 0.0006, "step": 20260 }, { "epoch": 2.790089470061941, "grad_norm": 0.07655234634876251, "learning_rate": 3.5830007043907023e-06, "loss": 0.0012, "step": 20270 }, { "epoch": 2.791465932553338, "grad_norm": 0.004642071668058634, "learning_rate": 3.559521014322611e-06, "loss": 0.0502, "step": 20280 }, { "epoch": 2.792842395044735, "grad_norm": 0.11229307949542999, "learning_rate": 3.53604132425452e-06, "loss": 0.007, "step": 20290 }, { "epoch": 2.794218857536132, "grad_norm": 0.004117736127227545, "learning_rate": 3.5125616341864286e-06, "loss": 0.0335, "step": 20300 }, { "epoch": 2.7955953200275294, "grad_norm": 0.0041410657577216625, "learning_rate": 3.489081944118338e-06, "loss": 0.0009, "step": 20310 }, { "epoch": 2.796971782518926, "grad_norm": 0.13495752215385437, "learning_rate": 3.4656022540502467e-06, "loss": 0.0005, "step": 20320 }, { "epoch": 2.7983482450103234, "grad_norm": 0.004456101451069117, "learning_rate": 3.4421225639821557e-06, "loss": 0.0309, "step": 20330 }, { "epoch": 2.7997247075017206, "grad_norm": 0.004087483510375023, "learning_rate": 3.4186428739140643e-06, "loss": 0.0007, "step": 20340 }, { "epoch": 2.8011011699931174, "grad_norm": 0.004397843033075333, "learning_rate": 3.395163183845974e-06, "loss": 0.007, "step": 20350 }, { "epoch": 2.8024776324845146, "grad_norm": 0.0036495975218713284, "learning_rate": 3.3716834937778824e-06, "loss": 0.0003, "step": 20360 }, { "epoch": 2.803854094975912, "grad_norm": 0.0039611561223864555, "learning_rate": 3.3482038037097915e-06, "loss": 0.0002, "step": 20370 }, { "epoch": 2.805230557467309, "grad_norm": 0.019836846739053726, "learning_rate": 3.3247241136417e-06, "loss": 0.0584, "step": 20380 }, { "epoch": 2.8066070199587063, "grad_norm": 0.030797531828284264, "learning_rate": 3.3012444235736087e-06, "loss": 0.0349, "step": 20390 }, { "epoch": 2.807983482450103, "grad_norm": 0.005021429155021906, "learning_rate": 3.277764733505518e-06, "loss": 0.006, "step": 20400 }, { "epoch": 2.8093599449415003, "grad_norm": 0.005451617296785116, "learning_rate": 3.2542850434374264e-06, "loss": 0.0008, "step": 20410 }, { "epoch": 2.8107364074328975, "grad_norm": 0.0049826521426439285, "learning_rate": 3.230805353369336e-06, "loss": 0.0003, "step": 20420 }, { "epoch": 2.8121128699242943, "grad_norm": 0.005054602399468422, "learning_rate": 3.2073256633012445e-06, "loss": 0.0003, "step": 20430 }, { "epoch": 2.8134893324156915, "grad_norm": 0.004861037712544203, "learning_rate": 3.1838459732331536e-06, "loss": 0.0227, "step": 20440 }, { "epoch": 2.8148657949070888, "grad_norm": 0.06910067796707153, "learning_rate": 3.160366283165062e-06, "loss": 0.0208, "step": 20450 }, { "epoch": 2.816242257398486, "grad_norm": 0.07083255052566528, "learning_rate": 3.1368865930969717e-06, "loss": 0.0004, "step": 20460 }, { "epoch": 2.817618719889883, "grad_norm": 0.0043411701917648315, "learning_rate": 3.1134069030288803e-06, "loss": 0.0004, "step": 20470 }, { "epoch": 2.81899518238128, "grad_norm": 0.09297284483909607, "learning_rate": 3.089927212960789e-06, "loss": 0.0011, "step": 20480 }, { "epoch": 2.820371644872677, "grad_norm": 0.004413216840475798, "learning_rate": 3.066447522892698e-06, "loss": 0.0007, "step": 20490 }, { "epoch": 2.8217481073640744, "grad_norm": 0.44820624589920044, "learning_rate": 3.042967832824607e-06, "loss": 0.0617, "step": 20500 }, { "epoch": 2.823124569855471, "grad_norm": 0.004886975046247244, "learning_rate": 3.0194881427565156e-06, "loss": 0.0007, "step": 20510 }, { "epoch": 2.8245010323468684, "grad_norm": 0.08625947684049606, "learning_rate": 2.9960084526884247e-06, "loss": 0.0008, "step": 20520 }, { "epoch": 2.8258774948382657, "grad_norm": 0.005348458420485258, "learning_rate": 2.9725287626203337e-06, "loss": 0.0007, "step": 20530 }, { "epoch": 2.827253957329663, "grad_norm": 0.005517769604921341, "learning_rate": 2.949049072552243e-06, "loss": 0.0173, "step": 20540 }, { "epoch": 2.82863041982106, "grad_norm": 0.02463040128350258, "learning_rate": 2.9255693824841514e-06, "loss": 0.0003, "step": 20550 }, { "epoch": 2.830006882312457, "grad_norm": 0.0046638669446110725, "learning_rate": 2.90208969241606e-06, "loss": 0.0015, "step": 20560 }, { "epoch": 2.831383344803854, "grad_norm": 0.10519939661026001, "learning_rate": 2.878610002347969e-06, "loss": 0.0006, "step": 20570 }, { "epoch": 2.8327598072952513, "grad_norm": 0.004858760628849268, "learning_rate": 2.855130312279878e-06, "loss": 0.0487, "step": 20580 }, { "epoch": 2.834136269786648, "grad_norm": 0.2699757516384125, "learning_rate": 2.8316506222117868e-06, "loss": 0.0011, "step": 20590 }, { "epoch": 2.8355127322780453, "grad_norm": 0.0043495348654687405, "learning_rate": 2.808170932143696e-06, "loss": 0.0004, "step": 20600 }, { "epoch": 2.8368891947694426, "grad_norm": 0.004762982949614525, "learning_rate": 2.784691242075605e-06, "loss": 0.0043, "step": 20610 }, { "epoch": 2.83826565726084, "grad_norm": 0.005085165146738291, "learning_rate": 2.7612115520075135e-06, "loss": 0.0004, "step": 20620 }, { "epoch": 2.839642119752237, "grad_norm": 0.006411368027329445, "learning_rate": 2.7377318619394226e-06, "loss": 0.0252, "step": 20630 }, { "epoch": 2.841018582243634, "grad_norm": 0.00515042245388031, "learning_rate": 2.7142521718713316e-06, "loss": 0.0006, "step": 20640 }, { "epoch": 2.842395044735031, "grad_norm": 0.004030673298984766, "learning_rate": 2.6907724818032407e-06, "loss": 0.0002, "step": 20650 }, { "epoch": 2.8437715072264282, "grad_norm": 0.005205986555665731, "learning_rate": 2.6672927917351493e-06, "loss": 0.0027, "step": 20660 }, { "epoch": 2.845147969717825, "grad_norm": 0.005025054328143597, "learning_rate": 2.643813101667058e-06, "loss": 0.0167, "step": 20670 }, { "epoch": 2.8465244322092222, "grad_norm": 0.004381037782877684, "learning_rate": 2.620333411598967e-06, "loss": 0.013, "step": 20680 }, { "epoch": 2.8479008947006195, "grad_norm": 0.004999776836484671, "learning_rate": 2.596853721530876e-06, "loss": 0.0006, "step": 20690 }, { "epoch": 2.8492773571920162, "grad_norm": 0.0045098233968019485, "learning_rate": 2.5733740314627846e-06, "loss": 0.0004, "step": 20700 }, { "epoch": 2.8506538196834135, "grad_norm": 0.004159578587859869, "learning_rate": 2.5498943413946937e-06, "loss": 0.0002, "step": 20710 }, { "epoch": 2.8520302821748107, "grad_norm": 0.004347478039562702, "learning_rate": 2.5264146513266027e-06, "loss": 0.0003, "step": 20720 }, { "epoch": 2.853406744666208, "grad_norm": 0.08565700799226761, "learning_rate": 2.5029349612585114e-06, "loss": 0.0004, "step": 20730 }, { "epoch": 2.854783207157605, "grad_norm": 0.022747129201889038, "learning_rate": 2.4794552711904204e-06, "loss": 0.013, "step": 20740 }, { "epoch": 2.856159669649002, "grad_norm": 0.053441137075424194, "learning_rate": 2.4559755811223295e-06, "loss": 0.0003, "step": 20750 }, { "epoch": 2.857536132140399, "grad_norm": 0.004235512111335993, "learning_rate": 2.432495891054238e-06, "loss": 0.1067, "step": 20760 }, { "epoch": 2.8589125946317964, "grad_norm": 0.004918124992400408, "learning_rate": 2.409016200986147e-06, "loss": 0.0003, "step": 20770 }, { "epoch": 2.860289057123193, "grad_norm": 0.004634925164282322, "learning_rate": 2.385536510918056e-06, "loss": 0.0026, "step": 20780 }, { "epoch": 2.8616655196145904, "grad_norm": 0.004428219981491566, "learning_rate": 2.362056820849965e-06, "loss": 0.0003, "step": 20790 }, { "epoch": 2.8630419821059876, "grad_norm": 0.005164165049791336, "learning_rate": 2.338577130781874e-06, "loss": 0.0004, "step": 20800 }, { "epoch": 2.864418444597385, "grad_norm": 0.09739003330469131, "learning_rate": 2.3150974407137825e-06, "loss": 0.051, "step": 20810 }, { "epoch": 2.865794907088782, "grad_norm": 0.11762718856334686, "learning_rate": 2.2916177506456915e-06, "loss": 0.0005, "step": 20820 }, { "epoch": 2.867171369580179, "grad_norm": 0.005923726130276918, "learning_rate": 2.2681380605776006e-06, "loss": 0.0006, "step": 20830 }, { "epoch": 2.868547832071576, "grad_norm": 0.004946379456669092, "learning_rate": 2.2446583705095092e-06, "loss": 0.0006, "step": 20840 }, { "epoch": 2.8699242945629733, "grad_norm": 0.004479643888771534, "learning_rate": 2.2211786804414183e-06, "loss": 0.0005, "step": 20850 }, { "epoch": 2.87130075705437, "grad_norm": 0.04147579148411751, "learning_rate": 2.1976989903733273e-06, "loss": 0.0006, "step": 20860 }, { "epoch": 2.8726772195457673, "grad_norm": 0.005070742219686508, "learning_rate": 2.174219300305236e-06, "loss": 0.0006, "step": 20870 }, { "epoch": 2.8740536820371645, "grad_norm": 0.0067503079771995544, "learning_rate": 2.150739610237145e-06, "loss": 0.0529, "step": 20880 }, { "epoch": 2.8754301445285617, "grad_norm": 0.0054399678483605385, "learning_rate": 2.127259920169054e-06, "loss": 0.0002, "step": 20890 }, { "epoch": 2.876806607019959, "grad_norm": 0.09573568403720856, "learning_rate": 2.103780230100963e-06, "loss": 0.0467, "step": 20900 }, { "epoch": 2.8781830695113557, "grad_norm": 0.005411568563431501, "learning_rate": 2.0803005400328713e-06, "loss": 0.0026, "step": 20910 }, { "epoch": 2.879559532002753, "grad_norm": 0.008412584662437439, "learning_rate": 2.0568208499647803e-06, "loss": 0.0061, "step": 20920 }, { "epoch": 2.88093599449415, "grad_norm": 0.033355168998241425, "learning_rate": 2.0333411598966894e-06, "loss": 0.0066, "step": 20930 }, { "epoch": 2.882312456985547, "grad_norm": 0.004690178669989109, "learning_rate": 2.0098614698285984e-06, "loss": 0.0004, "step": 20940 }, { "epoch": 2.883688919476944, "grad_norm": 0.005773876793682575, "learning_rate": 1.986381779760507e-06, "loss": 0.0204, "step": 20950 }, { "epoch": 2.8850653819683414, "grad_norm": 0.004950038623064756, "learning_rate": 1.962902089692416e-06, "loss": 0.0017, "step": 20960 }, { "epoch": 2.8864418444597386, "grad_norm": 0.006148640997707844, "learning_rate": 1.939422399624325e-06, "loss": 0.0882, "step": 20970 }, { "epoch": 2.887818306951136, "grad_norm": 0.048229116946458817, "learning_rate": 1.915942709556234e-06, "loss": 0.034, "step": 20980 }, { "epoch": 2.8891947694425326, "grad_norm": 0.006473532412201166, "learning_rate": 1.8924630194881429e-06, "loss": 0.0071, "step": 20990 }, { "epoch": 2.89057123193393, "grad_norm": 0.04012025520205498, "learning_rate": 1.868983329420052e-06, "loss": 0.001, "step": 21000 }, { "epoch": 2.891947694425327, "grad_norm": 0.08814006298780441, "learning_rate": 1.8455036393519607e-06, "loss": 0.0008, "step": 21010 }, { "epoch": 2.893324156916724, "grad_norm": 0.040213849395513535, "learning_rate": 1.8220239492838698e-06, "loss": 0.0008, "step": 21020 }, { "epoch": 2.894700619408121, "grad_norm": 0.006959440186619759, "learning_rate": 1.7985442592157782e-06, "loss": 0.0004, "step": 21030 }, { "epoch": 2.8960770818995183, "grad_norm": 0.005593619309365749, "learning_rate": 1.7750645691476873e-06, "loss": 0.0088, "step": 21040 }, { "epoch": 2.897453544390915, "grad_norm": 0.005697543267160654, "learning_rate": 1.751584879079596e-06, "loss": 0.0062, "step": 21050 }, { "epoch": 2.8988300068823123, "grad_norm": 0.042203452438116074, "learning_rate": 1.7281051890115051e-06, "loss": 0.0006, "step": 21060 }, { "epoch": 2.9002064693737095, "grad_norm": 0.005650228355079889, "learning_rate": 1.704625498943414e-06, "loss": 0.0003, "step": 21070 }, { "epoch": 2.9015829318651067, "grad_norm": 0.006505673751235008, "learning_rate": 1.6811458088753228e-06, "loss": 0.0005, "step": 21080 }, { "epoch": 2.902959394356504, "grad_norm": 0.03064986877143383, "learning_rate": 1.6576661188072319e-06, "loss": 0.0003, "step": 21090 }, { "epoch": 2.9043358568479007, "grad_norm": 0.006291820667684078, "learning_rate": 1.6341864287391407e-06, "loss": 0.0012, "step": 21100 }, { "epoch": 2.905712319339298, "grad_norm": 0.005568502005189657, "learning_rate": 1.6107067386710498e-06, "loss": 0.0004, "step": 21110 }, { "epoch": 2.907088781830695, "grad_norm": 0.005912845488637686, "learning_rate": 1.5872270486029586e-06, "loss": 0.0475, "step": 21120 }, { "epoch": 2.908465244322092, "grad_norm": 0.005125691648572683, "learning_rate": 1.5637473585348676e-06, "loss": 0.0003, "step": 21130 }, { "epoch": 2.909841706813489, "grad_norm": 0.004653359763324261, "learning_rate": 1.5402676684667763e-06, "loss": 0.0006, "step": 21140 }, { "epoch": 2.9112181693048864, "grad_norm": 0.005419536959379911, "learning_rate": 1.5167879783986853e-06, "loss": 0.001, "step": 21150 }, { "epoch": 2.9125946317962836, "grad_norm": 0.08378367871046066, "learning_rate": 1.4933082883305942e-06, "loss": 0.0007, "step": 21160 }, { "epoch": 2.913971094287681, "grad_norm": 0.00514714140444994, "learning_rate": 1.469828598262503e-06, "loss": 0.0006, "step": 21170 }, { "epoch": 2.9153475567790776, "grad_norm": 0.0046628136187791824, "learning_rate": 1.4463489081944118e-06, "loss": 0.0009, "step": 21180 }, { "epoch": 2.916724019270475, "grad_norm": 1.3783539533615112, "learning_rate": 1.4228692181263207e-06, "loss": 0.0498, "step": 21190 }, { "epoch": 2.918100481761872, "grad_norm": 0.005679844878613949, "learning_rate": 1.3993895280582297e-06, "loss": 0.0002, "step": 21200 }, { "epoch": 2.919476944253269, "grad_norm": 0.10758664458990097, "learning_rate": 1.3759098379901386e-06, "loss": 0.0149, "step": 21210 }, { "epoch": 2.920853406744666, "grad_norm": 0.005442201159894466, "learning_rate": 1.3524301479220476e-06, "loss": 0.0004, "step": 21220 }, { "epoch": 2.9222298692360633, "grad_norm": 0.00516867870464921, "learning_rate": 1.3289504578539565e-06, "loss": 0.0057, "step": 21230 }, { "epoch": 2.9236063317274605, "grad_norm": 0.005495723336935043, "learning_rate": 1.3054707677858653e-06, "loss": 0.0601, "step": 21240 }, { "epoch": 2.9249827942188578, "grad_norm": 2.693856716156006, "learning_rate": 1.2819910777177741e-06, "loss": 0.0229, "step": 21250 }, { "epoch": 2.9263592567102545, "grad_norm": 0.08575022220611572, "learning_rate": 1.2585113876496832e-06, "loss": 0.0046, "step": 21260 }, { "epoch": 2.9277357192016518, "grad_norm": 0.0791582241654396, "learning_rate": 1.235031697581592e-06, "loss": 0.0295, "step": 21270 }, { "epoch": 2.929112181693049, "grad_norm": 0.0983717069029808, "learning_rate": 1.2115520075135009e-06, "loss": 0.0242, "step": 21280 }, { "epoch": 2.9304886441844458, "grad_norm": 0.006015075370669365, "learning_rate": 1.18807231744541e-06, "loss": 0.0004, "step": 21290 }, { "epoch": 2.931865106675843, "grad_norm": 0.031094053760170937, "learning_rate": 1.1645926273773185e-06, "loss": 0.0008, "step": 21300 }, { "epoch": 2.93324156916724, "grad_norm": 3.4912655353546143, "learning_rate": 1.1411129373092276e-06, "loss": 0.0622, "step": 21310 }, { "epoch": 2.9346180316586374, "grad_norm": 0.11082141101360321, "learning_rate": 1.1176332472411364e-06, "loss": 0.0037, "step": 21320 }, { "epoch": 2.9359944941500347, "grad_norm": 0.005436692386865616, "learning_rate": 1.0941535571730455e-06, "loss": 0.0188, "step": 21330 }, { "epoch": 2.9373709566414314, "grad_norm": 0.006797828711569309, "learning_rate": 1.0706738671049543e-06, "loss": 0.0004, "step": 21340 }, { "epoch": 2.9387474191328287, "grad_norm": 0.005579444579780102, "learning_rate": 1.0471941770368631e-06, "loss": 0.0029, "step": 21350 }, { "epoch": 2.940123881624226, "grad_norm": 0.004749068524688482, "learning_rate": 1.023714486968772e-06, "loss": 0.0052, "step": 21360 }, { "epoch": 2.9415003441156227, "grad_norm": 0.0834852084517479, "learning_rate": 1.0002347969006808e-06, "loss": 0.0013, "step": 21370 }, { "epoch": 2.94287680660702, "grad_norm": 0.011617741547524929, "learning_rate": 9.767551068325899e-07, "loss": 0.0251, "step": 21380 }, { "epoch": 2.944253269098417, "grad_norm": 0.0056719304993748665, "learning_rate": 9.532754167644988e-07, "loss": 0.0006, "step": 21390 }, { "epoch": 2.9456297315898143, "grad_norm": 0.007793138734996319, "learning_rate": 9.297957266964078e-07, "loss": 0.0005, "step": 21400 }, { "epoch": 2.9470061940812116, "grad_norm": 0.009565806947648525, "learning_rate": 9.063160366283166e-07, "loss": 0.0005, "step": 21410 }, { "epoch": 2.9483826565726083, "grad_norm": 0.005277134012430906, "learning_rate": 8.828363465602254e-07, "loss": 0.0033, "step": 21420 }, { "epoch": 2.9497591190640056, "grad_norm": 0.0062510790303349495, "learning_rate": 8.593566564921343e-07, "loss": 0.0015, "step": 21430 }, { "epoch": 2.951135581555403, "grad_norm": 0.005442068446427584, "learning_rate": 8.358769664240432e-07, "loss": 0.0235, "step": 21440 }, { "epoch": 2.9525120440467996, "grad_norm": 0.00562697509303689, "learning_rate": 8.123972763559522e-07, "loss": 0.0002, "step": 21450 }, { "epoch": 2.953888506538197, "grad_norm": 0.005244623403996229, "learning_rate": 7.889175862878611e-07, "loss": 0.0002, "step": 21460 }, { "epoch": 2.955264969029594, "grad_norm": 0.0052780346013605595, "learning_rate": 7.654378962197699e-07, "loss": 0.0556, "step": 21470 }, { "epoch": 2.956641431520991, "grad_norm": 0.029858004301786423, "learning_rate": 7.419582061516789e-07, "loss": 0.0012, "step": 21480 }, { "epoch": 2.958017894012388, "grad_norm": 0.12054148316383362, "learning_rate": 7.184785160835877e-07, "loss": 0.0027, "step": 21490 }, { "epoch": 2.9593943565037852, "grad_norm": 0.005949034355580807, "learning_rate": 6.949988260154967e-07, "loss": 0.024, "step": 21500 }, { "epoch": 2.9607708189951825, "grad_norm": 0.005305220372974873, "learning_rate": 6.715191359474055e-07, "loss": 0.0007, "step": 21510 }, { "epoch": 2.9621472814865797, "grad_norm": 0.037529367953538895, "learning_rate": 6.480394458793143e-07, "loss": 0.008, "step": 21520 }, { "epoch": 2.9635237439779765, "grad_norm": 0.005595346447080374, "learning_rate": 6.245597558112233e-07, "loss": 0.0009, "step": 21530 }, { "epoch": 2.9649002064693737, "grad_norm": 0.005723931826651096, "learning_rate": 6.010800657431322e-07, "loss": 0.0006, "step": 21540 }, { "epoch": 2.966276668960771, "grad_norm": 0.05509048327803612, "learning_rate": 5.776003756750411e-07, "loss": 0.0431, "step": 21550 }, { "epoch": 2.9676531314521677, "grad_norm": 0.09578588604927063, "learning_rate": 5.5412068560695e-07, "loss": 0.0007, "step": 21560 }, { "epoch": 2.969029593943565, "grad_norm": 0.004910743795335293, "learning_rate": 5.30640995538859e-07, "loss": 0.0186, "step": 21570 }, { "epoch": 2.970406056434962, "grad_norm": 0.03952249139547348, "learning_rate": 5.071613054707678e-07, "loss": 0.0003, "step": 21580 }, { "epoch": 2.9717825189263594, "grad_norm": 0.005249264650046825, "learning_rate": 4.836816154026767e-07, "loss": 0.0055, "step": 21590 }, { "epoch": 2.9731589814177566, "grad_norm": 0.005884993821382523, "learning_rate": 4.6020192533458564e-07, "loss": 0.0005, "step": 21600 }, { "epoch": 2.9745354439091534, "grad_norm": 0.00481320358812809, "learning_rate": 4.367222352664945e-07, "loss": 0.001, "step": 21610 }, { "epoch": 2.9759119064005506, "grad_norm": 0.004874872509390116, "learning_rate": 4.132425451984034e-07, "loss": 0.0002, "step": 21620 }, { "epoch": 2.977288368891948, "grad_norm": 0.005336121190339327, "learning_rate": 3.897628551303123e-07, "loss": 0.0512, "step": 21630 }, { "epoch": 2.9786648313833446, "grad_norm": 0.04259374365210533, "learning_rate": 3.662831650622212e-07, "loss": 0.0248, "step": 21640 }, { "epoch": 2.980041293874742, "grad_norm": 0.005379998590797186, "learning_rate": 3.428034749941301e-07, "loss": 0.0005, "step": 21650 }, { "epoch": 2.981417756366139, "grad_norm": 0.0341486819088459, "learning_rate": 3.19323784926039e-07, "loss": 0.0006, "step": 21660 }, { "epoch": 2.9827942188575363, "grad_norm": 0.0051844920963048935, "learning_rate": 2.958440948579479e-07, "loss": 0.0069, "step": 21670 }, { "epoch": 2.9841706813489335, "grad_norm": 0.0054930453188717365, "learning_rate": 2.723644047898568e-07, "loss": 0.0008, "step": 21680 }, { "epoch": 2.9855471438403303, "grad_norm": 0.00616528419777751, "learning_rate": 2.4888471472176566e-07, "loss": 0.0002, "step": 21690 }, { "epoch": 2.9869236063317275, "grad_norm": 0.06524072587490082, "learning_rate": 2.2540502465367455e-07, "loss": 0.0007, "step": 21700 }, { "epoch": 2.9883000688231247, "grad_norm": 0.005343121010810137, "learning_rate": 2.019253345855835e-07, "loss": 0.001, "step": 21710 }, { "epoch": 2.9896765313145215, "grad_norm": 0.005664136726409197, "learning_rate": 1.7844564451749238e-07, "loss": 0.0006, "step": 21720 }, { "epoch": 2.9910529938059187, "grad_norm": 1.8913332223892212, "learning_rate": 1.5496595444940128e-07, "loss": 0.034, "step": 21730 }, { "epoch": 2.992429456297316, "grad_norm": 0.0076236422173678875, "learning_rate": 1.314862643813102e-07, "loss": 0.0004, "step": 21740 }, { "epoch": 2.993805918788713, "grad_norm": 0.03445188328623772, "learning_rate": 1.0800657431321907e-07, "loss": 0.0157, "step": 21750 }, { "epoch": 2.9951823812801104, "grad_norm": 0.032660964876413345, "learning_rate": 8.452688424512798e-08, "loss": 0.0006, "step": 21760 }, { "epoch": 2.996558843771507, "grad_norm": 0.10467182099819183, "learning_rate": 6.104719417703687e-08, "loss": 0.0004, "step": 21770 }, { "epoch": 2.9979353062629044, "grad_norm": 0.005670404527336359, "learning_rate": 3.7567504108945765e-08, "loss": 0.0004, "step": 21780 }, { "epoch": 2.9993117687543016, "grad_norm": 0.006718012038618326, "learning_rate": 1.408781404085466e-08, "loss": 0.0011, "step": 21790 }, { "epoch": 3.0, "eval_accuracy": 0.9970061940812113, "eval_f1": 0.9681435371658733, "eval_loss": 0.013353521004319191, "eval_precision": 0.9814402375649591, "eval_recall": 0.9552023121387283, "eval_runtime": 51.1984, "eval_samples_per_second": 567.596, "eval_steps_per_second": 35.489, "step": 21795 } ], "logging_steps": 10, "max_steps": 21795, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.619363385712435e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }