{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.8377807640622, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007679017085813016, "grad_norm": 9.727725365375315, "learning_rate": 1.9969278033794163e-05, "loss": 1.0813, "step": 10 }, { "epoch": 0.015358034171626032, "grad_norm": 5.891390510276554, "learning_rate": 1.993855606758833e-05, "loss": 0.2409, "step": 20 }, { "epoch": 0.02303705125743905, "grad_norm": 5.316733043697516, "learning_rate": 1.990783410138249e-05, "loss": 0.1393, "step": 30 }, { "epoch": 0.030716068343252065, "grad_norm": 1.7740681981145512, "learning_rate": 1.9877112135176652e-05, "loss": 0.0493, "step": 40 }, { "epoch": 0.03839508542906508, "grad_norm": 0.21916356325831438, "learning_rate": 1.9846390168970814e-05, "loss": 0.0162, "step": 50 }, { "epoch": 0.0460741025148781, "grad_norm": 0.10526533851381863, "learning_rate": 1.981566820276498e-05, "loss": 0.0091, "step": 60 }, { "epoch": 0.05375311960069111, "grad_norm": 0.09814045024832824, "learning_rate": 1.978494623655914e-05, "loss": 0.0059, "step": 70 }, { "epoch": 0.06143213668650413, "grad_norm": 0.07005501460032271, "learning_rate": 1.9754224270353303e-05, "loss": 0.004, "step": 80 }, { "epoch": 0.06911115377231715, "grad_norm": 0.05895087166496951, "learning_rate": 1.9723502304147465e-05, "loss": 0.0029, "step": 90 }, { "epoch": 0.07679017085813016, "grad_norm": 0.05370999256341492, "learning_rate": 1.969278033794163e-05, "loss": 0.0021, "step": 100 }, { "epoch": 0.08446918794394317, "grad_norm": 0.03834816398116016, "learning_rate": 1.9662058371735792e-05, "loss": 0.0017, "step": 110 }, { "epoch": 0.0921482050297562, "grad_norm": 0.02734932863664795, "learning_rate": 1.9631336405529954e-05, "loss": 0.0013, "step": 120 }, { "epoch": 0.0998272221155692, "grad_norm": 0.036636985676517964, "learning_rate": 1.960061443932412e-05, "loss": 0.0011, "step": 130 }, { "epoch": 0.10750623920138222, "grad_norm": 0.02624781225455215, "learning_rate": 1.956989247311828e-05, "loss": 0.001, "step": 140 }, { "epoch": 0.11518525628719524, "grad_norm": 0.020107478390890325, "learning_rate": 1.9539170506912443e-05, "loss": 0.0009, "step": 150 }, { "epoch": 0.12286427337300826, "grad_norm": 0.01800807960296447, "learning_rate": 1.9508448540706605e-05, "loss": 0.0008, "step": 160 }, { "epoch": 0.13054329045882127, "grad_norm": 0.020228009242971072, "learning_rate": 1.947772657450077e-05, "loss": 0.0008, "step": 170 }, { "epoch": 0.1382223075446343, "grad_norm": 0.014493824912246944, "learning_rate": 1.9447004608294932e-05, "loss": 0.0007, "step": 180 }, { "epoch": 0.1459013246304473, "grad_norm": 0.017644672369553523, "learning_rate": 1.9416282642089094e-05, "loss": 0.0007, "step": 190 }, { "epoch": 0.1535803417162603, "grad_norm": 0.010751743718197211, "learning_rate": 1.9385560675883256e-05, "loss": 0.0007, "step": 200 }, { "epoch": 0.16125935880207334, "grad_norm": 0.010740308456505745, "learning_rate": 1.935483870967742e-05, "loss": 0.0006, "step": 210 }, { "epoch": 0.16893837588788635, "grad_norm": 0.012713528482636717, "learning_rate": 1.9324116743471583e-05, "loss": 0.0006, "step": 220 }, { "epoch": 0.17661739297369936, "grad_norm": 0.015802969815143914, "learning_rate": 1.9293394777265745e-05, "loss": 0.0006, "step": 230 }, { "epoch": 0.1842964100595124, "grad_norm": 0.014091331879241371, "learning_rate": 1.926267281105991e-05, "loss": 0.0006, "step": 240 }, { "epoch": 0.1919754271453254, "grad_norm": 0.015514382294291343, "learning_rate": 1.923195084485407e-05, "loss": 0.0006, "step": 250 }, { "epoch": 0.1996544442311384, "grad_norm": 0.013206328203358594, "learning_rate": 1.9201228878648233e-05, "loss": 0.0005, "step": 260 }, { "epoch": 0.20733346131695143, "grad_norm": 0.009242022580630511, "learning_rate": 1.91705069124424e-05, "loss": 0.0006, "step": 270 }, { "epoch": 0.21501247840276444, "grad_norm": 0.020571611640529425, "learning_rate": 1.913978494623656e-05, "loss": 0.0006, "step": 280 }, { "epoch": 0.22269149548857747, "grad_norm": 0.010955465950333863, "learning_rate": 1.9109062980030722e-05, "loss": 0.0006, "step": 290 }, { "epoch": 0.23037051257439048, "grad_norm": 0.010007446119499878, "learning_rate": 1.9078341013824884e-05, "loss": 0.0005, "step": 300 }, { "epoch": 0.23804952966020348, "grad_norm": 0.007350763980205989, "learning_rate": 1.904761904761905e-05, "loss": 0.0005, "step": 310 }, { "epoch": 0.24572854674601652, "grad_norm": 0.01779827202427705, "learning_rate": 1.901689708141321e-05, "loss": 0.0005, "step": 320 }, { "epoch": 0.25340756383182955, "grad_norm": 0.011252332653060129, "learning_rate": 1.8986175115207373e-05, "loss": 0.0005, "step": 330 }, { "epoch": 0.26108658091764253, "grad_norm": 0.008048724916684773, "learning_rate": 1.895545314900154e-05, "loss": 0.0005, "step": 340 }, { "epoch": 0.26876559800345556, "grad_norm": 0.008708169011978722, "learning_rate": 1.89247311827957e-05, "loss": 0.0005, "step": 350 }, { "epoch": 0.2764446150892686, "grad_norm": 0.011334018261947772, "learning_rate": 1.8894009216589862e-05, "loss": 0.0005, "step": 360 }, { "epoch": 0.2841236321750816, "grad_norm": 0.012541055386254458, "learning_rate": 1.8863287250384027e-05, "loss": 0.0005, "step": 370 }, { "epoch": 0.2918026492608946, "grad_norm": 0.00797864009319901, "learning_rate": 1.883256528417819e-05, "loss": 0.0005, "step": 380 }, { "epoch": 0.29948166634670764, "grad_norm": 0.020543218292001533, "learning_rate": 1.880184331797235e-05, "loss": 0.0005, "step": 390 }, { "epoch": 0.3071606834325206, "grad_norm": 0.008833716167768964, "learning_rate": 1.8771121351766516e-05, "loss": 0.0005, "step": 400 }, { "epoch": 0.31483970051833365, "grad_norm": 0.01304769806714082, "learning_rate": 1.8740399385560678e-05, "loss": 0.0005, "step": 410 }, { "epoch": 0.3225187176041467, "grad_norm": 0.0069234640254174085, "learning_rate": 1.870967741935484e-05, "loss": 0.0005, "step": 420 }, { "epoch": 0.33019773468995967, "grad_norm": 0.006673799615850355, "learning_rate": 1.8678955453149005e-05, "loss": 0.0005, "step": 430 }, { "epoch": 0.3378767517757727, "grad_norm": 0.008446803942572372, "learning_rate": 1.8648233486943167e-05, "loss": 0.0005, "step": 440 }, { "epoch": 0.34555576886158573, "grad_norm": 0.007927264290360291, "learning_rate": 1.861751152073733e-05, "loss": 0.0005, "step": 450 }, { "epoch": 0.3532347859473987, "grad_norm": 0.008214795632921428, "learning_rate": 1.858678955453149e-05, "loss": 0.0005, "step": 460 }, { "epoch": 0.36091380303321174, "grad_norm": 0.0753894373723428, "learning_rate": 1.8556067588325656e-05, "loss": 0.0005, "step": 470 }, { "epoch": 0.3685928201190248, "grad_norm": 0.023758962572301385, "learning_rate": 1.8525345622119818e-05, "loss": 0.0011, "step": 480 }, { "epoch": 0.37627183720483776, "grad_norm": 0.008016402676739795, "learning_rate": 1.849462365591398e-05, "loss": 0.0007, "step": 490 }, { "epoch": 0.3839508542906508, "grad_norm": 0.008412657527867743, "learning_rate": 1.8463901689708145e-05, "loss": 0.0005, "step": 500 }, { "epoch": 0.3916298713764638, "grad_norm": 0.009837140434012532, "learning_rate": 1.8433179723502307e-05, "loss": 0.0005, "step": 510 }, { "epoch": 0.3993088884622768, "grad_norm": 0.007645537883711128, "learning_rate": 1.840245775729647e-05, "loss": 0.0005, "step": 520 }, { "epoch": 0.40698790554808983, "grad_norm": 0.009833102435437406, "learning_rate": 1.837173579109063e-05, "loss": 0.0005, "step": 530 }, { "epoch": 0.41466692263390287, "grad_norm": 0.011054813245860173, "learning_rate": 1.8341013824884796e-05, "loss": 0.0004, "step": 540 }, { "epoch": 0.4223459397197159, "grad_norm": 0.007490270162929283, "learning_rate": 1.8310291858678958e-05, "loss": 0.0004, "step": 550 }, { "epoch": 0.4300249568055289, "grad_norm": 0.004870404631077036, "learning_rate": 1.827956989247312e-05, "loss": 0.0004, "step": 560 }, { "epoch": 0.4377039738913419, "grad_norm": 0.0066189666941918876, "learning_rate": 1.8248847926267285e-05, "loss": 0.0004, "step": 570 }, { "epoch": 0.44538299097715495, "grad_norm": 0.006057086083441172, "learning_rate": 1.8218125960061447e-05, "loss": 0.0004, "step": 580 }, { "epoch": 0.4530620080629679, "grad_norm": 0.013285236122907804, "learning_rate": 1.818740399385561e-05, "loss": 0.0004, "step": 590 }, { "epoch": 0.46074102514878096, "grad_norm": 0.0060099205496900975, "learning_rate": 1.815668202764977e-05, "loss": 0.0004, "step": 600 }, { "epoch": 0.468420042234594, "grad_norm": 0.014047318223341236, "learning_rate": 1.8125960061443936e-05, "loss": 0.0004, "step": 610 }, { "epoch": 0.47609905932040697, "grad_norm": 0.006607072647025668, "learning_rate": 1.8095238095238097e-05, "loss": 0.0004, "step": 620 }, { "epoch": 0.48377807640622, "grad_norm": 0.006231396108942757, "learning_rate": 1.806451612903226e-05, "loss": 0.0004, "step": 630 }, { "epoch": 0.49145709349203304, "grad_norm": 0.007977818806990321, "learning_rate": 1.803379416282642e-05, "loss": 0.0004, "step": 640 }, { "epoch": 0.499136110577846, "grad_norm": 0.004452562656745065, "learning_rate": 1.8003072196620586e-05, "loss": 0.0004, "step": 650 }, { "epoch": 0.5068151276636591, "grad_norm": 0.014825032602307369, "learning_rate": 1.7972350230414748e-05, "loss": 0.0004, "step": 660 }, { "epoch": 0.5144941447494721, "grad_norm": 0.007726258660291533, "learning_rate": 1.794162826420891e-05, "loss": 0.0004, "step": 670 }, { "epoch": 0.5221731618352851, "grad_norm": 0.004901555909874864, "learning_rate": 1.7910906298003075e-05, "loss": 0.0004, "step": 680 }, { "epoch": 0.5298521789210981, "grad_norm": 0.008374026896139112, "learning_rate": 1.7880184331797237e-05, "loss": 0.0004, "step": 690 }, { "epoch": 0.5375311960069111, "grad_norm": 0.012547419917709076, "learning_rate": 1.78494623655914e-05, "loss": 0.0004, "step": 700 }, { "epoch": 0.5452102130927241, "grad_norm": 0.006258152294015092, "learning_rate": 1.781874039938556e-05, "loss": 0.0003, "step": 710 }, { "epoch": 0.5528892301785372, "grad_norm": 0.005083584292665606, "learning_rate": 1.7788018433179726e-05, "loss": 0.0004, "step": 720 }, { "epoch": 0.5605682472643502, "grad_norm": 0.004891854049254944, "learning_rate": 1.7757296466973888e-05, "loss": 0.0003, "step": 730 }, { "epoch": 0.5682472643501632, "grad_norm": 0.0072081474858841825, "learning_rate": 1.772657450076805e-05, "loss": 0.0003, "step": 740 }, { "epoch": 0.5759262814359762, "grad_norm": 0.007757147865401733, "learning_rate": 1.7695852534562215e-05, "loss": 0.0003, "step": 750 }, { "epoch": 0.5836052985217892, "grad_norm": 0.009479916603674836, "learning_rate": 1.7665130568356377e-05, "loss": 0.0003, "step": 760 }, { "epoch": 0.5912843156076022, "grad_norm": 0.0032364838449939247, "learning_rate": 1.763440860215054e-05, "loss": 0.0003, "step": 770 }, { "epoch": 0.5989633326934153, "grad_norm": 0.006901552705099111, "learning_rate": 1.76036866359447e-05, "loss": 0.0003, "step": 780 }, { "epoch": 0.6066423497792283, "grad_norm": 0.015422089144709121, "learning_rate": 1.7572964669738866e-05, "loss": 0.0004, "step": 790 }, { "epoch": 0.6143213668650412, "grad_norm": 0.008708065391951672, "learning_rate": 1.7542242703533028e-05, "loss": 0.0003, "step": 800 }, { "epoch": 0.6220003839508543, "grad_norm": 0.005612862812218234, "learning_rate": 1.751152073732719e-05, "loss": 0.0004, "step": 810 }, { "epoch": 0.6296794010366673, "grad_norm": 0.004164448177684985, "learning_rate": 1.748079877112135e-05, "loss": 0.0003, "step": 820 }, { "epoch": 0.6373584181224803, "grad_norm": 0.005805489468439844, "learning_rate": 1.7450076804915517e-05, "loss": 0.0003, "step": 830 }, { "epoch": 0.6450374352082934, "grad_norm": 0.0058336408998533274, "learning_rate": 1.741935483870968e-05, "loss": 0.0003, "step": 840 }, { "epoch": 0.6527164522941064, "grad_norm": 0.004581976763955479, "learning_rate": 1.738863287250384e-05, "loss": 0.0003, "step": 850 }, { "epoch": 0.6603954693799193, "grad_norm": 0.007306575244655952, "learning_rate": 1.7357910906298005e-05, "loss": 0.0003, "step": 860 }, { "epoch": 0.6680744864657324, "grad_norm": 0.003820631484850308, "learning_rate": 1.7327188940092167e-05, "loss": 0.0003, "step": 870 }, { "epoch": 0.6757535035515454, "grad_norm": 0.0051408749682255814, "learning_rate": 1.729646697388633e-05, "loss": 0.0003, "step": 880 }, { "epoch": 0.6834325206373584, "grad_norm": 0.006921982048724673, "learning_rate": 1.726574500768049e-05, "loss": 0.0003, "step": 890 }, { "epoch": 0.6911115377231715, "grad_norm": 0.004784894049846067, "learning_rate": 1.7235023041474656e-05, "loss": 0.0003, "step": 900 }, { "epoch": 0.6987905548089844, "grad_norm": 0.0031159854313391635, "learning_rate": 1.7204301075268818e-05, "loss": 0.0003, "step": 910 }, { "epoch": 0.7064695718947974, "grad_norm": 0.006605791714352256, "learning_rate": 1.717357910906298e-05, "loss": 0.0003, "step": 920 }, { "epoch": 0.7141485889806105, "grad_norm": 0.004936488600986064, "learning_rate": 1.7142857142857142e-05, "loss": 0.0003, "step": 930 }, { "epoch": 0.7218276060664235, "grad_norm": 0.007674349562955001, "learning_rate": 1.7112135176651307e-05, "loss": 0.0003, "step": 940 }, { "epoch": 0.7295066231522365, "grad_norm": 0.0076727356784045695, "learning_rate": 1.708141321044547e-05, "loss": 0.0003, "step": 950 }, { "epoch": 0.7371856402380496, "grad_norm": 0.00553702977922986, "learning_rate": 1.705069124423963e-05, "loss": 0.0003, "step": 960 }, { "epoch": 0.7448646573238625, "grad_norm": 0.007299313911910154, "learning_rate": 1.7019969278033796e-05, "loss": 0.0003, "step": 970 }, { "epoch": 0.7525436744096755, "grad_norm": 0.004258926555903475, "learning_rate": 1.6989247311827958e-05, "loss": 0.0003, "step": 980 }, { "epoch": 0.7602226914954886, "grad_norm": 0.004547740732820229, "learning_rate": 1.695852534562212e-05, "loss": 0.0003, "step": 990 }, { "epoch": 0.7679017085813016, "grad_norm": 0.005203964533047756, "learning_rate": 1.6927803379416285e-05, "loss": 0.0003, "step": 1000 }, { "epoch": 0.7755807256671146, "grad_norm": 0.01302332966364172, "learning_rate": 1.6897081413210447e-05, "loss": 0.0003, "step": 1010 }, { "epoch": 0.7832597427529276, "grad_norm": 0.0093180048231896, "learning_rate": 1.686635944700461e-05, "loss": 0.0003, "step": 1020 }, { "epoch": 0.7909387598387406, "grad_norm": 0.008803117247590506, "learning_rate": 1.683563748079877e-05, "loss": 0.0003, "step": 1030 }, { "epoch": 0.7986177769245536, "grad_norm": 0.0048423473942718395, "learning_rate": 1.6804915514592936e-05, "loss": 0.0003, "step": 1040 }, { "epoch": 0.8062967940103667, "grad_norm": 0.00871594900669087, "learning_rate": 1.6774193548387098e-05, "loss": 0.0003, "step": 1050 }, { "epoch": 0.8139758110961797, "grad_norm": 0.004213591332826992, "learning_rate": 1.674347158218126e-05, "loss": 0.0003, "step": 1060 }, { "epoch": 0.8216548281819928, "grad_norm": 0.008233410874863824, "learning_rate": 1.6712749615975425e-05, "loss": 0.0003, "step": 1070 }, { "epoch": 0.8293338452678057, "grad_norm": 0.004842583832484554, "learning_rate": 1.6682027649769587e-05, "loss": 0.0003, "step": 1080 }, { "epoch": 0.8370128623536187, "grad_norm": 0.012551492004310723, "learning_rate": 1.665130568356375e-05, "loss": 0.0003, "step": 1090 }, { "epoch": 0.8446918794394318, "grad_norm": 0.010999047619194315, "learning_rate": 1.6620583717357914e-05, "loss": 0.0003, "step": 1100 }, { "epoch": 0.8523708965252448, "grad_norm": 0.00940161449318046, "learning_rate": 1.6589861751152075e-05, "loss": 0.0003, "step": 1110 }, { "epoch": 0.8600499136110578, "grad_norm": 0.005629135501887116, "learning_rate": 1.6559139784946237e-05, "loss": 0.0003, "step": 1120 }, { "epoch": 0.8677289306968708, "grad_norm": 0.005695864014209226, "learning_rate": 1.6528417818740403e-05, "loss": 0.0003, "step": 1130 }, { "epoch": 0.8754079477826838, "grad_norm": 0.008141397000999681, "learning_rate": 1.6497695852534564e-05, "loss": 0.0003, "step": 1140 }, { "epoch": 0.8830869648684968, "grad_norm": 0.010312822716836551, "learning_rate": 1.6466973886328726e-05, "loss": 0.0003, "step": 1150 }, { "epoch": 0.8907659819543099, "grad_norm": 0.004299526696605698, "learning_rate": 1.643625192012289e-05, "loss": 0.0003, "step": 1160 }, { "epoch": 0.8984449990401229, "grad_norm": 0.007880227129562899, "learning_rate": 1.6405529953917053e-05, "loss": 0.0003, "step": 1170 }, { "epoch": 0.9061240161259358, "grad_norm": 0.0038386080131062204, "learning_rate": 1.6374807987711215e-05, "loss": 0.0003, "step": 1180 }, { "epoch": 0.9138030332117489, "grad_norm": 0.005902343886207709, "learning_rate": 1.6344086021505377e-05, "loss": 0.0003, "step": 1190 }, { "epoch": 0.9214820502975619, "grad_norm": 0.005315908218014497, "learning_rate": 1.6313364055299542e-05, "loss": 0.0003, "step": 1200 }, { "epoch": 0.9291610673833749, "grad_norm": 0.004817638329770463, "learning_rate": 1.6282642089093704e-05, "loss": 0.0003, "step": 1210 }, { "epoch": 0.936840084469188, "grad_norm": 0.004282276912369252, "learning_rate": 1.6251920122887866e-05, "loss": 0.0003, "step": 1220 }, { "epoch": 0.944519101555001, "grad_norm": 0.0030553467454727474, "learning_rate": 1.622119815668203e-05, "loss": 0.0003, "step": 1230 }, { "epoch": 0.9521981186408139, "grad_norm": 0.003669066757736195, "learning_rate": 1.6190476190476193e-05, "loss": 0.0003, "step": 1240 }, { "epoch": 0.959877135726627, "grad_norm": 0.007997281797816693, "learning_rate": 1.6159754224270355e-05, "loss": 0.0003, "step": 1250 }, { "epoch": 0.96755615281244, "grad_norm": 0.0033293025096136717, "learning_rate": 1.6129032258064517e-05, "loss": 0.0003, "step": 1260 }, { "epoch": 0.975235169898253, "grad_norm": 0.0090924908378621, "learning_rate": 1.6098310291858682e-05, "loss": 0.0003, "step": 1270 }, { "epoch": 0.9829141869840661, "grad_norm": 0.01333845903017061, "learning_rate": 1.6067588325652844e-05, "loss": 0.0003, "step": 1280 }, { "epoch": 0.990593204069879, "grad_norm": 0.004317437784916082, "learning_rate": 1.6036866359447006e-05, "loss": 0.0003, "step": 1290 }, { "epoch": 0.998272221155692, "grad_norm": 0.006218136692693676, "learning_rate": 1.600614439324117e-05, "loss": 0.0003, "step": 1300 }, { "epoch": 1.005375311960069, "grad_norm": 0.012722862337500376, "learning_rate": 1.5975422427035333e-05, "loss": 0.0003, "step": 1310 }, { "epoch": 1.0130543290458822, "grad_norm": 0.012347661878448313, "learning_rate": 1.5944700460829495e-05, "loss": 0.0003, "step": 1320 }, { "epoch": 1.0207333461316952, "grad_norm": 0.013680655581236744, "learning_rate": 1.5913978494623657e-05, "loss": 0.0003, "step": 1330 }, { "epoch": 1.0284123632175082, "grad_norm": 0.008212234787974039, "learning_rate": 1.5883256528417822e-05, "loss": 0.0003, "step": 1340 }, { "epoch": 1.0360913803033212, "grad_norm": 0.004326507441533446, "learning_rate": 1.5852534562211984e-05, "loss": 0.0003, "step": 1350 }, { "epoch": 1.0437703973891341, "grad_norm": 0.006702741273813344, "learning_rate": 1.5821812596006145e-05, "loss": 0.0003, "step": 1360 }, { "epoch": 1.051449414474947, "grad_norm": 0.007151020722561525, "learning_rate": 1.5791090629800307e-05, "loss": 0.0003, "step": 1370 }, { "epoch": 1.0591284315607603, "grad_norm": 0.012743255489323653, "learning_rate": 1.5760368663594473e-05, "loss": 0.0003, "step": 1380 }, { "epoch": 1.0668074486465733, "grad_norm": 0.008818497275557231, "learning_rate": 1.5729646697388634e-05, "loss": 0.0003, "step": 1390 }, { "epoch": 1.0744864657323863, "grad_norm": 0.002839593309524096, "learning_rate": 1.5698924731182796e-05, "loss": 0.0003, "step": 1400 }, { "epoch": 1.0821654828181992, "grad_norm": 0.006499416407964513, "learning_rate": 1.566820276497696e-05, "loss": 0.0003, "step": 1410 }, { "epoch": 1.0898444999040122, "grad_norm": 0.0043902156072960176, "learning_rate": 1.5637480798771123e-05, "loss": 0.0003, "step": 1420 }, { "epoch": 1.0975235169898252, "grad_norm": 0.011284861249978177, "learning_rate": 1.5606758832565285e-05, "loss": 0.0003, "step": 1430 }, { "epoch": 1.1052025340756384, "grad_norm": 0.004036096758704275, "learning_rate": 1.5576036866359447e-05, "loss": 0.0003, "step": 1440 }, { "epoch": 1.1128815511614514, "grad_norm": 0.006193352867852987, "learning_rate": 1.5545314900153612e-05, "loss": 0.0003, "step": 1450 }, { "epoch": 1.1205605682472644, "grad_norm": 0.0043243328755989815, "learning_rate": 1.5514592933947774e-05, "loss": 0.0003, "step": 1460 }, { "epoch": 1.1282395853330773, "grad_norm": 0.005942275681812123, "learning_rate": 1.5483870967741936e-05, "loss": 0.0003, "step": 1470 }, { "epoch": 1.1359186024188903, "grad_norm": 0.004708153262372667, "learning_rate": 1.5453149001536098e-05, "loss": 0.0003, "step": 1480 }, { "epoch": 1.1435976195047033, "grad_norm": 0.007691673975032789, "learning_rate": 1.5422427035330263e-05, "loss": 0.0003, "step": 1490 }, { "epoch": 1.1512766365905165, "grad_norm": 0.012308448692961365, "learning_rate": 1.5391705069124425e-05, "loss": 0.0003, "step": 1500 }, { "epoch": 1.1589556536763295, "grad_norm": 0.008574072497790838, "learning_rate": 1.5360983102918587e-05, "loss": 0.0003, "step": 1510 }, { "epoch": 1.1666346707621424, "grad_norm": 0.0022349584876796874, "learning_rate": 1.5330261136712752e-05, "loss": 0.0003, "step": 1520 }, { "epoch": 1.1743136878479554, "grad_norm": 0.0039426732001856726, "learning_rate": 1.5299539170506914e-05, "loss": 0.0003, "step": 1530 }, { "epoch": 1.1819927049337684, "grad_norm": 0.0076643241645843055, "learning_rate": 1.5268817204301076e-05, "loss": 0.0003, "step": 1540 }, { "epoch": 1.1896717220195816, "grad_norm": 0.00801480580123044, "learning_rate": 1.523809523809524e-05, "loss": 0.0003, "step": 1550 }, { "epoch": 1.1973507391053946, "grad_norm": 0.0063236711582617235, "learning_rate": 1.5207373271889403e-05, "loss": 0.0003, "step": 1560 }, { "epoch": 1.2050297561912076, "grad_norm": 0.0026532800936888, "learning_rate": 1.5176651305683565e-05, "loss": 0.0003, "step": 1570 }, { "epoch": 1.2127087732770205, "grad_norm": 0.004399678225568667, "learning_rate": 1.5145929339477728e-05, "loss": 0.0003, "step": 1580 }, { "epoch": 1.2203877903628335, "grad_norm": 0.003175980432596953, "learning_rate": 1.511520737327189e-05, "loss": 0.0003, "step": 1590 }, { "epoch": 1.2280668074486465, "grad_norm": 0.003690809213724381, "learning_rate": 1.5084485407066054e-05, "loss": 0.0003, "step": 1600 }, { "epoch": 1.2357458245344595, "grad_norm": 0.002885795037112433, "learning_rate": 1.5053763440860215e-05, "loss": 0.0003, "step": 1610 }, { "epoch": 1.2434248416202727, "grad_norm": 0.004662223632430856, "learning_rate": 1.5023041474654379e-05, "loss": 0.0003, "step": 1620 }, { "epoch": 1.2511038587060856, "grad_norm": 0.0038277697276091482, "learning_rate": 1.4992319508448543e-05, "loss": 0.0003, "step": 1630 }, { "epoch": 1.2587828757918986, "grad_norm": 0.004926059671498492, "learning_rate": 1.4961597542242704e-05, "loss": 0.0003, "step": 1640 }, { "epoch": 1.2664618928777116, "grad_norm": 0.00463367088053743, "learning_rate": 1.4930875576036868e-05, "loss": 0.0003, "step": 1650 }, { "epoch": 1.2741409099635246, "grad_norm": 0.005726418059037605, "learning_rate": 1.490015360983103e-05, "loss": 0.0003, "step": 1660 }, { "epoch": 1.2818199270493378, "grad_norm": 0.005247011723204908, "learning_rate": 1.4869431643625193e-05, "loss": 0.0003, "step": 1670 }, { "epoch": 1.2894989441351508, "grad_norm": 0.007164559324630275, "learning_rate": 1.4838709677419357e-05, "loss": 0.0003, "step": 1680 }, { "epoch": 1.2971779612209637, "grad_norm": 0.005041996622130852, "learning_rate": 1.4807987711213519e-05, "loss": 0.0003, "step": 1690 }, { "epoch": 1.3048569783067767, "grad_norm": 0.005487598186113812, "learning_rate": 1.477726574500768e-05, "loss": 0.0003, "step": 1700 }, { "epoch": 1.3125359953925897, "grad_norm": 0.003135256998223166, "learning_rate": 1.4746543778801846e-05, "loss": 0.0003, "step": 1710 }, { "epoch": 1.320215012478403, "grad_norm": 0.0069210555218662635, "learning_rate": 1.4715821812596008e-05, "loss": 0.0003, "step": 1720 }, { "epoch": 1.3278940295642156, "grad_norm": 0.003417481046551585, "learning_rate": 1.468509984639017e-05, "loss": 0.0003, "step": 1730 }, { "epoch": 1.3355730466500289, "grad_norm": 0.0076945646315396184, "learning_rate": 1.4654377880184335e-05, "loss": 0.0003, "step": 1740 }, { "epoch": 1.3432520637358418, "grad_norm": 0.004664070136688662, "learning_rate": 1.4623655913978497e-05, "loss": 0.0003, "step": 1750 }, { "epoch": 1.3509310808216548, "grad_norm": 0.003957679244338998, "learning_rate": 1.4592933947772658e-05, "loss": 0.0003, "step": 1760 }, { "epoch": 1.3586100979074678, "grad_norm": 0.003471942169350295, "learning_rate": 1.456221198156682e-05, "loss": 0.0003, "step": 1770 }, { "epoch": 1.3662891149932808, "grad_norm": 0.006157537653481742, "learning_rate": 1.4531490015360986e-05, "loss": 0.0003, "step": 1780 }, { "epoch": 1.373968132079094, "grad_norm": 0.008728892287992982, "learning_rate": 1.4500768049155147e-05, "loss": 0.0003, "step": 1790 }, { "epoch": 1.381647149164907, "grad_norm": 0.00566547815883834, "learning_rate": 1.447004608294931e-05, "loss": 0.0003, "step": 1800 }, { "epoch": 1.38932616625072, "grad_norm": 0.008658651412825806, "learning_rate": 1.4439324116743471e-05, "loss": 0.0003, "step": 1810 }, { "epoch": 1.397005183336533, "grad_norm": 0.005166116309061509, "learning_rate": 1.4408602150537636e-05, "loss": 0.0003, "step": 1820 }, { "epoch": 1.4046842004223459, "grad_norm": 0.0028075866905398405, "learning_rate": 1.4377880184331798e-05, "loss": 0.0003, "step": 1830 }, { "epoch": 1.412363217508159, "grad_norm": 0.005329755957597182, "learning_rate": 1.434715821812596e-05, "loss": 0.0003, "step": 1840 }, { "epoch": 1.420042234593972, "grad_norm": 0.0025789510506500093, "learning_rate": 1.4316436251920125e-05, "loss": 0.0003, "step": 1850 }, { "epoch": 1.427721251679785, "grad_norm": 0.00305980645847167, "learning_rate": 1.4285714285714287e-05, "loss": 0.0003, "step": 1860 }, { "epoch": 1.435400268765598, "grad_norm": 0.0070892296996524756, "learning_rate": 1.4254992319508449e-05, "loss": 0.0003, "step": 1870 }, { "epoch": 1.443079285851411, "grad_norm": 0.004636892035159531, "learning_rate": 1.422427035330261e-05, "loss": 0.0003, "step": 1880 }, { "epoch": 1.450758302937224, "grad_norm": 0.007196283698693798, "learning_rate": 1.4193548387096776e-05, "loss": 0.0003, "step": 1890 }, { "epoch": 1.458437320023037, "grad_norm": 0.002244250906729436, "learning_rate": 1.4162826420890938e-05, "loss": 0.0003, "step": 1900 }, { "epoch": 1.4661163371088501, "grad_norm": 0.003662139205215455, "learning_rate": 1.41321044546851e-05, "loss": 0.0003, "step": 1910 }, { "epoch": 1.4737953541946631, "grad_norm": 0.00899296235470264, "learning_rate": 1.4101382488479263e-05, "loss": 0.0003, "step": 1920 }, { "epoch": 1.481474371280476, "grad_norm": 0.007554011023119674, "learning_rate": 1.4070660522273427e-05, "loss": 0.0003, "step": 1930 }, { "epoch": 1.489153388366289, "grad_norm": 0.006664687059239841, "learning_rate": 1.4039938556067589e-05, "loss": 0.0003, "step": 1940 }, { "epoch": 1.496832405452102, "grad_norm": 0.003654880039156468, "learning_rate": 1.4009216589861752e-05, "loss": 0.0003, "step": 1950 }, { "epoch": 1.5045114225379153, "grad_norm": 0.007520628392798319, "learning_rate": 1.3978494623655916e-05, "loss": 0.0003, "step": 1960 }, { "epoch": 1.512190439623728, "grad_norm": 0.0074722854847562, "learning_rate": 1.3947772657450078e-05, "loss": 0.0003, "step": 1970 }, { "epoch": 1.5198694567095412, "grad_norm": 0.009940245405058647, "learning_rate": 1.3917050691244241e-05, "loss": 0.0003, "step": 1980 }, { "epoch": 1.5275484737953542, "grad_norm": 0.006304573730328701, "learning_rate": 1.3886328725038403e-05, "loss": 0.0003, "step": 1990 }, { "epoch": 1.5352274908811672, "grad_norm": 0.0055550452573098165, "learning_rate": 1.3855606758832567e-05, "loss": 0.0003, "step": 2000 }, { "epoch": 1.5429065079669804, "grad_norm": 0.008909260445155288, "learning_rate": 1.382488479262673e-05, "loss": 0.0003, "step": 2010 }, { "epoch": 1.5505855250527931, "grad_norm": 0.002599064360242108, "learning_rate": 1.3794162826420892e-05, "loss": 0.0003, "step": 2020 }, { "epoch": 1.5582645421386063, "grad_norm": 0.0034038539972756267, "learning_rate": 1.3763440860215056e-05, "loss": 0.0003, "step": 2030 }, { "epoch": 1.5659435592244193, "grad_norm": 0.013808170488498894, "learning_rate": 1.3732718894009217e-05, "loss": 0.0003, "step": 2040 }, { "epoch": 1.5736225763102323, "grad_norm": 0.0017545504012708383, "learning_rate": 1.3701996927803381e-05, "loss": 0.0003, "step": 2050 }, { "epoch": 1.5813015933960453, "grad_norm": 0.003842048923808368, "learning_rate": 1.3671274961597543e-05, "loss": 0.0003, "step": 2060 }, { "epoch": 1.5889806104818582, "grad_norm": 0.004326236565390302, "learning_rate": 1.3640552995391706e-05, "loss": 0.0003, "step": 2070 }, { "epoch": 1.5966596275676714, "grad_norm": 0.0035407621847916644, "learning_rate": 1.360983102918587e-05, "loss": 0.0003, "step": 2080 }, { "epoch": 1.6043386446534842, "grad_norm": 0.004974256724902474, "learning_rate": 1.3579109062980032e-05, "loss": 0.0003, "step": 2090 }, { "epoch": 1.6120176617392974, "grad_norm": 0.002810217043936606, "learning_rate": 1.3548387096774194e-05, "loss": 0.0003, "step": 2100 }, { "epoch": 1.6196966788251104, "grad_norm": 0.002954993487071089, "learning_rate": 1.3517665130568359e-05, "loss": 0.0003, "step": 2110 }, { "epoch": 1.6273756959109233, "grad_norm": 0.003028755651427543, "learning_rate": 1.348694316436252e-05, "loss": 0.0003, "step": 2120 }, { "epoch": 1.6350547129967365, "grad_norm": 0.0026674905313092702, "learning_rate": 1.3456221198156683e-05, "loss": 0.0003, "step": 2130 }, { "epoch": 1.6427337300825493, "grad_norm": 0.006409597579717579, "learning_rate": 1.3425499231950848e-05, "loss": 0.0003, "step": 2140 }, { "epoch": 1.6504127471683625, "grad_norm": 0.0036828809413129507, "learning_rate": 1.339477726574501e-05, "loss": 0.0003, "step": 2150 }, { "epoch": 1.6580917642541755, "grad_norm": 0.007891670676920814, "learning_rate": 1.3364055299539171e-05, "loss": 0.0003, "step": 2160 }, { "epoch": 1.6657707813399885, "grad_norm": 0.006626613367159732, "learning_rate": 1.3333333333333333e-05, "loss": 0.0003, "step": 2170 }, { "epoch": 1.6734497984258017, "grad_norm": 0.003534695341590609, "learning_rate": 1.3302611367127499e-05, "loss": 0.0003, "step": 2180 }, { "epoch": 1.6811288155116144, "grad_norm": 0.002799573693646372, "learning_rate": 1.327188940092166e-05, "loss": 0.0003, "step": 2190 }, { "epoch": 1.6888078325974276, "grad_norm": 0.007920181925607207, "learning_rate": 1.3241167434715822e-05, "loss": 0.0003, "step": 2200 }, { "epoch": 1.6964868496832406, "grad_norm": 0.003197909917687604, "learning_rate": 1.3210445468509984e-05, "loss": 0.0003, "step": 2210 }, { "epoch": 1.7041658667690536, "grad_norm": 0.0019020952054658064, "learning_rate": 1.317972350230415e-05, "loss": 0.0003, "step": 2220 }, { "epoch": 1.7118448838548666, "grad_norm": 0.003430345573430971, "learning_rate": 1.3149001536098311e-05, "loss": 0.0003, "step": 2230 }, { "epoch": 1.7195239009406795, "grad_norm": 0.005966901533330741, "learning_rate": 1.3118279569892473e-05, "loss": 0.0003, "step": 2240 }, { "epoch": 1.7272029180264927, "grad_norm": 0.003453325797399688, "learning_rate": 1.3087557603686638e-05, "loss": 0.0003, "step": 2250 }, { "epoch": 1.7348819351123055, "grad_norm": 0.004117138090969933, "learning_rate": 1.30568356374808e-05, "loss": 0.0003, "step": 2260 }, { "epoch": 1.7425609521981187, "grad_norm": 0.015933078305414367, "learning_rate": 1.3026113671274962e-05, "loss": 0.0003, "step": 2270 }, { "epoch": 1.7502399692839317, "grad_norm": 0.00695674170034877, "learning_rate": 1.2995391705069126e-05, "loss": 0.0003, "step": 2280 }, { "epoch": 1.7579189863697446, "grad_norm": 0.0030599729808705334, "learning_rate": 1.2964669738863289e-05, "loss": 0.0003, "step": 2290 }, { "epoch": 1.7655980034555578, "grad_norm": 0.005764241880693109, "learning_rate": 1.2933947772657451e-05, "loss": 0.0003, "step": 2300 }, { "epoch": 1.7732770205413706, "grad_norm": 0.0024572213319480774, "learning_rate": 1.2903225806451613e-05, "loss": 0.0003, "step": 2310 }, { "epoch": 1.7809560376271838, "grad_norm": 0.005442108190635756, "learning_rate": 1.2872503840245776e-05, "loss": 0.0003, "step": 2320 }, { "epoch": 1.7886350547129968, "grad_norm": 0.005249849493945225, "learning_rate": 1.284178187403994e-05, "loss": 0.0003, "step": 2330 }, { "epoch": 1.7963140717988098, "grad_norm": 0.003760686952515486, "learning_rate": 1.2811059907834102e-05, "loss": 0.0003, "step": 2340 }, { "epoch": 1.8039930888846227, "grad_norm": 0.003326565830912095, "learning_rate": 1.2780337941628265e-05, "loss": 0.0003, "step": 2350 }, { "epoch": 1.8116721059704357, "grad_norm": 0.00687928832968373, "learning_rate": 1.2749615975422429e-05, "loss": 0.0003, "step": 2360 }, { "epoch": 1.819351123056249, "grad_norm": 0.002523920816120679, "learning_rate": 1.271889400921659e-05, "loss": 0.0003, "step": 2370 }, { "epoch": 1.8270301401420617, "grad_norm": 0.0035826335995729104, "learning_rate": 1.2688172043010754e-05, "loss": 0.0003, "step": 2380 }, { "epoch": 1.8347091572278749, "grad_norm": 0.0042202748472642045, "learning_rate": 1.2657450076804916e-05, "loss": 0.0003, "step": 2390 }, { "epoch": 1.8423881743136878, "grad_norm": 0.00433061878504225, "learning_rate": 1.262672811059908e-05, "loss": 0.0003, "step": 2400 }, { "epoch": 1.8500671913995008, "grad_norm": 0.006032498913999335, "learning_rate": 1.2596006144393243e-05, "loss": 0.0003, "step": 2410 }, { "epoch": 1.857746208485314, "grad_norm": 0.002731884836982076, "learning_rate": 1.2565284178187405e-05, "loss": 0.0003, "step": 2420 }, { "epoch": 1.8654252255711268, "grad_norm": 0.003451471394001781, "learning_rate": 1.2534562211981567e-05, "loss": 0.0003, "step": 2430 }, { "epoch": 1.87310424265694, "grad_norm": 0.002472343027285463, "learning_rate": 1.2503840245775732e-05, "loss": 0.0003, "step": 2440 }, { "epoch": 1.880783259742753, "grad_norm": 0.003320944102999081, "learning_rate": 1.2473118279569894e-05, "loss": 0.0003, "step": 2450 }, { "epoch": 1.888462276828566, "grad_norm": 0.0018486199226611809, "learning_rate": 1.2442396313364056e-05, "loss": 0.0003, "step": 2460 }, { "epoch": 1.896141293914379, "grad_norm": 0.0024515391018999654, "learning_rate": 1.2411674347158221e-05, "loss": 0.0003, "step": 2470 }, { "epoch": 1.903820311000192, "grad_norm": 0.0039409109050494015, "learning_rate": 1.2380952380952383e-05, "loss": 0.0003, "step": 2480 }, { "epoch": 1.911499328086005, "grad_norm": 0.0020024603662128597, "learning_rate": 1.2350230414746545e-05, "loss": 0.0003, "step": 2490 }, { "epoch": 1.9191783451718178, "grad_norm": 0.004837460528352513, "learning_rate": 1.2319508448540707e-05, "loss": 0.0003, "step": 2500 }, { "epoch": 1.926857362257631, "grad_norm": 0.001684979658088187, "learning_rate": 1.2288786482334872e-05, "loss": 0.0003, "step": 2510 }, { "epoch": 1.934536379343444, "grad_norm": 0.0036424135207038444, "learning_rate": 1.2258064516129034e-05, "loss": 0.0003, "step": 2520 }, { "epoch": 1.942215396429257, "grad_norm": 0.003460021636897484, "learning_rate": 1.2227342549923195e-05, "loss": 0.0003, "step": 2530 }, { "epoch": 1.9498944135150702, "grad_norm": 0.0012857496658247277, "learning_rate": 1.2196620583717357e-05, "loss": 0.0003, "step": 2540 }, { "epoch": 1.957573430600883, "grad_norm": 0.006405995384537319, "learning_rate": 1.2165898617511523e-05, "loss": 0.0003, "step": 2550 }, { "epoch": 1.9652524476866962, "grad_norm": 0.0027885557407680427, "learning_rate": 1.2135176651305684e-05, "loss": 0.0003, "step": 2560 }, { "epoch": 1.9729314647725091, "grad_norm": 0.0046197235806071674, "learning_rate": 1.2104454685099846e-05, "loss": 0.0003, "step": 2570 }, { "epoch": 1.9806104818583221, "grad_norm": 0.002923714387690518, "learning_rate": 1.2073732718894012e-05, "loss": 0.0003, "step": 2580 }, { "epoch": 1.9882894989441353, "grad_norm": 0.004546312959559587, "learning_rate": 1.2043010752688173e-05, "loss": 0.0003, "step": 2590 }, { "epoch": 1.995968516029948, "grad_norm": 0.0027543773295028978, "learning_rate": 1.2012288786482335e-05, "loss": 0.0003, "step": 2600 }, { "epoch": 2.0030716068343253, "grad_norm": 0.00445399533446676, "learning_rate": 1.1981566820276497e-05, "loss": 0.0002, "step": 2610 }, { "epoch": 2.010750623920138, "grad_norm": 0.009066513947313037, "learning_rate": 1.1950844854070662e-05, "loss": 0.0003, "step": 2620 }, { "epoch": 2.0184296410059512, "grad_norm": 0.0022089803112435333, "learning_rate": 1.1920122887864824e-05, "loss": 0.0003, "step": 2630 }, { "epoch": 2.0261086580917644, "grad_norm": 0.007859382516213974, "learning_rate": 1.1889400921658986e-05, "loss": 0.0003, "step": 2640 }, { "epoch": 2.033787675177577, "grad_norm": 0.002280973635314502, "learning_rate": 1.185867895545315e-05, "loss": 0.0003, "step": 2650 }, { "epoch": 2.0414666922633904, "grad_norm": 0.003812714680306737, "learning_rate": 1.1827956989247313e-05, "loss": 0.0003, "step": 2660 }, { "epoch": 2.049145709349203, "grad_norm": 0.003865003763224919, "learning_rate": 1.1797235023041475e-05, "loss": 0.0003, "step": 2670 }, { "epoch": 2.0568247264350163, "grad_norm": 0.0056210714836978015, "learning_rate": 1.1766513056835639e-05, "loss": 0.0003, "step": 2680 }, { "epoch": 2.0645037435208295, "grad_norm": 0.0029786676822015994, "learning_rate": 1.1735791090629802e-05, "loss": 0.0003, "step": 2690 }, { "epoch": 2.0721827606066423, "grad_norm": 0.007577207373633205, "learning_rate": 1.1705069124423964e-05, "loss": 0.0003, "step": 2700 }, { "epoch": 2.0798617776924555, "grad_norm": 0.004911935203582005, "learning_rate": 1.1674347158218127e-05, "loss": 0.0003, "step": 2710 }, { "epoch": 2.0875407947782683, "grad_norm": 0.002348567372539777, "learning_rate": 1.164362519201229e-05, "loss": 0.0003, "step": 2720 }, { "epoch": 2.0952198118640815, "grad_norm": 0.0021975557273255456, "learning_rate": 1.1612903225806453e-05, "loss": 0.0003, "step": 2730 }, { "epoch": 2.102898828949894, "grad_norm": 0.004404237994021701, "learning_rate": 1.1582181259600616e-05, "loss": 0.0003, "step": 2740 }, { "epoch": 2.1105778460357074, "grad_norm": 0.0018039936664214415, "learning_rate": 1.1551459293394778e-05, "loss": 0.0003, "step": 2750 }, { "epoch": 2.1182568631215206, "grad_norm": 0.010542570288986714, "learning_rate": 1.152073732718894e-05, "loss": 0.0003, "step": 2760 }, { "epoch": 2.1259358802073334, "grad_norm": 0.007212811136834576, "learning_rate": 1.1490015360983104e-05, "loss": 0.0003, "step": 2770 }, { "epoch": 2.1336148972931466, "grad_norm": 0.002463133011224361, "learning_rate": 1.1459293394777267e-05, "loss": 0.0003, "step": 2780 }, { "epoch": 2.1412939143789593, "grad_norm": 0.0030969432185738934, "learning_rate": 1.1428571428571429e-05, "loss": 0.0003, "step": 2790 }, { "epoch": 2.1489729314647725, "grad_norm": 0.0009640256914357676, "learning_rate": 1.1397849462365593e-05, "loss": 0.0003, "step": 2800 }, { "epoch": 2.1566519485505857, "grad_norm": 0.0062156621929774095, "learning_rate": 1.1367127496159756e-05, "loss": 0.0003, "step": 2810 }, { "epoch": 2.1643309656363985, "grad_norm": 0.006538407111075363, "learning_rate": 1.1336405529953918e-05, "loss": 0.0003, "step": 2820 }, { "epoch": 2.1720099827222117, "grad_norm": 0.0028212937587065077, "learning_rate": 1.130568356374808e-05, "loss": 0.0003, "step": 2830 }, { "epoch": 2.1796889998080244, "grad_norm": 0.005023107122791538, "learning_rate": 1.1274961597542245e-05, "loss": 0.0003, "step": 2840 }, { "epoch": 2.1873680168938376, "grad_norm": 0.0011156564572317208, "learning_rate": 1.1244239631336407e-05, "loss": 0.0003, "step": 2850 }, { "epoch": 2.1950470339796504, "grad_norm": 0.001942852230799726, "learning_rate": 1.1213517665130569e-05, "loss": 0.0003, "step": 2860 }, { "epoch": 2.2027260510654636, "grad_norm": 0.008378616547487394, "learning_rate": 1.118279569892473e-05, "loss": 0.0003, "step": 2870 }, { "epoch": 2.210405068151277, "grad_norm": 0.002600735996609255, "learning_rate": 1.1152073732718896e-05, "loss": 0.0003, "step": 2880 }, { "epoch": 2.2180840852370896, "grad_norm": 0.0015136314382080984, "learning_rate": 1.1121351766513058e-05, "loss": 0.0003, "step": 2890 }, { "epoch": 2.2257631023229028, "grad_norm": 0.007485965818712587, "learning_rate": 1.109062980030722e-05, "loss": 0.0003, "step": 2900 }, { "epoch": 2.2334421194087155, "grad_norm": 0.006013194482848518, "learning_rate": 1.1059907834101385e-05, "loss": 0.0003, "step": 2910 }, { "epoch": 2.2411211364945287, "grad_norm": 0.005253526138111572, "learning_rate": 1.1029185867895547e-05, "loss": 0.0003, "step": 2920 }, { "epoch": 2.248800153580342, "grad_norm": 0.0030844920407275436, "learning_rate": 1.0998463901689708e-05, "loss": 0.0003, "step": 2930 }, { "epoch": 2.2564791706661547, "grad_norm": 0.00448225831555134, "learning_rate": 1.096774193548387e-05, "loss": 0.0003, "step": 2940 }, { "epoch": 2.264158187751968, "grad_norm": 0.002766089015696827, "learning_rate": 1.0937019969278036e-05, "loss": 0.0003, "step": 2950 }, { "epoch": 2.2718372048377806, "grad_norm": 0.00432406954984362, "learning_rate": 1.0906298003072197e-05, "loss": 0.0003, "step": 2960 }, { "epoch": 2.279516221923594, "grad_norm": 0.005297571043727681, "learning_rate": 1.087557603686636e-05, "loss": 0.0003, "step": 2970 }, { "epoch": 2.2871952390094066, "grad_norm": 0.005051571714735924, "learning_rate": 1.0844854070660523e-05, "loss": 0.0003, "step": 2980 }, { "epoch": 2.2948742560952198, "grad_norm": 0.0036863856281938016, "learning_rate": 1.0814132104454686e-05, "loss": 0.0003, "step": 2990 }, { "epoch": 2.302553273181033, "grad_norm": 0.0024327974031678975, "learning_rate": 1.0783410138248848e-05, "loss": 0.0003, "step": 3000 }, { "epoch": 2.3102322902668457, "grad_norm": 0.0032986912710764884, "learning_rate": 1.0752688172043012e-05, "loss": 0.0003, "step": 3010 }, { "epoch": 2.317911307352659, "grad_norm": 0.002895373170239971, "learning_rate": 1.0721966205837175e-05, "loss": 0.0003, "step": 3020 }, { "epoch": 2.3255903244384717, "grad_norm": 0.0037467096830764678, "learning_rate": 1.0691244239631337e-05, "loss": 0.0003, "step": 3030 }, { "epoch": 2.333269341524285, "grad_norm": 0.0041338587084730925, "learning_rate": 1.0660522273425499e-05, "loss": 0.0003, "step": 3040 }, { "epoch": 2.340948358610098, "grad_norm": 0.004584463378907932, "learning_rate": 1.0629800307219663e-05, "loss": 0.0003, "step": 3050 }, { "epoch": 2.348627375695911, "grad_norm": 0.005390217173101364, "learning_rate": 1.0599078341013826e-05, "loss": 0.0003, "step": 3060 }, { "epoch": 2.356306392781724, "grad_norm": 0.006574519516791052, "learning_rate": 1.0568356374807988e-05, "loss": 0.0003, "step": 3070 }, { "epoch": 2.363985409867537, "grad_norm": 0.0031029456990706457, "learning_rate": 1.0537634408602151e-05, "loss": 0.0003, "step": 3080 }, { "epoch": 2.37166442695335, "grad_norm": 0.003485382502658449, "learning_rate": 1.0506912442396313e-05, "loss": 0.0003, "step": 3090 }, { "epoch": 2.379343444039163, "grad_norm": 0.0033652977877385503, "learning_rate": 1.0476190476190477e-05, "loss": 0.0003, "step": 3100 }, { "epoch": 2.387022461124976, "grad_norm": 0.001994262459548078, "learning_rate": 1.044546850998464e-05, "loss": 0.0003, "step": 3110 }, { "epoch": 2.394701478210789, "grad_norm": 0.0027030822644481534, "learning_rate": 1.0414746543778802e-05, "loss": 0.0003, "step": 3120 }, { "epoch": 2.402380495296602, "grad_norm": 0.0075004858676295996, "learning_rate": 1.0384024577572966e-05, "loss": 0.0003, "step": 3130 }, { "epoch": 2.410059512382415, "grad_norm": 0.005277345871616036, "learning_rate": 1.035330261136713e-05, "loss": 0.0003, "step": 3140 }, { "epoch": 2.4177385294682283, "grad_norm": 0.0034500505732812984, "learning_rate": 1.0322580645161291e-05, "loss": 0.0003, "step": 3150 }, { "epoch": 2.425417546554041, "grad_norm": 0.004072496071322172, "learning_rate": 1.0291858678955453e-05, "loss": 0.0003, "step": 3160 }, { "epoch": 2.4330965636398543, "grad_norm": 0.0037519391939282247, "learning_rate": 1.0261136712749618e-05, "loss": 0.0003, "step": 3170 }, { "epoch": 2.440775580725667, "grad_norm": 0.0029126430566717857, "learning_rate": 1.023041474654378e-05, "loss": 0.0003, "step": 3180 }, { "epoch": 2.4484545978114802, "grad_norm": 0.006804725581164672, "learning_rate": 1.0199692780337942e-05, "loss": 0.0003, "step": 3190 }, { "epoch": 2.456133614897293, "grad_norm": 0.003394434324862927, "learning_rate": 1.0168970814132104e-05, "loss": 0.0003, "step": 3200 }, { "epoch": 2.463812631983106, "grad_norm": 0.0047839322011928136, "learning_rate": 1.0138248847926269e-05, "loss": 0.0003, "step": 3210 }, { "epoch": 2.471491649068919, "grad_norm": 0.0021047452286355496, "learning_rate": 1.0107526881720431e-05, "loss": 0.0003, "step": 3220 }, { "epoch": 2.479170666154732, "grad_norm": 0.002910893106529187, "learning_rate": 1.0076804915514593e-05, "loss": 0.0003, "step": 3230 }, { "epoch": 2.4868496832405453, "grad_norm": 0.0025188863038248495, "learning_rate": 1.0046082949308758e-05, "loss": 0.0003, "step": 3240 }, { "epoch": 2.494528700326358, "grad_norm": 0.005110455562647162, "learning_rate": 1.001536098310292e-05, "loss": 0.0003, "step": 3250 }, { "epoch": 2.5022077174121713, "grad_norm": 0.0035194967193856925, "learning_rate": 9.984639016897082e-06, "loss": 0.0003, "step": 3260 }, { "epoch": 2.509886734497984, "grad_norm": 0.004112839294946175, "learning_rate": 9.953917050691245e-06, "loss": 0.0003, "step": 3270 }, { "epoch": 2.5175657515837973, "grad_norm": 0.003219601043538841, "learning_rate": 9.923195084485407e-06, "loss": 0.0003, "step": 3280 }, { "epoch": 2.5252447686696105, "grad_norm": 0.0021615101929151476, "learning_rate": 9.89247311827957e-06, "loss": 0.0003, "step": 3290 }, { "epoch": 2.532923785755423, "grad_norm": 0.0026956859004236954, "learning_rate": 9.861751152073733e-06, "loss": 0.0003, "step": 3300 }, { "epoch": 2.5406028028412364, "grad_norm": 0.0012769547330191422, "learning_rate": 9.831029185867896e-06, "loss": 0.0003, "step": 3310 }, { "epoch": 2.548281819927049, "grad_norm": 0.004837667895037462, "learning_rate": 9.80030721966206e-06, "loss": 0.0003, "step": 3320 }, { "epoch": 2.5559608370128624, "grad_norm": 0.004331759593272771, "learning_rate": 9.769585253456221e-06, "loss": 0.0003, "step": 3330 }, { "epoch": 2.5636398540986756, "grad_norm": 0.003806749090919161, "learning_rate": 9.738863287250385e-06, "loss": 0.0003, "step": 3340 }, { "epoch": 2.5713188711844883, "grad_norm": 0.003405808798578046, "learning_rate": 9.708141321044547e-06, "loss": 0.0003, "step": 3350 }, { "epoch": 2.5789978882703015, "grad_norm": 0.0018090209587433655, "learning_rate": 9.67741935483871e-06, "loss": 0.0003, "step": 3360 }, { "epoch": 2.5866769053561143, "grad_norm": 0.0024778977279488216, "learning_rate": 9.646697388632872e-06, "loss": 0.0003, "step": 3370 }, { "epoch": 2.5943559224419275, "grad_norm": 0.005359718689721543, "learning_rate": 9.615975422427036e-06, "loss": 0.0003, "step": 3380 }, { "epoch": 2.6020349395277407, "grad_norm": 0.0012868512440999584, "learning_rate": 9.5852534562212e-06, "loss": 0.0003, "step": 3390 }, { "epoch": 2.6097139566135534, "grad_norm": 0.005696079089497556, "learning_rate": 9.554531490015361e-06, "loss": 0.0003, "step": 3400 }, { "epoch": 2.6173929736993666, "grad_norm": 0.0032109625254054023, "learning_rate": 9.523809523809525e-06, "loss": 0.0003, "step": 3410 }, { "epoch": 2.6250719907851794, "grad_norm": 0.002903890729728573, "learning_rate": 9.493087557603687e-06, "loss": 0.0003, "step": 3420 }, { "epoch": 2.6327510078709926, "grad_norm": 0.003523170524614984, "learning_rate": 9.46236559139785e-06, "loss": 0.0003, "step": 3430 }, { "epoch": 2.640430024956806, "grad_norm": 0.0036252760580902602, "learning_rate": 9.431643625192014e-06, "loss": 0.0003, "step": 3440 }, { "epoch": 2.6481090420426185, "grad_norm": 0.002714708446513513, "learning_rate": 9.400921658986176e-06, "loss": 0.0003, "step": 3450 }, { "epoch": 2.6557880591284313, "grad_norm": 0.0037700018885341927, "learning_rate": 9.370199692780339e-06, "loss": 0.0003, "step": 3460 }, { "epoch": 2.6634670762142445, "grad_norm": 0.005122776482125785, "learning_rate": 9.339477726574503e-06, "loss": 0.0003, "step": 3470 }, { "epoch": 2.6711460933000577, "grad_norm": 0.0013201671036341795, "learning_rate": 9.308755760368664e-06, "loss": 0.0003, "step": 3480 }, { "epoch": 2.6788251103858705, "grad_norm": 0.003020186145598462, "learning_rate": 9.278033794162828e-06, "loss": 0.0003, "step": 3490 }, { "epoch": 2.6865041274716837, "grad_norm": 0.011069671357401941, "learning_rate": 9.24731182795699e-06, "loss": 0.0003, "step": 3500 }, { "epoch": 2.6941831445574964, "grad_norm": 0.007485690656388163, "learning_rate": 9.216589861751153e-06, "loss": 0.0003, "step": 3510 }, { "epoch": 2.7018621616433096, "grad_norm": 0.00636654094660092, "learning_rate": 9.185867895545315e-06, "loss": 0.0003, "step": 3520 }, { "epoch": 2.709541178729123, "grad_norm": 0.0039619478349876185, "learning_rate": 9.155145929339479e-06, "loss": 0.0003, "step": 3530 }, { "epoch": 2.7172201958149356, "grad_norm": 0.0054654628329376094, "learning_rate": 9.124423963133642e-06, "loss": 0.0003, "step": 3540 }, { "epoch": 2.7248992129007488, "grad_norm": 0.0023486404063996135, "learning_rate": 9.093701996927804e-06, "loss": 0.0003, "step": 3550 }, { "epoch": 2.7325782299865615, "grad_norm": 0.002849175524143881, "learning_rate": 9.062980030721968e-06, "loss": 0.0003, "step": 3560 }, { "epoch": 2.7402572470723747, "grad_norm": 0.0040654911920692495, "learning_rate": 9.03225806451613e-06, "loss": 0.0003, "step": 3570 }, { "epoch": 2.747936264158188, "grad_norm": 0.003611352214922628, "learning_rate": 9.001536098310293e-06, "loss": 0.0003, "step": 3580 }, { "epoch": 2.7556152812440007, "grad_norm": 0.002668802943203528, "learning_rate": 8.970814132104455e-06, "loss": 0.0003, "step": 3590 }, { "epoch": 2.763294298329814, "grad_norm": 0.0033794836750422907, "learning_rate": 8.940092165898619e-06, "loss": 0.0003, "step": 3600 }, { "epoch": 2.7709733154156266, "grad_norm": 0.0019714078072412106, "learning_rate": 8.90937019969278e-06, "loss": 0.0003, "step": 3610 }, { "epoch": 2.77865233250144, "grad_norm": 0.0036424341616288908, "learning_rate": 8.878648233486944e-06, "loss": 0.0003, "step": 3620 }, { "epoch": 2.786331349587253, "grad_norm": 0.0025731242429069758, "learning_rate": 8.847926267281107e-06, "loss": 0.0003, "step": 3630 }, { "epoch": 2.794010366673066, "grad_norm": 0.003112848869608825, "learning_rate": 8.81720430107527e-06, "loss": 0.0003, "step": 3640 }, { "epoch": 2.801689383758879, "grad_norm": 0.0024507056173105482, "learning_rate": 8.786482334869433e-06, "loss": 0.0003, "step": 3650 }, { "epoch": 2.8093684008446917, "grad_norm": 0.003942355547919082, "learning_rate": 8.755760368663595e-06, "loss": 0.0003, "step": 3660 }, { "epoch": 2.817047417930505, "grad_norm": 0.007985074482092179, "learning_rate": 8.725038402457758e-06, "loss": 0.0003, "step": 3670 }, { "epoch": 2.824726435016318, "grad_norm": 0.006570601645436398, "learning_rate": 8.69431643625192e-06, "loss": 0.0003, "step": 3680 }, { "epoch": 2.832405452102131, "grad_norm": 0.002711244143718766, "learning_rate": 8.663594470046084e-06, "loss": 0.0003, "step": 3690 }, { "epoch": 2.840084469187944, "grad_norm": 0.0021197096154111942, "learning_rate": 8.632872503840246e-06, "loss": 0.0003, "step": 3700 }, { "epoch": 2.847763486273757, "grad_norm": 0.0030651493700618623, "learning_rate": 8.602150537634409e-06, "loss": 0.0003, "step": 3710 }, { "epoch": 2.85544250335957, "grad_norm": 0.0038564973802781943, "learning_rate": 8.571428571428571e-06, "loss": 0.0003, "step": 3720 }, { "epoch": 2.8631215204453833, "grad_norm": 0.0019345140882013855, "learning_rate": 8.540706605222734e-06, "loss": 0.0003, "step": 3730 }, { "epoch": 2.870800537531196, "grad_norm": 0.003530046987716614, "learning_rate": 8.509984639016898e-06, "loss": 0.0003, "step": 3740 }, { "epoch": 2.8784795546170088, "grad_norm": 0.0045638471491203396, "learning_rate": 8.47926267281106e-06, "loss": 0.0003, "step": 3750 }, { "epoch": 2.886158571702822, "grad_norm": 0.002548977990330342, "learning_rate": 8.448540706605223e-06, "loss": 0.0003, "step": 3760 }, { "epoch": 2.893837588788635, "grad_norm": 0.004637676739276992, "learning_rate": 8.417818740399385e-06, "loss": 0.0003, "step": 3770 }, { "epoch": 2.901516605874448, "grad_norm": 0.004249182216049807, "learning_rate": 8.387096774193549e-06, "loss": 0.0003, "step": 3780 }, { "epoch": 2.909195622960261, "grad_norm": 0.0020834658623780523, "learning_rate": 8.356374807987712e-06, "loss": 0.0003, "step": 3790 }, { "epoch": 2.916874640046074, "grad_norm": 0.005570319306302508, "learning_rate": 8.325652841781874e-06, "loss": 0.0003, "step": 3800 }, { "epoch": 2.924553657131887, "grad_norm": 0.004441691840249149, "learning_rate": 8.294930875576038e-06, "loss": 0.0003, "step": 3810 }, { "epoch": 2.9322326742177003, "grad_norm": 0.004852997098567165, "learning_rate": 8.264208909370201e-06, "loss": 0.0003, "step": 3820 }, { "epoch": 2.939911691303513, "grad_norm": 0.004796673169729304, "learning_rate": 8.233486943164363e-06, "loss": 0.0003, "step": 3830 }, { "epoch": 2.9475907083893262, "grad_norm": 0.00303272221365764, "learning_rate": 8.202764976958527e-06, "loss": 0.0003, "step": 3840 }, { "epoch": 2.955269725475139, "grad_norm": 0.009776414261676187, "learning_rate": 8.172043010752689e-06, "loss": 0.0003, "step": 3850 }, { "epoch": 2.962948742560952, "grad_norm": 0.0019734177655631514, "learning_rate": 8.141321044546852e-06, "loss": 0.0003, "step": 3860 }, { "epoch": 2.9706277596467654, "grad_norm": 0.005414209178109374, "learning_rate": 8.110599078341016e-06, "loss": 0.0003, "step": 3870 }, { "epoch": 2.978306776732578, "grad_norm": 0.0020584308196663873, "learning_rate": 8.079877112135177e-06, "loss": 0.0003, "step": 3880 }, { "epoch": 2.9859857938183914, "grad_norm": 0.002197200567243655, "learning_rate": 8.049155145929341e-06, "loss": 0.0003, "step": 3890 }, { "epoch": 2.993664810904204, "grad_norm": 0.0012909809217368014, "learning_rate": 8.018433179723503e-06, "loss": 0.0003, "step": 3900 }, { "epoch": 3.0007679017085813, "grad_norm": 0.005912347993649129, "learning_rate": 7.987711213517666e-06, "loss": 0.0003, "step": 3910 }, { "epoch": 3.0084469187943945, "grad_norm": 0.0024746506269591262, "learning_rate": 7.956989247311828e-06, "loss": 0.0003, "step": 3920 }, { "epoch": 3.0161259358802073, "grad_norm": 0.003743031987316316, "learning_rate": 7.926267281105992e-06, "loss": 0.0003, "step": 3930 }, { "epoch": 3.0238049529660205, "grad_norm": 0.0020760288460637535, "learning_rate": 7.895545314900154e-06, "loss": 0.0003, "step": 3940 }, { "epoch": 3.0314839700518332, "grad_norm": 0.010111413246967849, "learning_rate": 7.864823348694317e-06, "loss": 0.0003, "step": 3950 }, { "epoch": 3.0391629871376464, "grad_norm": 0.00321449927030987, "learning_rate": 7.83410138248848e-06, "loss": 0.0003, "step": 3960 }, { "epoch": 3.046842004223459, "grad_norm": 0.0022266492617884176, "learning_rate": 7.803379416282643e-06, "loss": 0.0003, "step": 3970 }, { "epoch": 3.0545210213092724, "grad_norm": 0.3330681312874651, "learning_rate": 7.772657450076806e-06, "loss": 0.0006, "step": 3980 }, { "epoch": 3.0622000383950856, "grad_norm": 0.0929644329244732, "learning_rate": 7.741935483870968e-06, "loss": 0.0031, "step": 3990 }, { "epoch": 3.0698790554808983, "grad_norm": 2.5444639341537942, "learning_rate": 7.711213517665132e-06, "loss": 0.0499, "step": 4000 }, { "epoch": 3.0775580725667115, "grad_norm": 2.706681402967684, "learning_rate": 7.680491551459293e-06, "loss": 0.0123, "step": 4010 }, { "epoch": 3.0852370896525243, "grad_norm": 0.3106533133384103, "learning_rate": 7.649769585253457e-06, "loss": 0.012, "step": 4020 }, { "epoch": 3.0929161067383375, "grad_norm": 0.011659651180711347, "learning_rate": 7.61904761904762e-06, "loss": 0.0007, "step": 4030 }, { "epoch": 3.1005951238241507, "grad_norm": 0.01130908041600874, "learning_rate": 7.588325652841782e-06, "loss": 0.0004, "step": 4040 }, { "epoch": 3.1082741409099635, "grad_norm": 0.008227145166831094, "learning_rate": 7.557603686635945e-06, "loss": 0.0003, "step": 4050 }, { "epoch": 3.1159531579957767, "grad_norm": 0.0055737792826447054, "learning_rate": 7.526881720430108e-06, "loss": 0.0003, "step": 4060 }, { "epoch": 3.1236321750815894, "grad_norm": 0.002555421706411845, "learning_rate": 7.496159754224271e-06, "loss": 0.0003, "step": 4070 }, { "epoch": 3.1313111921674026, "grad_norm": 0.006555677014598616, "learning_rate": 7.465437788018434e-06, "loss": 0.0003, "step": 4080 }, { "epoch": 3.1389902092532154, "grad_norm": 0.009434120428132338, "learning_rate": 7.434715821812597e-06, "loss": 0.0003, "step": 4090 }, { "epoch": 3.1466692263390286, "grad_norm": 0.0041818007155617395, "learning_rate": 7.403993855606759e-06, "loss": 0.0003, "step": 4100 }, { "epoch": 3.1543482434248418, "grad_norm": 0.002877849745067617, "learning_rate": 7.373271889400923e-06, "loss": 0.0003, "step": 4110 }, { "epoch": 3.1620272605106545, "grad_norm": 0.0030447348846358723, "learning_rate": 7.342549923195085e-06, "loss": 0.0003, "step": 4120 }, { "epoch": 3.1697062775964677, "grad_norm": 0.0060218718801864175, "learning_rate": 7.311827956989248e-06, "loss": 0.0003, "step": 4130 }, { "epoch": 3.1773852946822805, "grad_norm": 0.006332525678995614, "learning_rate": 7.28110599078341e-06, "loss": 0.0003, "step": 4140 }, { "epoch": 3.1850643117680937, "grad_norm": 0.003215146996700883, "learning_rate": 7.250384024577574e-06, "loss": 0.0003, "step": 4150 }, { "epoch": 3.192743328853907, "grad_norm": 0.005935997258576502, "learning_rate": 7.2196620583717355e-06, "loss": 0.0003, "step": 4160 }, { "epoch": 3.2004223459397196, "grad_norm": 0.004157021390061587, "learning_rate": 7.188940092165899e-06, "loss": 0.0003, "step": 4170 }, { "epoch": 3.208101363025533, "grad_norm": 0.004859684777513284, "learning_rate": 7.158218125960063e-06, "loss": 0.0003, "step": 4180 }, { "epoch": 3.2157803801113456, "grad_norm": 0.004857019863143934, "learning_rate": 7.1274961597542245e-06, "loss": 0.0003, "step": 4190 }, { "epoch": 3.223459397197159, "grad_norm": 0.003878455168237981, "learning_rate": 7.096774193548388e-06, "loss": 0.0003, "step": 4200 }, { "epoch": 3.231138414282972, "grad_norm": 0.004425680070729378, "learning_rate": 7.06605222734255e-06, "loss": 0.0003, "step": 4210 }, { "epoch": 3.2388174313687847, "grad_norm": 0.0012776092639396753, "learning_rate": 7.0353302611367134e-06, "loss": 0.0003, "step": 4220 }, { "epoch": 3.246496448454598, "grad_norm": 0.0032977925007112736, "learning_rate": 7.004608294930876e-06, "loss": 0.0003, "step": 4230 }, { "epoch": 3.2541754655404107, "grad_norm": 0.00520607446673023, "learning_rate": 6.973886328725039e-06, "loss": 0.0003, "step": 4240 }, { "epoch": 3.261854482626224, "grad_norm": 0.005908417082190133, "learning_rate": 6.9431643625192015e-06, "loss": 0.0003, "step": 4250 }, { "epoch": 3.269533499712037, "grad_norm": 0.005104388415579662, "learning_rate": 6.912442396313365e-06, "loss": 0.0003, "step": 4260 }, { "epoch": 3.27721251679785, "grad_norm": 0.00279843439440394, "learning_rate": 6.881720430107528e-06, "loss": 0.0003, "step": 4270 }, { "epoch": 3.284891533883663, "grad_norm": 0.004221914284256829, "learning_rate": 6.8509984639016905e-06, "loss": 0.0003, "step": 4280 }, { "epoch": 3.292570550969476, "grad_norm": 0.002789017236217476, "learning_rate": 6.820276497695853e-06, "loss": 0.0003, "step": 4290 }, { "epoch": 3.300249568055289, "grad_norm": 0.003591747007618594, "learning_rate": 6.789554531490016e-06, "loss": 0.0003, "step": 4300 }, { "epoch": 3.3079285851411018, "grad_norm": 0.0056326237606971275, "learning_rate": 6.758832565284179e-06, "loss": 0.0003, "step": 4310 }, { "epoch": 3.315607602226915, "grad_norm": 0.0027487580853665806, "learning_rate": 6.728110599078341e-06, "loss": 0.0003, "step": 4320 }, { "epoch": 3.3232866193127277, "grad_norm": 0.005221571566163444, "learning_rate": 6.697388632872505e-06, "loss": 0.0003, "step": 4330 }, { "epoch": 3.330965636398541, "grad_norm": 0.005764385454293689, "learning_rate": 6.666666666666667e-06, "loss": 0.0003, "step": 4340 }, { "epoch": 3.338644653484354, "grad_norm": 0.0019604837190694527, "learning_rate": 6.63594470046083e-06, "loss": 0.0003, "step": 4350 }, { "epoch": 3.346323670570167, "grad_norm": 0.002896385918252742, "learning_rate": 6.605222734254992e-06, "loss": 0.0003, "step": 4360 }, { "epoch": 3.35400268765598, "grad_norm": 0.00606820751300495, "learning_rate": 6.574500768049156e-06, "loss": 0.0003, "step": 4370 }, { "epoch": 3.361681704741793, "grad_norm": 0.004591568141116001, "learning_rate": 6.543778801843319e-06, "loss": 0.0003, "step": 4380 }, { "epoch": 3.369360721827606, "grad_norm": 0.002433120666611045, "learning_rate": 6.513056835637481e-06, "loss": 0.0003, "step": 4390 }, { "epoch": 3.3770397389134192, "grad_norm": 0.00531173620847677, "learning_rate": 6.4823348694316445e-06, "loss": 0.0003, "step": 4400 }, { "epoch": 3.384718755999232, "grad_norm": 0.005122482104609612, "learning_rate": 6.451612903225806e-06, "loss": 0.0003, "step": 4410 }, { "epoch": 3.392397773085045, "grad_norm": 0.004100500912973401, "learning_rate": 6.42089093701997e-06, "loss": 0.0003, "step": 4420 }, { "epoch": 3.400076790170858, "grad_norm": 0.0023809509840975367, "learning_rate": 6.390168970814133e-06, "loss": 0.0003, "step": 4430 }, { "epoch": 3.407755807256671, "grad_norm": 0.0026348003137030064, "learning_rate": 6.359447004608295e-06, "loss": 0.0003, "step": 4440 }, { "epoch": 3.4154348243424844, "grad_norm": 0.0013064532447140981, "learning_rate": 6.328725038402458e-06, "loss": 0.0003, "step": 4450 }, { "epoch": 3.423113841428297, "grad_norm": 0.004540940802478115, "learning_rate": 6.2980030721966216e-06, "loss": 0.0003, "step": 4460 }, { "epoch": 3.4307928585141103, "grad_norm": 0.0026972329703791845, "learning_rate": 6.267281105990783e-06, "loss": 0.0003, "step": 4470 }, { "epoch": 3.438471875599923, "grad_norm": 0.004636444780767564, "learning_rate": 6.236559139784947e-06, "loss": 0.0003, "step": 4480 }, { "epoch": 3.4461508926857363, "grad_norm": 0.003960381641830104, "learning_rate": 6.2058371735791105e-06, "loss": 0.0003, "step": 4490 }, { "epoch": 3.4538299097715495, "grad_norm": 0.00474082033962727, "learning_rate": 6.175115207373272e-06, "loss": 0.0003, "step": 4500 }, { "epoch": 3.4615089268573622, "grad_norm": 0.0038782360963232256, "learning_rate": 6.144393241167436e-06, "loss": 0.0003, "step": 4510 }, { "epoch": 3.4691879439431754, "grad_norm": 0.0035297720880386315, "learning_rate": 6.113671274961598e-06, "loss": 0.0003, "step": 4520 }, { "epoch": 3.476866961028988, "grad_norm": 0.004232110212456926, "learning_rate": 6.082949308755761e-06, "loss": 0.0003, "step": 4530 }, { "epoch": 3.4845459781148014, "grad_norm": 0.002775233265931185, "learning_rate": 6.052227342549923e-06, "loss": 0.0002, "step": 4540 }, { "epoch": 3.492224995200614, "grad_norm": 0.0022484687297900418, "learning_rate": 6.021505376344087e-06, "loss": 0.0003, "step": 4550 }, { "epoch": 3.4999040122864273, "grad_norm": 0.010059652937527065, "learning_rate": 5.9907834101382485e-06, "loss": 0.0003, "step": 4560 }, { "epoch": 3.50758302937224, "grad_norm": 0.005860058288903284, "learning_rate": 5.960061443932412e-06, "loss": 0.0003, "step": 4570 }, { "epoch": 3.5152620464580533, "grad_norm": 0.001771873531568768, "learning_rate": 5.929339477726575e-06, "loss": 0.0003, "step": 4580 }, { "epoch": 3.5229410635438665, "grad_norm": 0.0037856677349401477, "learning_rate": 5.8986175115207375e-06, "loss": 0.0003, "step": 4590 }, { "epoch": 3.5306200806296792, "grad_norm": 0.00262376974978298, "learning_rate": 5.867895545314901e-06, "loss": 0.0003, "step": 4600 }, { "epoch": 3.5382990977154924, "grad_norm": 0.003097531567235156, "learning_rate": 5.837173579109064e-06, "loss": 0.0003, "step": 4610 }, { "epoch": 3.545978114801305, "grad_norm": 0.0019368382797062594, "learning_rate": 5.806451612903226e-06, "loss": 0.0003, "step": 4620 }, { "epoch": 3.5536571318871184, "grad_norm": 0.004336527548689335, "learning_rate": 5.775729646697389e-06, "loss": 0.0003, "step": 4630 }, { "epoch": 3.5613361489729316, "grad_norm": 0.001542836625256801, "learning_rate": 5.745007680491552e-06, "loss": 0.0003, "step": 4640 }, { "epoch": 3.5690151660587444, "grad_norm": 0.006404316601820908, "learning_rate": 5.7142857142857145e-06, "loss": 0.0003, "step": 4650 }, { "epoch": 3.5766941831445576, "grad_norm": 0.0029147588687753185, "learning_rate": 5.683563748079878e-06, "loss": 0.0003, "step": 4660 }, { "epoch": 3.5843732002303703, "grad_norm": 0.009747259780473038, "learning_rate": 5.65284178187404e-06, "loss": 0.0003, "step": 4670 }, { "epoch": 3.5920522173161835, "grad_norm": 0.005435655230219059, "learning_rate": 5.6221198156682035e-06, "loss": 0.0003, "step": 4680 }, { "epoch": 3.5997312344019967, "grad_norm": 0.0011809589959859185, "learning_rate": 5.591397849462365e-06, "loss": 0.0003, "step": 4690 }, { "epoch": 3.6074102514878095, "grad_norm": 0.0043535225911655855, "learning_rate": 5.560675883256529e-06, "loss": 0.0003, "step": 4700 }, { "epoch": 3.6150892685736227, "grad_norm": 0.002409686221015935, "learning_rate": 5.529953917050692e-06, "loss": 0.0003, "step": 4710 }, { "epoch": 3.6227682856594354, "grad_norm": 0.0021895075558719937, "learning_rate": 5.499231950844854e-06, "loss": 0.0003, "step": 4720 }, { "epoch": 3.6304473027452486, "grad_norm": 0.002325096639061682, "learning_rate": 5.468509984639018e-06, "loss": 0.0003, "step": 4730 }, { "epoch": 3.638126319831062, "grad_norm": 0.005217362404820908, "learning_rate": 5.43778801843318e-06, "loss": 0.0003, "step": 4740 }, { "epoch": 3.6458053369168746, "grad_norm": 0.0025128558421436426, "learning_rate": 5.407066052227343e-06, "loss": 0.0003, "step": 4750 }, { "epoch": 3.653484354002688, "grad_norm": 0.003260458572828957, "learning_rate": 5.376344086021506e-06, "loss": 0.0003, "step": 4760 }, { "epoch": 3.6611633710885005, "grad_norm": 0.004920360483939583, "learning_rate": 5.345622119815669e-06, "loss": 0.0003, "step": 4770 }, { "epoch": 3.6688423881743137, "grad_norm": 0.006017572517411142, "learning_rate": 5.314900153609831e-06, "loss": 0.0003, "step": 4780 }, { "epoch": 3.676521405260127, "grad_norm": 0.003921825806007615, "learning_rate": 5.284178187403994e-06, "loss": 0.0003, "step": 4790 }, { "epoch": 3.6842004223459397, "grad_norm": 0.0026317828696058375, "learning_rate": 5.253456221198157e-06, "loss": 0.0003, "step": 4800 }, { "epoch": 3.691879439431753, "grad_norm": 0.0018516494095851901, "learning_rate": 5.22273425499232e-06, "loss": 0.0003, "step": 4810 }, { "epoch": 3.6995584565175657, "grad_norm": 0.0018334101081003715, "learning_rate": 5.192012288786483e-06, "loss": 0.0003, "step": 4820 }, { "epoch": 3.707237473603379, "grad_norm": 0.0026102899650512555, "learning_rate": 5.161290322580646e-06, "loss": 0.0003, "step": 4830 }, { "epoch": 3.714916490689192, "grad_norm": 0.0041980444717698105, "learning_rate": 5.130568356374809e-06, "loss": 0.0002, "step": 4840 }, { "epoch": 3.722595507775005, "grad_norm": 0.002162407360619667, "learning_rate": 5.099846390168971e-06, "loss": 0.0003, "step": 4850 }, { "epoch": 3.7302745248608176, "grad_norm": 0.003927892142640596, "learning_rate": 5.0691244239631346e-06, "loss": 0.0003, "step": 4860 }, { "epoch": 3.7379535419466308, "grad_norm": 0.006489628564389115, "learning_rate": 5.038402457757296e-06, "loss": 0.0003, "step": 4870 }, { "epoch": 3.745632559032444, "grad_norm": 0.0021265975651571616, "learning_rate": 5.00768049155146e-06, "loss": 0.0003, "step": 4880 }, { "epoch": 3.7533115761182567, "grad_norm": 0.008523718567678173, "learning_rate": 4.976958525345623e-06, "loss": 0.0002, "step": 4890 }, { "epoch": 3.76099059320407, "grad_norm": 0.00219483779562531, "learning_rate": 4.946236559139785e-06, "loss": 0.0003, "step": 4900 }, { "epoch": 3.7686696102898827, "grad_norm": 0.0026242522484158557, "learning_rate": 4.915514592933948e-06, "loss": 0.0003, "step": 4910 }, { "epoch": 3.776348627375696, "grad_norm": 0.0030264706878712897, "learning_rate": 4.884792626728111e-06, "loss": 0.0003, "step": 4920 }, { "epoch": 3.784027644461509, "grad_norm": 0.006051061980646295, "learning_rate": 4.8540706605222734e-06, "loss": 0.0003, "step": 4930 }, { "epoch": 3.791706661547322, "grad_norm": 0.0076497369041367514, "learning_rate": 4.823348694316436e-06, "loss": 0.0003, "step": 4940 }, { "epoch": 3.799385678633135, "grad_norm": 0.003304104700477534, "learning_rate": 4.7926267281106e-06, "loss": 0.0003, "step": 4950 }, { "epoch": 3.807064695718948, "grad_norm": 0.001277891356393671, "learning_rate": 4.761904761904762e-06, "loss": 0.0003, "step": 4960 }, { "epoch": 3.814743712804761, "grad_norm": 0.0016109781447346389, "learning_rate": 4.731182795698925e-06, "loss": 0.0002, "step": 4970 }, { "epoch": 3.822422729890574, "grad_norm": 0.0032221247449133765, "learning_rate": 4.700460829493088e-06, "loss": 0.0003, "step": 4980 }, { "epoch": 3.830101746976387, "grad_norm": 0.0023617321542409296, "learning_rate": 4.669738863287251e-06, "loss": 0.0003, "step": 4990 }, { "epoch": 3.8377807640622, "grad_norm": 0.0016875889883648163, "learning_rate": 4.639016897081414e-06, "loss": 0.0002, "step": 5000 } ], "logging_steps": 10, "max_steps": 6510, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.088806659181773e+16, "train_batch_size": 12, "trial_name": null, "trial_params": null }