{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.118375, "eval_steps": 500, "global_step": 2900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00125, "grad_norm": 1.0962235927581787, "learning_rate": 1.3499999999999998e-05, "loss": 4.542208099365235, "step": 10 }, { "epoch": 0.0025, "grad_norm": 0.6401466727256775, "learning_rate": 2.8499999999999998e-05, "loss": 4.2099754333496096, "step": 20 }, { "epoch": 0.00375, "grad_norm": 0.5453175902366638, "learning_rate": 4.3499999999999993e-05, "loss": 4.047785949707031, "step": 30 }, { "epoch": 0.005, "grad_norm": 0.48525792360305786, "learning_rate": 5.85e-05, "loss": 3.964551544189453, "step": 40 }, { "epoch": 0.00625, "grad_norm": 0.4704461395740509, "learning_rate": 7.35e-05, "loss": 3.8913909912109377, "step": 50 }, { "epoch": 0.0075, "grad_norm": 0.46138691902160645, "learning_rate": 8.849999999999998e-05, "loss": 3.830010986328125, "step": 60 }, { "epoch": 0.00875, "grad_norm": 0.4441743493080139, "learning_rate": 0.00010349999999999998, "loss": 3.7685489654541016, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.45079436898231506, "learning_rate": 0.0001185, "loss": 3.7476680755615233, "step": 80 }, { "epoch": 0.01125, "grad_norm": 0.4830528497695923, "learning_rate": 0.0001335, "loss": 3.6839160919189453, "step": 90 }, { "epoch": 0.0125, "grad_norm": 0.44532763957977295, "learning_rate": 0.00014849999999999998, "loss": 3.6974281311035155, "step": 100 }, { "epoch": 0.01375, "grad_norm": 0.4425051808357239, "learning_rate": 0.0001635, "loss": 3.6414600372314454, "step": 110 }, { "epoch": 0.015, "grad_norm": 0.44826197624206543, "learning_rate": 0.00017849999999999997, "loss": 3.678114318847656, "step": 120 }, { "epoch": 0.01625, "grad_norm": 0.4449610412120819, "learning_rate": 0.0001935, "loss": 3.620401382446289, "step": 130 }, { "epoch": 0.0175, "grad_norm": 0.4469984173774719, "learning_rate": 0.00020849999999999997, "loss": 3.5873218536376954, "step": 140 }, { "epoch": 0.01875, "grad_norm": 0.43191787600517273, "learning_rate": 0.00022349999999999998, "loss": 3.6034412384033203, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.42065128684043884, "learning_rate": 0.0002385, "loss": 3.5601234436035156, "step": 160 }, { "epoch": 0.02125, "grad_norm": 0.4359514117240906, "learning_rate": 0.0002535, "loss": 3.569548797607422, "step": 170 }, { "epoch": 0.0225, "grad_norm": 0.4924817681312561, "learning_rate": 0.00026849999999999997, "loss": 3.5676345825195312, "step": 180 }, { "epoch": 0.02375, "grad_norm": 0.45101380348205566, "learning_rate": 0.00028349999999999995, "loss": 3.548577880859375, "step": 190 }, { "epoch": 0.025, "grad_norm": 0.43757978081703186, "learning_rate": 0.0002985, "loss": 3.542619323730469, "step": 200 }, { "epoch": 0.02625, "grad_norm": 0.42400574684143066, "learning_rate": 0.00029999901450063963, "loss": 3.519863510131836, "step": 210 }, { "epoch": 0.0275, "grad_norm": 0.4135052263736725, "learning_rate": 0.00029999560785280927, "loss": 3.489061737060547, "step": 220 }, { "epoch": 0.02875, "grad_norm": 0.42550331354141235, "learning_rate": 0.0002999897679451004, "loss": 3.5228813171386717, "step": 230 }, { "epoch": 0.03, "grad_norm": 0.4192274808883667, "learning_rate": 0.0002999814948722491, "loss": 3.500740814208984, "step": 240 }, { "epoch": 0.03125, "grad_norm": 0.4208814799785614, "learning_rate": 0.00029997078876846286, "loss": 3.4928520202636717, "step": 250 }, { "epoch": 0.0325, "grad_norm": 0.4207195043563843, "learning_rate": 0.00029995764980741843, "loss": 3.455915069580078, "step": 260 }, { "epoch": 0.03375, "grad_norm": 0.4257533550262451, "learning_rate": 0.00029994207820225867, "loss": 3.4429149627685547, "step": 270 }, { "epoch": 0.035, "grad_norm": 0.4169400930404663, "learning_rate": 0.0002999240742055895, "loss": 3.423783874511719, "step": 280 }, { "epoch": 0.03625, "grad_norm": 0.4206733703613281, "learning_rate": 0.0002999036381094753, "loss": 3.463421630859375, "step": 290 }, { "epoch": 0.0375, "grad_norm": 0.4171896278858185, "learning_rate": 0.0002998807702454349, "loss": 3.4295799255371096, "step": 300 }, { "epoch": 0.03875, "grad_norm": 0.4365955889225006, "learning_rate": 0.00029985547098443534, "loss": 3.395406723022461, "step": 310 }, { "epoch": 0.04, "grad_norm": 0.405113160610199, "learning_rate": 0.00029982774073688656, "loss": 3.4092079162597657, "step": 320 }, { "epoch": 0.04125, "grad_norm": 0.4027758836746216, "learning_rate": 0.0002997975799526344, "loss": 3.3900100708007814, "step": 330 }, { "epoch": 0.0425, "grad_norm": 0.36929693818092346, "learning_rate": 0.0002997649891209534, "loss": 3.3978004455566406, "step": 340 }, { "epoch": 0.04375, "grad_norm": 0.36560943722724915, "learning_rate": 0.00029972996877053866, "loss": 3.389325714111328, "step": 350 }, { "epoch": 0.045, "grad_norm": 0.381977915763855, "learning_rate": 0.0002996925194694977, "loss": 3.3937278747558595, "step": 360 }, { "epoch": 0.04625, "grad_norm": 0.4034512937068939, "learning_rate": 0.0002996526418253408, "loss": 3.3755035400390625, "step": 370 }, { "epoch": 0.0475, "grad_norm": 0.40442460775375366, "learning_rate": 0.00029961033648497136, "loss": 3.3367355346679686, "step": 380 }, { "epoch": 0.04875, "grad_norm": 0.4017426073551178, "learning_rate": 0.00029956560413467545, "loss": 3.358184051513672, "step": 390 }, { "epoch": 0.05, "grad_norm": 0.39038756489753723, "learning_rate": 0.00029951844550011046, "loss": 3.343119430541992, "step": 400 }, { "epoch": 0.05125, "grad_norm": 0.3888174295425415, "learning_rate": 0.00029946886134629366, "loss": 3.3283355712890623, "step": 410 }, { "epoch": 0.0525, "grad_norm": 0.37037158012390137, "learning_rate": 0.0002994168524775894, "loss": 3.3299789428710938, "step": 420 }, { "epoch": 0.05375, "grad_norm": 0.3683609366416931, "learning_rate": 0.0002993624197376964, "loss": 3.3183124542236326, "step": 430 }, { "epoch": 0.055, "grad_norm": 0.38251495361328125, "learning_rate": 0.00029930556400963374, "loss": 3.3277027130126955, "step": 440 }, { "epoch": 0.05625, "grad_norm": 0.3685538172721863, "learning_rate": 0.00029924628621572693, "loss": 3.3209506988525392, "step": 450 }, { "epoch": 0.0575, "grad_norm": 0.40296629071235657, "learning_rate": 0.0002991845873175927, "loss": 3.328449249267578, "step": 460 }, { "epoch": 0.05875, "grad_norm": 0.38991594314575195, "learning_rate": 0.0002991204683161233, "loss": 3.3068389892578125, "step": 470 }, { "epoch": 0.06, "grad_norm": 0.39050525426864624, "learning_rate": 0.00029905393025147044, "loss": 3.3071823120117188, "step": 480 }, { "epoch": 0.06125, "grad_norm": 0.3846362829208374, "learning_rate": 0.00029898497420302855, "loss": 3.280712127685547, "step": 490 }, { "epoch": 0.0625, "grad_norm": 0.3796476125717163, "learning_rate": 0.0002989136012894168, "loss": 3.2889366149902344, "step": 500 }, { "epoch": 0.06375, "grad_norm": 0.36522796750068665, "learning_rate": 0.0002988398126684615, "loss": 3.291548156738281, "step": 510 }, { "epoch": 0.065, "grad_norm": 0.3771977722644806, "learning_rate": 0.000298763609537177, "loss": 3.2864120483398436, "step": 520 }, { "epoch": 0.06625, "grad_norm": 0.35490748286247253, "learning_rate": 0.00029868499313174624, "loss": 3.285882568359375, "step": 530 }, { "epoch": 0.0675, "grad_norm": 0.37846773862838745, "learning_rate": 0.00029860396472750083, "loss": 3.279650115966797, "step": 540 }, { "epoch": 0.06875, "grad_norm": 0.38366758823394775, "learning_rate": 0.0002985205256389005, "loss": 3.2765472412109373, "step": 550 }, { "epoch": 0.07, "grad_norm": 0.3582383990287781, "learning_rate": 0.0002984346772195113, "loss": 3.283465576171875, "step": 560 }, { "epoch": 0.07125, "grad_norm": 0.35587790608406067, "learning_rate": 0.00029834642086198427, "loss": 3.271703338623047, "step": 570 }, { "epoch": 0.0725, "grad_norm": 0.37970319390296936, "learning_rate": 0.0002982557579980322, "loss": 3.245616912841797, "step": 580 }, { "epoch": 0.07375, "grad_norm": 0.3639717400074005, "learning_rate": 0.000298162690098407, "loss": 3.258502960205078, "step": 590 }, { "epoch": 0.075, "grad_norm": 0.36604151129722595, "learning_rate": 0.0002980672186728754, "loss": 3.2504783630371095, "step": 600 }, { "epoch": 0.07625, "grad_norm": 0.3567405343055725, "learning_rate": 0.0002979693452701947, "loss": 3.2632408142089844, "step": 610 }, { "epoch": 0.0775, "grad_norm": 0.3561687171459198, "learning_rate": 0.0002978690714780875, "loss": 3.2412956237792967, "step": 620 }, { "epoch": 0.07875, "grad_norm": 0.3695826232433319, "learning_rate": 0.00029776639892321606, "loss": 3.2321800231933593, "step": 630 }, { "epoch": 0.08, "grad_norm": 0.3597981035709381, "learning_rate": 0.000297661329271156, "loss": 3.205059814453125, "step": 640 }, { "epoch": 0.08125, "grad_norm": 0.3703027367591858, "learning_rate": 0.00029755386422636884, "loss": 3.2197303771972656, "step": 650 }, { "epoch": 0.0825, "grad_norm": 0.34924793243408203, "learning_rate": 0.000297444005532175, "loss": 3.1991270065307615, "step": 660 }, { "epoch": 0.08375, "grad_norm": 0.3698681592941284, "learning_rate": 0.0002973317549707249, "loss": 3.2529460906982424, "step": 670 }, { "epoch": 0.085, "grad_norm": 0.3563082814216614, "learning_rate": 0.0002972171143629705, "loss": 3.2158493041992187, "step": 680 }, { "epoch": 0.08625, "grad_norm": 0.34349241852760315, "learning_rate": 0.0002971000855686355, "loss": 3.1953773498535156, "step": 690 }, { "epoch": 0.0875, "grad_norm": 0.35246145725250244, "learning_rate": 0.00029698067048618536, "loss": 3.1936809539794924, "step": 700 }, { "epoch": 0.08875, "grad_norm": 0.372384637594223, "learning_rate": 0.00029685887105279624, "loss": 3.1743907928466797, "step": 710 }, { "epoch": 0.09, "grad_norm": 0.36444413661956787, "learning_rate": 0.0002967346892443239, "loss": 3.1915206909179688, "step": 720 }, { "epoch": 0.09125, "grad_norm": 0.351936012506485, "learning_rate": 0.00029660812707527133, "loss": 3.2071929931640626, "step": 730 }, { "epoch": 0.0925, "grad_norm": 0.34430351853370667, "learning_rate": 0.00029647918659875635, "loss": 3.181209182739258, "step": 740 }, { "epoch": 0.09375, "grad_norm": 0.35013672709465027, "learning_rate": 0.0002963478699064781, "loss": 3.1906646728515624, "step": 750 }, { "epoch": 0.095, "grad_norm": 0.34257224202156067, "learning_rate": 0.00029621417912868323, "loss": 3.228814697265625, "step": 760 }, { "epoch": 0.09625, "grad_norm": 0.3684031367301941, "learning_rate": 0.00029607811643413135, "loss": 3.1923717498779296, "step": 770 }, { "epoch": 0.0975, "grad_norm": 0.3443935513496399, "learning_rate": 0.0002959396840300596, "loss": 3.19796085357666, "step": 780 }, { "epoch": 0.09875, "grad_norm": 0.35859978199005127, "learning_rate": 0.0002957988841621472, "loss": 3.2059059143066406, "step": 790 }, { "epoch": 0.1, "grad_norm": 0.35764068365097046, "learning_rate": 0.00029565571911447893, "loss": 3.197886276245117, "step": 800 }, { "epoch": 0.10125, "grad_norm": 0.34071439504623413, "learning_rate": 0.0002955101912095078, "loss": 3.1616336822509767, "step": 810 }, { "epoch": 0.1025, "grad_norm": 0.3420080244541168, "learning_rate": 0.00029536230280801767, "loss": 3.1601337432861327, "step": 820 }, { "epoch": 0.10375, "grad_norm": 0.33847373723983765, "learning_rate": 0.000295212056309085, "loss": 3.1570825576782227, "step": 830 }, { "epoch": 0.105, "grad_norm": 0.33834919333457947, "learning_rate": 0.00029505945415003954, "loss": 3.1625667572021485, "step": 840 }, { "epoch": 0.10625, "grad_norm": 0.33340954780578613, "learning_rate": 0.0002949044988064253, "loss": 3.180934524536133, "step": 850 }, { "epoch": 0.1075, "grad_norm": 0.36460039019584656, "learning_rate": 0.0002947471927919599, "loss": 3.157451629638672, "step": 860 }, { "epoch": 0.10875, "grad_norm": 0.3350942134857178, "learning_rate": 0.00029458753865849424, "loss": 3.16195068359375, "step": 870 }, { "epoch": 0.11, "grad_norm": 0.3496924936771393, "learning_rate": 0.00029442553899597075, "loss": 3.151328468322754, "step": 880 }, { "epoch": 0.11125, "grad_norm": 0.33595219254493713, "learning_rate": 0.0002942611964323817, "loss": 3.1474483489990233, "step": 890 }, { "epoch": 0.1125, "grad_norm": 0.34623855352401733, "learning_rate": 0.00029409451363372605, "loss": 3.151190757751465, "step": 900 }, { "epoch": 0.11375, "grad_norm": 0.3318001925945282, "learning_rate": 0.00029392549330396696, "loss": 3.166637420654297, "step": 910 }, { "epoch": 0.115, "grad_norm": 0.3412299156188965, "learning_rate": 0.0002937541381849872, "loss": 3.1225704193115233, "step": 920 }, { "epoch": 0.11625, "grad_norm": 0.3505077064037323, "learning_rate": 0.000293580451056545, "loss": 3.1353290557861326, "step": 930 }, { "epoch": 0.1175, "grad_norm": 0.36463502049446106, "learning_rate": 0.00029340443473622915, "loss": 3.136840057373047, "step": 940 }, { "epoch": 0.11875, "grad_norm": 0.3351852297782898, "learning_rate": 0.00029322609207941283, "loss": 3.13529109954834, "step": 950 }, { "epoch": 0.12, "grad_norm": 0.34219565987586975, "learning_rate": 0.00029304542597920766, "loss": 3.158438873291016, "step": 960 }, { "epoch": 0.12125, "grad_norm": 0.32809194922447205, "learning_rate": 0.0002928624393664166, "loss": 3.1299543380737305, "step": 970 }, { "epoch": 0.1225, "grad_norm": 0.3327496647834778, "learning_rate": 0.00029267713520948643, "loss": 3.127825164794922, "step": 980 }, { "epoch": 0.12375, "grad_norm": 0.34798717498779297, "learning_rate": 0.00029248951651445973, "loss": 3.1128513336181642, "step": 990 }, { "epoch": 0.125, "grad_norm": 0.3409072756767273, "learning_rate": 0.0002922995863249258, "loss": 3.0872947692871096, "step": 1000 }, { "epoch": 0.12625, "grad_norm": 0.3241552412509918, "learning_rate": 0.00029210734772197166, "loss": 3.135254669189453, "step": 1010 }, { "epoch": 0.1275, "grad_norm": 0.33775585889816284, "learning_rate": 0.0002919128038241318, "loss": 3.1159934997558594, "step": 1020 }, { "epoch": 0.12875, "grad_norm": 0.3477814197540283, "learning_rate": 0.0002917159577873377, "loss": 3.1170124053955077, "step": 1030 }, { "epoch": 0.13, "grad_norm": 0.34592533111572266, "learning_rate": 0.00029151681280486656, "loss": 3.108189010620117, "step": 1040 }, { "epoch": 0.13125, "grad_norm": 0.3351367712020874, "learning_rate": 0.00029131537210728975, "loss": 3.1113187789916994, "step": 1050 }, { "epoch": 0.1325, "grad_norm": 0.354574054479599, "learning_rate": 0.00029111163896241996, "loss": 3.114427947998047, "step": 1060 }, { "epoch": 0.13375, "grad_norm": 0.3495411276817322, "learning_rate": 0.0002909056166752586, "loss": 3.099652862548828, "step": 1070 }, { "epoch": 0.135, "grad_norm": 0.32834145426750183, "learning_rate": 0.0002906973085879419, "loss": 3.1386039733886717, "step": 1080 }, { "epoch": 0.13625, "grad_norm": 0.3264714181423187, "learning_rate": 0.000290486718079687, "loss": 3.0912006378173826, "step": 1090 }, { "epoch": 0.1375, "grad_norm": 0.3366532623767853, "learning_rate": 0.0002902738485667367, "loss": 3.066594123840332, "step": 1100 }, { "epoch": 0.13875, "grad_norm": 0.3517299294471741, "learning_rate": 0.00029005870350230453, "loss": 3.1176836013793947, "step": 1110 }, { "epoch": 0.14, "grad_norm": 0.3267839550971985, "learning_rate": 0.00028984128637651825, "loss": 3.0968740463256834, "step": 1120 }, { "epoch": 0.14125, "grad_norm": 0.3284844160079956, "learning_rate": 0.0002896216007163637, "loss": 3.0969438552856445, "step": 1130 }, { "epoch": 0.1425, "grad_norm": 0.32528674602508545, "learning_rate": 0.0002893996500856272, "loss": 3.1029247283935546, "step": 1140 }, { "epoch": 0.14375, "grad_norm": 0.3221900463104248, "learning_rate": 0.00028917543808483796, "loss": 3.1135093688964846, "step": 1150 }, { "epoch": 0.145, "grad_norm": 0.33406203985214233, "learning_rate": 0.0002889489683512096, "loss": 3.0768909454345703, "step": 1160 }, { "epoch": 0.14625, "grad_norm": 0.3397602140903473, "learning_rate": 0.0002887202445585811, "loss": 3.0875574111938477, "step": 1170 }, { "epoch": 0.1475, "grad_norm": 0.33265799283981323, "learning_rate": 0.0002884892704173573, "loss": 3.0731714248657225, "step": 1180 }, { "epoch": 0.14875, "grad_norm": 0.32758811116218567, "learning_rate": 0.00028825604967444866, "loss": 3.12725772857666, "step": 1190 }, { "epoch": 0.15, "grad_norm": 0.3414333164691925, "learning_rate": 0.0002880205861132105, "loss": 3.0664628982543944, "step": 1200 }, { "epoch": 0.15125, "grad_norm": 0.33994749188423157, "learning_rate": 0.00028778288355338144, "loss": 3.0672843933105467, "step": 1210 }, { "epoch": 0.1525, "grad_norm": 0.33031970262527466, "learning_rate": 0.0002875429458510219, "loss": 3.1195865631103517, "step": 1220 }, { "epoch": 0.15375, "grad_norm": 0.3435764014720917, "learning_rate": 0.0002873007768984511, "loss": 3.0530136108398436, "step": 1230 }, { "epoch": 0.155, "grad_norm": 0.32233837246894836, "learning_rate": 0.00028705638062418386, "loss": 3.076219177246094, "step": 1240 }, { "epoch": 0.15625, "grad_norm": 0.32135722041130066, "learning_rate": 0.0002868097609928674, "loss": 3.051331329345703, "step": 1250 }, { "epoch": 0.1575, "grad_norm": 0.3371255099773407, "learning_rate": 0.0002865609220052165, "loss": 3.060790252685547, "step": 1260 }, { "epoch": 0.15875, "grad_norm": 0.344938188791275, "learning_rate": 0.000286309867697949, "loss": 3.0386356353759765, "step": 1270 }, { "epoch": 0.16, "grad_norm": 0.3418465256690979, "learning_rate": 0.0002860566021437197, "loss": 3.0650426864624025, "step": 1280 }, { "epoch": 0.16125, "grad_norm": 0.3343369960784912, "learning_rate": 0.0002858011294510552, "loss": 3.04056396484375, "step": 1290 }, { "epoch": 0.1625, "grad_norm": 0.34760987758636475, "learning_rate": 0.0002855434537642865, "loss": 3.057343292236328, "step": 1300 }, { "epoch": 0.16375, "grad_norm": 0.3561476767063141, "learning_rate": 0.000285283579263482, "loss": 3.0452644348144533, "step": 1310 }, { "epoch": 0.165, "grad_norm": 0.3393990993499756, "learning_rate": 0.00028502151016437986, "loss": 3.0280082702636717, "step": 1320 }, { "epoch": 0.16625, "grad_norm": 0.34054502844810486, "learning_rate": 0.0002847572507183193, "loss": 3.05167293548584, "step": 1330 }, { "epoch": 0.1675, "grad_norm": 0.33443859219551086, "learning_rate": 0.000284490805212172, "loss": 3.090429496765137, "step": 1340 }, { "epoch": 0.16875, "grad_norm": 0.33417847752571106, "learning_rate": 0.00028422217796827216, "loss": 3.064510726928711, "step": 1350 }, { "epoch": 0.17, "grad_norm": 0.33783411979675293, "learning_rate": 0.00028395137334434676, "loss": 3.03808479309082, "step": 1360 }, { "epoch": 0.17125, "grad_norm": 0.3298597037792206, "learning_rate": 0.00028367839573344454, "loss": 3.0556320190429687, "step": 1370 }, { "epoch": 0.1725, "grad_norm": 0.3494487702846527, "learning_rate": 0.000283403249563865, "loss": 3.039119338989258, "step": 1380 }, { "epoch": 0.17375, "grad_norm": 0.3314499258995056, "learning_rate": 0.0002831259392990864, "loss": 3.0571495056152345, "step": 1390 }, { "epoch": 0.175, "grad_norm": 0.32631996273994446, "learning_rate": 0.00028284646943769337, "loss": 3.0531475067138674, "step": 1400 }, { "epoch": 0.17625, "grad_norm": 0.32792332768440247, "learning_rate": 0.00028256484451330403, "loss": 3.026413917541504, "step": 1410 }, { "epoch": 0.1775, "grad_norm": 0.31503939628601074, "learning_rate": 0.0002822810690944963, "loss": 3.052714538574219, "step": 1420 }, { "epoch": 0.17875, "grad_norm": 0.33248430490493774, "learning_rate": 0.000281995147784734, "loss": 3.050276184082031, "step": 1430 }, { "epoch": 0.18, "grad_norm": 0.3251739740371704, "learning_rate": 0.0002817070852222918, "loss": 3.057173156738281, "step": 1440 }, { "epoch": 0.18125, "grad_norm": 0.32668808102607727, "learning_rate": 0.0002814168860801806, "loss": 3.0323795318603515, "step": 1450 }, { "epoch": 0.1825, "grad_norm": 0.3308984339237213, "learning_rate": 0.0002811245550660709, "loss": 3.0549407958984376, "step": 1460 }, { "epoch": 0.18375, "grad_norm": 0.33381035923957825, "learning_rate": 0.0002808300969222172, "loss": 3.038156509399414, "step": 1470 }, { "epoch": 0.185, "grad_norm": 0.3257412016391754, "learning_rate": 0.0002805335164253806, "loss": 2.9872032165527345, "step": 1480 }, { "epoch": 0.18625, "grad_norm": 0.33056777715682983, "learning_rate": 0.0002802348183867514, "loss": 3.0133747100830077, "step": 1490 }, { "epoch": 0.1875, "grad_norm": 0.3441354036331177, "learning_rate": 0.00027993400765187124, "loss": 2.9739742279052734, "step": 1500 }, { "epoch": 0.18875, "grad_norm": 0.33665338158607483, "learning_rate": 0.0002796310891005542, "loss": 3.0187236785888674, "step": 1510 }, { "epoch": 0.19, "grad_norm": 0.3456818163394928, "learning_rate": 0.00027932606764680796, "loss": 3.000716781616211, "step": 1520 }, { "epoch": 0.19125, "grad_norm": 0.31569328904151917, "learning_rate": 0.00027901894823875387, "loss": 3.003590774536133, "step": 1530 }, { "epoch": 0.1925, "grad_norm": 0.33296287059783936, "learning_rate": 0.00027870973585854665, "loss": 3.010061264038086, "step": 1540 }, { "epoch": 0.19375, "grad_norm": 0.3256262242794037, "learning_rate": 0.0002783984355222937, "loss": 3.009804534912109, "step": 1550 }, { "epoch": 0.195, "grad_norm": 0.33614271879196167, "learning_rate": 0.0002780850522799737, "loss": 3.0124454498291016, "step": 1560 }, { "epoch": 0.19625, "grad_norm": 0.3145756721496582, "learning_rate": 0.00027776959121535464, "loss": 3.0317821502685547, "step": 1570 }, { "epoch": 0.1975, "grad_norm": 0.33354440331459045, "learning_rate": 0.0002774520574459113, "loss": 2.9961336135864256, "step": 1580 }, { "epoch": 0.19875, "grad_norm": 0.33014026284217834, "learning_rate": 0.00027713245612274247, "loss": 2.996552658081055, "step": 1590 }, { "epoch": 0.2, "grad_norm": 0.3215552866458893, "learning_rate": 0.00027681079243048717, "loss": 3.0088550567626955, "step": 1600 }, { "epoch": 0.20125, "grad_norm": 0.33168256282806396, "learning_rate": 0.0002764870715872405, "loss": 3.017764663696289, "step": 1610 }, { "epoch": 0.2025, "grad_norm": 0.3189021646976471, "learning_rate": 0.00027616129884446916, "loss": 3.013848876953125, "step": 1620 }, { "epoch": 0.20375, "grad_norm": 0.3378245532512665, "learning_rate": 0.0002758334794869262, "loss": 2.9824432373046874, "step": 1630 }, { "epoch": 0.205, "grad_norm": 0.3281719386577606, "learning_rate": 0.00027550361883256535, "loss": 2.998569679260254, "step": 1640 }, { "epoch": 0.20625, "grad_norm": 0.3160180151462555, "learning_rate": 0.00027517172223245445, "loss": 2.9816024780273436, "step": 1650 }, { "epoch": 0.2075, "grad_norm": 0.33755186200141907, "learning_rate": 0.00027483779507068913, "loss": 2.9858730316162108, "step": 1660 }, { "epoch": 0.20875, "grad_norm": 0.3391866683959961, "learning_rate": 0.0002745018427643051, "loss": 3.0096504211425783, "step": 1670 }, { "epoch": 0.21, "grad_norm": 0.31814366579055786, "learning_rate": 0.00027416387076319035, "loss": 3.0215930938720703, "step": 1680 }, { "epoch": 0.21125, "grad_norm": 0.33236604928970337, "learning_rate": 0.00027382388454999686, "loss": 2.9959911346435546, "step": 1690 }, { "epoch": 0.2125, "grad_norm": 0.31699317693710327, "learning_rate": 0.00027348188964005147, "loss": 2.9976850509643556, "step": 1700 }, { "epoch": 0.21375, "grad_norm": 0.3221769630908966, "learning_rate": 0.00027313789158126667, "loss": 2.992999267578125, "step": 1710 }, { "epoch": 0.215, "grad_norm": 0.3359137773513794, "learning_rate": 0.00027279189595405036, "loss": 3.005726623535156, "step": 1720 }, { "epoch": 0.21625, "grad_norm": 0.3230113685131073, "learning_rate": 0.0002724439083712153, "loss": 2.9472639083862306, "step": 1730 }, { "epoch": 0.2175, "grad_norm": 0.3287331461906433, "learning_rate": 0.00027209393447788835, "loss": 2.979531478881836, "step": 1740 }, { "epoch": 0.21875, "grad_norm": 0.33660778403282166, "learning_rate": 0.00027174197995141866, "loss": 2.972854232788086, "step": 1750 }, { "epoch": 0.22, "grad_norm": 0.3239246904850006, "learning_rate": 0.0002713880505012855, "loss": 2.9944473266601563, "step": 1760 }, { "epoch": 0.22125, "grad_norm": 0.33757129311561584, "learning_rate": 0.00027103215186900597, "loss": 2.984470748901367, "step": 1770 }, { "epoch": 0.2225, "grad_norm": 0.3297254741191864, "learning_rate": 0.0002706742898280415, "loss": 2.9779335021972657, "step": 1780 }, { "epoch": 0.22375, "grad_norm": 0.3173220753669739, "learning_rate": 0.0002703144701837044, "loss": 2.989591598510742, "step": 1790 }, { "epoch": 0.225, "grad_norm": 0.3255234658718109, "learning_rate": 0.00026995269877306356, "loss": 2.9630172729492186, "step": 1800 }, { "epoch": 0.22625, "grad_norm": 0.31826311349868774, "learning_rate": 0.0002695889814648499, "loss": 2.949393463134766, "step": 1810 }, { "epoch": 0.2275, "grad_norm": 0.3174227774143219, "learning_rate": 0.00026922332415936116, "loss": 2.962572479248047, "step": 1820 }, { "epoch": 0.22875, "grad_norm": 0.3316509425640106, "learning_rate": 0.0002688557327883659, "loss": 2.940341567993164, "step": 1830 }, { "epoch": 0.23, "grad_norm": 0.3224658668041229, "learning_rate": 0.00026848621331500766, "loss": 2.96124267578125, "step": 1840 }, { "epoch": 0.23125, "grad_norm": 0.32638388872146606, "learning_rate": 0.00026811477173370815, "loss": 2.967144775390625, "step": 1850 }, { "epoch": 0.2325, "grad_norm": 0.3130776286125183, "learning_rate": 0.0002677414140700696, "loss": 2.946596145629883, "step": 1860 }, { "epoch": 0.23375, "grad_norm": 0.3233889937400818, "learning_rate": 0.0002673661463807776, "loss": 2.967226028442383, "step": 1870 }, { "epoch": 0.235, "grad_norm": 0.32231318950653076, "learning_rate": 0.00026698897475350254, "loss": 2.9710044860839844, "step": 1880 }, { "epoch": 0.23625, "grad_norm": 0.32844141125679016, "learning_rate": 0.0002666099053068007, "loss": 2.98897762298584, "step": 1890 }, { "epoch": 0.2375, "grad_norm": 0.32306230068206787, "learning_rate": 0.00026622894419001537, "loss": 2.978479766845703, "step": 1900 }, { "epoch": 0.23875, "grad_norm": 0.3146902620792389, "learning_rate": 0.0002658460975831769, "loss": 2.9550348281860352, "step": 1910 }, { "epoch": 0.24, "grad_norm": 0.3284965753555298, "learning_rate": 0.00026546137169690235, "loss": 2.954052734375, "step": 1920 }, { "epoch": 0.24125, "grad_norm": 0.3169645071029663, "learning_rate": 0.00026507477277229496, "loss": 2.9700557708740236, "step": 1930 }, { "epoch": 0.2425, "grad_norm": 0.32831114530563354, "learning_rate": 0.0002646863070808425, "loss": 2.965903091430664, "step": 1940 }, { "epoch": 0.24375, "grad_norm": 0.3175643980503082, "learning_rate": 0.0002642959809243163, "loss": 2.9406291961669924, "step": 1950 }, { "epoch": 1.000875, "grad_norm": 0.3207930624485016, "learning_rate": 0.00026390380063466806, "loss": 3.2728614807128906, "step": 1960 }, { "epoch": 1.002125, "grad_norm": 0.33733275532722473, "learning_rate": 0.000263509772573928, "loss": 2.9439430236816406, "step": 1970 }, { "epoch": 1.003375, "grad_norm": 0.3284999132156372, "learning_rate": 0.00026311390313410097, "loss": 2.9343544006347657, "step": 1980 }, { "epoch": 1.004625, "grad_norm": 0.31731417775154114, "learning_rate": 0.0002627161987370632, "loss": 2.9394100189208983, "step": 1990 }, { "epoch": 1.005875, "grad_norm": 0.3323110044002533, "learning_rate": 0.000262316665834458, "loss": 2.932010269165039, "step": 2000 }, { "epoch": 1.007125, "grad_norm": 0.3103737533092499, "learning_rate": 0.000261915310907591, "loss": 2.9386672973632812, "step": 2010 }, { "epoch": 1.008375, "grad_norm": 0.311427503824234, "learning_rate": 0.0002615121404673251, "loss": 2.893377113342285, "step": 2020 }, { "epoch": 1.009625, "grad_norm": 0.31239134073257446, "learning_rate": 0.00026110716105397485, "loss": 2.9058563232421877, "step": 2030 }, { "epoch": 1.010875, "grad_norm": 0.31634724140167236, "learning_rate": 0.0002607003792372004, "loss": 2.9108051300048827, "step": 2040 }, { "epoch": 1.012125, "grad_norm": 0.3243110775947571, "learning_rate": 0.00026029180161590067, "loss": 2.919489288330078, "step": 2050 }, { "epoch": 1.013375, "grad_norm": 0.3139530420303345, "learning_rate": 0.0002598814348181068, "loss": 2.901732635498047, "step": 2060 }, { "epoch": 1.014625, "grad_norm": 0.33199912309646606, "learning_rate": 0.000259469285500874, "loss": 2.9169565200805665, "step": 2070 }, { "epoch": 1.015875, "grad_norm": 0.31367039680480957, "learning_rate": 0.0002590553603501741, "loss": 2.8976207733154298, "step": 2080 }, { "epoch": 1.017125, "grad_norm": 0.3230586349964142, "learning_rate": 0.00025863966608078673, "loss": 2.882274055480957, "step": 2090 }, { "epoch": 1.018375, "grad_norm": 0.313128262758255, "learning_rate": 0.0002582222094361907, "loss": 2.8793052673339843, "step": 2100 }, { "epoch": 1.019625, "grad_norm": 0.31868046522140503, "learning_rate": 0.00025780299718845416, "loss": 2.8837726593017576, "step": 2110 }, { "epoch": 1.020875, "grad_norm": 0.32915839552879333, "learning_rate": 0.00025738203613812543, "loss": 2.8813737869262694, "step": 2120 }, { "epoch": 1.022125, "grad_norm": 0.3112785220146179, "learning_rate": 0.0002569593331141218, "loss": 2.8455078125, "step": 2130 }, { "epoch": 1.023375, "grad_norm": 0.29719340801239014, "learning_rate": 0.0002565348949736196, "loss": 2.8642330169677734, "step": 2140 }, { "epoch": 1.024625, "grad_norm": 0.3277027904987335, "learning_rate": 0.0002561087286019424, "loss": 2.8876218795776367, "step": 2150 }, { "epoch": 1.025875, "grad_norm": 0.3183266520500183, "learning_rate": 0.0002556808409124494, "loss": 2.883536720275879, "step": 2160 }, { "epoch": 1.027125, "grad_norm": 0.3155848979949951, "learning_rate": 0.00025525123884642366, "loss": 2.862305450439453, "step": 2170 }, { "epoch": 1.028375, "grad_norm": 0.3132951855659485, "learning_rate": 0.000254819929372959, "loss": 2.856226348876953, "step": 2180 }, { "epoch": 1.029625, "grad_norm": 0.3311610221862793, "learning_rate": 0.0002543869194888471, "loss": 2.854077911376953, "step": 2190 }, { "epoch": 1.030875, "grad_norm": 0.31400758028030396, "learning_rate": 0.00025395221621846435, "loss": 2.8614845275878906, "step": 2200 }, { "epoch": 1.032125, "grad_norm": 0.30809491872787476, "learning_rate": 0.00025351582661365724, "loss": 2.849281883239746, "step": 2210 }, { "epoch": 1.033375, "grad_norm": 0.3170127272605896, "learning_rate": 0.00025307775775362855, "loss": 2.8551118850708006, "step": 2220 }, { "epoch": 1.034625, "grad_norm": 0.33024269342422485, "learning_rate": 0.0002526380167448223, "loss": 2.867789459228516, "step": 2230 }, { "epoch": 1.035875, "grad_norm": 0.31417202949523926, "learning_rate": 0.0002521966107208084, "loss": 2.841164779663086, "step": 2240 }, { "epoch": 1.037125, "grad_norm": 0.31379812955856323, "learning_rate": 0.0002517535468421669, "loss": 2.8253313064575196, "step": 2250 }, { "epoch": 1.038375, "grad_norm": 0.32265710830688477, "learning_rate": 0.00025130883229637196, "loss": 2.7978553771972656, "step": 2260 }, { "epoch": 1.039625, "grad_norm": 0.3156164288520813, "learning_rate": 0.0002508624742976753, "loss": 2.8533695220947264, "step": 2270 }, { "epoch": 1.040875, "grad_norm": 0.31064850091934204, "learning_rate": 0.000250414480086989, "loss": 2.8257659912109374, "step": 2280 }, { "epoch": 1.042125, "grad_norm": 0.31829559803009033, "learning_rate": 0.00024996485693176815, "loss": 2.8458267211914063, "step": 2290 }, { "epoch": 1.043375, "grad_norm": 0.3130020499229431, "learning_rate": 0.0002495136121258928, "loss": 2.823675537109375, "step": 2300 }, { "epoch": 1.044625, "grad_norm": 0.31633248925209045, "learning_rate": 0.0002490607529895499, "loss": 2.831635093688965, "step": 2310 }, { "epoch": 1.045875, "grad_norm": 0.3220139741897583, "learning_rate": 0.00024860628686911436, "loss": 2.8292957305908204, "step": 2320 }, { "epoch": 1.047125, "grad_norm": 0.3184780180454254, "learning_rate": 0.0002481502211370298, "loss": 2.813566207885742, "step": 2330 }, { "epoch": 1.048375, "grad_norm": 0.3212302625179291, "learning_rate": 0.00024769256319168923, "loss": 2.820956993103027, "step": 2340 }, { "epoch": 1.049625, "grad_norm": 0.31229448318481445, "learning_rate": 0.00024723332045731484, "loss": 2.8484216690063477, "step": 2350 }, { "epoch": 1.050875, "grad_norm": 0.3293566107749939, "learning_rate": 0.0002467725003838375, "loss": 2.822405242919922, "step": 2360 }, { "epoch": 1.052125, "grad_norm": 0.32199931144714355, "learning_rate": 0.00024631011044677615, "loss": 2.8012535095214846, "step": 2370 }, { "epoch": 1.053375, "grad_norm": 0.3443029820919037, "learning_rate": 0.00024584615814711626, "loss": 2.8285900115966798, "step": 2380 }, { "epoch": 1.054625, "grad_norm": 0.31065329909324646, "learning_rate": 0.00024538065101118833, "loss": 2.8233985900878906, "step": 2390 }, { "epoch": 1.055875, "grad_norm": 0.31999537348747253, "learning_rate": 0.0002449135965905457, "loss": 2.819304656982422, "step": 2400 }, { "epoch": 1.057125, "grad_norm": 0.3119924068450928, "learning_rate": 0.0002444450024618422, "loss": 2.794045257568359, "step": 2410 }, { "epoch": 1.058375, "grad_norm": 0.30880260467529297, "learning_rate": 0.00024397487622670894, "loss": 2.8043006896972655, "step": 2420 }, { "epoch": 1.059625, "grad_norm": 0.33332186937332153, "learning_rate": 0.0002435032255116313, "loss": 2.795323371887207, "step": 2430 }, { "epoch": 1.060875, "grad_norm": 0.313965767621994, "learning_rate": 0.00024303005796782508, "loss": 2.823124313354492, "step": 2440 }, { "epoch": 1.062125, "grad_norm": 0.3164885938167572, "learning_rate": 0.0002425553812711123, "loss": 2.8160400390625, "step": 2450 }, { "epoch": 1.063375, "grad_norm": 0.3221667408943176, "learning_rate": 0.00024207920312179686, "loss": 2.7901626586914063, "step": 2460 }, { "epoch": 1.064625, "grad_norm": 0.31535276770591736, "learning_rate": 0.0002416015312445396, "loss": 2.7759862899780274, "step": 2470 }, { "epoch": 1.065875, "grad_norm": 0.3354146480560303, "learning_rate": 0.0002411223733882328, "loss": 2.7960500717163086, "step": 2480 }, { "epoch": 1.067125, "grad_norm": 0.36151498556137085, "learning_rate": 0.0002406417373258746, "loss": 2.787026596069336, "step": 2490 }, { "epoch": 1.068375, "grad_norm": 0.32125601172447205, "learning_rate": 0.00024015963085444297, "loss": 2.8015857696533204, "step": 2500 }, { "epoch": 1.069625, "grad_norm": 0.31707149744033813, "learning_rate": 0.00023967606179476914, "loss": 2.7936481475830077, "step": 2510 }, { "epoch": 1.070875, "grad_norm": 0.3259861171245575, "learning_rate": 0.00023919103799141078, "loss": 2.7787807464599608, "step": 2520 }, { "epoch": 1.072125, "grad_norm": 0.32442429661750793, "learning_rate": 0.00023870456731252466, "loss": 2.7599460601806642, "step": 2530 }, { "epoch": 1.073375, "grad_norm": 0.3134000599384308, "learning_rate": 0.0002382166576497391, "loss": 2.802600288391113, "step": 2540 }, { "epoch": 1.074625, "grad_norm": 0.31214284896850586, "learning_rate": 0.00023772731691802583, "loss": 2.783060073852539, "step": 2550 }, { "epoch": 1.075875, "grad_norm": 0.31292805075645447, "learning_rate": 0.00023723655305557187, "loss": 2.766581153869629, "step": 2560 }, { "epoch": 1.077125, "grad_norm": 0.3185879588127136, "learning_rate": 0.0002367443740236504, "loss": 2.7777988433837892, "step": 2570 }, { "epoch": 1.078375, "grad_norm": 0.33435672521591187, "learning_rate": 0.00023625078780649178, "loss": 2.783607292175293, "step": 2580 }, { "epoch": 1.079625, "grad_norm": 0.32837480306625366, "learning_rate": 0.00023575580241115408, "loss": 2.778690719604492, "step": 2590 }, { "epoch": 1.080875, "grad_norm": 0.3214893639087677, "learning_rate": 0.00023525942586739309, "loss": 2.7992782592773438, "step": 2600 }, { "epoch": 1.082125, "grad_norm": 0.31656068563461304, "learning_rate": 0.00023476166622753212, "loss": 2.7780582427978517, "step": 2610 }, { "epoch": 1.083375, "grad_norm": 0.32466331124305725, "learning_rate": 0.0002342625315663314, "loss": 2.756012535095215, "step": 2620 }, { "epoch": 1.084625, "grad_norm": 0.33582648634910583, "learning_rate": 0.0002337620299808569, "loss": 2.791756820678711, "step": 2630 }, { "epoch": 1.085875, "grad_norm": 0.320222944021225, "learning_rate": 0.00023326016959034922, "loss": 2.7860565185546875, "step": 2640 }, { "epoch": 1.087125, "grad_norm": 0.30881232023239136, "learning_rate": 0.00023275695853609184, "loss": 2.765717315673828, "step": 2650 }, { "epoch": 1.088375, "grad_norm": 0.32089343667030334, "learning_rate": 0.00023225240498127883, "loss": 2.798817825317383, "step": 2660 }, { "epoch": 1.089625, "grad_norm": 0.3162356913089752, "learning_rate": 0.00023174651711088272, "loss": 2.7743175506591795, "step": 2670 }, { "epoch": 1.090875, "grad_norm": 0.3252812623977661, "learning_rate": 0.0002312393031315215, "loss": 2.74603271484375, "step": 2680 }, { "epoch": 1.092125, "grad_norm": 0.32010287046432495, "learning_rate": 0.00023073077127132562, "loss": 2.7529132843017576, "step": 2690 }, { "epoch": 1.093375, "grad_norm": 0.31325286626815796, "learning_rate": 0.00023022092977980442, "loss": 2.759819984436035, "step": 2700 }, { "epoch": 1.094625, "grad_norm": 0.31815576553344727, "learning_rate": 0.00022970978692771242, "loss": 2.762770080566406, "step": 2710 }, { "epoch": 1.095875, "grad_norm": 0.3144858181476593, "learning_rate": 0.00022919735100691504, "loss": 2.7560394287109373, "step": 2720 }, { "epoch": 1.097125, "grad_norm": 0.3316778540611267, "learning_rate": 0.00022868363033025406, "loss": 2.7469573974609376, "step": 2730 }, { "epoch": 1.098375, "grad_norm": 0.3257002830505371, "learning_rate": 0.000228168633231413, "loss": 2.743033218383789, "step": 2740 }, { "epoch": 1.099625, "grad_norm": 0.31909677386283875, "learning_rate": 0.00022765236806478154, "loss": 2.7673528671264647, "step": 2750 }, { "epoch": 1.100875, "grad_norm": 0.31668204069137573, "learning_rate": 0.00022713484320532055, "loss": 2.7211772918701174, "step": 2760 }, { "epoch": 1.102125, "grad_norm": 0.3331785202026367, "learning_rate": 0.00022661606704842558, "loss": 2.722859191894531, "step": 2770 }, { "epoch": 1.103375, "grad_norm": 0.33396124839782715, "learning_rate": 0.00022609604800979111, "loss": 2.7449823379516602, "step": 2780 }, { "epoch": 1.104625, "grad_norm": 0.31729063391685486, "learning_rate": 0.00022557479452527392, "loss": 2.725127601623535, "step": 2790 }, { "epoch": 1.105875, "grad_norm": 0.3257978558540344, "learning_rate": 0.00022505231505075613, "loss": 2.7355205535888674, "step": 2800 }, { "epoch": 1.107125, "grad_norm": 0.33255940675735474, "learning_rate": 0.00022452861806200838, "loss": 2.7257015228271486, "step": 2810 }, { "epoch": 1.108375, "grad_norm": 0.32612425088882446, "learning_rate": 0.00022400371205455176, "loss": 2.767654800415039, "step": 2820 }, { "epoch": 1.109625, "grad_norm": 0.3252048194408417, "learning_rate": 0.0002234776055435205, "loss": 2.7518121719360353, "step": 2830 }, { "epoch": 1.110875, "grad_norm": 0.31889963150024414, "learning_rate": 0.00022295030706352356, "loss": 2.7312660217285156, "step": 2840 }, { "epoch": 1.112125, "grad_norm": 0.319558709859848, "learning_rate": 0.00022242182516850635, "loss": 2.740167236328125, "step": 2850 }, { "epoch": 1.113375, "grad_norm": 0.3091806173324585, "learning_rate": 0.0002218921684316119, "loss": 2.748370552062988, "step": 2860 }, { "epoch": 1.114625, "grad_norm": 0.33689746260643005, "learning_rate": 0.00022136134544504163, "loss": 2.787190628051758, "step": 2870 }, { "epoch": 1.115875, "grad_norm": 0.31660985946655273, "learning_rate": 0.0002208293648199162, "loss": 2.7415428161621094, "step": 2880 }, { "epoch": 1.117125, "grad_norm": 0.325989305973053, "learning_rate": 0.0002202962351861357, "loss": 2.7200296401977537, "step": 2890 }, { "epoch": 1.118375, "grad_norm": 0.3281361758708954, "learning_rate": 0.0002197619651922397, "loss": 2.7388254165649415, "step": 2900 } ], "logging_steps": 10, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.8847708770441626e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }