{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 834, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012004801920768308, "grad_norm": 1.126847743988037, "learning_rate": 1.142857142857143e-06, "loss": 1.2938, "step": 5 }, { "epoch": 0.024009603841536616, "grad_norm": 0.9763688445091248, "learning_rate": 2.5714285714285716e-06, "loss": 1.2541, "step": 10 }, { "epoch": 0.03601440576230492, "grad_norm": 0.7441720366477966, "learning_rate": 4e-06, "loss": 1.2122, "step": 15 }, { "epoch": 0.04801920768307323, "grad_norm": 0.6003404259681702, "learning_rate": 5.428571428571429e-06, "loss": 1.2006, "step": 20 }, { "epoch": 0.060024009603841535, "grad_norm": 0.5410875678062439, "learning_rate": 6.857142857142857e-06, "loss": 1.2223, "step": 25 }, { "epoch": 0.07202881152460984, "grad_norm": 0.5632787346839905, "learning_rate": 8.285714285714287e-06, "loss": 1.155, "step": 30 }, { "epoch": 0.08403361344537816, "grad_norm": 0.5073860287666321, "learning_rate": 9.714285714285715e-06, "loss": 1.1372, "step": 35 }, { "epoch": 0.09603841536614646, "grad_norm": 0.4458916187286377, "learning_rate": 1.1142857142857143e-05, "loss": 1.1612, "step": 40 }, { "epoch": 0.10804321728691477, "grad_norm": 0.4466385841369629, "learning_rate": 1.2571428571428572e-05, "loss": 1.15, "step": 45 }, { "epoch": 0.12004801920768307, "grad_norm": 0.5031300187110901, "learning_rate": 1.4e-05, "loss": 1.1447, "step": 50 }, { "epoch": 0.13205282112845138, "grad_norm": 0.44504547119140625, "learning_rate": 1.5428571428571428e-05, "loss": 1.1461, "step": 55 }, { "epoch": 0.14405762304921968, "grad_norm": 0.5289918184280396, "learning_rate": 1.6857142857142858e-05, "loss": 1.1246, "step": 60 }, { "epoch": 0.15606242496998798, "grad_norm": 0.4286825954914093, "learning_rate": 1.8285714285714288e-05, "loss": 1.08, "step": 65 }, { "epoch": 0.16806722689075632, "grad_norm": 0.4959764778614044, "learning_rate": 1.9714285714285714e-05, "loss": 1.0991, "step": 70 }, { "epoch": 0.18007202881152462, "grad_norm": 0.4261907935142517, "learning_rate": 2.1142857142857144e-05, "loss": 1.108, "step": 75 }, { "epoch": 0.19207683073229292, "grad_norm": 0.39412474632263184, "learning_rate": 2.257142857142857e-05, "loss": 1.0524, "step": 80 }, { "epoch": 0.20408163265306123, "grad_norm": 0.457573264837265, "learning_rate": 2.4e-05, "loss": 1.09, "step": 85 }, { "epoch": 0.21608643457382953, "grad_norm": 0.4494602084159851, "learning_rate": 2.5428571428571427e-05, "loss": 1.0803, "step": 90 }, { "epoch": 0.22809123649459784, "grad_norm": 0.5574950575828552, "learning_rate": 2.685714285714286e-05, "loss": 1.0802, "step": 95 }, { "epoch": 0.24009603841536614, "grad_norm": 0.581778883934021, "learning_rate": 2.8285714285714287e-05, "loss": 1.0872, "step": 100 }, { "epoch": 0.25210084033613445, "grad_norm": 0.5143195986747742, "learning_rate": 2.9714285714285717e-05, "loss": 1.0241, "step": 105 }, { "epoch": 0.26410564225690275, "grad_norm": 0.6538636684417725, "learning_rate": 2.9999697901093597e-05, "loss": 1.0171, "step": 110 }, { "epoch": 0.27611044417767105, "grad_norm": 0.46839553117752075, "learning_rate": 2.99984706451415e-05, "loss": 1.0474, "step": 115 }, { "epoch": 0.28811524609843936, "grad_norm": 0.45771801471710205, "learning_rate": 2.999629942814264e-05, "loss": 1.0063, "step": 120 }, { "epoch": 0.30012004801920766, "grad_norm": 0.5769961476325989, "learning_rate": 2.9993184386747226e-05, "loss": 1.0608, "step": 125 }, { "epoch": 0.31212484993997597, "grad_norm": 0.5172315835952759, "learning_rate": 2.9989125717007107e-05, "loss": 0.9765, "step": 130 }, { "epoch": 0.3241296518607443, "grad_norm": 0.6754844784736633, "learning_rate": 2.9984123674363393e-05, "loss": 1.0156, "step": 135 }, { "epoch": 0.33613445378151263, "grad_norm": 0.8809567093849182, "learning_rate": 2.9978178573630414e-05, "loss": 0.9547, "step": 140 }, { "epoch": 0.34813925570228094, "grad_norm": 0.614689826965332, "learning_rate": 2.9971290788975884e-05, "loss": 0.9682, "step": 145 }, { "epoch": 0.36014405762304924, "grad_norm": 0.6198326349258423, "learning_rate": 2.9963460753897364e-05, "loss": 0.9623, "step": 150 }, { "epoch": 0.37214885954381755, "grad_norm": 0.5520050525665283, "learning_rate": 2.9954688961194974e-05, "loss": 0.9664, "step": 155 }, { "epoch": 0.38415366146458585, "grad_norm": 0.5535730719566345, "learning_rate": 2.994497596294037e-05, "loss": 0.9632, "step": 160 }, { "epoch": 0.39615846338535415, "grad_norm": 0.7371352314949036, "learning_rate": 2.9934322370442022e-05, "loss": 0.9166, "step": 165 }, { "epoch": 0.40816326530612246, "grad_norm": 0.5894187688827515, "learning_rate": 2.9922728854206704e-05, "loss": 0.9436, "step": 170 }, { "epoch": 0.42016806722689076, "grad_norm": 0.6794357299804688, "learning_rate": 2.9910196143897334e-05, "loss": 0.9549, "step": 175 }, { "epoch": 0.43217286914765907, "grad_norm": 0.7796855568885803, "learning_rate": 2.9896725028287017e-05, "loss": 0.9387, "step": 180 }, { "epoch": 0.44417767106842737, "grad_norm": 0.6924332976341248, "learning_rate": 2.9882316355209423e-05, "loss": 0.9127, "step": 185 }, { "epoch": 0.4561824729891957, "grad_norm": 0.78730708360672, "learning_rate": 2.986697103150542e-05, "loss": 0.9377, "step": 190 }, { "epoch": 0.468187274909964, "grad_norm": 0.6436963677406311, "learning_rate": 2.9850690022965995e-05, "loss": 0.9075, "step": 195 }, { "epoch": 0.4801920768307323, "grad_norm": 0.6622775793075562, "learning_rate": 2.9833474354271487e-05, "loss": 0.8795, "step": 200 }, { "epoch": 0.4921968787515006, "grad_norm": 0.6374281644821167, "learning_rate": 2.981532510892707e-05, "loss": 0.8786, "step": 205 }, { "epoch": 0.5042016806722689, "grad_norm": 0.6543936133384705, "learning_rate": 2.9796243429194578e-05, "loss": 0.8963, "step": 210 }, { "epoch": 0.5162064825930373, "grad_norm": 0.8162550330162048, "learning_rate": 2.977623051602062e-05, "loss": 0.8398, "step": 215 }, { "epoch": 0.5282112845138055, "grad_norm": 0.6448330283164978, "learning_rate": 2.9755287628960982e-05, "loss": 0.8528, "step": 220 }, { "epoch": 0.5402160864345739, "grad_norm": 4.620177268981934, "learning_rate": 2.9733416086101356e-05, "loss": 0.8638, "step": 225 }, { "epoch": 0.5522208883553421, "grad_norm": 0.7428641319274902, "learning_rate": 2.9710617263974385e-05, "loss": 0.8236, "step": 230 }, { "epoch": 0.5642256902761105, "grad_norm": 0.7523469924926758, "learning_rate": 2.968689259747304e-05, "loss": 0.8339, "step": 235 }, { "epoch": 0.5762304921968787, "grad_norm": 0.7694314122200012, "learning_rate": 2.966224357976029e-05, "loss": 0.8341, "step": 240 }, { "epoch": 0.5882352941176471, "grad_norm": 8.718358993530273, "learning_rate": 2.9636671762175145e-05, "loss": 0.8247, "step": 245 }, { "epoch": 0.6002400960384153, "grad_norm": 0.8020578026771545, "learning_rate": 2.9610178754135005e-05, "loss": 0.7834, "step": 250 }, { "epoch": 0.6122448979591837, "grad_norm": 0.7112622857093811, "learning_rate": 2.958276622303438e-05, "loss": 0.8256, "step": 255 }, { "epoch": 0.6242496998799519, "grad_norm": 0.7433765530586243, "learning_rate": 2.9554435894139945e-05, "loss": 0.7668, "step": 260 }, { "epoch": 0.6362545018007203, "grad_norm": 0.7709916234016418, "learning_rate": 2.9525189550481955e-05, "loss": 0.8346, "step": 265 }, { "epoch": 0.6482593037214885, "grad_norm": 1.9189757108688354, "learning_rate": 2.9495029032742025e-05, "loss": 0.7604, "step": 270 }, { "epoch": 0.6602641056422569, "grad_norm": 0.8425916433334351, "learning_rate": 2.9463956239137287e-05, "loss": 0.8273, "step": 275 }, { "epoch": 0.6722689075630253, "grad_norm": 0.8174909353256226, "learning_rate": 2.943197312530091e-05, "loss": 0.7908, "step": 280 }, { "epoch": 0.6842737094837935, "grad_norm": 0.6657884120941162, "learning_rate": 2.9399081704159034e-05, "loss": 0.7703, "step": 285 }, { "epoch": 0.6962785114045619, "grad_norm": 0.8427141904830933, "learning_rate": 2.936528404580408e-05, "loss": 0.8011, "step": 290 }, { "epoch": 0.7082833133253301, "grad_norm": 0.8150597214698792, "learning_rate": 2.9330582277364453e-05, "loss": 0.7654, "step": 295 }, { "epoch": 0.7202881152460985, "grad_norm": 0.7505553364753723, "learning_rate": 2.929497858287067e-05, "loss": 0.7376, "step": 300 }, { "epoch": 0.7322929171668667, "grad_norm": 0.7946950793266296, "learning_rate": 2.925847520311791e-05, "loss": 0.7239, "step": 305 }, { "epoch": 0.7442977190876351, "grad_norm": 0.8390210866928101, "learning_rate": 2.9221074435524995e-05, "loss": 0.7742, "step": 310 }, { "epoch": 0.7563025210084033, "grad_norm": 0.8262509107589722, "learning_rate": 2.9182778633989756e-05, "loss": 0.759, "step": 315 }, { "epoch": 0.7683073229291717, "grad_norm": 0.9196308255195618, "learning_rate": 2.9143590208740925e-05, "loss": 0.7897, "step": 320 }, { "epoch": 0.78031212484994, "grad_norm": 0.8700764775276184, "learning_rate": 2.9103511626186444e-05, "loss": 0.7395, "step": 325 }, { "epoch": 0.7923169267707083, "grad_norm": 0.8262081742286682, "learning_rate": 2.9062545408758193e-05, "loss": 0.7038, "step": 330 }, { "epoch": 0.8043217286914766, "grad_norm": 0.8056402206420898, "learning_rate": 2.902069413475329e-05, "loss": 0.7026, "step": 335 }, { "epoch": 0.8163265306122449, "grad_norm": 0.8297358751296997, "learning_rate": 2.8977960438171787e-05, "loss": 0.7308, "step": 340 }, { "epoch": 0.8283313325330132, "grad_norm": 0.8549435138702393, "learning_rate": 2.8934347008550886e-05, "loss": 0.6703, "step": 345 }, { "epoch": 0.8403361344537815, "grad_norm": 0.9239634871482849, "learning_rate": 2.8889856590795705e-05, "loss": 0.7002, "step": 350 }, { "epoch": 0.8523409363745498, "grad_norm": 0.8247022032737732, "learning_rate": 2.8844491985006486e-05, "loss": 0.7076, "step": 355 }, { "epoch": 0.8643457382953181, "grad_norm": 0.9243834614753723, "learning_rate": 2.8798256046302376e-05, "loss": 0.6776, "step": 360 }, { "epoch": 0.8763505402160864, "grad_norm": 0.9479813575744629, "learning_rate": 2.875115168464174e-05, "loss": 0.6979, "step": 365 }, { "epoch": 0.8883553421368547, "grad_norm": 0.988172173500061, "learning_rate": 2.8703181864639013e-05, "loss": 0.7221, "step": 370 }, { "epoch": 0.9003601440576231, "grad_norm": 1.0673149824142456, "learning_rate": 2.865434960537811e-05, "loss": 0.6326, "step": 375 }, { "epoch": 0.9123649459783914, "grad_norm": 0.9364232420921326, "learning_rate": 2.860465798022242e-05, "loss": 0.6799, "step": 380 }, { "epoch": 0.9243697478991597, "grad_norm": 0.9759359955787659, "learning_rate": 2.8554110116621376e-05, "loss": 0.6705, "step": 385 }, { "epoch": 0.936374549819928, "grad_norm": 0.8587977290153503, "learning_rate": 2.8502709195913617e-05, "loss": 0.6405, "step": 390 }, { "epoch": 0.9483793517406963, "grad_norm": 0.8970200419425964, "learning_rate": 2.8450458453126773e-05, "loss": 0.6404, "step": 395 }, { "epoch": 0.9603841536614646, "grad_norm": 0.8413552045822144, "learning_rate": 2.839736117677386e-05, "loss": 0.6313, "step": 400 }, { "epoch": 0.9723889555822329, "grad_norm": 0.9825181365013123, "learning_rate": 2.8343420708646292e-05, "loss": 0.5839, "step": 405 }, { "epoch": 0.9843937575030012, "grad_norm": 0.8249124884605408, "learning_rate": 2.8288640443603587e-05, "loss": 0.6214, "step": 410 }, { "epoch": 0.9963985594237695, "grad_norm": 1.0160948038101196, "learning_rate": 2.823302382935968e-05, "loss": 0.6417, "step": 415 }, { "epoch": 1.007202881152461, "grad_norm": 0.9784197211265564, "learning_rate": 2.8176574366265963e-05, "loss": 0.5966, "step": 420 }, { "epoch": 1.0192076830732293, "grad_norm": 0.9859529137611389, "learning_rate": 2.8119295607090936e-05, "loss": 0.5378, "step": 425 }, { "epoch": 1.0312124849939976, "grad_norm": 1.0028948783874512, "learning_rate": 2.8061191156796658e-05, "loss": 0.5948, "step": 430 }, { "epoch": 1.043217286914766, "grad_norm": 0.9926419854164124, "learning_rate": 2.8002264672311822e-05, "loss": 0.5835, "step": 435 }, { "epoch": 1.0552220888355341, "grad_norm": 1.0303875207901, "learning_rate": 2.7942519862301618e-05, "loss": 0.4992, "step": 440 }, { "epoch": 1.0672268907563025, "grad_norm": 0.8923658132553101, "learning_rate": 2.7881960486934314e-05, "loss": 0.5432, "step": 445 }, { "epoch": 1.0792316926770709, "grad_norm": 0.8442468643188477, "learning_rate": 2.7820590357644604e-05, "loss": 0.5246, "step": 450 }, { "epoch": 1.0912364945978392, "grad_norm": 0.9535374641418457, "learning_rate": 2.7758413336893725e-05, "loss": 0.5832, "step": 455 }, { "epoch": 1.1032412965186074, "grad_norm": 1.013627290725708, "learning_rate": 2.7695433337926362e-05, "loss": 0.5381, "step": 460 }, { "epoch": 1.1152460984393757, "grad_norm": 0.9319940805435181, "learning_rate": 2.763165432452438e-05, "loss": 0.5282, "step": 465 }, { "epoch": 1.127250900360144, "grad_norm": 0.8916425704956055, "learning_rate": 2.756708031075731e-05, "loss": 0.5252, "step": 470 }, { "epoch": 1.1392557022809124, "grad_norm": 0.9970868825912476, "learning_rate": 2.7501715360729766e-05, "loss": 0.5196, "step": 475 }, { "epoch": 1.1512605042016806, "grad_norm": 0.9539552330970764, "learning_rate": 2.7435563588325627e-05, "loss": 0.4874, "step": 480 }, { "epoch": 1.163265306122449, "grad_norm": 1.0665792226791382, "learning_rate": 2.7368629156949137e-05, "loss": 0.5093, "step": 485 }, { "epoch": 1.1752701080432173, "grad_norm": 1.048183798789978, "learning_rate": 2.7300916279262868e-05, "loss": 0.522, "step": 490 }, { "epoch": 1.1872749099639857, "grad_norm": 1.1342055797576904, "learning_rate": 2.7232429216922583e-05, "loss": 0.5293, "step": 495 }, { "epoch": 1.199279711884754, "grad_norm": 0.9771571159362793, "learning_rate": 2.7163172280309028e-05, "loss": 0.5041, "step": 500 }, { "epoch": 1.2112845138055222, "grad_norm": 1.2090368270874023, "learning_rate": 2.7093149828256643e-05, "loss": 0.5112, "step": 505 }, { "epoch": 1.2232893157262905, "grad_norm": 1.1582231521606445, "learning_rate": 2.702236626777923e-05, "loss": 0.4962, "step": 510 }, { "epoch": 1.2352941176470589, "grad_norm": 1.037467360496521, "learning_rate": 2.6950826053792586e-05, "loss": 0.4958, "step": 515 }, { "epoch": 1.247298919567827, "grad_norm": 0.9874077439308167, "learning_rate": 2.6878533688834125e-05, "loss": 0.4696, "step": 520 }, { "epoch": 1.2593037214885954, "grad_norm": 1.03226900100708, "learning_rate": 2.6805493722779513e-05, "loss": 0.4994, "step": 525 }, { "epoch": 1.2713085234093637, "grad_norm": 0.986599326133728, "learning_rate": 2.6731710752556293e-05, "loss": 0.4679, "step": 530 }, { "epoch": 1.283313325330132, "grad_norm": 0.8920894861221313, "learning_rate": 2.6657189421854564e-05, "loss": 0.5125, "step": 535 }, { "epoch": 1.2953181272509005, "grad_norm": 1.0151013135910034, "learning_rate": 2.6581934420834754e-05, "loss": 0.508, "step": 540 }, { "epoch": 1.3073229291716686, "grad_norm": 1.0022249221801758, "learning_rate": 2.65059504858324e-05, "loss": 0.4258, "step": 545 }, { "epoch": 1.319327731092437, "grad_norm": 0.912742018699646, "learning_rate": 2.6429242399060063e-05, "loss": 0.5101, "step": 550 }, { "epoch": 1.3313325330132053, "grad_norm": 1.016350507736206, "learning_rate": 2.635181498830637e-05, "loss": 0.4921, "step": 555 }, { "epoch": 1.3433373349339737, "grad_norm": 0.8971051573753357, "learning_rate": 2.627367312663214e-05, "loss": 0.4477, "step": 560 }, { "epoch": 1.355342136854742, "grad_norm": 0.9581533074378967, "learning_rate": 2.6194821732063708e-05, "loss": 0.4356, "step": 565 }, { "epoch": 1.3673469387755102, "grad_norm": 1.00229012966156, "learning_rate": 2.6115265767283377e-05, "loss": 0.4632, "step": 570 }, { "epoch": 1.3793517406962785, "grad_norm": 0.9300605654716492, "learning_rate": 2.6035010239317106e-05, "loss": 0.5101, "step": 575 }, { "epoch": 1.3913565426170469, "grad_norm": 0.9333869814872742, "learning_rate": 2.5954060199219364e-05, "loss": 0.4389, "step": 580 }, { "epoch": 1.403361344537815, "grad_norm": 1.121485710144043, "learning_rate": 2.587242074175523e-05, "loss": 0.4602, "step": 585 }, { "epoch": 1.4153661464585834, "grad_norm": 0.9909285306930542, "learning_rate": 2.5790097005079766e-05, "loss": 0.4386, "step": 590 }, { "epoch": 1.4273709483793517, "grad_norm": 1.1501377820968628, "learning_rate": 2.5707094170414606e-05, "loss": 0.4577, "step": 595 }, { "epoch": 1.43937575030012, "grad_norm": 1.0231988430023193, "learning_rate": 2.5623417461721887e-05, "loss": 0.4762, "step": 600 }, { "epoch": 1.4513805522208885, "grad_norm": 0.9353159070014954, "learning_rate": 2.5539072145375452e-05, "loss": 0.4456, "step": 605 }, { "epoch": 1.4633853541416566, "grad_norm": 1.0465285778045654, "learning_rate": 2.5454063529829405e-05, "loss": 0.4379, "step": 610 }, { "epoch": 1.475390156062425, "grad_norm": 1.1565396785736084, "learning_rate": 2.5368396965284017e-05, "loss": 0.4122, "step": 615 }, { "epoch": 1.4873949579831933, "grad_norm": 1.128082513809204, "learning_rate": 2.5282077843349e-05, "loss": 0.4581, "step": 620 }, { "epoch": 1.4993997599039615, "grad_norm": 0.9868873357772827, "learning_rate": 2.519511159670417e-05, "loss": 0.4402, "step": 625 }, { "epoch": 1.51140456182473, "grad_norm": 1.0035743713378906, "learning_rate": 2.510750369875752e-05, "loss": 0.4489, "step": 630 }, { "epoch": 1.5234093637454982, "grad_norm": 1.1779558658599854, "learning_rate": 2.5019259663300758e-05, "loss": 0.4474, "step": 635 }, { "epoch": 1.5354141656662665, "grad_norm": 1.0303417444229126, "learning_rate": 2.4930385044162282e-05, "loss": 0.4338, "step": 640 }, { "epoch": 1.547418967587035, "grad_norm": 1.0198581218719482, "learning_rate": 2.4840885434857608e-05, "loss": 0.399, "step": 645 }, { "epoch": 1.559423769507803, "grad_norm": 0.9304978847503662, "learning_rate": 2.4750766468237388e-05, "loss": 0.4285, "step": 650 }, { "epoch": 1.5714285714285714, "grad_norm": 0.9525741338729858, "learning_rate": 2.466003381613283e-05, "loss": 0.4253, "step": 655 }, { "epoch": 1.5834333733493398, "grad_norm": 0.9774038791656494, "learning_rate": 2.456869318899878e-05, "loss": 0.4608, "step": 660 }, { "epoch": 1.595438175270108, "grad_norm": 0.9870194792747498, "learning_rate": 2.447675033555429e-05, "loss": 0.3993, "step": 665 }, { "epoch": 1.6074429771908765, "grad_norm": 0.9398269653320312, "learning_rate": 2.4384211042420826e-05, "loss": 0.4121, "step": 670 }, { "epoch": 1.6194477791116446, "grad_norm": 0.9968813061714172, "learning_rate": 2.4291081133758063e-05, "loss": 0.4298, "step": 675 }, { "epoch": 1.631452581032413, "grad_norm": 1.1225817203521729, "learning_rate": 2.4197366470897352e-05, "loss": 0.3736, "step": 680 }, { "epoch": 1.6434573829531813, "grad_norm": 1.1227293014526367, "learning_rate": 2.4103072951972794e-05, "loss": 0.4418, "step": 685 }, { "epoch": 1.6554621848739495, "grad_norm": 1.0404561758041382, "learning_rate": 2.400820651155005e-05, "loss": 0.3952, "step": 690 }, { "epoch": 1.667466986794718, "grad_norm": 0.9955992102622986, "learning_rate": 2.3912773120252822e-05, "loss": 0.387, "step": 695 }, { "epoch": 1.6794717887154862, "grad_norm": 1.0180872678756714, "learning_rate": 2.3816778784387097e-05, "loss": 0.3817, "step": 700 }, { "epoch": 1.6914765906362546, "grad_norm": 0.9844180345535278, "learning_rate": 2.372022954556311e-05, "loss": 0.4103, "step": 705 }, { "epoch": 1.703481392557023, "grad_norm": 1.05225670337677, "learning_rate": 2.3623131480315107e-05, "loss": 0.367, "step": 710 }, { "epoch": 1.715486194477791, "grad_norm": 1.0176974534988403, "learning_rate": 2.352549069971891e-05, "loss": 0.3605, "step": 715 }, { "epoch": 1.7274909963985594, "grad_norm": 1.0177741050720215, "learning_rate": 2.3427313349007302e-05, "loss": 0.3782, "step": 720 }, { "epoch": 1.7394957983193278, "grad_norm": 0.9692652821540833, "learning_rate": 2.3328605607183255e-05, "loss": 0.3554, "step": 725 }, { "epoch": 1.751500600240096, "grad_norm": 0.892565131187439, "learning_rate": 2.322937368663105e-05, "loss": 0.3714, "step": 730 }, { "epoch": 1.7635054021608645, "grad_norm": 0.9849589467048645, "learning_rate": 2.312962383272529e-05, "loss": 0.3648, "step": 735 }, { "epoch": 1.7755102040816326, "grad_norm": 1.1365302801132202, "learning_rate": 2.3029362323437823e-05, "loss": 0.3451, "step": 740 }, { "epoch": 1.787515006002401, "grad_norm": 0.9451067447662354, "learning_rate": 2.292859546894263e-05, "loss": 0.3465, "step": 745 }, { "epoch": 1.7995198079231693, "grad_norm": 1.0000107288360596, "learning_rate": 2.282732961121869e-05, "loss": 0.3297, "step": 750 }, { "epoch": 1.8115246098439375, "grad_norm": 0.9930965900421143, "learning_rate": 2.2725571123650814e-05, "loss": 0.3413, "step": 755 }, { "epoch": 1.8235294117647058, "grad_norm": 1.1068987846374512, "learning_rate": 2.2623326410628537e-05, "loss": 0.3501, "step": 760 }, { "epoch": 1.8355342136854742, "grad_norm": 1.0239458084106445, "learning_rate": 2.2520601907143045e-05, "loss": 0.3738, "step": 765 }, { "epoch": 1.8475390156062423, "grad_norm": 1.0029224157333374, "learning_rate": 2.2417404078382153e-05, "loss": 0.3683, "step": 770 }, { "epoch": 1.859543817527011, "grad_norm": 1.0009812116622925, "learning_rate": 2.2313739419323443e-05, "loss": 0.3782, "step": 775 }, { "epoch": 1.871548619447779, "grad_norm": 1.0867785215377808, "learning_rate": 2.220961445432546e-05, "loss": 0.3504, "step": 780 }, { "epoch": 1.8835534213685474, "grad_norm": 1.0827844142913818, "learning_rate": 2.2105035736717086e-05, "loss": 0.3416, "step": 785 }, { "epoch": 1.8955582232893158, "grad_norm": 0.9669961929321289, "learning_rate": 2.2000009848385107e-05, "loss": 0.3535, "step": 790 }, { "epoch": 1.907563025210084, "grad_norm": 1.0839868783950806, "learning_rate": 2.189454339935996e-05, "loss": 0.3245, "step": 795 }, { "epoch": 1.9195678271308525, "grad_norm": 1.1661854982376099, "learning_rate": 2.1788643027399725e-05, "loss": 0.3481, "step": 800 }, { "epoch": 1.9315726290516206, "grad_norm": 1.0360976457595825, "learning_rate": 2.1682315397572344e-05, "loss": 0.3725, "step": 805 }, { "epoch": 1.943577430972389, "grad_norm": 0.9784966707229614, "learning_rate": 2.157556720183616e-05, "loss": 0.318, "step": 810 }, { "epoch": 1.9555822328931574, "grad_norm": 1.1009594202041626, "learning_rate": 2.1468405158618744e-05, "loss": 0.3223, "step": 815 }, { "epoch": 1.9675870348139255, "grad_norm": 0.9810320734977722, "learning_rate": 2.136083601239403e-05, "loss": 0.3622, "step": 820 }, { "epoch": 1.9795918367346939, "grad_norm": 1.039110779762268, "learning_rate": 2.125286653325787e-05, "loss": 0.3408, "step": 825 }, { "epoch": 1.9915966386554622, "grad_norm": 1.1904340982437134, "learning_rate": 2.114450351650193e-05, "loss": 0.3264, "step": 830 } ], "logging_steps": 5, "max_steps": 2085, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3201796128560906e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }