| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 834, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012004801920768308, | |
| "grad_norm": 1.126847743988037, | |
| "learning_rate": 1.142857142857143e-06, | |
| "loss": 1.2938, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.024009603841536616, | |
| "grad_norm": 0.9763688445091248, | |
| "learning_rate": 2.5714285714285716e-06, | |
| "loss": 1.2541, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03601440576230492, | |
| "grad_norm": 0.7441720366477966, | |
| "learning_rate": 4e-06, | |
| "loss": 1.2122, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04801920768307323, | |
| "grad_norm": 0.6003404259681702, | |
| "learning_rate": 5.428571428571429e-06, | |
| "loss": 1.2006, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.060024009603841535, | |
| "grad_norm": 0.5410875678062439, | |
| "learning_rate": 6.857142857142857e-06, | |
| "loss": 1.2223, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.07202881152460984, | |
| "grad_norm": 0.5632787346839905, | |
| "learning_rate": 8.285714285714287e-06, | |
| "loss": 1.155, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08403361344537816, | |
| "grad_norm": 0.5073860287666321, | |
| "learning_rate": 9.714285714285715e-06, | |
| "loss": 1.1372, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.09603841536614646, | |
| "grad_norm": 0.4458916187286377, | |
| "learning_rate": 1.1142857142857143e-05, | |
| "loss": 1.1612, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10804321728691477, | |
| "grad_norm": 0.4466385841369629, | |
| "learning_rate": 1.2571428571428572e-05, | |
| "loss": 1.15, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.12004801920768307, | |
| "grad_norm": 0.5031300187110901, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.1447, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13205282112845138, | |
| "grad_norm": 0.44504547119140625, | |
| "learning_rate": 1.5428571428571428e-05, | |
| "loss": 1.1461, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.14405762304921968, | |
| "grad_norm": 0.5289918184280396, | |
| "learning_rate": 1.6857142857142858e-05, | |
| "loss": 1.1246, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15606242496998798, | |
| "grad_norm": 0.4286825954914093, | |
| "learning_rate": 1.8285714285714288e-05, | |
| "loss": 1.08, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.16806722689075632, | |
| "grad_norm": 0.4959764778614044, | |
| "learning_rate": 1.9714285714285714e-05, | |
| "loss": 1.0991, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18007202881152462, | |
| "grad_norm": 0.4261907935142517, | |
| "learning_rate": 2.1142857142857144e-05, | |
| "loss": 1.108, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.19207683073229292, | |
| "grad_norm": 0.39412474632263184, | |
| "learning_rate": 2.257142857142857e-05, | |
| "loss": 1.0524, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.20408163265306123, | |
| "grad_norm": 0.457573264837265, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.09, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.21608643457382953, | |
| "grad_norm": 0.4494602084159851, | |
| "learning_rate": 2.5428571428571427e-05, | |
| "loss": 1.0803, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22809123649459784, | |
| "grad_norm": 0.5574950575828552, | |
| "learning_rate": 2.685714285714286e-05, | |
| "loss": 1.0802, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.24009603841536614, | |
| "grad_norm": 0.581778883934021, | |
| "learning_rate": 2.8285714285714287e-05, | |
| "loss": 1.0872, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.25210084033613445, | |
| "grad_norm": 0.5143195986747742, | |
| "learning_rate": 2.9714285714285717e-05, | |
| "loss": 1.0241, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.26410564225690275, | |
| "grad_norm": 0.6538636684417725, | |
| "learning_rate": 2.9999697901093597e-05, | |
| "loss": 1.0171, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.27611044417767105, | |
| "grad_norm": 0.46839553117752075, | |
| "learning_rate": 2.99984706451415e-05, | |
| "loss": 1.0474, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.28811524609843936, | |
| "grad_norm": 0.45771801471710205, | |
| "learning_rate": 2.999629942814264e-05, | |
| "loss": 1.0063, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.30012004801920766, | |
| "grad_norm": 0.5769961476325989, | |
| "learning_rate": 2.9993184386747226e-05, | |
| "loss": 1.0608, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.31212484993997597, | |
| "grad_norm": 0.5172315835952759, | |
| "learning_rate": 2.9989125717007107e-05, | |
| "loss": 0.9765, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3241296518607443, | |
| "grad_norm": 0.6754844784736633, | |
| "learning_rate": 2.9984123674363393e-05, | |
| "loss": 1.0156, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.33613445378151263, | |
| "grad_norm": 0.8809567093849182, | |
| "learning_rate": 2.9978178573630414e-05, | |
| "loss": 0.9547, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.34813925570228094, | |
| "grad_norm": 0.614689826965332, | |
| "learning_rate": 2.9971290788975884e-05, | |
| "loss": 0.9682, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.36014405762304924, | |
| "grad_norm": 0.6198326349258423, | |
| "learning_rate": 2.9963460753897364e-05, | |
| "loss": 0.9623, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.37214885954381755, | |
| "grad_norm": 0.5520050525665283, | |
| "learning_rate": 2.9954688961194974e-05, | |
| "loss": 0.9664, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.38415366146458585, | |
| "grad_norm": 0.5535730719566345, | |
| "learning_rate": 2.994497596294037e-05, | |
| "loss": 0.9632, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.39615846338535415, | |
| "grad_norm": 0.7371352314949036, | |
| "learning_rate": 2.9934322370442022e-05, | |
| "loss": 0.9166, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.40816326530612246, | |
| "grad_norm": 0.5894187688827515, | |
| "learning_rate": 2.9922728854206704e-05, | |
| "loss": 0.9436, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.42016806722689076, | |
| "grad_norm": 0.6794357299804688, | |
| "learning_rate": 2.9910196143897334e-05, | |
| "loss": 0.9549, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.43217286914765907, | |
| "grad_norm": 0.7796855568885803, | |
| "learning_rate": 2.9896725028287017e-05, | |
| "loss": 0.9387, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.44417767106842737, | |
| "grad_norm": 0.6924332976341248, | |
| "learning_rate": 2.9882316355209423e-05, | |
| "loss": 0.9127, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.4561824729891957, | |
| "grad_norm": 0.78730708360672, | |
| "learning_rate": 2.986697103150542e-05, | |
| "loss": 0.9377, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.468187274909964, | |
| "grad_norm": 0.6436963677406311, | |
| "learning_rate": 2.9850690022965995e-05, | |
| "loss": 0.9075, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4801920768307323, | |
| "grad_norm": 0.6622775793075562, | |
| "learning_rate": 2.9833474354271487e-05, | |
| "loss": 0.8795, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4921968787515006, | |
| "grad_norm": 0.6374281644821167, | |
| "learning_rate": 2.981532510892707e-05, | |
| "loss": 0.8786, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5042016806722689, | |
| "grad_norm": 0.6543936133384705, | |
| "learning_rate": 2.9796243429194578e-05, | |
| "loss": 0.8963, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5162064825930373, | |
| "grad_norm": 0.8162550330162048, | |
| "learning_rate": 2.977623051602062e-05, | |
| "loss": 0.8398, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5282112845138055, | |
| "grad_norm": 0.6448330283164978, | |
| "learning_rate": 2.9755287628960982e-05, | |
| "loss": 0.8528, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5402160864345739, | |
| "grad_norm": 4.620177268981934, | |
| "learning_rate": 2.9733416086101356e-05, | |
| "loss": 0.8638, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5522208883553421, | |
| "grad_norm": 0.7428641319274902, | |
| "learning_rate": 2.9710617263974385e-05, | |
| "loss": 0.8236, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5642256902761105, | |
| "grad_norm": 0.7523469924926758, | |
| "learning_rate": 2.968689259747304e-05, | |
| "loss": 0.8339, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5762304921968787, | |
| "grad_norm": 0.7694314122200012, | |
| "learning_rate": 2.966224357976029e-05, | |
| "loss": 0.8341, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 8.718358993530273, | |
| "learning_rate": 2.9636671762175145e-05, | |
| "loss": 0.8247, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6002400960384153, | |
| "grad_norm": 0.8020578026771545, | |
| "learning_rate": 2.9610178754135005e-05, | |
| "loss": 0.7834, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6122448979591837, | |
| "grad_norm": 0.7112622857093811, | |
| "learning_rate": 2.958276622303438e-05, | |
| "loss": 0.8256, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6242496998799519, | |
| "grad_norm": 0.7433765530586243, | |
| "learning_rate": 2.9554435894139945e-05, | |
| "loss": 0.7668, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6362545018007203, | |
| "grad_norm": 0.7709916234016418, | |
| "learning_rate": 2.9525189550481955e-05, | |
| "loss": 0.8346, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6482593037214885, | |
| "grad_norm": 1.9189757108688354, | |
| "learning_rate": 2.9495029032742025e-05, | |
| "loss": 0.7604, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6602641056422569, | |
| "grad_norm": 0.8425916433334351, | |
| "learning_rate": 2.9463956239137287e-05, | |
| "loss": 0.8273, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6722689075630253, | |
| "grad_norm": 0.8174909353256226, | |
| "learning_rate": 2.943197312530091e-05, | |
| "loss": 0.7908, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6842737094837935, | |
| "grad_norm": 0.6657884120941162, | |
| "learning_rate": 2.9399081704159034e-05, | |
| "loss": 0.7703, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6962785114045619, | |
| "grad_norm": 0.8427141904830933, | |
| "learning_rate": 2.936528404580408e-05, | |
| "loss": 0.8011, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7082833133253301, | |
| "grad_norm": 0.8150597214698792, | |
| "learning_rate": 2.9330582277364453e-05, | |
| "loss": 0.7654, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7202881152460985, | |
| "grad_norm": 0.7505553364753723, | |
| "learning_rate": 2.929497858287067e-05, | |
| "loss": 0.7376, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7322929171668667, | |
| "grad_norm": 0.7946950793266296, | |
| "learning_rate": 2.925847520311791e-05, | |
| "loss": 0.7239, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.7442977190876351, | |
| "grad_norm": 0.8390210866928101, | |
| "learning_rate": 2.9221074435524995e-05, | |
| "loss": 0.7742, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7563025210084033, | |
| "grad_norm": 0.8262509107589722, | |
| "learning_rate": 2.9182778633989756e-05, | |
| "loss": 0.759, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7683073229291717, | |
| "grad_norm": 0.9196308255195618, | |
| "learning_rate": 2.9143590208740925e-05, | |
| "loss": 0.7897, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.78031212484994, | |
| "grad_norm": 0.8700764775276184, | |
| "learning_rate": 2.9103511626186444e-05, | |
| "loss": 0.7395, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7923169267707083, | |
| "grad_norm": 0.8262081742286682, | |
| "learning_rate": 2.9062545408758193e-05, | |
| "loss": 0.7038, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8043217286914766, | |
| "grad_norm": 0.8056402206420898, | |
| "learning_rate": 2.902069413475329e-05, | |
| "loss": 0.7026, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.8163265306122449, | |
| "grad_norm": 0.8297358751296997, | |
| "learning_rate": 2.8977960438171787e-05, | |
| "loss": 0.7308, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8283313325330132, | |
| "grad_norm": 0.8549435138702393, | |
| "learning_rate": 2.8934347008550886e-05, | |
| "loss": 0.6703, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8403361344537815, | |
| "grad_norm": 0.9239634871482849, | |
| "learning_rate": 2.8889856590795705e-05, | |
| "loss": 0.7002, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8523409363745498, | |
| "grad_norm": 0.8247022032737732, | |
| "learning_rate": 2.8844491985006486e-05, | |
| "loss": 0.7076, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.8643457382953181, | |
| "grad_norm": 0.9243834614753723, | |
| "learning_rate": 2.8798256046302376e-05, | |
| "loss": 0.6776, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8763505402160864, | |
| "grad_norm": 0.9479813575744629, | |
| "learning_rate": 2.875115168464174e-05, | |
| "loss": 0.6979, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8883553421368547, | |
| "grad_norm": 0.988172173500061, | |
| "learning_rate": 2.8703181864639013e-05, | |
| "loss": 0.7221, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9003601440576231, | |
| "grad_norm": 1.0673149824142456, | |
| "learning_rate": 2.865434960537811e-05, | |
| "loss": 0.6326, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.9123649459783914, | |
| "grad_norm": 0.9364232420921326, | |
| "learning_rate": 2.860465798022242e-05, | |
| "loss": 0.6799, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9243697478991597, | |
| "grad_norm": 0.9759359955787659, | |
| "learning_rate": 2.8554110116621376e-05, | |
| "loss": 0.6705, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.936374549819928, | |
| "grad_norm": 0.8587977290153503, | |
| "learning_rate": 2.8502709195913617e-05, | |
| "loss": 0.6405, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9483793517406963, | |
| "grad_norm": 0.8970200419425964, | |
| "learning_rate": 2.8450458453126773e-05, | |
| "loss": 0.6404, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.9603841536614646, | |
| "grad_norm": 0.8413552045822144, | |
| "learning_rate": 2.839736117677386e-05, | |
| "loss": 0.6313, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9723889555822329, | |
| "grad_norm": 0.9825181365013123, | |
| "learning_rate": 2.8343420708646292e-05, | |
| "loss": 0.5839, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9843937575030012, | |
| "grad_norm": 0.8249124884605408, | |
| "learning_rate": 2.8288640443603587e-05, | |
| "loss": 0.6214, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9963985594237695, | |
| "grad_norm": 1.0160948038101196, | |
| "learning_rate": 2.823302382935968e-05, | |
| "loss": 0.6417, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.007202881152461, | |
| "grad_norm": 0.9784197211265564, | |
| "learning_rate": 2.8176574366265963e-05, | |
| "loss": 0.5966, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0192076830732293, | |
| "grad_norm": 0.9859529137611389, | |
| "learning_rate": 2.8119295607090936e-05, | |
| "loss": 0.5378, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.0312124849939976, | |
| "grad_norm": 1.0028948783874512, | |
| "learning_rate": 2.8061191156796658e-05, | |
| "loss": 0.5948, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.043217286914766, | |
| "grad_norm": 0.9926419854164124, | |
| "learning_rate": 2.8002264672311822e-05, | |
| "loss": 0.5835, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.0552220888355341, | |
| "grad_norm": 1.0303875207901, | |
| "learning_rate": 2.7942519862301618e-05, | |
| "loss": 0.4992, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0672268907563025, | |
| "grad_norm": 0.8923658132553101, | |
| "learning_rate": 2.7881960486934314e-05, | |
| "loss": 0.5432, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.0792316926770709, | |
| "grad_norm": 0.8442468643188477, | |
| "learning_rate": 2.7820590357644604e-05, | |
| "loss": 0.5246, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0912364945978392, | |
| "grad_norm": 0.9535374641418457, | |
| "learning_rate": 2.7758413336893725e-05, | |
| "loss": 0.5832, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.1032412965186074, | |
| "grad_norm": 1.013627290725708, | |
| "learning_rate": 2.7695433337926362e-05, | |
| "loss": 0.5381, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1152460984393757, | |
| "grad_norm": 0.9319940805435181, | |
| "learning_rate": 2.763165432452438e-05, | |
| "loss": 0.5282, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.127250900360144, | |
| "grad_norm": 0.8916425704956055, | |
| "learning_rate": 2.756708031075731e-05, | |
| "loss": 0.5252, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1392557022809124, | |
| "grad_norm": 0.9970868825912476, | |
| "learning_rate": 2.7501715360729766e-05, | |
| "loss": 0.5196, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.1512605042016806, | |
| "grad_norm": 0.9539552330970764, | |
| "learning_rate": 2.7435563588325627e-05, | |
| "loss": 0.4874, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.163265306122449, | |
| "grad_norm": 1.0665792226791382, | |
| "learning_rate": 2.7368629156949137e-05, | |
| "loss": 0.5093, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.1752701080432173, | |
| "grad_norm": 1.048183798789978, | |
| "learning_rate": 2.7300916279262868e-05, | |
| "loss": 0.522, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1872749099639857, | |
| "grad_norm": 1.1342055797576904, | |
| "learning_rate": 2.7232429216922583e-05, | |
| "loss": 0.5293, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.199279711884754, | |
| "grad_norm": 0.9771571159362793, | |
| "learning_rate": 2.7163172280309028e-05, | |
| "loss": 0.5041, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2112845138055222, | |
| "grad_norm": 1.2090368270874023, | |
| "learning_rate": 2.7093149828256643e-05, | |
| "loss": 0.5112, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.2232893157262905, | |
| "grad_norm": 1.1582231521606445, | |
| "learning_rate": 2.702236626777923e-05, | |
| "loss": 0.4962, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2352941176470589, | |
| "grad_norm": 1.037467360496521, | |
| "learning_rate": 2.6950826053792586e-05, | |
| "loss": 0.4958, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.247298919567827, | |
| "grad_norm": 0.9874077439308167, | |
| "learning_rate": 2.6878533688834125e-05, | |
| "loss": 0.4696, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2593037214885954, | |
| "grad_norm": 1.03226900100708, | |
| "learning_rate": 2.6805493722779513e-05, | |
| "loss": 0.4994, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.2713085234093637, | |
| "grad_norm": 0.986599326133728, | |
| "learning_rate": 2.6731710752556293e-05, | |
| "loss": 0.4679, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.283313325330132, | |
| "grad_norm": 0.8920894861221313, | |
| "learning_rate": 2.6657189421854564e-05, | |
| "loss": 0.5125, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.2953181272509005, | |
| "grad_norm": 1.0151013135910034, | |
| "learning_rate": 2.6581934420834754e-05, | |
| "loss": 0.508, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3073229291716686, | |
| "grad_norm": 1.0022249221801758, | |
| "learning_rate": 2.65059504858324e-05, | |
| "loss": 0.4258, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.319327731092437, | |
| "grad_norm": 0.912742018699646, | |
| "learning_rate": 2.6429242399060063e-05, | |
| "loss": 0.5101, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3313325330132053, | |
| "grad_norm": 1.016350507736206, | |
| "learning_rate": 2.635181498830637e-05, | |
| "loss": 0.4921, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.3433373349339737, | |
| "grad_norm": 0.8971051573753357, | |
| "learning_rate": 2.627367312663214e-05, | |
| "loss": 0.4477, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.355342136854742, | |
| "grad_norm": 0.9581533074378967, | |
| "learning_rate": 2.6194821732063708e-05, | |
| "loss": 0.4356, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.3673469387755102, | |
| "grad_norm": 1.00229012966156, | |
| "learning_rate": 2.6115265767283377e-05, | |
| "loss": 0.4632, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3793517406962785, | |
| "grad_norm": 0.9300605654716492, | |
| "learning_rate": 2.6035010239317106e-05, | |
| "loss": 0.5101, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.3913565426170469, | |
| "grad_norm": 0.9333869814872742, | |
| "learning_rate": 2.5954060199219364e-05, | |
| "loss": 0.4389, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.403361344537815, | |
| "grad_norm": 1.121485710144043, | |
| "learning_rate": 2.587242074175523e-05, | |
| "loss": 0.4602, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.4153661464585834, | |
| "grad_norm": 0.9909285306930542, | |
| "learning_rate": 2.5790097005079766e-05, | |
| "loss": 0.4386, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.4273709483793517, | |
| "grad_norm": 1.1501377820968628, | |
| "learning_rate": 2.5707094170414606e-05, | |
| "loss": 0.4577, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.43937575030012, | |
| "grad_norm": 1.0231988430023193, | |
| "learning_rate": 2.5623417461721887e-05, | |
| "loss": 0.4762, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4513805522208885, | |
| "grad_norm": 0.9353159070014954, | |
| "learning_rate": 2.5539072145375452e-05, | |
| "loss": 0.4456, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.4633853541416566, | |
| "grad_norm": 1.0465285778045654, | |
| "learning_rate": 2.5454063529829405e-05, | |
| "loss": 0.4379, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.475390156062425, | |
| "grad_norm": 1.1565396785736084, | |
| "learning_rate": 2.5368396965284017e-05, | |
| "loss": 0.4122, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.4873949579831933, | |
| "grad_norm": 1.128082513809204, | |
| "learning_rate": 2.5282077843349e-05, | |
| "loss": 0.4581, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.4993997599039615, | |
| "grad_norm": 0.9868873357772827, | |
| "learning_rate": 2.519511159670417e-05, | |
| "loss": 0.4402, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.51140456182473, | |
| "grad_norm": 1.0035743713378906, | |
| "learning_rate": 2.510750369875752e-05, | |
| "loss": 0.4489, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5234093637454982, | |
| "grad_norm": 1.1779558658599854, | |
| "learning_rate": 2.5019259663300758e-05, | |
| "loss": 0.4474, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.5354141656662665, | |
| "grad_norm": 1.0303417444229126, | |
| "learning_rate": 2.4930385044162282e-05, | |
| "loss": 0.4338, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.547418967587035, | |
| "grad_norm": 1.0198581218719482, | |
| "learning_rate": 2.4840885434857608e-05, | |
| "loss": 0.399, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.559423769507803, | |
| "grad_norm": 0.9304978847503662, | |
| "learning_rate": 2.4750766468237388e-05, | |
| "loss": 0.4285, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.9525741338729858, | |
| "learning_rate": 2.466003381613283e-05, | |
| "loss": 0.4253, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.5834333733493398, | |
| "grad_norm": 0.9774038791656494, | |
| "learning_rate": 2.456869318899878e-05, | |
| "loss": 0.4608, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.595438175270108, | |
| "grad_norm": 0.9870194792747498, | |
| "learning_rate": 2.447675033555429e-05, | |
| "loss": 0.3993, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.6074429771908765, | |
| "grad_norm": 0.9398269653320312, | |
| "learning_rate": 2.4384211042420826e-05, | |
| "loss": 0.4121, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6194477791116446, | |
| "grad_norm": 0.9968813061714172, | |
| "learning_rate": 2.4291081133758063e-05, | |
| "loss": 0.4298, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.631452581032413, | |
| "grad_norm": 1.1225817203521729, | |
| "learning_rate": 2.4197366470897352e-05, | |
| "loss": 0.3736, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.6434573829531813, | |
| "grad_norm": 1.1227293014526367, | |
| "learning_rate": 2.4103072951972794e-05, | |
| "loss": 0.4418, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.6554621848739495, | |
| "grad_norm": 1.0404561758041382, | |
| "learning_rate": 2.400820651155005e-05, | |
| "loss": 0.3952, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.667466986794718, | |
| "grad_norm": 0.9955992102622986, | |
| "learning_rate": 2.3912773120252822e-05, | |
| "loss": 0.387, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.6794717887154862, | |
| "grad_norm": 1.0180872678756714, | |
| "learning_rate": 2.3816778784387097e-05, | |
| "loss": 0.3817, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6914765906362546, | |
| "grad_norm": 0.9844180345535278, | |
| "learning_rate": 2.372022954556311e-05, | |
| "loss": 0.4103, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.703481392557023, | |
| "grad_norm": 1.05225670337677, | |
| "learning_rate": 2.3623131480315107e-05, | |
| "loss": 0.367, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.715486194477791, | |
| "grad_norm": 1.0176974534988403, | |
| "learning_rate": 2.352549069971891e-05, | |
| "loss": 0.3605, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.7274909963985594, | |
| "grad_norm": 1.0177741050720215, | |
| "learning_rate": 2.3427313349007302e-05, | |
| "loss": 0.3782, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.7394957983193278, | |
| "grad_norm": 0.9692652821540833, | |
| "learning_rate": 2.3328605607183255e-05, | |
| "loss": 0.3554, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.751500600240096, | |
| "grad_norm": 0.892565131187439, | |
| "learning_rate": 2.322937368663105e-05, | |
| "loss": 0.3714, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.7635054021608645, | |
| "grad_norm": 0.9849589467048645, | |
| "learning_rate": 2.312962383272529e-05, | |
| "loss": 0.3648, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.7755102040816326, | |
| "grad_norm": 1.1365302801132202, | |
| "learning_rate": 2.3029362323437823e-05, | |
| "loss": 0.3451, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.787515006002401, | |
| "grad_norm": 0.9451067447662354, | |
| "learning_rate": 2.292859546894263e-05, | |
| "loss": 0.3465, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.7995198079231693, | |
| "grad_norm": 1.0000107288360596, | |
| "learning_rate": 2.282732961121869e-05, | |
| "loss": 0.3297, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8115246098439375, | |
| "grad_norm": 0.9930965900421143, | |
| "learning_rate": 2.2725571123650814e-05, | |
| "loss": 0.3413, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.8235294117647058, | |
| "grad_norm": 1.1068987846374512, | |
| "learning_rate": 2.2623326410628537e-05, | |
| "loss": 0.3501, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.8355342136854742, | |
| "grad_norm": 1.0239458084106445, | |
| "learning_rate": 2.2520601907143045e-05, | |
| "loss": 0.3738, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.8475390156062423, | |
| "grad_norm": 1.0029224157333374, | |
| "learning_rate": 2.2417404078382153e-05, | |
| "loss": 0.3683, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.859543817527011, | |
| "grad_norm": 1.0009812116622925, | |
| "learning_rate": 2.2313739419323443e-05, | |
| "loss": 0.3782, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.871548619447779, | |
| "grad_norm": 1.0867785215377808, | |
| "learning_rate": 2.220961445432546e-05, | |
| "loss": 0.3504, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.8835534213685474, | |
| "grad_norm": 1.0827844142913818, | |
| "learning_rate": 2.2105035736717086e-05, | |
| "loss": 0.3416, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.8955582232893158, | |
| "grad_norm": 0.9669961929321289, | |
| "learning_rate": 2.2000009848385107e-05, | |
| "loss": 0.3535, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.907563025210084, | |
| "grad_norm": 1.0839868783950806, | |
| "learning_rate": 2.189454339935996e-05, | |
| "loss": 0.3245, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.9195678271308525, | |
| "grad_norm": 1.1661854982376099, | |
| "learning_rate": 2.1788643027399725e-05, | |
| "loss": 0.3481, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.9315726290516206, | |
| "grad_norm": 1.0360976457595825, | |
| "learning_rate": 2.1682315397572344e-05, | |
| "loss": 0.3725, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.943577430972389, | |
| "grad_norm": 0.9784966707229614, | |
| "learning_rate": 2.157556720183616e-05, | |
| "loss": 0.318, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.9555822328931574, | |
| "grad_norm": 1.1009594202041626, | |
| "learning_rate": 2.1468405158618744e-05, | |
| "loss": 0.3223, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.9675870348139255, | |
| "grad_norm": 0.9810320734977722, | |
| "learning_rate": 2.136083601239403e-05, | |
| "loss": 0.3622, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.9795918367346939, | |
| "grad_norm": 1.039110779762268, | |
| "learning_rate": 2.125286653325787e-05, | |
| "loss": 0.3408, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.9915966386554622, | |
| "grad_norm": 1.1904340982437134, | |
| "learning_rate": 2.114450351650193e-05, | |
| "loss": 0.3264, | |
| "step": 830 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2085, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3201796128560906e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |