| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 894, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011198208286674132, |
| "grad_norm": 1.0978649854660034, |
| "learning_rate": 1.0714285714285714e-06, |
| "loss": 1.2808, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.022396416573348264, |
| "grad_norm": 0.865485429763794, |
| "learning_rate": 2.410714285714286e-06, |
| "loss": 1.3219, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0335946248600224, |
| "grad_norm": 0.7268191576004028, |
| "learning_rate": 3.75e-06, |
| "loss": 1.2607, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.04479283314669653, |
| "grad_norm": 0.6289479732513428, |
| "learning_rate": 5.0892857142857146e-06, |
| "loss": 1.2549, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.055991041433370664, |
| "grad_norm": 0.5705351829528809, |
| "learning_rate": 6.428571428571429e-06, |
| "loss": 1.2545, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.0671892497200448, |
| "grad_norm": 0.8970493078231812, |
| "learning_rate": 7.767857142857144e-06, |
| "loss": 1.2063, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.07838745800671892, |
| "grad_norm": 0.5323396325111389, |
| "learning_rate": 9.107142857142856e-06, |
| "loss": 1.1831, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.08958566629339305, |
| "grad_norm": 0.4824765622615814, |
| "learning_rate": 1.044642857142857e-05, |
| "loss": 1.1912, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10078387458006718, |
| "grad_norm": 0.5041835308074951, |
| "learning_rate": 1.1785714285714286e-05, |
| "loss": 1.1675, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.11198208286674133, |
| "grad_norm": 0.4978722631931305, |
| "learning_rate": 1.3125e-05, |
| "loss": 1.1681, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.12318029115341546, |
| "grad_norm": 0.428523451089859, |
| "learning_rate": 1.4464285714285715e-05, |
| "loss": 1.154, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.1343784994400896, |
| "grad_norm": 0.4901936650276184, |
| "learning_rate": 1.580357142857143e-05, |
| "loss": 1.1488, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1455767077267637, |
| "grad_norm": 0.4443103075027466, |
| "learning_rate": 1.7142857142857142e-05, |
| "loss": 1.2096, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.15677491601343785, |
| "grad_norm": 0.5249817967414856, |
| "learning_rate": 1.848214285714286e-05, |
| "loss": 1.1517, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.167973124300112, |
| "grad_norm": 0.4530683755874634, |
| "learning_rate": 1.982142857142857e-05, |
| "loss": 1.1722, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.1791713325867861, |
| "grad_norm": 0.47545140981674194, |
| "learning_rate": 2.1160714285714287e-05, |
| "loss": 1.1244, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.19036954087346025, |
| "grad_norm": 0.4221179187297821, |
| "learning_rate": 2.25e-05, |
| "loss": 1.0931, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.20156774916013437, |
| "grad_norm": 0.4603467285633087, |
| "learning_rate": 2.3839285714285713e-05, |
| "loss": 1.1179, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2127659574468085, |
| "grad_norm": 0.48785561323165894, |
| "learning_rate": 2.517857142857143e-05, |
| "loss": 1.1119, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.22396416573348266, |
| "grad_norm": 0.46568208932876587, |
| "learning_rate": 2.6517857142857143e-05, |
| "loss": 1.1662, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.23516237402015677, |
| "grad_norm": 0.43084895610809326, |
| "learning_rate": 2.7857142857142858e-05, |
| "loss": 1.1031, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.24636058230683092, |
| "grad_norm": 0.46002936363220215, |
| "learning_rate": 2.9196428571428573e-05, |
| "loss": 1.1044, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.25755879059350506, |
| "grad_norm": 0.5120033025741577, |
| "learning_rate": 2.9999934306758047e-05, |
| "loss": 1.0682, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.2687569988801792, |
| "grad_norm": 0.5539029836654663, |
| "learning_rate": 2.9999195264394326e-05, |
| "loss": 1.0544, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2799552071668533, |
| "grad_norm": 0.5363937616348267, |
| "learning_rate": 2.9997635103707554e-05, |
| "loss": 1.0822, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.2911534154535274, |
| "grad_norm": 0.49808958172798157, |
| "learning_rate": 2.999525391010742e-05, |
| "loss": 1.0351, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3023516237402016, |
| "grad_norm": 0.6337416768074036, |
| "learning_rate": 2.9992051813950364e-05, |
| "loss": 1.0445, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.3135498320268757, |
| "grad_norm": 0.5711157321929932, |
| "learning_rate": 2.998802899053244e-05, |
| "loss": 1.0135, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.3247480403135498, |
| "grad_norm": 0.6856613755226135, |
| "learning_rate": 2.998318566007973e-05, |
| "loss": 1.013, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.335946248600224, |
| "grad_norm": 0.5463095307350159, |
| "learning_rate": 2.99775220877363e-05, |
| "loss": 1.0258, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3471444568868981, |
| "grad_norm": 0.6269132494926453, |
| "learning_rate": 2.9971038583549633e-05, |
| "loss": 1.0128, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.3583426651735722, |
| "grad_norm": 0.5507866740226746, |
| "learning_rate": 2.9963735502453715e-05, |
| "loss": 0.9991, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.36954087346024633, |
| "grad_norm": 0.6331682205200195, |
| "learning_rate": 2.995561324424958e-05, |
| "loss": 0.9721, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.3807390817469205, |
| "grad_norm": 0.6370624303817749, |
| "learning_rate": 2.9946672253583415e-05, |
| "loss": 0.9839, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.3919372900335946, |
| "grad_norm": 0.588683545589447, |
| "learning_rate": 2.9936913019922235e-05, |
| "loss": 0.9616, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.40313549832026874, |
| "grad_norm": 0.6478800773620605, |
| "learning_rate": 2.9926336077527062e-05, |
| "loss": 0.9782, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.4143337066069429, |
| "grad_norm": 0.6072382926940918, |
| "learning_rate": 2.9914942005423723e-05, |
| "loss": 0.9035, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.425531914893617, |
| "grad_norm": 0.830245852470398, |
| "learning_rate": 2.9902731427371096e-05, |
| "loss": 0.9616, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.43673012318029114, |
| "grad_norm": 0.7037926912307739, |
| "learning_rate": 2.9889705011827006e-05, |
| "loss": 0.9044, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.4479283314669653, |
| "grad_norm": 0.689367949962616, |
| "learning_rate": 2.9875863471911608e-05, |
| "loss": 0.9293, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.45912653975363943, |
| "grad_norm": 0.7107216715812683, |
| "learning_rate": 2.9861207565368363e-05, |
| "loss": 0.908, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.47032474804031354, |
| "grad_norm": 0.7383051514625549, |
| "learning_rate": 2.9845738094522533e-05, |
| "loss": 0.9435, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.48152295632698766, |
| "grad_norm": 0.7229745388031006, |
| "learning_rate": 2.9829455906237287e-05, |
| "loss": 0.8984, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.49272116461366183, |
| "grad_norm": 0.746848464012146, |
| "learning_rate": 2.9812361891867325e-05, |
| "loss": 0.8979, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.503919372900336, |
| "grad_norm": 0.723301112651825, |
| "learning_rate": 2.979445698721007e-05, |
| "loss": 0.9337, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.5151175811870101, |
| "grad_norm": 0.7699582576751709, |
| "learning_rate": 2.9775742172454473e-05, |
| "loss": 0.9443, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5263157894736842, |
| "grad_norm": 0.7237251400947571, |
| "learning_rate": 2.9756218472127302e-05, |
| "loss": 0.8489, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.5375139977603584, |
| "grad_norm": 0.8070515990257263, |
| "learning_rate": 2.9735886955037118e-05, |
| "loss": 0.8639, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5487122060470325, |
| "grad_norm": 0.7536729574203491, |
| "learning_rate": 2.9714748734215714e-05, |
| "loss": 0.8083, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.5599104143337066, |
| "grad_norm": 0.8057552576065063, |
| "learning_rate": 2.969280496685719e-05, |
| "loss": 0.8714, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5711086226203808, |
| "grad_norm": 0.8718234896659851, |
| "learning_rate": 2.9670056854254634e-05, |
| "loss": 0.8651, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.5823068309070548, |
| "grad_norm": 0.8957020044326782, |
| "learning_rate": 2.9646505641734324e-05, |
| "loss": 0.8428, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.593505039193729, |
| "grad_norm": 0.7812408804893494, |
| "learning_rate": 2.9622152618587576e-05, |
| "loss": 0.8646, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.6047032474804032, |
| "grad_norm": 0.918906033039093, |
| "learning_rate": 2.9596999118000145e-05, |
| "loss": 0.8891, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6159014557670772, |
| "grad_norm": 0.9143264293670654, |
| "learning_rate": 2.9571046516979256e-05, |
| "loss": 0.814, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.6270996640537514, |
| "grad_norm": 0.9321989417076111, |
| "learning_rate": 2.954429623627821e-05, |
| "loss": 0.7757, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6382978723404256, |
| "grad_norm": 0.7887775301933289, |
| "learning_rate": 2.9516749740318623e-05, |
| "loss": 0.8416, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.6494960806270996, |
| "grad_norm": 0.8512288331985474, |
| "learning_rate": 2.948840853711022e-05, |
| "loss": 0.8131, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6606942889137738, |
| "grad_norm": 0.8105642199516296, |
| "learning_rate": 2.9459274178168335e-05, |
| "loss": 0.8115, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.671892497200448, |
| "grad_norm": 0.7953616976737976, |
| "learning_rate": 2.9429348258428933e-05, |
| "loss": 0.8266, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.683090705487122, |
| "grad_norm": 2.0142805576324463, |
| "learning_rate": 2.9398632416161298e-05, |
| "loss": 0.7923, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.6942889137737962, |
| "grad_norm": 0.9255501627922058, |
| "learning_rate": 2.936712833287837e-05, |
| "loss": 0.7464, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.7054871220604704, |
| "grad_norm": 0.8493944406509399, |
| "learning_rate": 2.9334837733244686e-05, |
| "loss": 0.7682, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.7166853303471444, |
| "grad_norm": 1.1492375135421753, |
| "learning_rate": 2.9301762384981944e-05, |
| "loss": 0.7543, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.7278835386338186, |
| "grad_norm": 0.9279423952102661, |
| "learning_rate": 2.926790409877225e-05, |
| "loss": 0.7954, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.7390817469204927, |
| "grad_norm": 0.8645827770233154, |
| "learning_rate": 2.9233264728158997e-05, |
| "loss": 0.8014, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7502799552071668, |
| "grad_norm": 0.8726319670677185, |
| "learning_rate": 2.9197846169445376e-05, |
| "loss": 0.7396, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.761478163493841, |
| "grad_norm": 0.9269015192985535, |
| "learning_rate": 2.916165036159058e-05, |
| "loss": 0.7109, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7726763717805151, |
| "grad_norm": 0.9171246290206909, |
| "learning_rate": 2.912467928610366e-05, |
| "loss": 0.7041, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.7838745800671892, |
| "grad_norm": 0.8157719373703003, |
| "learning_rate": 2.9086934966935015e-05, |
| "loss": 0.7498, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7950727883538634, |
| "grad_norm": 1.0310955047607422, |
| "learning_rate": 2.9048419470365656e-05, |
| "loss": 0.7538, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.8062709966405375, |
| "grad_norm": 0.9810894131660461, |
| "learning_rate": 2.9009134904894015e-05, |
| "loss": 0.698, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.8174692049272116, |
| "grad_norm": 0.8990864157676697, |
| "learning_rate": 2.8969083421120587e-05, |
| "loss": 0.6898, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.8286674132138858, |
| "grad_norm": 0.8737045526504517, |
| "learning_rate": 2.892826721163013e-05, |
| "loss": 0.7175, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.8398656215005599, |
| "grad_norm": 0.8609009981155396, |
| "learning_rate": 2.8886688510871706e-05, |
| "loss": 0.7466, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.851063829787234, |
| "grad_norm": 0.9786498546600342, |
| "learning_rate": 2.884434959503628e-05, |
| "loss": 0.7221, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8622620380739082, |
| "grad_norm": 0.9019100069999695, |
| "learning_rate": 2.8801252781932194e-05, |
| "loss": 0.7047, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.8734602463605823, |
| "grad_norm": 0.9129027128219604, |
| "learning_rate": 2.8757400430858193e-05, |
| "loss": 0.7235, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8846584546472565, |
| "grad_norm": 0.9829080700874329, |
| "learning_rate": 2.871279494247435e-05, |
| "loss": 0.6777, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.8958566629339306, |
| "grad_norm": 0.9491150975227356, |
| "learning_rate": 2.8667438758670582e-05, |
| "loss": 0.7271, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.9070548712206047, |
| "grad_norm": 0.9399238228797913, |
| "learning_rate": 2.8621334362433017e-05, |
| "loss": 0.6362, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.9182530795072789, |
| "grad_norm": 0.9197304844856262, |
| "learning_rate": 2.857448427770802e-05, |
| "loss": 0.666, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.9294512877939529, |
| "grad_norm": 0.9620344042778015, |
| "learning_rate": 2.8526891069264058e-05, |
| "loss": 0.677, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.9406494960806271, |
| "grad_norm": 0.8626346588134766, |
| "learning_rate": 2.847855734255128e-05, |
| "loss": 0.6432, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9518477043673013, |
| "grad_norm": 0.9515690803527832, |
| "learning_rate": 2.8429485743558876e-05, |
| "loss": 0.6336, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.9630459126539753, |
| "grad_norm": 1.0451574325561523, |
| "learning_rate": 2.8379678958670245e-05, |
| "loss": 0.6471, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9742441209406495, |
| "grad_norm": 0.9338580369949341, |
| "learning_rate": 2.8329139714515916e-05, |
| "loss": 0.6526, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.9854423292273237, |
| "grad_norm": 1.034649133682251, |
| "learning_rate": 2.8277870777824278e-05, |
| "loss": 0.6459, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9966405375139977, |
| "grad_norm": 1.0309592485427856, |
| "learning_rate": 2.822587495527013e-05, |
| "loss": 0.6495, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.0067189249720045, |
| "grad_norm": 0.9272093772888184, |
| "learning_rate": 2.817315509332102e-05, |
| "loss": 0.6473, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.0179171332586787, |
| "grad_norm": 1.0917242765426636, |
| "learning_rate": 2.8119714078081428e-05, |
| "loss": 0.5854, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.0291153415453527, |
| "grad_norm": 0.9767428040504456, |
| "learning_rate": 2.8065554835134766e-05, |
| "loss": 0.5468, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.0403135498320268, |
| "grad_norm": 0.9543578028678894, |
| "learning_rate": 2.8010680329383213e-05, |
| "loss": 0.5365, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.051511758118701, |
| "grad_norm": 0.9958118200302124, |
| "learning_rate": 2.7955093564885412e-05, |
| "loss": 0.5602, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.0627099664053752, |
| "grad_norm": 0.9233402609825134, |
| "learning_rate": 2.7898797584692003e-05, |
| "loss": 0.5523, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.0739081746920494, |
| "grad_norm": 1.1012506484985352, |
| "learning_rate": 2.784179547067906e-05, |
| "loss": 0.5821, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0851063829787233, |
| "grad_norm": 0.9179466962814331, |
| "learning_rate": 2.7784090343379337e-05, |
| "loss": 0.5227, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.0963045912653975, |
| "grad_norm": 1.1877224445343018, |
| "learning_rate": 2.7725685361811472e-05, |
| "loss": 0.5462, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.1075027995520716, |
| "grad_norm": 0.9985042810440063, |
| "learning_rate": 2.7666583723307035e-05, |
| "loss": 0.5437, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.1187010078387458, |
| "grad_norm": 1.211738109588623, |
| "learning_rate": 2.7606788663335498e-05, |
| "loss": 0.5293, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.12989921612542, |
| "grad_norm": 1.0043666362762451, |
| "learning_rate": 2.7546303455327093e-05, |
| "loss": 0.5199, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.1410974244120942, |
| "grad_norm": 1.134454369544983, |
| "learning_rate": 2.7485131410493644e-05, |
| "loss": 0.5272, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.1522956326987681, |
| "grad_norm": 1.006064534187317, |
| "learning_rate": 2.742327587764726e-05, |
| "loss": 0.5601, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.1634938409854423, |
| "grad_norm": 1.0664838552474976, |
| "learning_rate": 2.7360740243017042e-05, |
| "loss": 0.5324, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.1746920492721165, |
| "grad_norm": 0.9521324634552002, |
| "learning_rate": 2.729752793006368e-05, |
| "loss": 0.5258, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.1858902575587906, |
| "grad_norm": 1.0168689489364624, |
| "learning_rate": 2.723364239929206e-05, |
| "loss": 0.5062, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.1970884658454648, |
| "grad_norm": 1.09732985496521, |
| "learning_rate": 2.71690871480618e-05, |
| "loss": 0.5249, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.2082866741321387, |
| "grad_norm": 1.062366008758545, |
| "learning_rate": 2.7103865710395803e-05, |
| "loss": 0.5003, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.219484882418813, |
| "grad_norm": 1.0284899473190308, |
| "learning_rate": 2.7037981656786802e-05, |
| "loss": 0.4832, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.230683090705487, |
| "grad_norm": 1.1590033769607544, |
| "learning_rate": 2.6971438594001862e-05, |
| "loss": 0.502, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.2418812989921613, |
| "grad_norm": 1.2048217058181763, |
| "learning_rate": 2.690424016488496e-05, |
| "loss": 0.5068, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.2530795072788354, |
| "grad_norm": 1.0057001113891602, |
| "learning_rate": 2.6836390048157555e-05, |
| "loss": 0.501, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.2642777155655094, |
| "grad_norm": 1.0815175771713257, |
| "learning_rate": 2.676789195821719e-05, |
| "loss": 0.4754, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.2754759238521838, |
| "grad_norm": 0.9652005434036255, |
| "learning_rate": 2.6698749644934155e-05, |
| "loss": 0.4928, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.2866741321388577, |
| "grad_norm": 1.0275248289108276, |
| "learning_rate": 2.6628966893446215e-05, |
| "loss": 0.4819, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.297872340425532, |
| "grad_norm": 1.0320004224777222, |
| "learning_rate": 2.655854752395137e-05, |
| "loss": 0.5305, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.309070548712206, |
| "grad_norm": 1.03120756149292, |
| "learning_rate": 2.6487495391498757e-05, |
| "loss": 0.4958, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.3202687569988802, |
| "grad_norm": 0.9786254167556763, |
| "learning_rate": 2.6415814385777565e-05, |
| "loss": 0.4638, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.3314669652855544, |
| "grad_norm": 1.0948402881622314, |
| "learning_rate": 2.634350843090414e-05, |
| "loss": 0.4689, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.3426651735722284, |
| "grad_norm": 1.3032407760620117, |
| "learning_rate": 2.6270581485207137e-05, |
| "loss": 0.4767, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.3538633818589025, |
| "grad_norm": 1.0457333326339722, |
| "learning_rate": 2.6197037541010827e-05, |
| "loss": 0.4747, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.3650615901455767, |
| "grad_norm": 0.9719673991203308, |
| "learning_rate": 2.6122880624416553e-05, |
| "loss": 0.4693, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.3762597984322509, |
| "grad_norm": 1.0594805479049683, |
| "learning_rate": 2.604811479508231e-05, |
| "loss": 0.4199, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.387458006718925, |
| "grad_norm": 1.0848315954208374, |
| "learning_rate": 2.5972744146000504e-05, |
| "loss": 0.4375, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.398656215005599, |
| "grad_norm": 1.1041808128356934, |
| "learning_rate": 2.5896772803273903e-05, |
| "loss": 0.5127, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.4098544232922732, |
| "grad_norm": 1.0456072092056274, |
| "learning_rate": 2.582020492588973e-05, |
| "loss": 0.484, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.4210526315789473, |
| "grad_norm": 0.9886185526847839, |
| "learning_rate": 2.574304470549201e-05, |
| "loss": 0.4442, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.4322508398656215, |
| "grad_norm": 1.148207426071167, |
| "learning_rate": 2.566529636615207e-05, |
| "loss": 0.4223, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.4434490481522957, |
| "grad_norm": 0.9983200430870056, |
| "learning_rate": 2.5586964164137325e-05, |
| "loss": 0.4234, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.4546472564389696, |
| "grad_norm": 1.0275447368621826, |
| "learning_rate": 2.5508052387678258e-05, |
| "loss": 0.4618, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.465845464725644, |
| "grad_norm": 1.1754274368286133, |
| "learning_rate": 2.5428565356733672e-05, |
| "loss": 0.4258, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.477043673012318, |
| "grad_norm": 1.3802127838134766, |
| "learning_rate": 2.534850742275418e-05, |
| "loss": 0.4388, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.4882418812989922, |
| "grad_norm": 1.1148616075515747, |
| "learning_rate": 2.5267882968444017e-05, |
| "loss": 0.4366, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.4994400895856663, |
| "grad_norm": 1.0787454843521118, |
| "learning_rate": 2.518669640752109e-05, |
| "loss": 0.4121, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.5106382978723403, |
| "grad_norm": 1.1478720903396606, |
| "learning_rate": 2.5104952184475346e-05, |
| "loss": 0.4201, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.5218365061590147, |
| "grad_norm": 0.9818703532218933, |
| "learning_rate": 2.5022654774325507e-05, |
| "loss": 0.4343, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.5330347144456886, |
| "grad_norm": 1.0454514026641846, |
| "learning_rate": 2.4939808682374028e-05, |
| "loss": 0.4594, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.5442329227323628, |
| "grad_norm": 0.996567964553833, |
| "learning_rate": 2.48564184439605e-05, |
| "loss": 0.4373, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.555431131019037, |
| "grad_norm": 1.1099140644073486, |
| "learning_rate": 2.4772488624213352e-05, |
| "loss": 0.4076, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.5666293393057111, |
| "grad_norm": 1.1161383390426636, |
| "learning_rate": 2.4688023817799944e-05, |
| "loss": 0.4096, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.5778275475923853, |
| "grad_norm": 1.0627449750900269, |
| "learning_rate": 2.460302864867502e-05, |
| "loss": 0.4008, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.5890257558790593, |
| "grad_norm": 1.0296977758407593, |
| "learning_rate": 2.4517507769827598e-05, |
| "loss": 0.469, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.6002239641657336, |
| "grad_norm": 1.1758477687835693, |
| "learning_rate": 2.4431465863026223e-05, |
| "loss": 0.3854, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.6114221724524076, |
| "grad_norm": 1.0715276002883911, |
| "learning_rate": 2.434490763856268e-05, |
| "loss": 0.4006, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.6226203807390818, |
| "grad_norm": 1.1095898151397705, |
| "learning_rate": 2.4257837834994123e-05, |
| "loss": 0.3902, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.633818589025756, |
| "grad_norm": 1.100608468055725, |
| "learning_rate": 2.4170261218883686e-05, |
| "loss": 0.4034, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.64501679731243, |
| "grad_norm": 1.1237363815307617, |
| "learning_rate": 2.4082182584539526e-05, |
| "loss": 0.4103, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.6562150055991043, |
| "grad_norm": 1.0138239860534668, |
| "learning_rate": 2.3993606753752356e-05, |
| "loss": 0.3801, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.6674132138857782, |
| "grad_norm": 1.0473990440368652, |
| "learning_rate": 2.39045385755315e-05, |
| "loss": 0.3642, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.6786114221724524, |
| "grad_norm": 0.9463981986045837, |
| "learning_rate": 2.381498292583942e-05, |
| "loss": 0.4, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.6898096304591266, |
| "grad_norm": 1.0383639335632324, |
| "learning_rate": 2.37249447073248e-05, |
| "loss": 0.4388, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.7010078387458005, |
| "grad_norm": 1.0742672681808472, |
| "learning_rate": 2.3634428849054156e-05, |
| "loss": 0.3784, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.712206047032475, |
| "grad_norm": 0.9601068496704102, |
| "learning_rate": 2.3543440306241965e-05, |
| "loss": 0.4132, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.7234042553191489, |
| "grad_norm": 1.1224182844161987, |
| "learning_rate": 2.3451984059979444e-05, |
| "loss": 0.3628, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.734602463605823, |
| "grad_norm": 1.1569995880126953, |
| "learning_rate": 2.336006511696184e-05, |
| "loss": 0.3678, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.7458006718924972, |
| "grad_norm": 1.1215687990188599, |
| "learning_rate": 2.3267688509214335e-05, |
| "loss": 0.3658, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.7569988801791714, |
| "grad_norm": 1.0910794734954834, |
| "learning_rate": 2.317485929381658e-05, |
| "loss": 0.3702, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.7681970884658456, |
| "grad_norm": 1.161943793296814, |
| "learning_rate": 2.3081582552625867e-05, |
| "loss": 0.3851, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.7793952967525195, |
| "grad_norm": 1.1181591749191284, |
| "learning_rate": 2.29878633919989e-05, |
| "loss": 0.3623, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.7905935050391937, |
| "grad_norm": 0.9931478500366211, |
| "learning_rate": 2.2893706942512257e-05, |
| "loss": 0.3783, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.8017917133258678, |
| "grad_norm": 1.125067114830017, |
| "learning_rate": 2.2799118358681535e-05, |
| "loss": 0.3615, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.812989921612542, |
| "grad_norm": 1.1069613695144653, |
| "learning_rate": 2.2704102818679164e-05, |
| "loss": 0.3572, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.8241881298992162, |
| "grad_norm": 0.9140533208847046, |
| "learning_rate": 2.2608665524050923e-05, |
| "loss": 0.3553, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.8353863381858901, |
| "grad_norm": 0.987232506275177, |
| "learning_rate": 2.25128116994312e-05, |
| "loss": 0.3563, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.8465845464725645, |
| "grad_norm": 1.1312346458435059, |
| "learning_rate": 2.241654659225696e-05, |
| "loss": 0.3563, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.8577827547592385, |
| "grad_norm": 1.0891382694244385, |
| "learning_rate": 2.231987547248049e-05, |
| "loss": 0.3586, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.8689809630459127, |
| "grad_norm": 1.134059190750122, |
| "learning_rate": 2.2222803632280894e-05, |
| "loss": 0.3539, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.8801791713325868, |
| "grad_norm": 1.0670528411865234, |
| "learning_rate": 2.2125336385774385e-05, |
| "loss": 0.3506, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.8913773796192608, |
| "grad_norm": 1.1700224876403809, |
| "learning_rate": 2.2027479068723345e-05, |
| "loss": 0.344, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.9025755879059352, |
| "grad_norm": 1.1153310537338257, |
| "learning_rate": 2.1929237038244254e-05, |
| "loss": 0.335, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.9137737961926091, |
| "grad_norm": 3.077934741973877, |
| "learning_rate": 2.1830615672514404e-05, |
| "loss": 0.3383, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.9249720044792833, |
| "grad_norm": 0.9678553342819214, |
| "learning_rate": 2.1731620370477468e-05, |
| "loss": 0.337, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.9361702127659575, |
| "grad_norm": 0.9908361434936523, |
| "learning_rate": 2.1632256551547952e-05, |
| "loss": 0.3673, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.9473684210526314, |
| "grad_norm": 1.0404701232910156, |
| "learning_rate": 2.1532529655314514e-05, |
| "loss": 0.3276, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.9585666293393058, |
| "grad_norm": 0.9922332763671875, |
| "learning_rate": 2.1432445141242166e-05, |
| "loss": 0.3408, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.9697648376259798, |
| "grad_norm": 1.036054253578186, |
| "learning_rate": 2.1332008488373417e-05, |
| "loss": 0.3758, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.980963045912654, |
| "grad_norm": 1.0212777853012085, |
| "learning_rate": 2.12312251950283e-05, |
| "loss": 0.3191, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.992161254199328, |
| "grad_norm": 1.0531727075576782, |
| "learning_rate": 2.1130100778503407e-05, |
| "loss": 0.3344, |
| "step": 890 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 2235, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.352177971225428e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|