| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 1068, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.028089887640449437, |
| "grad_norm": 44.3452575996642, |
| "learning_rate": 8.411214953271029e-07, |
| "loss": 4.2299, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.056179775280898875, |
| "grad_norm": 16.696144417512347, |
| "learning_rate": 1.7757009345794394e-06, |
| "loss": 3.4728, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.08426966292134831, |
| "grad_norm": 5.245647070749203, |
| "learning_rate": 2.7102803738317757e-06, |
| "loss": 2.2396, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.11235955056179775, |
| "grad_norm": 3.747589322083775, |
| "learning_rate": 3.6448598130841123e-06, |
| "loss": 1.9973, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1404494382022472, |
| "grad_norm": 3.43796713948135, |
| "learning_rate": 4.579439252336449e-06, |
| "loss": 1.8521, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.16853932584269662, |
| "grad_norm": 3.514480101837343, |
| "learning_rate": 5.514018691588785e-06, |
| "loss": 1.8285, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.19662921348314608, |
| "grad_norm": 3.392412234015579, |
| "learning_rate": 6.448598130841122e-06, |
| "loss": 1.7364, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2247191011235955, |
| "grad_norm": 4.233842511487955, |
| "learning_rate": 7.383177570093458e-06, |
| "loss": 1.6781, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.25280898876404495, |
| "grad_norm": 3.715743054676569, |
| "learning_rate": 8.317757009345795e-06, |
| "loss": 1.6416, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2808988764044944, |
| "grad_norm": 3.143134814755476, |
| "learning_rate": 9.252336448598132e-06, |
| "loss": 1.6064, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3089887640449438, |
| "grad_norm": 2.7477799274995416, |
| "learning_rate": 9.999893131079397e-06, |
| "loss": 1.4984, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.33707865168539325, |
| "grad_norm": 2.3802092180915273, |
| "learning_rate": 9.996153198516951e-06, |
| "loss": 1.5675, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3651685393258427, |
| "grad_norm": 2.672410802886858, |
| "learning_rate": 9.987074387433024e-06, |
| "loss": 1.4511, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.39325842696629215, |
| "grad_norm": 2.624658269919838, |
| "learning_rate": 9.972666399425538e-06, |
| "loss": 1.4829, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.42134831460674155, |
| "grad_norm": 2.222084135278492, |
| "learning_rate": 9.952944630839371e-06, |
| "loss": 1.5174, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.449438202247191, |
| "grad_norm": 2.373569909234205, |
| "learning_rate": 9.92793015631386e-06, |
| "loss": 1.4802, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.47752808988764045, |
| "grad_norm": 2.6373847502517203, |
| "learning_rate": 9.897649706262474e-06, |
| "loss": 1.4245, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5056179775280899, |
| "grad_norm": 2.849217346030171, |
| "learning_rate": 9.862135638308763e-06, |
| "loss": 1.4519, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5337078651685393, |
| "grad_norm": 2.8063076239177334, |
| "learning_rate": 9.821425902709072e-06, |
| "loss": 1.3854, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5617977528089888, |
| "grad_norm": 2.4489682971962914, |
| "learning_rate": 9.775564001798973e-06, |
| "loss": 1.3687, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5898876404494382, |
| "grad_norm": 2.6550254325393516, |
| "learning_rate": 9.724598943506762e-06, |
| "loss": 1.3376, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6179775280898876, |
| "grad_norm": 2.5844459484825153, |
| "learning_rate": 9.6685851889837e-06, |
| "loss": 1.3022, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.6460674157303371, |
| "grad_norm": 2.8049982042093564, |
| "learning_rate": 9.607582594406941e-06, |
| "loss": 1.3088, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.6741573033707865, |
| "grad_norm": 2.4864422239085604, |
| "learning_rate": 9.541656347017345e-06, |
| "loss": 1.2908, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.702247191011236, |
| "grad_norm": 2.964748175555427, |
| "learning_rate": 9.470876895460545e-06, |
| "loss": 1.2868, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7303370786516854, |
| "grad_norm": 2.604536489470611, |
| "learning_rate": 9.395319874505661e-06, |
| "loss": 1.277, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.7584269662921348, |
| "grad_norm": 2.5620862270285185, |
| "learning_rate": 9.315066024222163e-06, |
| "loss": 1.2643, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.7865168539325843, |
| "grad_norm": 2.653270358757971, |
| "learning_rate": 9.230201103701201e-06, |
| "loss": 1.205, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8146067415730337, |
| "grad_norm": 2.5808237804954923, |
| "learning_rate": 9.140815799413624e-06, |
| "loss": 1.2222, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.8426966292134831, |
| "grad_norm": 2.898027686306709, |
| "learning_rate": 9.047005628302617e-06, |
| "loss": 1.1676, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8707865168539326, |
| "grad_norm": 3.518431492195722, |
| "learning_rate": 8.948870835714491e-06, |
| "loss": 1.1993, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.898876404494382, |
| "grad_norm": 2.925246568058356, |
| "learning_rate": 8.846516288276743e-06, |
| "loss": 1.1115, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9269662921348315, |
| "grad_norm": 2.9427782034508527, |
| "learning_rate": 8.740051361837786e-06, |
| "loss": 1.1041, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.9550561797752809, |
| "grad_norm": 2.5514195156882518, |
| "learning_rate": 8.629589824588158e-06, |
| "loss": 1.143, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.9831460674157303, |
| "grad_norm": 3.015436546535006, |
| "learning_rate": 8.515249715488085e-06, |
| "loss": 1.0505, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.0112359550561798, |
| "grad_norm": 3.2863183728567975, |
| "learning_rate": 8.397153218131297e-06, |
| "loss": 0.8597, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.0393258426966292, |
| "grad_norm": 3.2434695923935033, |
| "learning_rate": 8.2754265301799e-06, |
| "loss": 0.6543, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.0674157303370786, |
| "grad_norm": 4.0641803528122145, |
| "learning_rate": 8.150199728509844e-06, |
| "loss": 0.6335, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.095505617977528, |
| "grad_norm": 3.2752657889372885, |
| "learning_rate": 8.02160663021103e-06, |
| "loss": 0.6254, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.1235955056179776, |
| "grad_norm": 3.326866921104294, |
| "learning_rate": 7.889784649590673e-06, |
| "loss": 0.6102, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.151685393258427, |
| "grad_norm": 3.4005649282793846, |
| "learning_rate": 7.754874651332671e-06, |
| "loss": 0.5881, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.1797752808988764, |
| "grad_norm": 3.138502719677173, |
| "learning_rate": 7.617020799969895e-06, |
| "loss": 0.5858, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.2078651685393258, |
| "grad_norm": 3.343328236660075, |
| "learning_rate": 7.476370405830293e-06, |
| "loss": 0.6526, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.2359550561797752, |
| "grad_norm": 3.435537024900103, |
| "learning_rate": 7.333073767621385e-06, |
| "loss": 0.5759, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.2640449438202248, |
| "grad_norm": 3.0857557861079643, |
| "learning_rate": 7.18728401182139e-06, |
| "loss": 0.5646, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.2921348314606742, |
| "grad_norm": 3.5691357734753724, |
| "learning_rate": 7.039156929048603e-06, |
| "loss": 0.5574, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.3202247191011236, |
| "grad_norm": 3.449734373780758, |
| "learning_rate": 6.888850807583875e-06, |
| "loss": 0.5308, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.348314606741573, |
| "grad_norm": 3.8917107638385215, |
| "learning_rate": 6.736526264224101e-06, |
| "loss": 0.5457, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.3764044943820224, |
| "grad_norm": 3.530441704545947, |
| "learning_rate": 6.582346072647455e-06, |
| "loss": 0.5429, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.404494382022472, |
| "grad_norm": 3.2288633404653657, |
| "learning_rate": 6.426474989473785e-06, |
| "loss": 0.5216, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.4325842696629214, |
| "grad_norm": 4.041005860018691, |
| "learning_rate": 6.2690795782060535e-06, |
| "loss": 0.4646, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.4606741573033708, |
| "grad_norm": 4.067173197200558, |
| "learning_rate": 6.1103280312409355e-06, |
| "loss": 0.4637, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.4887640449438202, |
| "grad_norm": 3.326339667248253, |
| "learning_rate": 5.950389990138774e-06, |
| "loss": 0.4783, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.5168539325842696, |
| "grad_norm": 3.83419253395647, |
| "learning_rate": 5.789436364344998e-06, |
| "loss": 0.4588, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.5449438202247192, |
| "grad_norm": 3.139254395895648, |
| "learning_rate": 5.627639148556638e-06, |
| "loss": 0.4374, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.5730337078651684, |
| "grad_norm": 3.9341533967371403, |
| "learning_rate": 5.465171238929173e-06, |
| "loss": 0.455, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.601123595505618, |
| "grad_norm": 2.9583490118609435, |
| "learning_rate": 5.30220624832007e-06, |
| "loss": 0.3843, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.6292134831460674, |
| "grad_norm": 3.7095899864138606, |
| "learning_rate": 5.13891832076646e-06, |
| "loss": 0.4115, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.6573033707865168, |
| "grad_norm": 3.2360479692986153, |
| "learning_rate": 4.9754819453951986e-06, |
| "loss": 0.3814, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.6853932584269664, |
| "grad_norm": 4.383809713794338, |
| "learning_rate": 4.8120717699641535e-06, |
| "loss": 0.3791, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.7134831460674156, |
| "grad_norm": 3.4036732065921993, |
| "learning_rate": 4.648862414233998e-06, |
| "loss": 0.3517, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.7415730337078652, |
| "grad_norm": 3.8998605138651325, |
| "learning_rate": 4.486028283369901e-06, |
| "loss": 0.3603, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.7696629213483146, |
| "grad_norm": 3.6292532967844835, |
| "learning_rate": 4.323743381572557e-06, |
| "loss": 0.3184, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.797752808988764, |
| "grad_norm": 4.428693489997381, |
| "learning_rate": 4.162181126137658e-06, |
| "loss": 0.3807, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.8258426966292136, |
| "grad_norm": 3.955306267340941, |
| "learning_rate": 4.001514162142559e-06, |
| "loss": 0.3074, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.8539325842696628, |
| "grad_norm": 3.5747162718635197, |
| "learning_rate": 3.84191417795811e-06, |
| "loss": 0.3111, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.8820224719101124, |
| "grad_norm": 3.9923845873645742, |
| "learning_rate": 3.6835517217828442e-06, |
| "loss": 0.3005, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.9101123595505618, |
| "grad_norm": 3.65999276518314, |
| "learning_rate": 3.5265960193955338e-06, |
| "loss": 0.2559, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.9382022471910112, |
| "grad_norm": 3.7068235753264123, |
| "learning_rate": 3.3712147933208885e-06, |
| "loss": 0.2737, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.9662921348314608, |
| "grad_norm": 4.090893668780354, |
| "learning_rate": 3.2175740836016323e-06, |
| "loss": 0.231, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.99438202247191, |
| "grad_norm": 3.90282406209805, |
| "learning_rate": 3.065838070368469e-06, |
| "loss": 0.2496, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.0224719101123596, |
| "grad_norm": 2.796358555674757, |
| "learning_rate": 2.9161688983975466e-06, |
| "loss": 0.1056, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.050561797752809, |
| "grad_norm": 1.840479273209187, |
| "learning_rate": 2.7687265038429074e-06, |
| "loss": 0.063, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.0786516853932584, |
| "grad_norm": 2.5759762149420924, |
| "learning_rate": 2.6236684433290494e-06, |
| "loss": 0.0596, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.106741573033708, |
| "grad_norm": 1.9486659149321488, |
| "learning_rate": 2.4811497255862634e-06, |
| "loss": 0.0632, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.134831460674157, |
| "grad_norm": 2.2514702517323926, |
| "learning_rate": 2.341322645808642e-06, |
| "loss": 0.0658, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.162921348314607, |
| "grad_norm": 2.1967246146223345, |
| "learning_rate": 2.204336622911753e-06, |
| "loss": 0.0604, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.191011235955056, |
| "grad_norm": 2.486502088549764, |
| "learning_rate": 2.070338039863917e-06, |
| "loss": 0.0627, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.2191011235955056, |
| "grad_norm": 1.687530407531317, |
| "learning_rate": 1.9394700872616856e-06, |
| "loss": 0.0581, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.247191011235955, |
| "grad_norm": 2.6462802513977897, |
| "learning_rate": 1.8118726103166706e-06, |
| "loss": 0.061, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.2752808988764044, |
| "grad_norm": 2.016987283249722, |
| "learning_rate": 1.6876819594172578e-06, |
| "loss": 0.0619, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.303370786516854, |
| "grad_norm": 2.0842256218362163, |
| "learning_rate": 1.5670308444248777e-06, |
| "loss": 0.0584, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.331460674157303, |
| "grad_norm": 1.8494552666391486, |
| "learning_rate": 1.4500481928605304e-06, |
| "loss": 0.0506, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.359550561797753, |
| "grad_norm": 2.2638744740418755, |
| "learning_rate": 1.3368590121331166e-06, |
| "loss": 0.0544, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.3876404494382024, |
| "grad_norm": 2.2496173269208737, |
| "learning_rate": 1.2275842559567947e-06, |
| "loss": 0.0508, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.4157303370786516, |
| "grad_norm": 1.5930060349264639, |
| "learning_rate": 1.1223406951000936e-06, |
| "loss": 0.0501, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.443820224719101, |
| "grad_norm": 1.8725457508959784, |
| "learning_rate": 1.021240792604929e-06, |
| "loss": 0.048, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.4719101123595504, |
| "grad_norm": 1.9419600970183988, |
| "learning_rate": 9.243925836088386e-07, |
| "loss": 0.045, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 1.8713563535913174, |
| "learning_rate": 8.318995598988649e-07, |
| "loss": 0.0461, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.5280898876404496, |
| "grad_norm": 1.995760237288878, |
| "learning_rate": 7.438605593204562e-07, |
| "loss": 0.0481, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.556179775280899, |
| "grad_norm": 2.109074394489752, |
| "learning_rate": 6.603696601595577e-07, |
| "loss": 0.0396, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.5842696629213484, |
| "grad_norm": 3.85749040186571, |
| "learning_rate": 5.8151608061076e-07, |
| "loss": 0.05, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.6123595505617976, |
| "grad_norm": 2.4295027083907246, |
| "learning_rate": 5.073840834389293e-07, |
| "loss": 0.0448, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.640449438202247, |
| "grad_norm": 3.3753210520643893, |
| "learning_rate": 4.380528859361954e-07, |
| "loss": 0.0502, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.668539325842697, |
| "grad_norm": 1.4432588529836197, |
| "learning_rate": 3.735965752705256e-07, |
| "loss": 0.0448, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.696629213483146, |
| "grad_norm": 2.3912332708241815, |
| "learning_rate": 3.1408402931634163e-07, |
| "loss": 0.0446, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.7247191011235956, |
| "grad_norm": 1.6107794104662452, |
| "learning_rate": 2.595788430517637e-07, |
| "loss": 0.038, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.752808988764045, |
| "grad_norm": 1.6211044234112233, |
| "learning_rate": 2.1013926060116042e-07, |
| "loss": 0.0389, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.7808988764044944, |
| "grad_norm": 2.5135992092458546, |
| "learning_rate": 1.6581811299560212e-07, |
| "loss": 0.0417, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.808988764044944, |
| "grad_norm": 1.392970025455271, |
| "learning_rate": 1.2666276171773073e-07, |
| "loss": 0.0396, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.837078651685393, |
| "grad_norm": 1.8357472070480059, |
| "learning_rate": 9.271504809138854e-08, |
| "loss": 0.0377, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.865168539325843, |
| "grad_norm": 1.5659783547206103, |
| "learning_rate": 6.401124857006502e-08, |
| "loss": 0.0369, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.893258426966292, |
| "grad_norm": 1.6270134391199909, |
| "learning_rate": 4.058203597195831e-08, |
| "loss": 0.0334, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.9213483146067416, |
| "grad_norm": 1.5836178259212397, |
| "learning_rate": 2.2452446703067897e-08, |
| "loss": 0.0332, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.949438202247191, |
| "grad_norm": 1.5726583876755116, |
| "learning_rate": 9.641854003346607e-09, |
| "loss": 0.0341, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.9775280898876404, |
| "grad_norm": 1.720292896166018, |
| "learning_rate": 2.1639472444956454e-09, |
| "loss": 0.0336, |
| "step": 1060 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 1068, |
| "total_flos": 49945662455808.0, |
| "train_loss": 0.6897849787152215, |
| "train_runtime": 5575.3809, |
| "train_samples_per_second": 12.252, |
| "train_steps_per_second": 0.192 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1068, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 49945662455808.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|