{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1068, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028089887640449437, "grad_norm": 44.3452575996642, "learning_rate": 8.411214953271029e-07, "loss": 4.2299, "step": 10 }, { "epoch": 0.056179775280898875, "grad_norm": 16.696144417512347, "learning_rate": 1.7757009345794394e-06, "loss": 3.4728, "step": 20 }, { "epoch": 0.08426966292134831, "grad_norm": 5.245647070749203, "learning_rate": 2.7102803738317757e-06, "loss": 2.2396, "step": 30 }, { "epoch": 0.11235955056179775, "grad_norm": 3.747589322083775, "learning_rate": 3.6448598130841123e-06, "loss": 1.9973, "step": 40 }, { "epoch": 0.1404494382022472, "grad_norm": 3.43796713948135, "learning_rate": 4.579439252336449e-06, "loss": 1.8521, "step": 50 }, { "epoch": 0.16853932584269662, "grad_norm": 3.514480101837343, "learning_rate": 5.514018691588785e-06, "loss": 1.8285, "step": 60 }, { "epoch": 0.19662921348314608, "grad_norm": 3.392412234015579, "learning_rate": 6.448598130841122e-06, "loss": 1.7364, "step": 70 }, { "epoch": 0.2247191011235955, "grad_norm": 4.233842511487955, "learning_rate": 7.383177570093458e-06, "loss": 1.6781, "step": 80 }, { "epoch": 0.25280898876404495, "grad_norm": 3.715743054676569, "learning_rate": 8.317757009345795e-06, "loss": 1.6416, "step": 90 }, { "epoch": 0.2808988764044944, "grad_norm": 3.143134814755476, "learning_rate": 9.252336448598132e-06, "loss": 1.6064, "step": 100 }, { "epoch": 0.3089887640449438, "grad_norm": 2.7477799274995416, "learning_rate": 9.999893131079397e-06, "loss": 1.4984, "step": 110 }, { "epoch": 0.33707865168539325, "grad_norm": 2.3802092180915273, "learning_rate": 9.996153198516951e-06, "loss": 1.5675, "step": 120 }, { "epoch": 0.3651685393258427, "grad_norm": 2.672410802886858, "learning_rate": 9.987074387433024e-06, "loss": 1.4511, "step": 130 }, { "epoch": 0.39325842696629215, "grad_norm": 2.624658269919838, "learning_rate": 9.972666399425538e-06, "loss": 1.4829, "step": 140 }, { "epoch": 0.42134831460674155, "grad_norm": 2.222084135278492, "learning_rate": 9.952944630839371e-06, "loss": 1.5174, "step": 150 }, { "epoch": 0.449438202247191, "grad_norm": 2.373569909234205, "learning_rate": 9.92793015631386e-06, "loss": 1.4802, "step": 160 }, { "epoch": 0.47752808988764045, "grad_norm": 2.6373847502517203, "learning_rate": 9.897649706262474e-06, "loss": 1.4245, "step": 170 }, { "epoch": 0.5056179775280899, "grad_norm": 2.849217346030171, "learning_rate": 9.862135638308763e-06, "loss": 1.4519, "step": 180 }, { "epoch": 0.5337078651685393, "grad_norm": 2.8063076239177334, "learning_rate": 9.821425902709072e-06, "loss": 1.3854, "step": 190 }, { "epoch": 0.5617977528089888, "grad_norm": 2.4489682971962914, "learning_rate": 9.775564001798973e-06, "loss": 1.3687, "step": 200 }, { "epoch": 0.5898876404494382, "grad_norm": 2.6550254325393516, "learning_rate": 9.724598943506762e-06, "loss": 1.3376, "step": 210 }, { "epoch": 0.6179775280898876, "grad_norm": 2.5844459484825153, "learning_rate": 9.6685851889837e-06, "loss": 1.3022, "step": 220 }, { "epoch": 0.6460674157303371, "grad_norm": 2.8049982042093564, "learning_rate": 9.607582594406941e-06, "loss": 1.3088, "step": 230 }, { "epoch": 0.6741573033707865, "grad_norm": 2.4864422239085604, "learning_rate": 9.541656347017345e-06, "loss": 1.2908, "step": 240 }, { "epoch": 0.702247191011236, "grad_norm": 2.964748175555427, "learning_rate": 9.470876895460545e-06, "loss": 1.2868, "step": 250 }, { "epoch": 0.7303370786516854, "grad_norm": 2.604536489470611, "learning_rate": 9.395319874505661e-06, "loss": 1.277, "step": 260 }, { "epoch": 0.7584269662921348, "grad_norm": 2.5620862270285185, "learning_rate": 9.315066024222163e-06, "loss": 1.2643, "step": 270 }, { "epoch": 0.7865168539325843, "grad_norm": 2.653270358757971, "learning_rate": 9.230201103701201e-06, "loss": 1.205, "step": 280 }, { "epoch": 0.8146067415730337, "grad_norm": 2.5808237804954923, "learning_rate": 9.140815799413624e-06, "loss": 1.2222, "step": 290 }, { "epoch": 0.8426966292134831, "grad_norm": 2.898027686306709, "learning_rate": 9.047005628302617e-06, "loss": 1.1676, "step": 300 }, { "epoch": 0.8707865168539326, "grad_norm": 3.518431492195722, "learning_rate": 8.948870835714491e-06, "loss": 1.1993, "step": 310 }, { "epoch": 0.898876404494382, "grad_norm": 2.925246568058356, "learning_rate": 8.846516288276743e-06, "loss": 1.1115, "step": 320 }, { "epoch": 0.9269662921348315, "grad_norm": 2.9427782034508527, "learning_rate": 8.740051361837786e-06, "loss": 1.1041, "step": 330 }, { "epoch": 0.9550561797752809, "grad_norm": 2.5514195156882518, "learning_rate": 8.629589824588158e-06, "loss": 1.143, "step": 340 }, { "epoch": 0.9831460674157303, "grad_norm": 3.015436546535006, "learning_rate": 8.515249715488085e-06, "loss": 1.0505, "step": 350 }, { "epoch": 1.0112359550561798, "grad_norm": 3.2863183728567975, "learning_rate": 8.397153218131297e-06, "loss": 0.8597, "step": 360 }, { "epoch": 1.0393258426966292, "grad_norm": 3.2434695923935033, "learning_rate": 8.2754265301799e-06, "loss": 0.6543, "step": 370 }, { "epoch": 1.0674157303370786, "grad_norm": 4.0641803528122145, "learning_rate": 8.150199728509844e-06, "loss": 0.6335, "step": 380 }, { "epoch": 1.095505617977528, "grad_norm": 3.2752657889372885, "learning_rate": 8.02160663021103e-06, "loss": 0.6254, "step": 390 }, { "epoch": 1.1235955056179776, "grad_norm": 3.326866921104294, "learning_rate": 7.889784649590673e-06, "loss": 0.6102, "step": 400 }, { "epoch": 1.151685393258427, "grad_norm": 3.4005649282793846, "learning_rate": 7.754874651332671e-06, "loss": 0.5881, "step": 410 }, { "epoch": 1.1797752808988764, "grad_norm": 3.138502719677173, "learning_rate": 7.617020799969895e-06, "loss": 0.5858, "step": 420 }, { "epoch": 1.2078651685393258, "grad_norm": 3.343328236660075, "learning_rate": 7.476370405830293e-06, "loss": 0.6526, "step": 430 }, { "epoch": 1.2359550561797752, "grad_norm": 3.435537024900103, "learning_rate": 7.333073767621385e-06, "loss": 0.5759, "step": 440 }, { "epoch": 1.2640449438202248, "grad_norm": 3.0857557861079643, "learning_rate": 7.18728401182139e-06, "loss": 0.5646, "step": 450 }, { "epoch": 1.2921348314606742, "grad_norm": 3.5691357734753724, "learning_rate": 7.039156929048603e-06, "loss": 0.5574, "step": 460 }, { "epoch": 1.3202247191011236, "grad_norm": 3.449734373780758, "learning_rate": 6.888850807583875e-06, "loss": 0.5308, "step": 470 }, { "epoch": 1.348314606741573, "grad_norm": 3.8917107638385215, "learning_rate": 6.736526264224101e-06, "loss": 0.5457, "step": 480 }, { "epoch": 1.3764044943820224, "grad_norm": 3.530441704545947, "learning_rate": 6.582346072647455e-06, "loss": 0.5429, "step": 490 }, { "epoch": 1.404494382022472, "grad_norm": 3.2288633404653657, "learning_rate": 6.426474989473785e-06, "loss": 0.5216, "step": 500 }, { "epoch": 1.4325842696629214, "grad_norm": 4.041005860018691, "learning_rate": 6.2690795782060535e-06, "loss": 0.4646, "step": 510 }, { "epoch": 1.4606741573033708, "grad_norm": 4.067173197200558, "learning_rate": 6.1103280312409355e-06, "loss": 0.4637, "step": 520 }, { "epoch": 1.4887640449438202, "grad_norm": 3.326339667248253, "learning_rate": 5.950389990138774e-06, "loss": 0.4783, "step": 530 }, { "epoch": 1.5168539325842696, "grad_norm": 3.83419253395647, "learning_rate": 5.789436364344998e-06, "loss": 0.4588, "step": 540 }, { "epoch": 1.5449438202247192, "grad_norm": 3.139254395895648, "learning_rate": 5.627639148556638e-06, "loss": 0.4374, "step": 550 }, { "epoch": 1.5730337078651684, "grad_norm": 3.9341533967371403, "learning_rate": 5.465171238929173e-06, "loss": 0.455, "step": 560 }, { "epoch": 1.601123595505618, "grad_norm": 2.9583490118609435, "learning_rate": 5.30220624832007e-06, "loss": 0.3843, "step": 570 }, { "epoch": 1.6292134831460674, "grad_norm": 3.7095899864138606, "learning_rate": 5.13891832076646e-06, "loss": 0.4115, "step": 580 }, { "epoch": 1.6573033707865168, "grad_norm": 3.2360479692986153, "learning_rate": 4.9754819453951986e-06, "loss": 0.3814, "step": 590 }, { "epoch": 1.6853932584269664, "grad_norm": 4.383809713794338, "learning_rate": 4.8120717699641535e-06, "loss": 0.3791, "step": 600 }, { "epoch": 1.7134831460674156, "grad_norm": 3.4036732065921993, "learning_rate": 4.648862414233998e-06, "loss": 0.3517, "step": 610 }, { "epoch": 1.7415730337078652, "grad_norm": 3.8998605138651325, "learning_rate": 4.486028283369901e-06, "loss": 0.3603, "step": 620 }, { "epoch": 1.7696629213483146, "grad_norm": 3.6292532967844835, "learning_rate": 4.323743381572557e-06, "loss": 0.3184, "step": 630 }, { "epoch": 1.797752808988764, "grad_norm": 4.428693489997381, "learning_rate": 4.162181126137658e-06, "loss": 0.3807, "step": 640 }, { "epoch": 1.8258426966292136, "grad_norm": 3.955306267340941, "learning_rate": 4.001514162142559e-06, "loss": 0.3074, "step": 650 }, { "epoch": 1.8539325842696628, "grad_norm": 3.5747162718635197, "learning_rate": 3.84191417795811e-06, "loss": 0.3111, "step": 660 }, { "epoch": 1.8820224719101124, "grad_norm": 3.9923845873645742, "learning_rate": 3.6835517217828442e-06, "loss": 0.3005, "step": 670 }, { "epoch": 1.9101123595505618, "grad_norm": 3.65999276518314, "learning_rate": 3.5265960193955338e-06, "loss": 0.2559, "step": 680 }, { "epoch": 1.9382022471910112, "grad_norm": 3.7068235753264123, "learning_rate": 3.3712147933208885e-06, "loss": 0.2737, "step": 690 }, { "epoch": 1.9662921348314608, "grad_norm": 4.090893668780354, "learning_rate": 3.2175740836016323e-06, "loss": 0.231, "step": 700 }, { "epoch": 1.99438202247191, "grad_norm": 3.90282406209805, "learning_rate": 3.065838070368469e-06, "loss": 0.2496, "step": 710 }, { "epoch": 2.0224719101123596, "grad_norm": 2.796358555674757, "learning_rate": 2.9161688983975466e-06, "loss": 0.1056, "step": 720 }, { "epoch": 2.050561797752809, "grad_norm": 1.840479273209187, "learning_rate": 2.7687265038429074e-06, "loss": 0.063, "step": 730 }, { "epoch": 2.0786516853932584, "grad_norm": 2.5759762149420924, "learning_rate": 2.6236684433290494e-06, "loss": 0.0596, "step": 740 }, { "epoch": 2.106741573033708, "grad_norm": 1.9486659149321488, "learning_rate": 2.4811497255862634e-06, "loss": 0.0632, "step": 750 }, { "epoch": 2.134831460674157, "grad_norm": 2.2514702517323926, "learning_rate": 2.341322645808642e-06, "loss": 0.0658, "step": 760 }, { "epoch": 2.162921348314607, "grad_norm": 2.1967246146223345, "learning_rate": 2.204336622911753e-06, "loss": 0.0604, "step": 770 }, { "epoch": 2.191011235955056, "grad_norm": 2.486502088549764, "learning_rate": 2.070338039863917e-06, "loss": 0.0627, "step": 780 }, { "epoch": 2.2191011235955056, "grad_norm": 1.687530407531317, "learning_rate": 1.9394700872616856e-06, "loss": 0.0581, "step": 790 }, { "epoch": 2.247191011235955, "grad_norm": 2.6462802513977897, "learning_rate": 1.8118726103166706e-06, "loss": 0.061, "step": 800 }, { "epoch": 2.2752808988764044, "grad_norm": 2.016987283249722, "learning_rate": 1.6876819594172578e-06, "loss": 0.0619, "step": 810 }, { "epoch": 2.303370786516854, "grad_norm": 2.0842256218362163, "learning_rate": 1.5670308444248777e-06, "loss": 0.0584, "step": 820 }, { "epoch": 2.331460674157303, "grad_norm": 1.8494552666391486, "learning_rate": 1.4500481928605304e-06, "loss": 0.0506, "step": 830 }, { "epoch": 2.359550561797753, "grad_norm": 2.2638744740418755, "learning_rate": 1.3368590121331166e-06, "loss": 0.0544, "step": 840 }, { "epoch": 2.3876404494382024, "grad_norm": 2.2496173269208737, "learning_rate": 1.2275842559567947e-06, "loss": 0.0508, "step": 850 }, { "epoch": 2.4157303370786516, "grad_norm": 1.5930060349264639, "learning_rate": 1.1223406951000936e-06, "loss": 0.0501, "step": 860 }, { "epoch": 2.443820224719101, "grad_norm": 1.8725457508959784, "learning_rate": 1.021240792604929e-06, "loss": 0.048, "step": 870 }, { "epoch": 2.4719101123595504, "grad_norm": 1.9419600970183988, "learning_rate": 9.243925836088386e-07, "loss": 0.045, "step": 880 }, { "epoch": 2.5, "grad_norm": 1.8713563535913174, "learning_rate": 8.318995598988649e-07, "loss": 0.0461, "step": 890 }, { "epoch": 2.5280898876404496, "grad_norm": 1.995760237288878, "learning_rate": 7.438605593204562e-07, "loss": 0.0481, "step": 900 }, { "epoch": 2.556179775280899, "grad_norm": 2.109074394489752, "learning_rate": 6.603696601595577e-07, "loss": 0.0396, "step": 910 }, { "epoch": 2.5842696629213484, "grad_norm": 3.85749040186571, "learning_rate": 5.8151608061076e-07, "loss": 0.05, "step": 920 }, { "epoch": 2.6123595505617976, "grad_norm": 2.4295027083907246, "learning_rate": 5.073840834389293e-07, "loss": 0.0448, "step": 930 }, { "epoch": 2.640449438202247, "grad_norm": 3.3753210520643893, "learning_rate": 4.380528859361954e-07, "loss": 0.0502, "step": 940 }, { "epoch": 2.668539325842697, "grad_norm": 1.4432588529836197, "learning_rate": 3.735965752705256e-07, "loss": 0.0448, "step": 950 }, { "epoch": 2.696629213483146, "grad_norm": 2.3912332708241815, "learning_rate": 3.1408402931634163e-07, "loss": 0.0446, "step": 960 }, { "epoch": 2.7247191011235956, "grad_norm": 1.6107794104662452, "learning_rate": 2.595788430517637e-07, "loss": 0.038, "step": 970 }, { "epoch": 2.752808988764045, "grad_norm": 1.6211044234112233, "learning_rate": 2.1013926060116042e-07, "loss": 0.0389, "step": 980 }, { "epoch": 2.7808988764044944, "grad_norm": 2.5135992092458546, "learning_rate": 1.6581811299560212e-07, "loss": 0.0417, "step": 990 }, { "epoch": 2.808988764044944, "grad_norm": 1.392970025455271, "learning_rate": 1.2666276171773073e-07, "loss": 0.0396, "step": 1000 }, { "epoch": 2.837078651685393, "grad_norm": 1.8357472070480059, "learning_rate": 9.271504809138854e-08, "loss": 0.0377, "step": 1010 }, { "epoch": 2.865168539325843, "grad_norm": 1.5659783547206103, "learning_rate": 6.401124857006502e-08, "loss": 0.0369, "step": 1020 }, { "epoch": 2.893258426966292, "grad_norm": 1.6270134391199909, "learning_rate": 4.058203597195831e-08, "loss": 0.0334, "step": 1030 }, { "epoch": 2.9213483146067416, "grad_norm": 1.5836178259212397, "learning_rate": 2.2452446703067897e-08, "loss": 0.0332, "step": 1040 }, { "epoch": 2.949438202247191, "grad_norm": 1.5726583876755116, "learning_rate": 9.641854003346607e-09, "loss": 0.0341, "step": 1050 }, { "epoch": 2.9775280898876404, "grad_norm": 1.720292896166018, "learning_rate": 2.1639472444956454e-09, "loss": 0.0336, "step": 1060 }, { "epoch": 3.0, "step": 1068, "total_flos": 49945662455808.0, "train_loss": 0.6897849787152215, "train_runtime": 5575.3809, "train_samples_per_second": 12.252, "train_steps_per_second": 0.192 } ], "logging_steps": 10, "max_steps": 1068, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 49945662455808.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }