{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 503, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019880715705765406, "grad_norm": 127.0667237907462, "learning_rate": 5.882352941176471e-08, "loss": 3.918, "step": 1 }, { "epoch": 0.019880715705765408, "grad_norm": 115.98143785016711, "learning_rate": 5.882352941176471e-07, "loss": 3.7431, "step": 10 }, { "epoch": 0.039761431411530816, "grad_norm": 65.61390946147996, "learning_rate": 1.1764705882352942e-06, "loss": 3.0734, "step": 20 }, { "epoch": 0.05964214711729622, "grad_norm": 41.009392563213375, "learning_rate": 1.7647058823529412e-06, "loss": 2.4079, "step": 30 }, { "epoch": 0.07952286282306163, "grad_norm": 25.739828109506355, "learning_rate": 2.3529411764705885e-06, "loss": 2.1062, "step": 40 }, { "epoch": 0.09940357852882704, "grad_norm": 17.26914793098219, "learning_rate": 2.941176470588235e-06, "loss": 2.0659, "step": 50 }, { "epoch": 0.11928429423459244, "grad_norm": 25.05959744665656, "learning_rate": 2.9970662200387674e-06, "loss": 2.0592, "step": 60 }, { "epoch": 0.13916500994035785, "grad_norm": 15.43081495953451, "learning_rate": 2.986939491128791e-06, "loss": 2.0407, "step": 70 }, { "epoch": 0.15904572564612326, "grad_norm": 12.58490370084457, "learning_rate": 2.969632483038685e-06, "loss": 2.0257, "step": 80 }, { "epoch": 0.17892644135188868, "grad_norm": 20.125125391550814, "learning_rate": 2.94522876954573e-06, "loss": 1.9816, "step": 90 }, { "epoch": 0.1988071570576541, "grad_norm": 8.501045071308724, "learning_rate": 2.9138461936939467e-06, "loss": 1.9523, "step": 100 }, { "epoch": 0.1988071570576541, "eval_loss": 1.9134721755981445, "eval_runtime": 63.5356, "eval_samples_per_second": 26.662, "eval_steps_per_second": 0.425, "step": 100 }, { "epoch": 0.21868787276341947, "grad_norm": 22.082329977354508, "learning_rate": 2.875636298742058e-06, "loss": 1.9368, "step": 110 }, { "epoch": 0.23856858846918488, "grad_norm": 15.818359711105115, "learning_rate": 2.8307835963765403e-06, "loss": 1.9224, "step": 120 }, { "epoch": 0.2584493041749503, "grad_norm": 13.730610615135147, "learning_rate": 2.779504675723508e-06, "loss": 1.9146, "step": 130 }, { "epoch": 0.2783300198807157, "grad_norm": 6.635422891471246, "learning_rate": 2.722047157461906e-06, "loss": 1.9164, "step": 140 }, { "epoch": 0.2982107355864811, "grad_norm": 8.086513909738859, "learning_rate": 2.6586884980885044e-06, "loss": 1.9076, "step": 150 }, { "epoch": 0.31809145129224653, "grad_norm": 13.60628995566901, "learning_rate": 2.5897346501087633e-06, "loss": 1.9047, "step": 160 }, { "epoch": 0.3379721669980119, "grad_norm": 18.644035876158316, "learning_rate": 2.5155185846233844e-06, "loss": 1.8866, "step": 170 }, { "epoch": 0.35785288270377735, "grad_norm": 17.47074603496702, "learning_rate": 2.43639868344482e-06, "loss": 1.8956, "step": 180 }, { "epoch": 0.37773359840954274, "grad_norm": 7.178728301299398, "learning_rate": 2.3527570085080407e-06, "loss": 1.9043, "step": 190 }, { "epoch": 0.3976143141153082, "grad_norm": 12.253857653229236, "learning_rate": 2.264997456932413e-06, "loss": 1.9159, "step": 200 }, { "epoch": 0.3976143141153082, "eval_loss": 1.8660345077514648, "eval_runtime": 63.4283, "eval_samples_per_second": 26.707, "eval_steps_per_second": 0.426, "step": 200 }, { "epoch": 0.41749502982107356, "grad_norm": 11.988930445624767, "learning_rate": 2.1735438106436967e-06, "loss": 1.9004, "step": 210 }, { "epoch": 0.43737574552683894, "grad_norm": 10.753394166683348, "learning_rate": 2.078837689974332e-06, "loss": 1.9172, "step": 220 }, { "epoch": 0.4572564612326044, "grad_norm": 9.351072895819225, "learning_rate": 1.981336421123892e-06, "loss": 1.9192, "step": 230 }, { "epoch": 0.47713717693836977, "grad_norm": 7.682035183703906, "learning_rate": 1.8815108277774976e-06, "loss": 1.8959, "step": 240 }, { "epoch": 0.4970178926441352, "grad_norm": 11.950308335271014, "learning_rate": 1.7798429575462477e-06, "loss": 1.8733, "step": 250 }, { "epoch": 0.5168986083499006, "grad_norm": 10.982955764593422, "learning_rate": 1.6768237542084645e-06, "loss": 1.8827, "step": 260 }, { "epoch": 0.536779324055666, "grad_norm": 11.904679442862472, "learning_rate": 1.5729506869922447e-06, "loss": 1.8765, "step": 270 }, { "epoch": 0.5566600397614314, "grad_norm": 9.827244740110832, "learning_rate": 1.4687253483472872e-06, "loss": 1.8841, "step": 280 }, { "epoch": 0.5765407554671969, "grad_norm": 7.306293618482458, "learning_rate": 1.3646510318060986e-06, "loss": 1.8773, "step": 290 }, { "epoch": 0.5964214711729622, "grad_norm": 9.071120593329336, "learning_rate": 1.2612303016308466e-06, "loss": 1.875, "step": 300 }, { "epoch": 0.5964214711729622, "eval_loss": 1.8548645973205566, "eval_runtime": 63.4556, "eval_samples_per_second": 26.696, "eval_steps_per_second": 0.425, "step": 300 }, { "epoch": 0.6163021868787276, "grad_norm": 9.443760924559943, "learning_rate": 1.1589625659817845e-06, "loss": 1.8568, "step": 310 }, { "epoch": 0.6361829025844931, "grad_norm": 11.674966838003883, "learning_rate": 1.0583416653261663e-06, "loss": 1.877, "step": 320 }, { "epoch": 0.6560636182902585, "grad_norm": 8.948890555802585, "learning_rate": 9.598534877329919e-07, "loss": 1.8663, "step": 330 }, { "epoch": 0.6759443339960238, "grad_norm": 7.926486654587874, "learning_rate": 8.639736225690654e-07, "loss": 1.8776, "step": 340 }, { "epoch": 0.6958250497017893, "grad_norm": 6.133488811515441, "learning_rate": 7.711650639264374e-07, "loss": 1.8669, "step": 350 }, { "epoch": 0.7157057654075547, "grad_norm": 5.995054734148766, "learning_rate": 6.818759748711476e-07, "loss": 1.8661, "step": 360 }, { "epoch": 0.73558648111332, "grad_norm": 6.532247908566974, "learning_rate": 5.965375233094762e-07, "loss": 1.8429, "step": 370 }, { "epoch": 0.7554671968190855, "grad_norm": 8.52788084676906, "learning_rate": 5.155617999220938e-07, "loss": 1.883, "step": 380 }, { "epoch": 0.7753479125248509, "grad_norm": 7.412951150651719, "learning_rate": 4.3933982822017883e-07, "loss": 1.8518, "step": 390 }, { "epoch": 0.7952286282306164, "grad_norm": 8.704060015623933, "learning_rate": 3.6823967633276183e-07, "loss": 1.841, "step": 400 }, { "epoch": 0.7952286282306164, "eval_loss": 1.834498643875122, "eval_runtime": 63.4011, "eval_samples_per_second": 26.719, "eval_steps_per_second": 0.426, "step": 400 }, { "epoch": 0.8151093439363817, "grad_norm": 10.837619200928202, "learning_rate": 3.026046796432582e-07, "loss": 1.8274, "step": 410 }, { "epoch": 0.8349900596421471, "grad_norm": 9.650000403237328, "learning_rate": 2.4275178285790973e-07, "loss": 1.8457, "step": 420 }, { "epoch": 0.8548707753479126, "grad_norm": 8.514388424330665, "learning_rate": 1.889700095121219e-07, "loss": 1.8333, "step": 430 }, { "epoch": 0.8747514910536779, "grad_norm": 11.640897943920702, "learning_rate": 1.4151906630527865e-07, "loss": 1.8412, "step": 440 }, { "epoch": 0.8946322067594433, "grad_norm": 13.422663396524422, "learning_rate": 1.00628089003575e-07, "loss": 1.8505, "step": 450 }, { "epoch": 0.9145129224652088, "grad_norm": 6.751723671685878, "learning_rate": 6.649453596676663e-08, "loss": 1.8411, "step": 460 }, { "epoch": 0.9343936381709742, "grad_norm": 7.821816393267081, "learning_rate": 3.928323464188621e-08, "loss": 1.8268, "step": 470 }, { "epoch": 0.9542743538767395, "grad_norm": 7.257934599669054, "learning_rate": 1.9125585628307407e-08, "loss": 1.8413, "step": 480 }, { "epoch": 0.974155069582505, "grad_norm": 7.949497125452414, "learning_rate": 6.118928157650749e-09, "loss": 1.8531, "step": 490 }, { "epoch": 0.9940357852882704, "grad_norm": 7.167162679479334, "learning_rate": 3.260700525591909e-10, "loss": 1.8309, "step": 500 }, { "epoch": 0.9940357852882704, "eval_loss": 1.8293424844741821, "eval_runtime": 63.4663, "eval_samples_per_second": 26.691, "eval_steps_per_second": 0.425, "step": 500 } ], "logging_steps": 10, "max_steps": 503, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 162675912867840.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }