{ "best_global_step": 800, "best_metric": 0.5648624300956726, "best_model_checkpoint": "models/generation/description/checkpoint-800", "epoch": 17.5, "eval_steps": 400, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.25, "grad_norm": 5.374443531036377, "learning_rate": 8.166666666666667e-05, "loss": 12.4423, "step": 50 }, { "epoch": 0.5, "grad_norm": 4.090597152709961, "learning_rate": 9.996828927498017e-05, "loss": 2.8795, "step": 100 }, { "epoch": 0.75, "grad_norm": 3.756592035293579, "learning_rate": 9.983493166277486e-05, "loss": 2.5534, "step": 150 }, { "epoch": 1.0, "grad_norm": 3.0228748321533203, "learning_rate": 9.959768225002347e-05, "loss": 2.5155, "step": 200 }, { "epoch": 1.25, "grad_norm": 3.339592456817627, "learning_rate": 9.925703563494947e-05, "loss": 2.3755, "step": 250 }, { "epoch": 1.5, "grad_norm": 2.7049825191497803, "learning_rate": 9.881370196982982e-05, "loss": 2.3446, "step": 300 }, { "epoch": 1.75, "grad_norm": 2.8367323875427246, "learning_rate": 9.826860548052725e-05, "loss": 2.3269, "step": 350 }, { "epoch": 2.0, "grad_norm": 2.6648616790771484, "learning_rate": 9.76228825397397e-05, "loss": 2.3153, "step": 400 }, { "epoch": 2.0, "eval_loss": 0.5770760178565979, "eval_runtime": 15.1499, "eval_samples_per_second": 26.403, "eval_steps_per_second": 6.601, "step": 400 }, { "epoch": 2.25, "grad_norm": 2.7784676551818848, "learning_rate": 9.687787929798317e-05, "loss": 2.1744, "step": 450 }, { "epoch": 2.5, "grad_norm": 2.5936291217803955, "learning_rate": 9.603514887724691e-05, "loss": 2.2059, "step": 500 }, { "epoch": 2.75, "grad_norm": 2.7534546852111816, "learning_rate": 9.509644813317144e-05, "loss": 2.187, "step": 550 }, { "epoch": 3.0, "grad_norm": 2.6206445693969727, "learning_rate": 9.406373399249911e-05, "loss": 2.1917, "step": 600 }, { "epoch": 3.25, "grad_norm": 2.880136251449585, "learning_rate": 9.293915937343299e-05, "loss": 2.0629, "step": 650 }, { "epoch": 3.5, "grad_norm": 2.911586284637451, "learning_rate": 9.172506869740849e-05, "loss": 2.0542, "step": 700 }, { "epoch": 3.75, "grad_norm": 2.876952886581421, "learning_rate": 9.042399300163484e-05, "loss": 2.0751, "step": 750 }, { "epoch": 4.0, "grad_norm": 2.7687273025512695, "learning_rate": 8.90386446625952e-05, "loss": 2.0719, "step": 800 }, { "epoch": 4.0, "eval_loss": 0.5648624300956726, "eval_runtime": 15.2421, "eval_samples_per_second": 26.243, "eval_steps_per_second": 6.561, "step": 800 }, { "epoch": 4.25, "grad_norm": 3.4546947479248047, "learning_rate": 8.757191174150532e-05, "loss": 1.9407, "step": 850 }, { "epoch": 4.5, "grad_norm": 3.255340337753296, "learning_rate": 8.60268519635192e-05, "loss": 1.935, "step": 900 }, { "epoch": 4.75, "grad_norm": 3.166053056716919, "learning_rate": 8.440668634323305e-05, "loss": 1.9591, "step": 950 }, { "epoch": 5.0, "grad_norm": 3.0402660369873047, "learning_rate": 8.271479246977678e-05, "loss": 1.9412, "step": 1000 }, { "epoch": 5.25, "grad_norm": 3.6618967056274414, "learning_rate": 8.095469746549172e-05, "loss": 1.8086, "step": 1050 }, { "epoch": 5.5, "grad_norm": 3.8914847373962402, "learning_rate": 7.913007063287361e-05, "loss": 1.8275, "step": 1100 }, { "epoch": 5.75, "grad_norm": 3.528841018676758, "learning_rate": 7.724471580511021e-05, "loss": 1.8043, "step": 1150 }, { "epoch": 6.0, "grad_norm": 3.459420919418335, "learning_rate": 7.530256341615994e-05, "loss": 1.8232, "step": 1200 }, { "epoch": 6.0, "eval_loss": 0.5697617530822754, "eval_runtime": 15.3206, "eval_samples_per_second": 26.109, "eval_steps_per_second": 6.527, "step": 1200 }, { "epoch": 6.25, "grad_norm": 3.8563907146453857, "learning_rate": 7.33076623069039e-05, "loss": 1.6684, "step": 1250 }, { "epoch": 6.5, "grad_norm": 4.100020408630371, "learning_rate": 7.126417128445263e-05, "loss": 1.6671, "step": 1300 }, { "epoch": 6.75, "grad_norm": 4.32282829284668, "learning_rate": 6.917635045220425e-05, "loss": 1.687, "step": 1350 }, { "epoch": 7.0, "grad_norm": 4.05823278427124, "learning_rate": 6.704855232872843e-05, "loss": 1.6991, "step": 1400 }, { "epoch": 7.25, "grad_norm": 4.55335807800293, "learning_rate": 6.488521277399067e-05, "loss": 1.5066, "step": 1450 }, { "epoch": 7.5, "grad_norm": 4.818643093109131, "learning_rate": 6.26908417418333e-05, "loss": 1.5315, "step": 1500 }, { "epoch": 7.75, "grad_norm": 4.753720283508301, "learning_rate": 6.0470013877991525e-05, "loss": 1.5572, "step": 1550 }, { "epoch": 8.0, "grad_norm": 4.679745197296143, "learning_rate": 5.8227358983245274e-05, "loss": 1.5593, "step": 1600 }, { "epoch": 8.0, "eval_loss": 0.6191036105155945, "eval_runtime": 15.2291, "eval_samples_per_second": 26.266, "eval_steps_per_second": 6.566, "step": 1600 }, { "epoch": 8.25, "grad_norm": 5.900501728057861, "learning_rate": 5.5967552361588e-05, "loss": 1.3844, "step": 1650 }, { "epoch": 8.5, "grad_norm": 5.828335285186768, "learning_rate": 5.3695305073534455e-05, "loss": 1.3716, "step": 1700 }, { "epoch": 8.75, "grad_norm": 5.216298580169678, "learning_rate": 5.141535411488584e-05, "loss": 1.4014, "step": 1750 }, { "epoch": 9.0, "grad_norm": 5.470153331756592, "learning_rate": 4.913245254142751e-05, "loss": 1.4239, "step": 1800 }, { "epoch": 9.25, "grad_norm": 6.552476406097412, "learning_rate": 4.685135956014587e-05, "loss": 1.2293, "step": 1850 }, { "epoch": 9.5, "grad_norm": 6.554981231689453, "learning_rate": 4.4576830607621834e-05, "loss": 1.2586, "step": 1900 }, { "epoch": 9.75, "grad_norm": 5.98954963684082, "learning_rate": 4.231360743628464e-05, "loss": 1.2697, "step": 1950 }, { "epoch": 10.0, "grad_norm": 6.276147842407227, "learning_rate": 4.00664082291931e-05, "loss": 1.2779, "step": 2000 }, { "epoch": 10.0, "eval_loss": 0.6903724074363708, "eval_runtime": 15.0396, "eval_samples_per_second": 26.596, "eval_steps_per_second": 6.649, "step": 2000 }, { "epoch": 10.25, "grad_norm": 7.1180267333984375, "learning_rate": 3.78399177639524e-05, "loss": 1.1174, "step": 2050 }, { "epoch": 10.5, "grad_norm": 6.683684825897217, "learning_rate": 3.563877764627195e-05, "loss": 1.1296, "step": 2100 }, { "epoch": 10.75, "grad_norm": 7.034246444702148, "learning_rate": 3.34675766335243e-05, "loss": 1.1328, "step": 2150 }, { "epoch": 11.0, "grad_norm": 6.708085536956787, "learning_rate": 3.13308410684782e-05, "loss": 1.1434, "step": 2200 }, { "epoch": 11.25, "grad_norm": 6.986971378326416, "learning_rate": 2.9233025443148317e-05, "loss": 0.9911, "step": 2250 }, { "epoch": 11.5, "grad_norm": 6.527721881866455, "learning_rate": 2.7178503112433672e-05, "loss": 1.0082, "step": 2300 }, { "epoch": 11.75, "grad_norm": 7.1417341232299805, "learning_rate": 2.517155717690404e-05, "loss": 1.0233, "step": 2350 }, { "epoch": 12.0, "grad_norm": 7.950260639190674, "learning_rate": 2.3216371553741295e-05, "loss": 1.0301, "step": 2400 }, { "epoch": 12.0, "eval_loss": 0.8407155871391296, "eval_runtime": 15.0924, "eval_samples_per_second": 26.503, "eval_steps_per_second": 6.626, "step": 2400 }, { "epoch": 12.25, "grad_norm": 6.401738166809082, "learning_rate": 2.131702225445008e-05, "loss": 0.8974, "step": 2450 }, { "epoch": 12.5, "grad_norm": 7.200794219970703, "learning_rate": 1.9477468887521627e-05, "loss": 0.9053, "step": 2500 }, { "epoch": 12.75, "grad_norm": 7.977651119232178, "learning_rate": 1.770154640376479e-05, "loss": 0.9308, "step": 2550 }, { "epoch": 13.0, "grad_norm": 7.930706024169922, "learning_rate": 1.5992957101513524e-05, "loss": 0.9281, "step": 2600 }, { "epoch": 13.25, "grad_norm": 7.262845039367676, "learning_rate": 1.4355262908377271e-05, "loss": 0.8209, "step": 2650 }, { "epoch": 13.5, "grad_norm": 8.71459674835205, "learning_rate": 1.2791877955624859e-05, "loss": 0.8412, "step": 2700 }, { "epoch": 13.75, "grad_norm": 8.987716674804688, "learning_rate": 1.1306061460682072e-05, "loss": 0.8469, "step": 2750 }, { "epoch": 14.0, "grad_norm": 7.137842178344727, "learning_rate": 9.90091093258102e-06, "loss": 0.8318, "step": 2800 }, { "epoch": 14.0, "eval_loss": 0.9897236824035645, "eval_runtime": 15.2729, "eval_samples_per_second": 26.19, "eval_steps_per_second": 6.548, "step": 2800 }, { "epoch": 14.25, "grad_norm": 6.863259792327881, "learning_rate": 8.579355714525994e-06, "loss": 0.7727, "step": 2850 }, { "epoch": 14.5, "grad_norm": 7.495807647705078, "learning_rate": 7.3441508770376975e-06, "loss": 0.7773, "step": 2900 }, { "epoch": 14.75, "grad_norm": 7.848569869995117, "learning_rate": 6.197871474406936e-06, "loss": 0.7709, "step": 2950 }, { "epoch": 15.0, "grad_norm": 7.177825450897217, "learning_rate": 5.142907176431455e-06, "loss": 0.7807, "step": 3000 }, { "epoch": 15.25, "grad_norm": 7.384244918823242, "learning_rate": 4.181457286627316e-06, "loss": 0.7236, "step": 3050 }, { "epoch": 15.5, "grad_norm": 7.398818492889404, "learning_rate": 3.3155261573003195e-06, "loss": 0.7448, "step": 3100 }, { "epoch": 15.75, "grad_norm": 7.505466461181641, "learning_rate": 2.5469190110357475e-06, "loss": 0.7335, "step": 3150 }, { "epoch": 16.0, "grad_norm": 7.520777702331543, "learning_rate": 1.8772381773176417e-06, "loss": 0.7423, "step": 3200 }, { "epoch": 16.0, "eval_loss": 1.061659574508667, "eval_runtime": 15.1874, "eval_samples_per_second": 26.338, "eval_steps_per_second": 6.584, "step": 3200 }, { "epoch": 16.25, "grad_norm": 8.022893905639648, "learning_rate": 1.307879752122948e-06, "loss": 0.7108, "step": 3250 }, { "epoch": 16.5, "grad_norm": 7.253084659576416, "learning_rate": 8.40030687454535e-07, "loss": 0.7164, "step": 3300 }, { "epoch": 16.75, "grad_norm": 7.7122087478637695, "learning_rate": 4.746663168804566e-07, "loss": 0.708, "step": 3350 }, { "epoch": 17.0, "grad_norm": 7.264121055603027, "learning_rate": 2.1254832223808196e-07, "loss": 0.7214, "step": 3400 }, { "epoch": 17.25, "grad_norm": 7.239109516143799, "learning_rate": 5.4223145741943983e-08, "loss": 0.7047, "step": 3450 }, { "epoch": 17.5, "grad_norm": 7.081119060516357, "learning_rate": 2.08508055765666e-11, "loss": 0.6976, "step": 3500 } ], "logging_steps": 50, "max_steps": 3500, "num_input_tokens_seen": 0, "num_train_epochs": 18, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.7756473836306752e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }