| { |
| "best_global_step": 2000, |
| "best_metric": 4.951307773590088, |
| "best_model_checkpoint": "/kaggle/working/checkpoints/checkpoint-2000", |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 2452, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.04079967360261118, |
| "grad_norm": 541.5139770507812, |
| "learning_rate": 3.310810810810811e-05, |
| "loss": 116.98884765625, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.08159934720522236, |
| "grad_norm": 285.58978271484375, |
| "learning_rate": 4.947434819175778e-05, |
| "loss": 110.386572265625, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.12239902080783353, |
| "grad_norm": 268.13824462890625, |
| "learning_rate": 4.8423044575273343e-05, |
| "loss": 103.394189453125, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.1631986944104447, |
| "grad_norm": 321.2715148925781, |
| "learning_rate": 4.7371740958788904e-05, |
| "loss": 94.964775390625, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2039983680130559, |
| "grad_norm": 254.8035125732422, |
| "learning_rate": 4.6320437342304465e-05, |
| "loss": 89.278173828125, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.24479804161566707, |
| "grad_norm": 125.82566833496094, |
| "learning_rate": 4.526913372582002e-05, |
| "loss": 83.7441796875, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.28559771521827826, |
| "grad_norm": 220.39035034179688, |
| "learning_rate": 4.421783010933558e-05, |
| "loss": 77.793349609375, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.3263973888208894, |
| "grad_norm": 171.42626953125, |
| "learning_rate": 4.316652649285114e-05, |
| "loss": 73.357890625, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.3671970624235006, |
| "grad_norm": 98.98644256591797, |
| "learning_rate": 4.2115222876366694e-05, |
| "loss": 69.091845703125, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.4079967360261118, |
| "grad_norm": 146.9388885498047, |
| "learning_rate": 4.1063919259882255e-05, |
| "loss": 65.099111328125, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4079967360261118, |
| "eval_loss": 10.7504301071167, |
| "eval_runtime": 13.6455, |
| "eval_samples_per_second": 58.114, |
| "eval_steps_per_second": 7.328, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.44879640962872297, |
| "grad_norm": 156.0268096923828, |
| "learning_rate": 4.0012615643397815e-05, |
| "loss": 60.6353271484375, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.48959608323133413, |
| "grad_norm": 47.11336135864258, |
| "learning_rate": 3.8961312026913376e-05, |
| "loss": 55.8525439453125, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.5303957568339454, |
| "grad_norm": 102.2573471069336, |
| "learning_rate": 3.791000841042893e-05, |
| "loss": 51.6678466796875, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.5711954304365565, |
| "grad_norm": 21.42934799194336, |
| "learning_rate": 3.685870479394449e-05, |
| "loss": 47.7211572265625, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.6119951040391677, |
| "grad_norm": 18.592626571655273, |
| "learning_rate": 3.580740117746005e-05, |
| "loss": 44.35724609375, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.6527947776417788, |
| "grad_norm": 47.07696533203125, |
| "learning_rate": 3.475609756097561e-05, |
| "loss": 41.478544921875, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.69359445124439, |
| "grad_norm": 9.859332084655762, |
| "learning_rate": 3.370479394449117e-05, |
| "loss": 39.09307861328125, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.7343941248470012, |
| "grad_norm": 5.591737747192383, |
| "learning_rate": 3.2653490328006734e-05, |
| "loss": 37.41315185546875, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.7751937984496124, |
| "grad_norm": 21.721343994140625, |
| "learning_rate": 3.1602186711522294e-05, |
| "loss": 35.67253173828125, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.8159934720522236, |
| "grad_norm": 4.840476989746094, |
| "learning_rate": 3.055088309503785e-05, |
| "loss": 34.03028564453125, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8159934720522236, |
| "eval_loss": 6.185283184051514, |
| "eval_runtime": 13.5808, |
| "eval_samples_per_second": 58.391, |
| "eval_steps_per_second": 7.363, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8567931456548348, |
| "grad_norm": 2.8973324298858643, |
| "learning_rate": 2.949957947855341e-05, |
| "loss": 33.30818603515625, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.8975928192574459, |
| "grad_norm": 1.8784422874450684, |
| "learning_rate": 2.844827586206897e-05, |
| "loss": 32.28, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.9383924928600571, |
| "grad_norm": 4.1729865074157715, |
| "learning_rate": 2.7396972245584523e-05, |
| "loss": 31.48105712890625, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.9791921664626683, |
| "grad_norm": 2.748990058898926, |
| "learning_rate": 2.6345668629100084e-05, |
| "loss": 30.878486328125, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.0195838433292534, |
| "grad_norm": 1.7622333765029907, |
| "learning_rate": 2.5294365012615645e-05, |
| "loss": 30.14660400390625, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.0603835169318645, |
| "grad_norm": 2.544262647628784, |
| "learning_rate": 2.4243061396131202e-05, |
| "loss": 29.9266455078125, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.1011831905344758, |
| "grad_norm": 1.752083659172058, |
| "learning_rate": 2.3191757779646763e-05, |
| "loss": 29.6922900390625, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.1419828641370868, |
| "grad_norm": 1.3774715662002563, |
| "learning_rate": 2.2140454163162324e-05, |
| "loss": 29.419208984375, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.182782537739698, |
| "grad_norm": 1.3631178140640259, |
| "learning_rate": 2.1089150546677884e-05, |
| "loss": 29.1697119140625, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.2235822113423094, |
| "grad_norm": 1.5763075351715088, |
| "learning_rate": 2.003784693019344e-05, |
| "loss": 28.77746826171875, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2235822113423094, |
| "eval_loss": 5.188190460205078, |
| "eval_runtime": 13.686, |
| "eval_samples_per_second": 57.942, |
| "eval_steps_per_second": 7.307, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2643818849449204, |
| "grad_norm": 1.314376950263977, |
| "learning_rate": 1.8986543313709e-05, |
| "loss": 28.54221923828125, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.3051815585475315, |
| "grad_norm": 8.528578758239746, |
| "learning_rate": 1.793523969722456e-05, |
| "loss": 28.40603515625, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.3459812321501428, |
| "grad_norm": 8.729783058166504, |
| "learning_rate": 1.6883936080740117e-05, |
| "loss": 28.39514892578125, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.386780905752754, |
| "grad_norm": 1.1846702098846436, |
| "learning_rate": 1.5832632464255678e-05, |
| "loss": 28.02635009765625, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.427580579355365, |
| "grad_norm": 6.678456783294678, |
| "learning_rate": 1.4781328847771237e-05, |
| "loss": 27.882392578125, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.4683802529579764, |
| "grad_norm": 1.1704760789871216, |
| "learning_rate": 1.3730025231286797e-05, |
| "loss": 27.820341796875, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.5091799265605874, |
| "grad_norm": 4.103906154632568, |
| "learning_rate": 1.2678721614802355e-05, |
| "loss": 27.7095068359375, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.5499796001631987, |
| "grad_norm": 1.0563730001449585, |
| "learning_rate": 1.1627417998317915e-05, |
| "loss": 27.6540380859375, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.59077927376581, |
| "grad_norm": 1.4707422256469727, |
| "learning_rate": 1.0576114381833474e-05, |
| "loss": 27.472509765625, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.631578947368421, |
| "grad_norm": 1.8527077436447144, |
| "learning_rate": 9.524810765349033e-06, |
| "loss": 27.4132763671875, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.631578947368421, |
| "eval_loss": 4.951307773590088, |
| "eval_runtime": 13.75, |
| "eval_samples_per_second": 57.673, |
| "eval_steps_per_second": 7.273, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.672378620971032, |
| "grad_norm": 1.2665544748306274, |
| "learning_rate": 8.473507148864592e-06, |
| "loss": 27.26576416015625, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.7131782945736433, |
| "grad_norm": 1.6418918371200562, |
| "learning_rate": 7.422203532380152e-06, |
| "loss": 27.26886474609375, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.7539779681762546, |
| "grad_norm": 1.1888850927352905, |
| "learning_rate": 6.370899915895712e-06, |
| "loss": 27.21632568359375, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.794777641778866, |
| "grad_norm": 1.3705418109893799, |
| "learning_rate": 5.31959629941127e-06, |
| "loss": 27.19008544921875, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.835577315381477, |
| "grad_norm": 1.1785783767700195, |
| "learning_rate": 4.26829268292683e-06, |
| "loss": 27.07623046875, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.876376988984088, |
| "grad_norm": 1.0306082963943481, |
| "learning_rate": 3.2169890664423886e-06, |
| "loss": 27.08888916015625, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.9171766625866993, |
| "grad_norm": 5.090639114379883, |
| "learning_rate": 2.165685449957948e-06, |
| "loss": 26.97588623046875, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.9579763361893106, |
| "grad_norm": 1.3182185888290405, |
| "learning_rate": 1.1143818334735072e-06, |
| "loss": 27.0556298828125, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.9987760097919218, |
| "grad_norm": 1.3252017498016357, |
| "learning_rate": 6.307821698906644e-08, |
| "loss": 26.97030517578125, |
| "step": 2450 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 2452, |
| "total_flos": 2.081454582398976e+16, |
| "train_loss": 44.073582747243165, |
| "train_runtime": 6114.8596, |
| "train_samples_per_second": 25.649, |
| "train_steps_per_second": 0.401 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 2452, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.081454582398976e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|