| { | |
| "best_metric": 1.719967246055603, | |
| "best_model_checkpoint": "ckpts/sft_OLMo-1B-hf/checkpoint-940", | |
| "epoch": 4.96042216358839, | |
| "eval_steps": 20, | |
| "global_step": 940, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.10554089709762533, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 2.4799, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10554089709762533, | |
| "eval_loss": 2.3184142112731934, | |
| "eval_runtime": 4.0729, | |
| "eval_samples_per_second": 49.105, | |
| "eval_steps_per_second": 12.276, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.21108179419525067, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 2.2916, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.21108179419525067, | |
| "eval_loss": 2.265637159347534, | |
| "eval_runtime": 4.034, | |
| "eval_samples_per_second": 49.579, | |
| "eval_steps_per_second": 12.395, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.316622691292876, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 1e-05, | |
| "loss": 2.2737, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.316622691292876, | |
| "eval_loss": 2.2540640830993652, | |
| "eval_runtime": 3.9951, | |
| "eval_samples_per_second": 50.062, | |
| "eval_steps_per_second": 12.515, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.42216358839050133, | |
| "grad_norm": 7.25, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1889, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.42216358839050133, | |
| "eval_loss": 2.2447378635406494, | |
| "eval_runtime": 4.3965, | |
| "eval_samples_per_second": 45.491, | |
| "eval_steps_per_second": 11.373, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5277044854881267, | |
| "grad_norm": 8.0, | |
| "learning_rate": 1e-05, | |
| "loss": 2.2005, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5277044854881267, | |
| "eval_loss": 2.225715398788452, | |
| "eval_runtime": 4.0633, | |
| "eval_samples_per_second": 49.221, | |
| "eval_steps_per_second": 12.305, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.633245382585752, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1915, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.633245382585752, | |
| "eval_loss": 2.208789587020874, | |
| "eval_runtime": 4.4371, | |
| "eval_samples_per_second": 45.075, | |
| "eval_steps_per_second": 11.269, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7387862796833773, | |
| "grad_norm": 7.875, | |
| "learning_rate": 1e-05, | |
| "loss": 2.2115, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7387862796833773, | |
| "eval_loss": 2.189687728881836, | |
| "eval_runtime": 4.427, | |
| "eval_samples_per_second": 45.177, | |
| "eval_steps_per_second": 11.294, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8443271767810027, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1754, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8443271767810027, | |
| "eval_loss": 2.1662068367004395, | |
| "eval_runtime": 4.4352, | |
| "eval_samples_per_second": 45.094, | |
| "eval_steps_per_second": 11.273, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9498680738786279, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1529, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9498680738786279, | |
| "eval_loss": 2.151796340942383, | |
| "eval_runtime": 4.578, | |
| "eval_samples_per_second": 43.688, | |
| "eval_steps_per_second": 10.922, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0554089709762533, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0596, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0554089709762533, | |
| "eval_loss": 2.150334119796753, | |
| "eval_runtime": 4.3064, | |
| "eval_samples_per_second": 46.443, | |
| "eval_steps_per_second": 11.611, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1609498680738786, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9336, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.1609498680738786, | |
| "eval_loss": 2.138848066329956, | |
| "eval_runtime": 4.7845, | |
| "eval_samples_per_second": 41.802, | |
| "eval_steps_per_second": 10.45, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.266490765171504, | |
| "grad_norm": 8.125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.917, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.266490765171504, | |
| "eval_loss": 2.1307833194732666, | |
| "eval_runtime": 4.549, | |
| "eval_samples_per_second": 43.966, | |
| "eval_steps_per_second": 10.992, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3720316622691293, | |
| "grad_norm": 8.375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9214, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.3720316622691293, | |
| "eval_loss": 2.11213755607605, | |
| "eval_runtime": 4.4528, | |
| "eval_samples_per_second": 44.916, | |
| "eval_steps_per_second": 11.229, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4775725593667546, | |
| "grad_norm": 8.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9631, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.4775725593667546, | |
| "eval_loss": 2.0881500244140625, | |
| "eval_runtime": 4.489, | |
| "eval_samples_per_second": 44.553, | |
| "eval_steps_per_second": 11.138, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.58311345646438, | |
| "grad_norm": 8.5, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8888, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.58311345646438, | |
| "eval_loss": 2.0727522373199463, | |
| "eval_runtime": 4.6234, | |
| "eval_samples_per_second": 43.258, | |
| "eval_steps_per_second": 10.815, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6886543535620053, | |
| "grad_norm": 8.75, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8634, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.6886543535620053, | |
| "eval_loss": 2.0583410263061523, | |
| "eval_runtime": 4.3979, | |
| "eval_samples_per_second": 45.476, | |
| "eval_steps_per_second": 11.369, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.7941952506596306, | |
| "grad_norm": 9.625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8716, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.7941952506596306, | |
| "eval_loss": 2.0440073013305664, | |
| "eval_runtime": 4.4336, | |
| "eval_samples_per_second": 45.111, | |
| "eval_steps_per_second": 11.278, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.899736147757256, | |
| "grad_norm": 8.625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8626, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.899736147757256, | |
| "eval_loss": 2.027642011642456, | |
| "eval_runtime": 4.5994, | |
| "eval_samples_per_second": 43.484, | |
| "eval_steps_per_second": 10.871, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.005277044854881, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8374, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.005277044854881, | |
| "eval_loss": 2.023581027984619, | |
| "eval_runtime": 4.513, | |
| "eval_samples_per_second": 44.316, | |
| "eval_steps_per_second": 11.079, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.1108179419525066, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6156, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.1108179419525066, | |
| "eval_loss": 2.034921169281006, | |
| "eval_runtime": 4.3548, | |
| "eval_samples_per_second": 45.926, | |
| "eval_steps_per_second": 11.482, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.216358839050132, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1e-05, | |
| "loss": 1.571, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.216358839050132, | |
| "eval_loss": 2.0253899097442627, | |
| "eval_runtime": 4.4896, | |
| "eval_samples_per_second": 44.547, | |
| "eval_steps_per_second": 11.137, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.321899736147757, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.5824, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.321899736147757, | |
| "eval_loss": 2.000455141067505, | |
| "eval_runtime": 4.3294, | |
| "eval_samples_per_second": 46.195, | |
| "eval_steps_per_second": 11.549, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.4274406332453826, | |
| "grad_norm": 11.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.532, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.4274406332453826, | |
| "eval_loss": 2.0012362003326416, | |
| "eval_runtime": 4.4248, | |
| "eval_samples_per_second": 45.2, | |
| "eval_steps_per_second": 11.3, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.532981530343008, | |
| "grad_norm": 10.875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.538, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.532981530343008, | |
| "eval_loss": 1.9685580730438232, | |
| "eval_runtime": 4.6986, | |
| "eval_samples_per_second": 42.566, | |
| "eval_steps_per_second": 10.642, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.638522427440633, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.5482, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.638522427440633, | |
| "eval_loss": 1.945511817932129, | |
| "eval_runtime": 4.3643, | |
| "eval_samples_per_second": 45.826, | |
| "eval_steps_per_second": 11.457, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.7440633245382586, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.5028, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.7440633245382586, | |
| "eval_loss": 1.9389147758483887, | |
| "eval_runtime": 4.5184, | |
| "eval_samples_per_second": 44.263, | |
| "eval_steps_per_second": 11.066, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.849604221635884, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.4947, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.849604221635884, | |
| "eval_loss": 1.9430372714996338, | |
| "eval_runtime": 4.4992, | |
| "eval_samples_per_second": 44.453, | |
| "eval_steps_per_second": 11.113, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.955145118733509, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.5243, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.955145118733509, | |
| "eval_loss": 1.9145066738128662, | |
| "eval_runtime": 4.4679, | |
| "eval_samples_per_second": 44.764, | |
| "eval_steps_per_second": 11.191, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.0606860158311346, | |
| "grad_norm": 15.0, | |
| "learning_rate": 1e-05, | |
| "loss": 1.3297, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.0606860158311346, | |
| "eval_loss": 1.9249849319458008, | |
| "eval_runtime": 4.5014, | |
| "eval_samples_per_second": 44.43, | |
| "eval_steps_per_second": 11.108, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.16622691292876, | |
| "grad_norm": 13.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.21, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.16622691292876, | |
| "eval_loss": 1.9324084520339966, | |
| "eval_runtime": 4.7117, | |
| "eval_samples_per_second": 42.447, | |
| "eval_steps_per_second": 10.612, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.271767810026385, | |
| "grad_norm": 15.875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2001, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.271767810026385, | |
| "eval_loss": 1.9431959390640259, | |
| "eval_runtime": 4.4958, | |
| "eval_samples_per_second": 44.486, | |
| "eval_steps_per_second": 11.121, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.3773087071240107, | |
| "grad_norm": 15.125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1686, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.3773087071240107, | |
| "eval_loss": 1.9009323120117188, | |
| "eval_runtime": 4.3205, | |
| "eval_samples_per_second": 46.291, | |
| "eval_steps_per_second": 11.573, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.4828496042216357, | |
| "grad_norm": 16.5, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1798, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.4828496042216357, | |
| "eval_loss": 1.8920202255249023, | |
| "eval_runtime": 4.5772, | |
| "eval_samples_per_second": 43.695, | |
| "eval_steps_per_second": 10.924, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.588390501319261, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.197, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.588390501319261, | |
| "eval_loss": 1.8691601753234863, | |
| "eval_runtime": 4.5889, | |
| "eval_samples_per_second": 43.584, | |
| "eval_steps_per_second": 10.896, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.6939313984168867, | |
| "grad_norm": 15.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1745, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.6939313984168867, | |
| "eval_loss": 1.8563519716262817, | |
| "eval_runtime": 4.09, | |
| "eval_samples_per_second": 48.9, | |
| "eval_steps_per_second": 12.225, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.7994722955145117, | |
| "grad_norm": 16.125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1083, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.7994722955145117, | |
| "eval_loss": 1.8388882875442505, | |
| "eval_runtime": 4.1971, | |
| "eval_samples_per_second": 47.653, | |
| "eval_steps_per_second": 11.913, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.905013192612137, | |
| "grad_norm": 16.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1325, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.905013192612137, | |
| "eval_loss": 1.8317779302597046, | |
| "eval_runtime": 4.353, | |
| "eval_samples_per_second": 45.945, | |
| "eval_steps_per_second": 11.486, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.010554089709762, | |
| "grad_norm": 16.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0731, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.010554089709762, | |
| "eval_loss": 1.8252906799316406, | |
| "eval_runtime": 4.5472, | |
| "eval_samples_per_second": 43.983, | |
| "eval_steps_per_second": 10.996, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.116094986807388, | |
| "grad_norm": 15.75, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8763, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.116094986807388, | |
| "eval_loss": 1.8407686948776245, | |
| "eval_runtime": 4.341, | |
| "eval_samples_per_second": 46.073, | |
| "eval_steps_per_second": 11.518, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.221635883905013, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8789, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.221635883905013, | |
| "eval_loss": 1.836584210395813, | |
| "eval_runtime": 4.5537, | |
| "eval_samples_per_second": 43.921, | |
| "eval_steps_per_second": 10.98, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.327176781002638, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8585, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.327176781002638, | |
| "eval_loss": 1.826670527458191, | |
| "eval_runtime": 4.5393, | |
| "eval_samples_per_second": 44.06, | |
| "eval_steps_per_second": 11.015, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.432717678100264, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1e-05, | |
| "loss": 0.7994, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.432717678100264, | |
| "eval_loss": 1.823104977607727, | |
| "eval_runtime": 4.5835, | |
| "eval_samples_per_second": 43.635, | |
| "eval_steps_per_second": 10.909, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.538258575197889, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.828, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.538258575197889, | |
| "eval_loss": 1.7835866212844849, | |
| "eval_runtime": 4.4222, | |
| "eval_samples_per_second": 45.227, | |
| "eval_steps_per_second": 11.307, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.643799472295514, | |
| "grad_norm": 15.8125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8055, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.643799472295514, | |
| "eval_loss": 1.776289939880371, | |
| "eval_runtime": 4.451, | |
| "eval_samples_per_second": 44.934, | |
| "eval_steps_per_second": 11.234, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.74934036939314, | |
| "grad_norm": 16.125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8072, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.74934036939314, | |
| "eval_loss": 1.7724196910858154, | |
| "eval_runtime": 4.6227, | |
| "eval_samples_per_second": 43.264, | |
| "eval_steps_per_second": 10.816, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.854881266490765, | |
| "grad_norm": 15.1875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8029, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 4.854881266490765, | |
| "eval_loss": 1.7451767921447754, | |
| "eval_runtime": 4.5459, | |
| "eval_samples_per_second": 43.995, | |
| "eval_steps_per_second": 10.999, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 4.96042216358839, | |
| "grad_norm": 19.125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.7929, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 4.96042216358839, | |
| "eval_loss": 1.719967246055603, | |
| "eval_runtime": 4.7451, | |
| "eval_samples_per_second": 42.148, | |
| "eval_steps_per_second": 10.537, | |
| "step": 940 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 9450, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 5, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.968166912425984e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |