| { | |
| "best_metric": 0.9927281737327576, | |
| "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-290", | |
| "epoch": 42.666666666666664, | |
| "eval_steps": 5, | |
| "global_step": 320, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.243179589509964, | |
| "learning_rate": 9.857142857142858e-05, | |
| "loss": 1.9956, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "eval_loss": 1.9701930284500122, | |
| "eval_runtime": 17.115, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.34590908885002136, | |
| "learning_rate": 9.714285714285715e-05, | |
| "loss": 1.9758, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "eval_loss": 1.8941271305084229, | |
| "eval_runtime": 17.0912, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.31595832109451294, | |
| "learning_rate": 9.571428571428573e-05, | |
| "loss": 1.849, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.8046789169311523, | |
| "eval_runtime": 17.098, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.3428090512752533, | |
| "learning_rate": 9.428571428571429e-05, | |
| "loss": 1.789, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "eval_loss": 1.7658358812332153, | |
| "eval_runtime": 17.0734, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 0.3102028965950012, | |
| "learning_rate": 9.285714285714286e-05, | |
| "loss": 1.7789, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "eval_loss": 1.7225048542022705, | |
| "eval_runtime": 17.0972, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.38602885603904724, | |
| "learning_rate": 9.142857142857143e-05, | |
| "loss": 1.7003, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.6749440431594849, | |
| "eval_runtime": 17.1034, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 0.37120407819747925, | |
| "learning_rate": 9e-05, | |
| "loss": 1.6424, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "eval_loss": 1.6231099367141724, | |
| "eval_runtime": 17.1067, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 5.33, | |
| "grad_norm": 0.4633428454399109, | |
| "learning_rate": 8.857142857142857e-05, | |
| "loss": 1.6023, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 5.33, | |
| "eval_loss": 1.5727053880691528, | |
| "eval_runtime": 17.1002, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.5034663081169128, | |
| "learning_rate": 8.714285714285715e-05, | |
| "loss": 1.5322, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 1.5312587022781372, | |
| "eval_runtime": 17.1159, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 6.67, | |
| "grad_norm": 0.5549929141998291, | |
| "learning_rate": 8.571428571428571e-05, | |
| "loss": 1.4788, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 6.67, | |
| "eval_loss": 1.492464303970337, | |
| "eval_runtime": 17.0823, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 7.33, | |
| "grad_norm": 0.49194690585136414, | |
| "learning_rate": 8.428571428571429e-05, | |
| "loss": 1.4632, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 7.33, | |
| "eval_loss": 1.4622489213943481, | |
| "eval_runtime": 17.1022, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.5866131782531738, | |
| "learning_rate": 8.285714285714287e-05, | |
| "loss": 1.3951, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 1.435951828956604, | |
| "eval_runtime": 17.1087, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 8.67, | |
| "grad_norm": 0.6252542734146118, | |
| "learning_rate": 8.142857142857143e-05, | |
| "loss": 1.3796, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 8.67, | |
| "eval_loss": 1.413227915763855, | |
| "eval_runtime": 17.0914, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 9.33, | |
| "grad_norm": 0.6751863360404968, | |
| "learning_rate": 8e-05, | |
| "loss": 1.3257, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 9.33, | |
| "eval_loss": 1.395649790763855, | |
| "eval_runtime": 17.0885, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.8878222703933716, | |
| "learning_rate": 7.857142857142858e-05, | |
| "loss": 1.2795, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 1.3699487447738647, | |
| "eval_runtime": 17.1031, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 10.67, | |
| "grad_norm": 0.8470121026039124, | |
| "learning_rate": 7.714285714285715e-05, | |
| "loss": 1.2449, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 10.67, | |
| "eval_loss": 1.347831130027771, | |
| "eval_runtime": 17.0985, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 11.33, | |
| "grad_norm": 1.0655425786972046, | |
| "learning_rate": 7.571428571428571e-05, | |
| "loss": 1.1983, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 11.33, | |
| "eval_loss": 1.3311971426010132, | |
| "eval_runtime": 17.0784, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 1.2651888132095337, | |
| "learning_rate": 7.428571428571429e-05, | |
| "loss": 1.1467, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 1.3095277547836304, | |
| "eval_runtime": 17.0903, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 12.67, | |
| "grad_norm": 1.248926043510437, | |
| "learning_rate": 7.285714285714286e-05, | |
| "loss": 1.0922, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 12.67, | |
| "eval_loss": 1.2942878007888794, | |
| "eval_runtime": 17.0947, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 13.33, | |
| "grad_norm": 1.896952509880066, | |
| "learning_rate": 7.142857142857143e-05, | |
| "loss": 1.0403, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 13.33, | |
| "eval_loss": 1.2803159952163696, | |
| "eval_runtime": 17.0819, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 1.862244725227356, | |
| "learning_rate": 7e-05, | |
| "loss": 1.0049, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 1.2643567323684692, | |
| "eval_runtime": 17.0849, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 14.67, | |
| "grad_norm": 1.7487821578979492, | |
| "learning_rate": 6.857142857142858e-05, | |
| "loss": 0.9262, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 14.67, | |
| "eval_loss": 1.2471646070480347, | |
| "eval_runtime": 17.1278, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 15.33, | |
| "grad_norm": 1.838605284690857, | |
| "learning_rate": 6.714285714285714e-05, | |
| "loss": 0.8965, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 15.33, | |
| "eval_loss": 1.2377034425735474, | |
| "eval_runtime": 17.0731, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 3.117398977279663, | |
| "learning_rate": 6.571428571428571e-05, | |
| "loss": 0.8581, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 1.2083133459091187, | |
| "eval_runtime": 17.1304, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 16.67, | |
| "grad_norm": 2.5655250549316406, | |
| "learning_rate": 6.428571428571429e-05, | |
| "loss": 0.7929, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 16.67, | |
| "eval_loss": 1.1945828199386597, | |
| "eval_runtime": 17.104, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 17.33, | |
| "grad_norm": 2.168546199798584, | |
| "learning_rate": 6.285714285714286e-05, | |
| "loss": 0.7543, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 17.33, | |
| "eval_loss": 1.1876276731491089, | |
| "eval_runtime": 17.1046, | |
| "eval_samples_per_second": 0.409, | |
| "eval_steps_per_second": 0.058, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 2.5984208583831787, | |
| "learning_rate": 6.142857142857143e-05, | |
| "loss": 0.716, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 1.1714750528335571, | |
| "eval_runtime": 17.0807, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 18.67, | |
| "grad_norm": 3.479024887084961, | |
| "learning_rate": 6e-05, | |
| "loss": 0.6681, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 18.67, | |
| "eval_loss": 1.169895052909851, | |
| "eval_runtime": 17.0681, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 19.33, | |
| "grad_norm": 2.563386917114258, | |
| "learning_rate": 5.8571428571428575e-05, | |
| "loss": 0.6306, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 19.33, | |
| "eval_loss": 1.1741083860397339, | |
| "eval_runtime": 17.0568, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 2.96592116355896, | |
| "learning_rate": 5.714285714285714e-05, | |
| "loss": 0.6183, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 1.1455965042114258, | |
| "eval_runtime": 17.073, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 20.67, | |
| "grad_norm": 2.6751275062561035, | |
| "learning_rate": 5.571428571428572e-05, | |
| "loss": 0.5464, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 20.67, | |
| "eval_loss": 1.131102204322815, | |
| "eval_runtime": 17.0578, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 21.33, | |
| "grad_norm": 2.3700051307678223, | |
| "learning_rate": 5.428571428571428e-05, | |
| "loss": 0.551, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 21.33, | |
| "eval_loss": 1.127384066581726, | |
| "eval_runtime": 17.0546, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 3.3827567100524902, | |
| "learning_rate": 5.285714285714286e-05, | |
| "loss": 0.5179, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 1.111584186553955, | |
| "eval_runtime": 17.0812, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 22.67, | |
| "grad_norm": 3.55790114402771, | |
| "learning_rate": 5.142857142857143e-05, | |
| "loss": 0.4831, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 22.67, | |
| "eval_loss": 1.0948525667190552, | |
| "eval_runtime": 17.0547, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 23.33, | |
| "grad_norm": 3.0782699584960938, | |
| "learning_rate": 5e-05, | |
| "loss": 0.4587, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 23.33, | |
| "eval_loss": 1.0906586647033691, | |
| "eval_runtime": 17.0666, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 3.3993167877197266, | |
| "learning_rate": 4.8571428571428576e-05, | |
| "loss": 0.4203, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 1.0688152313232422, | |
| "eval_runtime": 17.0721, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 24.67, | |
| "grad_norm": 3.319303035736084, | |
| "learning_rate": 4.714285714285714e-05, | |
| "loss": 0.3975, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 24.67, | |
| "eval_loss": 1.0746583938598633, | |
| "eval_runtime": 17.0709, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 25.33, | |
| "grad_norm": 2.4532127380371094, | |
| "learning_rate": 4.5714285714285716e-05, | |
| "loss": 0.3832, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 25.33, | |
| "eval_loss": 1.0772522687911987, | |
| "eval_runtime": 17.0619, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "grad_norm": 3.956822156906128, | |
| "learning_rate": 4.428571428571428e-05, | |
| "loss": 0.3725, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_loss": 1.0638784170150757, | |
| "eval_runtime": 17.0807, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 26.67, | |
| "grad_norm": 2.76033353805542, | |
| "learning_rate": 4.2857142857142856e-05, | |
| "loss": 0.3473, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 26.67, | |
| "eval_loss": 1.04669988155365, | |
| "eval_runtime": 17.0774, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 27.33, | |
| "grad_norm": 3.8683507442474365, | |
| "learning_rate": 4.1428571428571437e-05, | |
| "loss": 0.3243, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 27.33, | |
| "eval_loss": 1.0470303297042847, | |
| "eval_runtime": 17.0718, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 4.535538196563721, | |
| "learning_rate": 4e-05, | |
| "loss": 0.3202, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 1.025539517402649, | |
| "eval_runtime": 17.0604, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 28.67, | |
| "grad_norm": 2.6224355697631836, | |
| "learning_rate": 3.857142857142858e-05, | |
| "loss": 0.2958, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 28.67, | |
| "eval_loss": 1.0192126035690308, | |
| "eval_runtime": 17.0657, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 29.33, | |
| "grad_norm": 2.5870041847229004, | |
| "learning_rate": 3.7142857142857143e-05, | |
| "loss": 0.2783, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 29.33, | |
| "eval_loss": 1.0211580991744995, | |
| "eval_runtime": 17.0857, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 3.4565751552581787, | |
| "learning_rate": 3.571428571428572e-05, | |
| "loss": 0.2773, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 1.006419062614441, | |
| "eval_runtime": 17.0807, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 30.67, | |
| "grad_norm": 2.4756500720977783, | |
| "learning_rate": 3.428571428571429e-05, | |
| "loss": 0.2482, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 30.67, | |
| "eval_loss": 1.0081219673156738, | |
| "eval_runtime": 17.0576, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 31.33, | |
| "grad_norm": 2.38002610206604, | |
| "learning_rate": 3.285714285714286e-05, | |
| "loss": 0.2464, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 31.33, | |
| "eval_loss": 1.0151804685592651, | |
| "eval_runtime": 17.0587, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 3.7081105709075928, | |
| "learning_rate": 3.142857142857143e-05, | |
| "loss": 0.2442, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 1.0032445192337036, | |
| "eval_runtime": 17.1613, | |
| "eval_samples_per_second": 0.408, | |
| "eval_steps_per_second": 0.058, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 32.67, | |
| "grad_norm": 2.55924391746521, | |
| "learning_rate": 3e-05, | |
| "loss": 0.2193, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 32.67, | |
| "eval_loss": 0.9989615082740784, | |
| "eval_runtime": 17.0447, | |
| "eval_samples_per_second": 0.411, | |
| "eval_steps_per_second": 0.059, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 33.33, | |
| "grad_norm": 1.9451407194137573, | |
| "learning_rate": 2.857142857142857e-05, | |
| "loss": 0.2101, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 33.33, | |
| "eval_loss": 1.0029457807540894, | |
| "eval_runtime": 17.0816, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "grad_norm": 2.713731527328491, | |
| "learning_rate": 2.714285714285714e-05, | |
| "loss": 0.2194, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_loss": 0.9959421753883362, | |
| "eval_runtime": 17.0747, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 34.67, | |
| "grad_norm": 2.1633846759796143, | |
| "learning_rate": 2.5714285714285714e-05, | |
| "loss": 0.1958, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 34.67, | |
| "eval_loss": 0.9989770650863647, | |
| "eval_runtime": 17.0821, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 35.33, | |
| "grad_norm": 3.9233529567718506, | |
| "learning_rate": 2.4285714285714288e-05, | |
| "loss": 0.1831, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 35.33, | |
| "eval_loss": 1.0072578191757202, | |
| "eval_runtime": 17.0564, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "grad_norm": 2.4143056869506836, | |
| "learning_rate": 2.2857142857142858e-05, | |
| "loss": 0.1753, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_loss": 0.9938892722129822, | |
| "eval_runtime": 17.0668, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 36.67, | |
| "grad_norm": 2.706679582595825, | |
| "learning_rate": 2.1428571428571428e-05, | |
| "loss": 0.1698, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 36.67, | |
| "eval_loss": 0.9969200491905212, | |
| "eval_runtime": 17.0643, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 37.33, | |
| "grad_norm": 1.872753620147705, | |
| "learning_rate": 2e-05, | |
| "loss": 0.16, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 37.33, | |
| "eval_loss": 0.9940390586853027, | |
| "eval_runtime": 17.0728, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "grad_norm": 2.7510581016540527, | |
| "learning_rate": 1.8571428571428572e-05, | |
| "loss": 0.1614, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_loss": 1.0066231489181519, | |
| "eval_runtime": 17.072, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 38.67, | |
| "grad_norm": 1.8461092710494995, | |
| "learning_rate": 1.7142857142857145e-05, | |
| "loss": 0.1506, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 38.67, | |
| "eval_loss": 0.9927281737327576, | |
| "eval_runtime": 17.0481, | |
| "eval_samples_per_second": 0.411, | |
| "eval_steps_per_second": 0.059, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 39.33, | |
| "grad_norm": 1.8425017595291138, | |
| "learning_rate": 1.5714285714285715e-05, | |
| "loss": 0.1419, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 39.33, | |
| "eval_loss": 1.0133570432662964, | |
| "eval_runtime": 17.0642, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 2.0457987785339355, | |
| "learning_rate": 1.4285714285714285e-05, | |
| "loss": 0.1459, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_loss": 1.0127934217453003, | |
| "eval_runtime": 17.0581, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 40.67, | |
| "grad_norm": 1.5630775690078735, | |
| "learning_rate": 1.2857142857142857e-05, | |
| "loss": 0.1225, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 40.67, | |
| "eval_loss": 1.0092624425888062, | |
| "eval_runtime": 17.0483, | |
| "eval_samples_per_second": 0.411, | |
| "eval_steps_per_second": 0.059, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 41.33, | |
| "grad_norm": 1.37598717212677, | |
| "learning_rate": 1.1428571428571429e-05, | |
| "loss": 0.146, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 41.33, | |
| "eval_loss": 1.0083317756652832, | |
| "eval_runtime": 17.0804, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "grad_norm": 1.8867217302322388, | |
| "learning_rate": 1e-05, | |
| "loss": 0.13, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_loss": 1.0165104866027832, | |
| "eval_runtime": 17.0621, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 42.67, | |
| "grad_norm": 1.4643555879592896, | |
| "learning_rate": 8.571428571428573e-06, | |
| "loss": 0.131, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 42.67, | |
| "eval_loss": 1.0264887809753418, | |
| "eval_runtime": 17.0554, | |
| "eval_samples_per_second": 0.41, | |
| "eval_steps_per_second": 0.059, | |
| "step": 320 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 350, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 10, | |
| "total_flos": 2.1266150580320993e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |