{ "best_metric": 0.9927281737327576, "best_model_checkpoint": "/scratch/kwamea/llama-output/checkpoint-290", "epoch": 38.666666666666664, "eval_steps": 5, "global_step": 290, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.67, "grad_norm": 0.243179589509964, "learning_rate": 9.857142857142858e-05, "loss": 1.9956, "step": 5 }, { "epoch": 0.67, "eval_loss": 1.9701930284500122, "eval_runtime": 17.115, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 5 }, { "epoch": 1.33, "grad_norm": 0.34590908885002136, "learning_rate": 9.714285714285715e-05, "loss": 1.9758, "step": 10 }, { "epoch": 1.33, "eval_loss": 1.8941271305084229, "eval_runtime": 17.0912, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 10 }, { "epoch": 2.0, "grad_norm": 0.31595832109451294, "learning_rate": 9.571428571428573e-05, "loss": 1.849, "step": 15 }, { "epoch": 2.0, "eval_loss": 1.8046789169311523, "eval_runtime": 17.098, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 15 }, { "epoch": 2.67, "grad_norm": 0.3428090512752533, "learning_rate": 9.428571428571429e-05, "loss": 1.789, "step": 20 }, { "epoch": 2.67, "eval_loss": 1.7658358812332153, "eval_runtime": 17.0734, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 20 }, { "epoch": 3.33, "grad_norm": 0.3102028965950012, "learning_rate": 9.285714285714286e-05, "loss": 1.7789, "step": 25 }, { "epoch": 3.33, "eval_loss": 1.7225048542022705, "eval_runtime": 17.0972, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 25 }, { "epoch": 4.0, "grad_norm": 0.38602885603904724, "learning_rate": 9.142857142857143e-05, "loss": 1.7003, "step": 30 }, { "epoch": 4.0, "eval_loss": 1.6749440431594849, "eval_runtime": 17.1034, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 30 }, { "epoch": 4.67, "grad_norm": 0.37120407819747925, "learning_rate": 9e-05, "loss": 1.6424, "step": 35 }, { "epoch": 4.67, "eval_loss": 1.6231099367141724, "eval_runtime": 17.1067, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 35 }, { "epoch": 5.33, "grad_norm": 0.4633428454399109, "learning_rate": 8.857142857142857e-05, "loss": 1.6023, "step": 40 }, { "epoch": 5.33, "eval_loss": 1.5727053880691528, "eval_runtime": 17.1002, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 40 }, { "epoch": 6.0, "grad_norm": 0.5034663081169128, "learning_rate": 8.714285714285715e-05, "loss": 1.5322, "step": 45 }, { "epoch": 6.0, "eval_loss": 1.5312587022781372, "eval_runtime": 17.1159, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 45 }, { "epoch": 6.67, "grad_norm": 0.5549929141998291, "learning_rate": 8.571428571428571e-05, "loss": 1.4788, "step": 50 }, { "epoch": 6.67, "eval_loss": 1.492464303970337, "eval_runtime": 17.0823, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 50 }, { "epoch": 7.33, "grad_norm": 0.49194690585136414, "learning_rate": 8.428571428571429e-05, "loss": 1.4632, "step": 55 }, { "epoch": 7.33, "eval_loss": 1.4622489213943481, "eval_runtime": 17.1022, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 55 }, { "epoch": 8.0, "grad_norm": 0.5866131782531738, "learning_rate": 8.285714285714287e-05, "loss": 1.3951, "step": 60 }, { "epoch": 8.0, "eval_loss": 1.435951828956604, "eval_runtime": 17.1087, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 60 }, { "epoch": 8.67, "grad_norm": 0.6252542734146118, "learning_rate": 8.142857142857143e-05, "loss": 1.3796, "step": 65 }, { "epoch": 8.67, "eval_loss": 1.413227915763855, "eval_runtime": 17.0914, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 65 }, { "epoch": 9.33, "grad_norm": 0.6751863360404968, "learning_rate": 8e-05, "loss": 1.3257, "step": 70 }, { "epoch": 9.33, "eval_loss": 1.395649790763855, "eval_runtime": 17.0885, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 70 }, { "epoch": 10.0, "grad_norm": 0.8878222703933716, "learning_rate": 7.857142857142858e-05, "loss": 1.2795, "step": 75 }, { "epoch": 10.0, "eval_loss": 1.3699487447738647, "eval_runtime": 17.1031, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 75 }, { "epoch": 10.67, "grad_norm": 0.8470121026039124, "learning_rate": 7.714285714285715e-05, "loss": 1.2449, "step": 80 }, { "epoch": 10.67, "eval_loss": 1.347831130027771, "eval_runtime": 17.0985, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 80 }, { "epoch": 11.33, "grad_norm": 1.0655425786972046, "learning_rate": 7.571428571428571e-05, "loss": 1.1983, "step": 85 }, { "epoch": 11.33, "eval_loss": 1.3311971426010132, "eval_runtime": 17.0784, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 85 }, { "epoch": 12.0, "grad_norm": 1.2651888132095337, "learning_rate": 7.428571428571429e-05, "loss": 1.1467, "step": 90 }, { "epoch": 12.0, "eval_loss": 1.3095277547836304, "eval_runtime": 17.0903, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 90 }, { "epoch": 12.67, "grad_norm": 1.248926043510437, "learning_rate": 7.285714285714286e-05, "loss": 1.0922, "step": 95 }, { "epoch": 12.67, "eval_loss": 1.2942878007888794, "eval_runtime": 17.0947, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 95 }, { "epoch": 13.33, "grad_norm": 1.896952509880066, "learning_rate": 7.142857142857143e-05, "loss": 1.0403, "step": 100 }, { "epoch": 13.33, "eval_loss": 1.2803159952163696, "eval_runtime": 17.0819, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 100 }, { "epoch": 14.0, "grad_norm": 1.862244725227356, "learning_rate": 7e-05, "loss": 1.0049, "step": 105 }, { "epoch": 14.0, "eval_loss": 1.2643567323684692, "eval_runtime": 17.0849, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 105 }, { "epoch": 14.67, "grad_norm": 1.7487821578979492, "learning_rate": 6.857142857142858e-05, "loss": 0.9262, "step": 110 }, { "epoch": 14.67, "eval_loss": 1.2471646070480347, "eval_runtime": 17.1278, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 110 }, { "epoch": 15.33, "grad_norm": 1.838605284690857, "learning_rate": 6.714285714285714e-05, "loss": 0.8965, "step": 115 }, { "epoch": 15.33, "eval_loss": 1.2377034425735474, "eval_runtime": 17.0731, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 115 }, { "epoch": 16.0, "grad_norm": 3.117398977279663, "learning_rate": 6.571428571428571e-05, "loss": 0.8581, "step": 120 }, { "epoch": 16.0, "eval_loss": 1.2083133459091187, "eval_runtime": 17.1304, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 120 }, { "epoch": 16.67, "grad_norm": 2.5655250549316406, "learning_rate": 6.428571428571429e-05, "loss": 0.7929, "step": 125 }, { "epoch": 16.67, "eval_loss": 1.1945828199386597, "eval_runtime": 17.104, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 125 }, { "epoch": 17.33, "grad_norm": 2.168546199798584, "learning_rate": 6.285714285714286e-05, "loss": 0.7543, "step": 130 }, { "epoch": 17.33, "eval_loss": 1.1876276731491089, "eval_runtime": 17.1046, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.058, "step": 130 }, { "epoch": 18.0, "grad_norm": 2.5984208583831787, "learning_rate": 6.142857142857143e-05, "loss": 0.716, "step": 135 }, { "epoch": 18.0, "eval_loss": 1.1714750528335571, "eval_runtime": 17.0807, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 135 }, { "epoch": 18.67, "grad_norm": 3.479024887084961, "learning_rate": 6e-05, "loss": 0.6681, "step": 140 }, { "epoch": 18.67, "eval_loss": 1.169895052909851, "eval_runtime": 17.0681, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 140 }, { "epoch": 19.33, "grad_norm": 2.563386917114258, "learning_rate": 5.8571428571428575e-05, "loss": 0.6306, "step": 145 }, { "epoch": 19.33, "eval_loss": 1.1741083860397339, "eval_runtime": 17.0568, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 145 }, { "epoch": 20.0, "grad_norm": 2.96592116355896, "learning_rate": 5.714285714285714e-05, "loss": 0.6183, "step": 150 }, { "epoch": 20.0, "eval_loss": 1.1455965042114258, "eval_runtime": 17.073, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 150 }, { "epoch": 20.67, "grad_norm": 2.6751275062561035, "learning_rate": 5.571428571428572e-05, "loss": 0.5464, "step": 155 }, { "epoch": 20.67, "eval_loss": 1.131102204322815, "eval_runtime": 17.0578, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 155 }, { "epoch": 21.33, "grad_norm": 2.3700051307678223, "learning_rate": 5.428571428571428e-05, "loss": 0.551, "step": 160 }, { "epoch": 21.33, "eval_loss": 1.127384066581726, "eval_runtime": 17.0546, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 160 }, { "epoch": 22.0, "grad_norm": 3.3827567100524902, "learning_rate": 5.285714285714286e-05, "loss": 0.5179, "step": 165 }, { "epoch": 22.0, "eval_loss": 1.111584186553955, "eval_runtime": 17.0812, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 165 }, { "epoch": 22.67, "grad_norm": 3.55790114402771, "learning_rate": 5.142857142857143e-05, "loss": 0.4831, "step": 170 }, { "epoch": 22.67, "eval_loss": 1.0948525667190552, "eval_runtime": 17.0547, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 170 }, { "epoch": 23.33, "grad_norm": 3.0782699584960938, "learning_rate": 5e-05, "loss": 0.4587, "step": 175 }, { "epoch": 23.33, "eval_loss": 1.0906586647033691, "eval_runtime": 17.0666, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 175 }, { "epoch": 24.0, "grad_norm": 3.3993167877197266, "learning_rate": 4.8571428571428576e-05, "loss": 0.4203, "step": 180 }, { "epoch": 24.0, "eval_loss": 1.0688152313232422, "eval_runtime": 17.0721, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 180 }, { "epoch": 24.67, "grad_norm": 3.319303035736084, "learning_rate": 4.714285714285714e-05, "loss": 0.3975, "step": 185 }, { "epoch": 24.67, "eval_loss": 1.0746583938598633, "eval_runtime": 17.0709, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 185 }, { "epoch": 25.33, "grad_norm": 2.4532127380371094, "learning_rate": 4.5714285714285716e-05, "loss": 0.3832, "step": 190 }, { "epoch": 25.33, "eval_loss": 1.0772522687911987, "eval_runtime": 17.0619, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 190 }, { "epoch": 26.0, "grad_norm": 3.956822156906128, "learning_rate": 4.428571428571428e-05, "loss": 0.3725, "step": 195 }, { "epoch": 26.0, "eval_loss": 1.0638784170150757, "eval_runtime": 17.0807, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 195 }, { "epoch": 26.67, "grad_norm": 2.76033353805542, "learning_rate": 4.2857142857142856e-05, "loss": 0.3473, "step": 200 }, { "epoch": 26.67, "eval_loss": 1.04669988155365, "eval_runtime": 17.0774, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 200 }, { "epoch": 27.33, "grad_norm": 3.8683507442474365, "learning_rate": 4.1428571428571437e-05, "loss": 0.3243, "step": 205 }, { "epoch": 27.33, "eval_loss": 1.0470303297042847, "eval_runtime": 17.0718, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 205 }, { "epoch": 28.0, "grad_norm": 4.535538196563721, "learning_rate": 4e-05, "loss": 0.3202, "step": 210 }, { "epoch": 28.0, "eval_loss": 1.025539517402649, "eval_runtime": 17.0604, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 210 }, { "epoch": 28.67, "grad_norm": 2.6224355697631836, "learning_rate": 3.857142857142858e-05, "loss": 0.2958, "step": 215 }, { "epoch": 28.67, "eval_loss": 1.0192126035690308, "eval_runtime": 17.0657, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 215 }, { "epoch": 29.33, "grad_norm": 2.5870041847229004, "learning_rate": 3.7142857142857143e-05, "loss": 0.2783, "step": 220 }, { "epoch": 29.33, "eval_loss": 1.0211580991744995, "eval_runtime": 17.0857, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 220 }, { "epoch": 30.0, "grad_norm": 3.4565751552581787, "learning_rate": 3.571428571428572e-05, "loss": 0.2773, "step": 225 }, { "epoch": 30.0, "eval_loss": 1.006419062614441, "eval_runtime": 17.0807, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 225 }, { "epoch": 30.67, "grad_norm": 2.4756500720977783, "learning_rate": 3.428571428571429e-05, "loss": 0.2482, "step": 230 }, { "epoch": 30.67, "eval_loss": 1.0081219673156738, "eval_runtime": 17.0576, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 230 }, { "epoch": 31.33, "grad_norm": 2.38002610206604, "learning_rate": 3.285714285714286e-05, "loss": 0.2464, "step": 235 }, { "epoch": 31.33, "eval_loss": 1.0151804685592651, "eval_runtime": 17.0587, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 235 }, { "epoch": 32.0, "grad_norm": 3.7081105709075928, "learning_rate": 3.142857142857143e-05, "loss": 0.2442, "step": 240 }, { "epoch": 32.0, "eval_loss": 1.0032445192337036, "eval_runtime": 17.1613, "eval_samples_per_second": 0.408, "eval_steps_per_second": 0.058, "step": 240 }, { "epoch": 32.67, "grad_norm": 2.55924391746521, "learning_rate": 3e-05, "loss": 0.2193, "step": 245 }, { "epoch": 32.67, "eval_loss": 0.9989615082740784, "eval_runtime": 17.0447, "eval_samples_per_second": 0.411, "eval_steps_per_second": 0.059, "step": 245 }, { "epoch": 33.33, "grad_norm": 1.9451407194137573, "learning_rate": 2.857142857142857e-05, "loss": 0.2101, "step": 250 }, { "epoch": 33.33, "eval_loss": 1.0029457807540894, "eval_runtime": 17.0816, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 250 }, { "epoch": 34.0, "grad_norm": 2.713731527328491, "learning_rate": 2.714285714285714e-05, "loss": 0.2194, "step": 255 }, { "epoch": 34.0, "eval_loss": 0.9959421753883362, "eval_runtime": 17.0747, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 255 }, { "epoch": 34.67, "grad_norm": 2.1633846759796143, "learning_rate": 2.5714285714285714e-05, "loss": 0.1958, "step": 260 }, { "epoch": 34.67, "eval_loss": 0.9989770650863647, "eval_runtime": 17.0821, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 260 }, { "epoch": 35.33, "grad_norm": 3.9233529567718506, "learning_rate": 2.4285714285714288e-05, "loss": 0.1831, "step": 265 }, { "epoch": 35.33, "eval_loss": 1.0072578191757202, "eval_runtime": 17.0564, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 265 }, { "epoch": 36.0, "grad_norm": 2.4143056869506836, "learning_rate": 2.2857142857142858e-05, "loss": 0.1753, "step": 270 }, { "epoch": 36.0, "eval_loss": 0.9938892722129822, "eval_runtime": 17.0668, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 270 }, { "epoch": 36.67, "grad_norm": 2.706679582595825, "learning_rate": 2.1428571428571428e-05, "loss": 0.1698, "step": 275 }, { "epoch": 36.67, "eval_loss": 0.9969200491905212, "eval_runtime": 17.0643, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 275 }, { "epoch": 37.33, "grad_norm": 1.872753620147705, "learning_rate": 2e-05, "loss": 0.16, "step": 280 }, { "epoch": 37.33, "eval_loss": 0.9940390586853027, "eval_runtime": 17.0728, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 280 }, { "epoch": 38.0, "grad_norm": 2.7510581016540527, "learning_rate": 1.8571428571428572e-05, "loss": 0.1614, "step": 285 }, { "epoch": 38.0, "eval_loss": 1.0066231489181519, "eval_runtime": 17.072, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.059, "step": 285 }, { "epoch": 38.67, "grad_norm": 1.8461092710494995, "learning_rate": 1.7142857142857145e-05, "loss": 0.1506, "step": 290 }, { "epoch": 38.67, "eval_loss": 0.9927281737327576, "eval_runtime": 17.0481, "eval_samples_per_second": 0.411, "eval_steps_per_second": 0.059, "step": 290 } ], "logging_steps": 5, "max_steps": 350, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 10, "total_flos": 1.9272976816637215e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }