| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.4893465185034152, |
| "global_step": 1200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "learning_rate": 0.0001, |
| "loss": 0.335, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.01, |
| "learning_rate": 0.0001, |
| "loss": 0.1841, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.01, |
| "learning_rate": 0.0001, |
| "loss": 0.1169, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.02, |
| "learning_rate": 0.0001, |
| "loss": 0.0871, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.02, |
| "learning_rate": 0.0001, |
| "loss": 0.0975, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.02, |
| "eval_loss": 0.1183294802904129, |
| "eval_runtime": 568.4607, |
| "eval_samples_per_second": 0.88, |
| "eval_steps_per_second": 0.88, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.02, |
| "learning_rate": 0.0001, |
| "loss": 0.1664, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.03, |
| "learning_rate": 0.0001, |
| "loss": 0.1231, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.03, |
| "learning_rate": 0.0001, |
| "loss": 0.0884, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.04, |
| "learning_rate": 0.0001, |
| "loss": 0.0694, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.04, |
| "learning_rate": 0.0001, |
| "loss": 0.0856, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04, |
| "eval_loss": 0.09814266860485077, |
| "eval_runtime": 568.4487, |
| "eval_samples_per_second": 0.88, |
| "eval_steps_per_second": 0.88, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04, |
| "learning_rate": 0.0001, |
| "loss": 0.1289, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.05, |
| "learning_rate": 0.0001, |
| "loss": 0.1118, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.05, |
| "learning_rate": 0.0001, |
| "loss": 0.0768, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.06, |
| "learning_rate": 0.0001, |
| "loss": 0.0653, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.06, |
| "learning_rate": 0.0001, |
| "loss": 0.0802, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.06, |
| "eval_loss": 0.09313643723726273, |
| "eval_runtime": 568.3479, |
| "eval_samples_per_second": 0.88, |
| "eval_steps_per_second": 0.88, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.07, |
| "learning_rate": 0.0001, |
| "loss": 0.1312, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.07, |
| "learning_rate": 0.0001, |
| "loss": 0.1045, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.07, |
| "learning_rate": 0.0001, |
| "loss": 0.0693, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.08, |
| "learning_rate": 0.0001, |
| "loss": 0.0622, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.08, |
| "learning_rate": 0.0001, |
| "loss": 0.0763, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_loss": 0.09021304547786713, |
| "eval_runtime": 568.4392, |
| "eval_samples_per_second": 0.88, |
| "eval_steps_per_second": 0.88, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.09, |
| "learning_rate": 0.0001, |
| "loss": 0.1246, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.09, |
| "learning_rate": 0.0001, |
| "loss": 0.1021, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.09, |
| "learning_rate": 0.0001, |
| "loss": 0.0768, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.1, |
| "learning_rate": 0.0001, |
| "loss": 0.061, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.1, |
| "learning_rate": 0.0001, |
| "loss": 0.0721, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.1, |
| "eval_loss": 0.08687781542539597, |
| "eval_runtime": 568.2572, |
| "eval_samples_per_second": 0.88, |
| "eval_steps_per_second": 0.88, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.11, |
| "learning_rate": 0.0001, |
| "loss": 0.1198, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.11, |
| "learning_rate": 0.0001, |
| "loss": 0.1007, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.11, |
| "learning_rate": 0.0001, |
| "loss": 0.0664, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.12, |
| "learning_rate": 0.0001, |
| "loss": 0.0588, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.12, |
| "learning_rate": 0.0001, |
| "loss": 0.0756, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.12, |
| "eval_loss": 0.08552172780036926, |
| "eval_runtime": 568.4354, |
| "eval_samples_per_second": 0.88, |
| "eval_steps_per_second": 0.88, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.13, |
| "learning_rate": 0.0001, |
| "loss": 0.1157, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.13, |
| "learning_rate": 0.0001, |
| "loss": 0.1, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.13, |
| "learning_rate": 0.0001, |
| "loss": 0.0687, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.14, |
| "learning_rate": 0.0001, |
| "loss": 0.0588, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.14, |
| "learning_rate": 0.0001, |
| "loss": 0.0689, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.14, |
| "eval_loss": 0.08368493616580963, |
| "eval_runtime": 568.2797, |
| "eval_samples_per_second": 0.88, |
| "eval_steps_per_second": 0.88, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.15, |
| "learning_rate": 0.0001, |
| "loss": 0.113, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.15, |
| "learning_rate": 0.0001, |
| "loss": 0.0959, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.15, |
| "learning_rate": 0.0001, |
| "loss": 0.0649, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.16, |
| "learning_rate": 0.0001, |
| "loss": 0.0545, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.16, |
| "learning_rate": 0.0001, |
| "loss": 0.0709, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_loss": 0.0824669748544693, |
| "eval_runtime": 568.3264, |
| "eval_samples_per_second": 0.88, |
| "eval_steps_per_second": 0.88, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.17, |
| "learning_rate": 0.0001, |
| "loss": 0.1198, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.17, |
| "learning_rate": 0.0001, |
| "loss": 0.0977, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.18, |
| "learning_rate": 0.0001, |
| "loss": 0.0706, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.18, |
| "learning_rate": 0.0001, |
| "loss": 0.0638, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.18, |
| "learning_rate": 0.0001, |
| "loss": 0.0691, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.18, |
| "eval_loss": 0.08339440822601318, |
| "eval_runtime": 568.6, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.19, |
| "learning_rate": 0.0001, |
| "loss": 0.1139, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.19, |
| "learning_rate": 0.0001, |
| "loss": 0.0941, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.2, |
| "learning_rate": 0.0001, |
| "loss": 0.0635, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.2, |
| "learning_rate": 0.0001, |
| "loss": 0.0588, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.2, |
| "learning_rate": 0.0001, |
| "loss": 0.0669, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.2, |
| "eval_loss": 0.08025918900966644, |
| "eval_runtime": 568.5809, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.21, |
| "learning_rate": 0.0001, |
| "loss": 0.1129, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.21, |
| "learning_rate": 0.0001, |
| "loss": 0.0957, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.22, |
| "learning_rate": 0.0001, |
| "loss": 0.0671, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.22, |
| "learning_rate": 0.0001, |
| "loss": 0.0563, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.22, |
| "learning_rate": 0.0001, |
| "loss": 0.0624, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.22, |
| "eval_loss": 0.07949241995811462, |
| "eval_runtime": 568.8464, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.23, |
| "learning_rate": 0.0001, |
| "loss": 0.114, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.23, |
| "learning_rate": 0.0001, |
| "loss": 0.0895, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.24, |
| "learning_rate": 0.0001, |
| "loss": 0.0672, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.24, |
| "learning_rate": 0.0001, |
| "loss": 0.06, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.24, |
| "learning_rate": 0.0001, |
| "loss": 0.0656, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_loss": 0.07967128604650497, |
| "eval_runtime": 568.9044, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.25, |
| "learning_rate": 0.0001, |
| "loss": 0.1091, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.25, |
| "learning_rate": 0.0001, |
| "loss": 0.0913, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.26, |
| "learning_rate": 0.0001, |
| "loss": 0.0733, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.26, |
| "learning_rate": 0.0001, |
| "loss": 0.0557, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.27, |
| "learning_rate": 0.0001, |
| "loss": 0.06, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.27, |
| "eval_loss": 0.08012031763792038, |
| "eval_runtime": 569.0972, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.27, |
| "learning_rate": 0.0001, |
| "loss": 0.1083, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.27, |
| "learning_rate": 0.0001, |
| "loss": 0.0894, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.28, |
| "learning_rate": 0.0001, |
| "loss": 0.0591, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.28, |
| "learning_rate": 0.0001, |
| "loss": 0.0573, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.29, |
| "learning_rate": 0.0001, |
| "loss": 0.0674, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.29, |
| "eval_loss": 0.07716764509677887, |
| "eval_runtime": 569.0522, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.29, |
| "learning_rate": 0.0001, |
| "loss": 0.1088, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.29, |
| "learning_rate": 0.0001, |
| "loss": 0.0912, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.3, |
| "learning_rate": 0.0001, |
| "loss": 0.0699, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.3, |
| "learning_rate": 0.0001, |
| "loss": 0.0553, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.31, |
| "learning_rate": 0.0001, |
| "loss": 0.0649, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.31, |
| "eval_loss": 0.0776296928524971, |
| "eval_runtime": 568.9262, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.31, |
| "learning_rate": 0.0001, |
| "loss": 0.109, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.31, |
| "learning_rate": 0.0001, |
| "loss": 0.0914, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.32, |
| "learning_rate": 0.0001, |
| "loss": 0.0669, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.32, |
| "learning_rate": 0.0001, |
| "loss": 0.0522, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.33, |
| "learning_rate": 0.0001, |
| "loss": 0.0616, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.33, |
| "eval_loss": 0.0761963352560997, |
| "eval_runtime": 568.7533, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.33, |
| "learning_rate": 0.0001, |
| "loss": 0.1023, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.33, |
| "learning_rate": 0.0001, |
| "loss": 0.0895, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.34, |
| "learning_rate": 0.0001, |
| "loss": 0.0611, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.34, |
| "learning_rate": 0.0001, |
| "loss": 0.0578, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.35, |
| "learning_rate": 0.0001, |
| "loss": 0.0609, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.35, |
| "eval_loss": 0.07499316334724426, |
| "eval_runtime": 568.7486, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.35, |
| "learning_rate": 0.0001, |
| "loss": 0.1043, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.35, |
| "learning_rate": 0.0001, |
| "loss": 0.0918, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.36, |
| "learning_rate": 0.0001, |
| "loss": 0.0631, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.36, |
| "learning_rate": 0.0001, |
| "loss": 0.0527, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.37, |
| "learning_rate": 0.0001, |
| "loss": 0.0575, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.37, |
| "eval_loss": 0.07557275891304016, |
| "eval_runtime": 568.8145, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.37, |
| "learning_rate": 0.0001, |
| "loss": 0.1076, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.38, |
| "learning_rate": 0.0001, |
| "loss": 0.0895, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.38, |
| "learning_rate": 0.0001, |
| "loss": 0.0683, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.38, |
| "learning_rate": 0.0001, |
| "loss": 0.0524, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.39, |
| "learning_rate": 0.0001, |
| "loss": 0.0547, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.39, |
| "eval_loss": 0.07596061378717422, |
| "eval_runtime": 568.7574, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.39, |
| "learning_rate": 0.0001, |
| "loss": 0.1121, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.4, |
| "learning_rate": 0.0001, |
| "loss": 0.0905, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.4, |
| "learning_rate": 0.0001, |
| "loss": 0.0606, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.4, |
| "learning_rate": 0.0001, |
| "loss": 0.0592, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.41, |
| "learning_rate": 0.0001, |
| "loss": 0.064, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.41, |
| "eval_loss": 0.07461731135845184, |
| "eval_runtime": 568.8501, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.41, |
| "learning_rate": 0.0001, |
| "loss": 0.1058, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.42, |
| "learning_rate": 0.0001, |
| "loss": 0.0854, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.42, |
| "learning_rate": 0.0001, |
| "loss": 0.0666, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.42, |
| "learning_rate": 0.0001, |
| "loss": 0.0554, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.43, |
| "learning_rate": 0.0001, |
| "loss": 0.0621, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.43, |
| "eval_loss": 0.0737040713429451, |
| "eval_runtime": 568.5368, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.43, |
| "learning_rate": 0.0001, |
| "loss": 0.1088, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.44, |
| "learning_rate": 0.0001, |
| "loss": 0.0862, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.44, |
| "learning_rate": 0.0001, |
| "loss": 0.0628, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.44, |
| "learning_rate": 0.0001, |
| "loss": 0.0535, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.45, |
| "learning_rate": 0.0001, |
| "loss": 0.0588, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.45, |
| "eval_loss": 0.07249170541763306, |
| "eval_runtime": 568.6042, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.45, |
| "learning_rate": 0.0001, |
| "loss": 0.1073, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.46, |
| "learning_rate": 0.0001, |
| "loss": 0.0894, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.46, |
| "learning_rate": 0.0001, |
| "loss": 0.067, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.46, |
| "learning_rate": 0.0001, |
| "loss": 0.0523, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.47, |
| "learning_rate": 0.0001, |
| "loss": 0.0567, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.47, |
| "eval_loss": 0.07228324562311172, |
| "eval_runtime": 568.454, |
| "eval_samples_per_second": 0.88, |
| "eval_steps_per_second": 0.88, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.47, |
| "learning_rate": 0.0001, |
| "loss": 0.1036, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.48, |
| "learning_rate": 0.0001, |
| "loss": 0.0858, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.48, |
| "learning_rate": 0.0001, |
| "loss": 0.0636, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.49, |
| "learning_rate": 0.0001, |
| "loss": 0.0527, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.49, |
| "learning_rate": 0.0001, |
| "loss": 0.0634, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.49, |
| "eval_loss": 0.07318860292434692, |
| "eval_runtime": 568.59, |
| "eval_samples_per_second": 0.879, |
| "eval_steps_per_second": 0.879, |
| "step": 1200 |
| } |
| ], |
| "max_steps": 5000, |
| "num_train_epochs": 3, |
| "total_flos": 8.961246418076467e+17, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|