queue status update
Browse files- queue_status.json +130 -130
queue_status.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"timestamp": "2026-04-12 08:
|
| 3 |
"total": 120,
|
| 4 |
-
"pending":
|
| 5 |
-
"running":
|
| 6 |
"done": 15,
|
| 7 |
-
"failed":
|
| 8 |
"stale": 0,
|
| 9 |
"retrying": 0,
|
| 10 |
"jobs": [
|
|
@@ -15,7 +15,7 @@
|
|
| 15 |
"gpu": 0,
|
| 16 |
"status": "done",
|
| 17 |
"elapsed": 1786,
|
| 18 |
-
"idle_time":
|
| 19 |
"exit_code": 0,
|
| 20 |
"retries": 0,
|
| 21 |
"log_file": "/tmp/gpu_queue/job_000_add_sub_baseline_10K_gpu0.log"
|
|
@@ -27,7 +27,7 @@
|
|
| 27 |
"gpu": 1,
|
| 28 |
"status": "done",
|
| 29 |
"elapsed": 2101,
|
| 30 |
-
"idle_time":
|
| 31 |
"exit_code": 0,
|
| 32 |
"retries": 0,
|
| 33 |
"log_file": "/tmp/gpu_queue/job_001_add_sub_baseline_25K_gpu1.log"
|
|
@@ -39,7 +39,7 @@
|
|
| 39 |
"gpu": 2,
|
| 40 |
"status": "done",
|
| 41 |
"elapsed": 4753,
|
| 42 |
-
"idle_time":
|
| 43 |
"exit_code": 0,
|
| 44 |
"retries": 0,
|
| 45 |
"log_file": "/tmp/gpu_queue/job_002_as_sorl_abs10_K1_25K_gpu2.log"
|
|
@@ -51,7 +51,7 @@
|
|
| 51 |
"gpu": 0,
|
| 52 |
"status": "done",
|
| 53 |
"elapsed": 2366,
|
| 54 |
-
"idle_time":
|
| 55 |
"exit_code": 0,
|
| 56 |
"retries": 0,
|
| 57 |
"log_file": "/tmp/gpu_queue/job_003_add_sub_baseline_50K_gpu0.log"
|
|
@@ -63,7 +63,7 @@
|
|
| 63 |
"gpu": 1,
|
| 64 |
"status": "done",
|
| 65 |
"elapsed": 6727,
|
| 66 |
-
"idle_time":
|
| 67 |
"exit_code": 0,
|
| 68 |
"retries": 0,
|
| 69 |
"log_file": "/tmp/gpu_queue/job_004_as_sorl_abs10_K1_50K_gpu1.log"
|
|
@@ -75,7 +75,7 @@
|
|
| 75 |
"gpu": 2,
|
| 76 |
"status": "done",
|
| 77 |
"elapsed": 3112,
|
| 78 |
-
"idle_time":
|
| 79 |
"exit_code": 0,
|
| 80 |
"retries": 0,
|
| 81 |
"log_file": "/tmp/gpu_queue/job_005_add_sub_baseline_100K_gpu2.log"
|
|
@@ -87,7 +87,7 @@
|
|
| 87 |
"gpu": 0,
|
| 88 |
"status": "done",
|
| 89 |
"elapsed": 8856,
|
| 90 |
-
"idle_time":
|
| 91 |
"exit_code": 0,
|
| 92 |
"retries": 0,
|
| 93 |
"log_file": "/tmp/gpu_queue/job_006_as_sorl_abs10_K1_100K_gpu0.log"
|
|
@@ -144,13 +144,13 @@
|
|
| 144 |
"job_id": 11,
|
| 145 |
"name": "as_sorl_abs2_K4_500K",
|
| 146 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 2 --K 4 --num",
|
| 147 |
-
"gpu":
|
| 148 |
-
"status": "
|
| 149 |
-
"elapsed":
|
| 150 |
-
"idle_time":
|
| 151 |
-
"exit_code": -
|
| 152 |
-
"retries":
|
| 153 |
-
"log_file": ""
|
| 154 |
},
|
| 155 |
{
|
| 156 |
"job_id": 12,
|
|
@@ -171,7 +171,7 @@
|
|
| 171 |
"gpu": 0,
|
| 172 |
"status": "failed",
|
| 173 |
"elapsed": 5,
|
| 174 |
-
"idle_time":
|
| 175 |
"exit_code": -9,
|
| 176 |
"retries": 1,
|
| 177 |
"log_file": "/tmp/gpu_queue/job_013_as_sorl_abs10_K4_500K_gpu0.log"
|
|
@@ -228,13 +228,13 @@
|
|
| 228 |
"job_id": 18,
|
| 229 |
"name": "as_sorl_abs100_K4_500K",
|
| 230 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 100 --K 4 --n",
|
| 231 |
-
"gpu":
|
| 232 |
-
"status": "
|
| 233 |
-
"elapsed":
|
| 234 |
-
"idle_time":
|
| 235 |
"exit_code": -1,
|
| 236 |
"retries": 0,
|
| 237 |
-
"log_file": ""
|
| 238 |
},
|
| 239 |
{
|
| 240 |
"job_id": 19,
|
|
@@ -336,13 +336,13 @@
|
|
| 336 |
"job_id": 27,
|
| 337 |
"name": "as_sorl_abs5_K1_25K",
|
| 338 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 25000 --abs_vocab 5 --K 1 --num_",
|
| 339 |
-
"gpu":
|
| 340 |
-
"status": "
|
| 341 |
-
"elapsed":
|
| 342 |
-
"idle_time":
|
| 343 |
-
"exit_code": -
|
| 344 |
-
"retries":
|
| 345 |
-
"log_file": ""
|
| 346 |
},
|
| 347 |
{
|
| 348 |
"job_id": 28,
|
|
@@ -459,7 +459,7 @@
|
|
| 459 |
"gpu": 0,
|
| 460 |
"status": "done",
|
| 461 |
"elapsed": 4510,
|
| 462 |
-
"idle_time":
|
| 463 |
"exit_code": 0,
|
| 464 |
"retries": 0,
|
| 465 |
"log_file": "/tmp/gpu_queue/job_037_as_sorl_abs10_K4_25K_gpu0.log"
|
|
@@ -504,13 +504,13 @@
|
|
| 504 |
"job_id": 41,
|
| 505 |
"name": "as_sorl_abs10_K4_50K",
|
| 506 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 50000 --abs_vocab 10 --K 4 --num",
|
| 507 |
-
"gpu":
|
| 508 |
-
"status": "
|
| 509 |
-
"elapsed":
|
| 510 |
-
"idle_time":
|
| 511 |
-
"exit_code": -
|
| 512 |
-
"retries":
|
| 513 |
-
"log_file": ""
|
| 514 |
},
|
| 515 |
{
|
| 516 |
"job_id": 42,
|
|
@@ -531,7 +531,7 @@
|
|
| 531 |
"gpu": 0,
|
| 532 |
"status": "done",
|
| 533 |
"elapsed": 5510,
|
| 534 |
-
"idle_time":
|
| 535 |
"exit_code": 0,
|
| 536 |
"retries": 0,
|
| 537 |
"log_file": "/tmp/gpu_queue/job_043_as_sorl_abs50_K4_50K_gpu0.log"
|
|
@@ -552,13 +552,13 @@
|
|
| 552 |
"job_id": 45,
|
| 553 |
"name": "as_sorl_abs10_K4_100K",
|
| 554 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 4 --nu",
|
| 555 |
-
"gpu":
|
| 556 |
-
"status": "
|
| 557 |
-
"elapsed":
|
| 558 |
-
"idle_time":
|
| 559 |
"exit_code": -1,
|
| 560 |
"retries": 0,
|
| 561 |
-
"log_file": ""
|
| 562 |
},
|
| 563 |
{
|
| 564 |
"job_id": 46,
|
|
@@ -612,13 +612,13 @@
|
|
| 612 |
"job_id": 50,
|
| 613 |
"name": "as_sorl_abs10_K1_zipf10.0_500K",
|
| 614 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --al",
|
| 615 |
-
"gpu":
|
| 616 |
-
"status": "
|
| 617 |
-
"elapsed":
|
| 618 |
-
"idle_time":
|
| 619 |
-
"exit_code": -
|
| 620 |
-
"retries":
|
| 621 |
-
"log_file": ""
|
| 622 |
},
|
| 623 |
{
|
| 624 |
"job_id": 51,
|
|
@@ -651,7 +651,7 @@
|
|
| 651 |
"gpu": 1,
|
| 652 |
"status": "failed",
|
| 653 |
"elapsed": 5,
|
| 654 |
-
"idle_time":
|
| 655 |
"exit_code": -9,
|
| 656 |
"retries": 1,
|
| 657 |
"log_file": "/tmp/gpu_queue/job_053_as_sorl_abs10_K4_zipf10.0_500K_gpu1.log"
|
|
@@ -675,7 +675,7 @@
|
|
| 675 |
"gpu": 0,
|
| 676 |
"status": "failed",
|
| 677 |
"elapsed": 5,
|
| 678 |
-
"idle_time":
|
| 679 |
"exit_code": -9,
|
| 680 |
"retries": 1,
|
| 681 |
"log_file": "/tmp/gpu_queue/job_055_as_sorl_abs100_K1_zipf5.0_500K_gpu0.log"
|
|
@@ -708,13 +708,13 @@
|
|
| 708 |
"job_id": 58,
|
| 709 |
"name": "as_sorl_abs100_K4_zipf5.0_500K",
|
| 710 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 100 --K 4 --a",
|
| 711 |
-
"gpu":
|
| 712 |
-
"status": "
|
| 713 |
-
"elapsed":
|
| 714 |
-
"idle_time":
|
| 715 |
"exit_code": -1,
|
| 716 |
"retries": 0,
|
| 717 |
-
"log_file": ""
|
| 718 |
},
|
| 719 |
{
|
| 720 |
"job_id": 59,
|
|
@@ -756,13 +756,13 @@
|
|
| 756 |
"job_id": 62,
|
| 757 |
"name": "as_baseline_50K_1L3H510d",
|
| 758 |
"cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 50000 --num_epochs 20 --n_la",
|
| 759 |
-
"gpu":
|
| 760 |
-
"status": "
|
| 761 |
-
"elapsed":
|
| 762 |
"idle_time": 0,
|
| 763 |
-
"exit_code": -
|
| 764 |
-
"retries":
|
| 765 |
-
"log_file": ""
|
| 766 |
},
|
| 767 |
{
|
| 768 |
"job_id": 63,
|
|
@@ -771,7 +771,7 @@
|
|
| 771 |
"gpu": 0,
|
| 772 |
"status": "done",
|
| 773 |
"elapsed": 5352,
|
| 774 |
-
"idle_time":
|
| 775 |
"exit_code": 0,
|
| 776 |
"retries": 0,
|
| 777 |
"log_file": "/tmp/gpu_queue/job_063_as_sorl_abs10_K1_50K_1L3H510d_gpu0.log"
|
|
@@ -780,24 +780,24 @@
|
|
| 780 |
"job_id": 64,
|
| 781 |
"name": "as_baseline_100K_1L3H510d",
|
| 782 |
"cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 100000 --num_epochs 20 --n_l",
|
| 783 |
-
"gpu":
|
| 784 |
-
"status": "
|
| 785 |
-
"elapsed":
|
| 786 |
-
"idle_time":
|
| 787 |
-
"exit_code": -
|
| 788 |
-
"retries":
|
| 789 |
-
"log_file": ""
|
| 790 |
},
|
| 791 |
{
|
| 792 |
"job_id": 65,
|
| 793 |
"name": "as_sorl_abs10_K1_100K_1L3H510d",
|
| 794 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 1 --nu",
|
| 795 |
"gpu": 0,
|
| 796 |
-
"status": "
|
| 797 |
-
"elapsed":
|
| 798 |
-
"idle_time":
|
| 799 |
-
"exit_code": -
|
| 800 |
-
"retries":
|
| 801 |
"log_file": "/tmp/gpu_queue/job_065_as_sorl_abs10_K1_100K_1L3H510d_gpu0.log"
|
| 802 |
},
|
| 803 |
{
|
|
@@ -841,24 +841,24 @@
|
|
| 841 |
"name": "as_sorl_abs10_K1_500K_1L3H510d",
|
| 842 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
|
| 843 |
"gpu": 2,
|
| 844 |
-
"status": "
|
| 845 |
-
"elapsed":
|
| 846 |
-
"idle_time":
|
| 847 |
-
"exit_code": -
|
| 848 |
-
"retries":
|
| 849 |
"log_file": "/tmp/gpu_queue/job_069_as_sorl_abs10_K1_500K_1L3H510d_gpu2.log"
|
| 850 |
},
|
| 851 |
{
|
| 852 |
"job_id": 70,
|
| 853 |
"name": "as_baseline_25K_1L2H256d",
|
| 854 |
"cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 25000 --num_epochs 20 --n_la",
|
| 855 |
-
"gpu":
|
| 856 |
-
"status": "
|
| 857 |
-
"elapsed":
|
| 858 |
-
"idle_time":
|
| 859 |
-
"exit_code": -
|
| 860 |
-
"retries":
|
| 861 |
-
"log_file": "/tmp/gpu_queue/
|
| 862 |
},
|
| 863 |
{
|
| 864 |
"job_id": 71,
|
|
@@ -903,7 +903,7 @@
|
|
| 903 |
"gpu": 2,
|
| 904 |
"status": "done",
|
| 905 |
"elapsed": 2797,
|
| 906 |
-
"idle_time":
|
| 907 |
"exit_code": 0,
|
| 908 |
"retries": 0,
|
| 909 |
"log_file": "/tmp/gpu_queue/job_074_as_baseline_100K_1L2H256d_gpu2.log"
|
|
@@ -912,13 +912,13 @@
|
|
| 912 |
"job_id": 75,
|
| 913 |
"name": "as_sorl_abs10_K1_100K_1L2H256d",
|
| 914 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 1 --nu",
|
| 915 |
-
"gpu":
|
| 916 |
-
"status": "
|
| 917 |
-
"elapsed":
|
| 918 |
-
"idle_time":
|
| 919 |
-
"exit_code": -
|
| 920 |
-
"retries":
|
| 921 |
-
"log_file": ""
|
| 922 |
},
|
| 923 |
{
|
| 924 |
"job_id": 76,
|
|
@@ -927,7 +927,7 @@
|
|
| 927 |
"gpu": 0,
|
| 928 |
"status": "done",
|
| 929 |
"elapsed": 4832,
|
| 930 |
-
"idle_time":
|
| 931 |
"exit_code": 0,
|
| 932 |
"retries": 0,
|
| 933 |
"log_file": "/tmp/gpu_queue/job_076_as_baseline_250K_1L2H256d_gpu0.log"
|
|
@@ -951,7 +951,7 @@
|
|
| 951 |
"gpu": 2,
|
| 952 |
"status": "done",
|
| 953 |
"elapsed": 7064,
|
| 954 |
-
"idle_time":
|
| 955 |
"exit_code": 0,
|
| 956 |
"retries": 0,
|
| 957 |
"log_file": "/tmp/gpu_queue/job_078_as_baseline_500K_1L2H256d_gpu2.log"
|
|
@@ -975,7 +975,7 @@
|
|
| 975 |
"gpu": 1,
|
| 976 |
"status": "done",
|
| 977 |
"elapsed": 1971,
|
| 978 |
-
"idle_time":
|
| 979 |
"exit_code": 0,
|
| 980 |
"retries": 0,
|
| 981 |
"log_file": "/tmp/gpu_queue/job_080_as_baseline_25K_2L1H128d_gpu1.log"
|
|
@@ -1057,11 +1057,11 @@
|
|
| 1057 |
"name": "as_sorl_abs10_K1_250K_2L1H128d",
|
| 1058 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 1 --nu",
|
| 1059 |
"gpu": 1,
|
| 1060 |
-
"status": "
|
| 1061 |
-
"elapsed":
|
| 1062 |
-
"idle_time":
|
| 1063 |
-
"exit_code": -
|
| 1064 |
-
"retries":
|
| 1065 |
"log_file": "/tmp/gpu_queue/job_087_as_sorl_abs10_K1_250K_2L1H128d_gpu1.log"
|
| 1066 |
},
|
| 1067 |
{
|
|
@@ -1081,11 +1081,11 @@
|
|
| 1081 |
"name": "as_sorl_abs10_K1_500K_2L1H128d",
|
| 1082 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
|
| 1083 |
"gpu": 1,
|
| 1084 |
-
"status": "
|
| 1085 |
-
"elapsed":
|
| 1086 |
-
"idle_time":
|
| 1087 |
-
"exit_code": -
|
| 1088 |
-
"retries":
|
| 1089 |
"log_file": "/tmp/gpu_queue/job_089_as_sorl_abs10_K1_500K_2L1H128d_gpu1.log"
|
| 1090 |
},
|
| 1091 |
{
|
|
@@ -1119,7 +1119,7 @@
|
|
| 1119 |
"gpu": 0,
|
| 1120 |
"status": "done",
|
| 1121 |
"elapsed": 3477,
|
| 1122 |
-
"idle_time":
|
| 1123 |
"exit_code": 0,
|
| 1124 |
"retries": 0,
|
| 1125 |
"log_file": "/tmp/gpu_queue/job_092_as_sorl_abs30_K4_10K_gpu0.log"
|
|
@@ -1153,11 +1153,11 @@
|
|
| 1153 |
"name": "as_sorl_abs10_K4_250K",
|
| 1154 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 4 --nu",
|
| 1155 |
"gpu": 2,
|
| 1156 |
-
"status": "
|
| 1157 |
-
"elapsed":
|
| 1158 |
-
"idle_time":
|
| 1159 |
-
"exit_code": -
|
| 1160 |
-
"retries":
|
| 1161 |
"log_file": "/tmp/gpu_queue/job_095_as_sorl_abs10_K4_250K_gpu2.log"
|
| 1162 |
},
|
| 1163 |
{
|
|
@@ -1188,25 +1188,25 @@
|
|
| 1188 |
"job_id": 98,
|
| 1189 |
"name": "as_sorl_abs2_K1_100K",
|
| 1190 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 2 --K 1 --num",
|
| 1191 |
-
"gpu":
|
| 1192 |
-
"status": "
|
| 1193 |
-
"elapsed":
|
| 1194 |
-
"idle_time":
|
| 1195 |
-
"exit_code": -
|
| 1196 |
-
"retries":
|
| 1197 |
-
"log_file": ""
|
| 1198 |
},
|
| 1199 |
{
|
| 1200 |
"job_id": 99,
|
| 1201 |
"name": "as_sorl_abs2_K4_100K",
|
| 1202 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 2 --K 4 --num",
|
| 1203 |
-
"gpu":
|
| 1204 |
-
"status": "
|
| 1205 |
-
"elapsed":
|
| 1206 |
"idle_time": 0,
|
| 1207 |
-
"exit_code": -
|
| 1208 |
-
"retries":
|
| 1209 |
-
"log_file": ""
|
| 1210 |
},
|
| 1211 |
{
|
| 1212 |
"job_id": 100,
|
|
|
|
| 1 |
{
|
| 2 |
+
"timestamp": "2026-04-12 08:07:30",
|
| 3 |
"total": 120,
|
| 4 |
+
"pending": 84,
|
| 5 |
+
"running": 3,
|
| 6 |
"done": 15,
|
| 7 |
+
"failed": 18,
|
| 8 |
"stale": 0,
|
| 9 |
"retrying": 0,
|
| 10 |
"jobs": [
|
|
|
|
| 15 |
"gpu": 0,
|
| 16 |
"status": "done",
|
| 17 |
"elapsed": 1786,
|
| 18 |
+
"idle_time": 20388,
|
| 19 |
"exit_code": 0,
|
| 20 |
"retries": 0,
|
| 21 |
"log_file": "/tmp/gpu_queue/job_000_add_sub_baseline_10K_gpu0.log"
|
|
|
|
| 27 |
"gpu": 1,
|
| 28 |
"status": "done",
|
| 29 |
"elapsed": 2101,
|
| 30 |
+
"idle_time": 20073,
|
| 31 |
"exit_code": 0,
|
| 32 |
"retries": 0,
|
| 33 |
"log_file": "/tmp/gpu_queue/job_001_add_sub_baseline_25K_gpu1.log"
|
|
|
|
| 39 |
"gpu": 2,
|
| 40 |
"status": "done",
|
| 41 |
"elapsed": 4753,
|
| 42 |
+
"idle_time": 17421,
|
| 43 |
"exit_code": 0,
|
| 44 |
"retries": 0,
|
| 45 |
"log_file": "/tmp/gpu_queue/job_002_as_sorl_abs10_K1_25K_gpu2.log"
|
|
|
|
| 51 |
"gpu": 0,
|
| 52 |
"status": "done",
|
| 53 |
"elapsed": 2366,
|
| 54 |
+
"idle_time": 19808,
|
| 55 |
"exit_code": 0,
|
| 56 |
"retries": 0,
|
| 57 |
"log_file": "/tmp/gpu_queue/job_003_add_sub_baseline_50K_gpu0.log"
|
|
|
|
| 63 |
"gpu": 1,
|
| 64 |
"status": "done",
|
| 65 |
"elapsed": 6727,
|
| 66 |
+
"idle_time": 15447,
|
| 67 |
"exit_code": 0,
|
| 68 |
"retries": 0,
|
| 69 |
"log_file": "/tmp/gpu_queue/job_004_as_sorl_abs10_K1_50K_gpu1.log"
|
|
|
|
| 75 |
"gpu": 2,
|
| 76 |
"status": "done",
|
| 77 |
"elapsed": 3112,
|
| 78 |
+
"idle_time": 19062,
|
| 79 |
"exit_code": 0,
|
| 80 |
"retries": 0,
|
| 81 |
"log_file": "/tmp/gpu_queue/job_005_add_sub_baseline_100K_gpu2.log"
|
|
|
|
| 87 |
"gpu": 0,
|
| 88 |
"status": "done",
|
| 89 |
"elapsed": 8856,
|
| 90 |
+
"idle_time": 11528,
|
| 91 |
"exit_code": 0,
|
| 92 |
"retries": 0,
|
| 93 |
"log_file": "/tmp/gpu_queue/job_006_as_sorl_abs10_K1_100K_gpu0.log"
|
|
|
|
| 144 |
"job_id": 11,
|
| 145 |
"name": "as_sorl_abs2_K4_500K",
|
| 146 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 2 --K 4 --num",
|
| 147 |
+
"gpu": 2,
|
| 148 |
+
"status": "failed",
|
| 149 |
+
"elapsed": 5,
|
| 150 |
+
"idle_time": 1,
|
| 151 |
+
"exit_code": -9,
|
| 152 |
+
"retries": 1,
|
| 153 |
+
"log_file": "/tmp/gpu_queue/job_011_as_sorl_abs2_K4_500K_gpu2.log"
|
| 154 |
},
|
| 155 |
{
|
| 156 |
"job_id": 12,
|
|
|
|
| 171 |
"gpu": 0,
|
| 172 |
"status": "failed",
|
| 173 |
"elapsed": 5,
|
| 174 |
+
"idle_time": 1035,
|
| 175 |
"exit_code": -9,
|
| 176 |
"retries": 1,
|
| 177 |
"log_file": "/tmp/gpu_queue/job_013_as_sorl_abs10_K4_500K_gpu0.log"
|
|
|
|
| 228 |
"job_id": 18,
|
| 229 |
"name": "as_sorl_abs100_K4_500K",
|
| 230 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 100 --K 4 --n",
|
| 231 |
+
"gpu": 2,
|
| 232 |
+
"status": "running",
|
| 233 |
+
"elapsed": 1,
|
| 234 |
+
"idle_time": 1,
|
| 235 |
"exit_code": -1,
|
| 236 |
"retries": 0,
|
| 237 |
+
"log_file": "/tmp/gpu_queue/job_018_as_sorl_abs100_K4_500K_gpu2.log"
|
| 238 |
},
|
| 239 |
{
|
| 240 |
"job_id": 19,
|
|
|
|
| 336 |
"job_id": 27,
|
| 337 |
"name": "as_sorl_abs5_K1_25K",
|
| 338 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 25000 --abs_vocab 5 --K 1 --num_",
|
| 339 |
+
"gpu": 0,
|
| 340 |
+
"status": "failed",
|
| 341 |
+
"elapsed": 6,
|
| 342 |
+
"idle_time": 5,
|
| 343 |
+
"exit_code": -9,
|
| 344 |
+
"retries": 1,
|
| 345 |
+
"log_file": "/tmp/gpu_queue/job_027_as_sorl_abs5_K1_25K_gpu0.log"
|
| 346 |
},
|
| 347 |
{
|
| 348 |
"job_id": 28,
|
|
|
|
| 459 |
"gpu": 0,
|
| 460 |
"status": "done",
|
| 461 |
"elapsed": 4510,
|
| 462 |
+
"idle_time": 15293,
|
| 463 |
"exit_code": 0,
|
| 464 |
"retries": 0,
|
| 465 |
"log_file": "/tmp/gpu_queue/job_037_as_sorl_abs10_K4_25K_gpu0.log"
|
|
|
|
| 504 |
"job_id": 41,
|
| 505 |
"name": "as_sorl_abs10_K4_50K",
|
| 506 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 50000 --abs_vocab 10 --K 4 --num",
|
| 507 |
+
"gpu": 2,
|
| 508 |
+
"status": "failed",
|
| 509 |
+
"elapsed": 6,
|
| 510 |
+
"idle_time": 5,
|
| 511 |
+
"exit_code": -9,
|
| 512 |
+
"retries": 1,
|
| 513 |
+
"log_file": "/tmp/gpu_queue/job_041_as_sorl_abs10_K4_50K_gpu2.log"
|
| 514 |
},
|
| 515 |
{
|
| 516 |
"job_id": 42,
|
|
|
|
| 531 |
"gpu": 0,
|
| 532 |
"status": "done",
|
| 533 |
"elapsed": 5510,
|
| 534 |
+
"idle_time": 9779,
|
| 535 |
"exit_code": 0,
|
| 536 |
"retries": 0,
|
| 537 |
"log_file": "/tmp/gpu_queue/job_043_as_sorl_abs50_K4_50K_gpu0.log"
|
|
|
|
| 552 |
"job_id": 45,
|
| 553 |
"name": "as_sorl_abs10_K4_100K",
|
| 554 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 4 --nu",
|
| 555 |
+
"gpu": 0,
|
| 556 |
+
"status": "running",
|
| 557 |
+
"elapsed": 1,
|
| 558 |
+
"idle_time": 1,
|
| 559 |
"exit_code": -1,
|
| 560 |
"retries": 0,
|
| 561 |
+
"log_file": "/tmp/gpu_queue/job_045_as_sorl_abs10_K4_100K_gpu0.log"
|
| 562 |
},
|
| 563 |
{
|
| 564 |
"job_id": 46,
|
|
|
|
| 612 |
"job_id": 50,
|
| 613 |
"name": "as_sorl_abs10_K1_zipf10.0_500K",
|
| 614 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --al",
|
| 615 |
+
"gpu": 2,
|
| 616 |
+
"status": "failed",
|
| 617 |
+
"elapsed": 6,
|
| 618 |
+
"idle_time": 11,
|
| 619 |
+
"exit_code": -9,
|
| 620 |
+
"retries": 1,
|
| 621 |
+
"log_file": "/tmp/gpu_queue/job_050_as_sorl_abs10_K1_zipf10.0_500K_gpu2.log"
|
| 622 |
},
|
| 623 |
{
|
| 624 |
"job_id": 51,
|
|
|
|
| 651 |
"gpu": 1,
|
| 652 |
"status": "failed",
|
| 653 |
"elapsed": 5,
|
| 654 |
+
"idle_time": 13457,
|
| 655 |
"exit_code": -9,
|
| 656 |
"retries": 1,
|
| 657 |
"log_file": "/tmp/gpu_queue/job_053_as_sorl_abs10_K4_zipf10.0_500K_gpu1.log"
|
|
|
|
| 675 |
"gpu": 0,
|
| 676 |
"status": "failed",
|
| 677 |
"elapsed": 5,
|
| 678 |
+
"idle_time": 6157,
|
| 679 |
"exit_code": -9,
|
| 680 |
"retries": 1,
|
| 681 |
"log_file": "/tmp/gpu_queue/job_055_as_sorl_abs100_K1_zipf5.0_500K_gpu0.log"
|
|
|
|
| 708 |
"job_id": 58,
|
| 709 |
"name": "as_sorl_abs100_K4_zipf5.0_500K",
|
| 710 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 100 --K 4 --a",
|
| 711 |
+
"gpu": 1,
|
| 712 |
+
"status": "running",
|
| 713 |
+
"elapsed": 1,
|
| 714 |
+
"idle_time": 1,
|
| 715 |
"exit_code": -1,
|
| 716 |
"retries": 0,
|
| 717 |
+
"log_file": "/tmp/gpu_queue/job_058_as_sorl_abs100_K4_zipf5.0_500K_gpu1.log"
|
| 718 |
},
|
| 719 |
{
|
| 720 |
"job_id": 59,
|
|
|
|
| 756 |
"job_id": 62,
|
| 757 |
"name": "as_baseline_50K_1L3H510d",
|
| 758 |
"cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 50000 --num_epochs 20 --n_la",
|
| 759 |
+
"gpu": 0,
|
| 760 |
+
"status": "failed",
|
| 761 |
+
"elapsed": 5,
|
| 762 |
"idle_time": 0,
|
| 763 |
+
"exit_code": -9,
|
| 764 |
+
"retries": 1,
|
| 765 |
+
"log_file": "/tmp/gpu_queue/job_062_as_baseline_50K_1L3H510d_gpu0.log"
|
| 766 |
},
|
| 767 |
{
|
| 768 |
"job_id": 63,
|
|
|
|
| 771 |
"gpu": 0,
|
| 772 |
"status": "done",
|
| 773 |
"elapsed": 5352,
|
| 774 |
+
"idle_time": 6171,
|
| 775 |
"exit_code": 0,
|
| 776 |
"retries": 0,
|
| 777 |
"log_file": "/tmp/gpu_queue/job_063_as_sorl_abs10_K1_50K_1L3H510d_gpu0.log"
|
|
|
|
| 780 |
"job_id": 64,
|
| 781 |
"name": "as_baseline_100K_1L3H510d",
|
| 782 |
"cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 100000 --num_epochs 20 --n_l",
|
| 783 |
+
"gpu": 1,
|
| 784 |
+
"status": "failed",
|
| 785 |
+
"elapsed": 4,
|
| 786 |
+
"idle_time": 21,
|
| 787 |
+
"exit_code": -15,
|
| 788 |
+
"retries": 1,
|
| 789 |
+
"log_file": "/tmp/gpu_queue/job_064_as_baseline_100K_1L3H510d_gpu1.log"
|
| 790 |
},
|
| 791 |
{
|
| 792 |
"job_id": 65,
|
| 793 |
"name": "as_sorl_abs10_K1_100K_1L3H510d",
|
| 794 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 1 --nu",
|
| 795 |
"gpu": 0,
|
| 796 |
+
"status": "failed",
|
| 797 |
+
"elapsed": 6,
|
| 798 |
+
"idle_time": 12,
|
| 799 |
+
"exit_code": -9,
|
| 800 |
+
"retries": 1,
|
| 801 |
"log_file": "/tmp/gpu_queue/job_065_as_sorl_abs10_K1_100K_1L3H510d_gpu0.log"
|
| 802 |
},
|
| 803 |
{
|
|
|
|
| 841 |
"name": "as_sorl_abs10_K1_500K_1L3H510d",
|
| 842 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
|
| 843 |
"gpu": 2,
|
| 844 |
+
"status": "failed",
|
| 845 |
+
"elapsed": 2,
|
| 846 |
+
"idle_time": 19,
|
| 847 |
+
"exit_code": -15,
|
| 848 |
+
"retries": 1,
|
| 849 |
"log_file": "/tmp/gpu_queue/job_069_as_sorl_abs10_K1_500K_1L3H510d_gpu2.log"
|
| 850 |
},
|
| 851 |
{
|
| 852 |
"job_id": 70,
|
| 853 |
"name": "as_baseline_25K_1L2H256d",
|
| 854 |
"cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 25000 --num_epochs 20 --n_la",
|
| 855 |
+
"gpu": 1,
|
| 856 |
+
"status": "failed",
|
| 857 |
+
"elapsed": 6,
|
| 858 |
+
"idle_time": 11,
|
| 859 |
+
"exit_code": -9,
|
| 860 |
+
"retries": 1,
|
| 861 |
+
"log_file": "/tmp/gpu_queue/job_070_as_baseline_25K_1L2H256d_gpu1.log"
|
| 862 |
},
|
| 863 |
{
|
| 864 |
"job_id": 71,
|
|
|
|
| 903 |
"gpu": 2,
|
| 904 |
"status": "done",
|
| 905 |
"elapsed": 2797,
|
| 906 |
+
"idle_time": 14622,
|
| 907 |
"exit_code": 0,
|
| 908 |
"retries": 0,
|
| 909 |
"log_file": "/tmp/gpu_queue/job_074_as_baseline_100K_1L2H256d_gpu2.log"
|
|
|
|
| 912 |
"job_id": 75,
|
| 913 |
"name": "as_sorl_abs10_K1_100K_1L2H256d",
|
| 914 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 1 --nu",
|
| 915 |
+
"gpu": 1,
|
| 916 |
+
"status": "failed",
|
| 917 |
+
"elapsed": 6,
|
| 918 |
+
"idle_time": 5,
|
| 919 |
+
"exit_code": -9,
|
| 920 |
+
"retries": 1,
|
| 921 |
+
"log_file": "/tmp/gpu_queue/job_075_as_sorl_abs10_K1_100K_1L2H256d_gpu1.log"
|
| 922 |
},
|
| 923 |
{
|
| 924 |
"job_id": 76,
|
|
|
|
| 927 |
"gpu": 0,
|
| 928 |
"status": "done",
|
| 929 |
"elapsed": 4832,
|
| 930 |
+
"idle_time": 1467,
|
| 931 |
"exit_code": 0,
|
| 932 |
"retries": 0,
|
| 933 |
"log_file": "/tmp/gpu_queue/job_076_as_baseline_250K_1L2H256d_gpu0.log"
|
|
|
|
| 951 |
"gpu": 2,
|
| 952 |
"status": "done",
|
| 953 |
"elapsed": 7064,
|
| 954 |
+
"idle_time": 11995,
|
| 955 |
"exit_code": 0,
|
| 956 |
"retries": 0,
|
| 957 |
"log_file": "/tmp/gpu_queue/job_078_as_baseline_500K_1L2H256d_gpu2.log"
|
|
|
|
| 975 |
"gpu": 1,
|
| 976 |
"status": "done",
|
| 977 |
"elapsed": 1971,
|
| 978 |
+
"idle_time": 13473,
|
| 979 |
"exit_code": 0,
|
| 980 |
"retries": 0,
|
| 981 |
"log_file": "/tmp/gpu_queue/job_080_as_baseline_25K_2L1H128d_gpu1.log"
|
|
|
|
| 1057 |
"name": "as_sorl_abs10_K1_250K_2L1H128d",
|
| 1058 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 1 --nu",
|
| 1059 |
"gpu": 1,
|
| 1060 |
+
"status": "failed",
|
| 1061 |
+
"elapsed": 5,
|
| 1062 |
+
"idle_time": 27,
|
| 1063 |
+
"exit_code": -9,
|
| 1064 |
+
"retries": 1,
|
| 1065 |
"log_file": "/tmp/gpu_queue/job_087_as_sorl_abs10_K1_250K_2L1H128d_gpu1.log"
|
| 1066 |
},
|
| 1067 |
{
|
|
|
|
| 1081 |
"name": "as_sorl_abs10_K1_500K_2L1H128d",
|
| 1082 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
|
| 1083 |
"gpu": 1,
|
| 1084 |
+
"status": "failed",
|
| 1085 |
+
"elapsed": 5,
|
| 1086 |
+
"idle_time": 26,
|
| 1087 |
+
"exit_code": -9,
|
| 1088 |
+
"retries": 1,
|
| 1089 |
"log_file": "/tmp/gpu_queue/job_089_as_sorl_abs10_K1_500K_2L1H128d_gpu1.log"
|
| 1090 |
},
|
| 1091 |
{
|
|
|
|
| 1119 |
"gpu": 0,
|
| 1120 |
"status": "done",
|
| 1121 |
"elapsed": 3477,
|
| 1122 |
+
"idle_time": 6302,
|
| 1123 |
"exit_code": 0,
|
| 1124 |
"retries": 0,
|
| 1125 |
"log_file": "/tmp/gpu_queue/job_092_as_sorl_abs30_K4_10K_gpu0.log"
|
|
|
|
| 1153 |
"name": "as_sorl_abs10_K4_250K",
|
| 1154 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 4 --nu",
|
| 1155 |
"gpu": 2,
|
| 1156 |
+
"status": "failed",
|
| 1157 |
+
"elapsed": 5,
|
| 1158 |
+
"idle_time": 23,
|
| 1159 |
+
"exit_code": -9,
|
| 1160 |
+
"retries": 1,
|
| 1161 |
"log_file": "/tmp/gpu_queue/job_095_as_sorl_abs10_K4_250K_gpu2.log"
|
| 1162 |
},
|
| 1163 |
{
|
|
|
|
| 1188 |
"job_id": 98,
|
| 1189 |
"name": "as_sorl_abs2_K1_100K",
|
| 1190 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 2 --K 1 --num",
|
| 1191 |
+
"gpu": 1,
|
| 1192 |
+
"status": "failed",
|
| 1193 |
+
"elapsed": 3,
|
| 1194 |
+
"idle_time": 20,
|
| 1195 |
+
"exit_code": -15,
|
| 1196 |
+
"retries": 1,
|
| 1197 |
+
"log_file": "/tmp/gpu_queue/job_098_as_sorl_abs2_K1_100K_gpu1.log"
|
| 1198 |
},
|
| 1199 |
{
|
| 1200 |
"job_id": 99,
|
| 1201 |
"name": "as_sorl_abs2_K4_100K",
|
| 1202 |
"cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 2 --K 4 --num",
|
| 1203 |
+
"gpu": 1,
|
| 1204 |
+
"status": "failed",
|
| 1205 |
+
"elapsed": 5,
|
| 1206 |
"idle_time": 0,
|
| 1207 |
+
"exit_code": -9,
|
| 1208 |
+
"retries": 1,
|
| 1209 |
+
"log_file": "/tmp/gpu_queue/job_099_as_sorl_abs2_K4_100K_gpu1.log"
|
| 1210 |
},
|
| 1211 |
{
|
| 1212 |
"job_id": 100,
|