amirali1985 commited on
Commit
983ce29
·
verified ·
1 Parent(s): 02ef935

queue status update

Browse files
Files changed (1) hide show
  1. queue_status.json +130 -130
queue_status.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "timestamp": "2026-04-12 08:05:30",
3
  "total": 120,
4
- "pending": 96,
5
- "running": 6,
6
  "done": 15,
7
- "failed": 3,
8
  "stale": 0,
9
  "retrying": 0,
10
  "jobs": [
@@ -15,7 +15,7 @@
15
  "gpu": 0,
16
  "status": "done",
17
  "elapsed": 1786,
18
- "idle_time": 20267,
19
  "exit_code": 0,
20
  "retries": 0,
21
  "log_file": "/tmp/gpu_queue/job_000_add_sub_baseline_10K_gpu0.log"
@@ -27,7 +27,7 @@
27
  "gpu": 1,
28
  "status": "done",
29
  "elapsed": 2101,
30
- "idle_time": 19952,
31
  "exit_code": 0,
32
  "retries": 0,
33
  "log_file": "/tmp/gpu_queue/job_001_add_sub_baseline_25K_gpu1.log"
@@ -39,7 +39,7 @@
39
  "gpu": 2,
40
  "status": "done",
41
  "elapsed": 4753,
42
- "idle_time": 17301,
43
  "exit_code": 0,
44
  "retries": 0,
45
  "log_file": "/tmp/gpu_queue/job_002_as_sorl_abs10_K1_25K_gpu2.log"
@@ -51,7 +51,7 @@
51
  "gpu": 0,
52
  "status": "done",
53
  "elapsed": 2366,
54
- "idle_time": 19687,
55
  "exit_code": 0,
56
  "retries": 0,
57
  "log_file": "/tmp/gpu_queue/job_003_add_sub_baseline_50K_gpu0.log"
@@ -63,7 +63,7 @@
63
  "gpu": 1,
64
  "status": "done",
65
  "elapsed": 6727,
66
- "idle_time": 15326,
67
  "exit_code": 0,
68
  "retries": 0,
69
  "log_file": "/tmp/gpu_queue/job_004_as_sorl_abs10_K1_50K_gpu1.log"
@@ -75,7 +75,7 @@
75
  "gpu": 2,
76
  "status": "done",
77
  "elapsed": 3112,
78
- "idle_time": 18941,
79
  "exit_code": 0,
80
  "retries": 0,
81
  "log_file": "/tmp/gpu_queue/job_005_add_sub_baseline_100K_gpu2.log"
@@ -87,7 +87,7 @@
87
  "gpu": 0,
88
  "status": "done",
89
  "elapsed": 8856,
90
- "idle_time": 11407,
91
  "exit_code": 0,
92
  "retries": 0,
93
  "log_file": "/tmp/gpu_queue/job_006_as_sorl_abs10_K1_100K_gpu0.log"
@@ -144,13 +144,13 @@
144
  "job_id": 11,
145
  "name": "as_sorl_abs2_K4_500K",
146
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 2 --K 4 --num",
147
- "gpu": -1,
148
- "status": "pending",
149
- "elapsed": 0,
150
- "idle_time": 0,
151
- "exit_code": -1,
152
- "retries": 0,
153
- "log_file": ""
154
  },
155
  {
156
  "job_id": 12,
@@ -171,7 +171,7 @@
171
  "gpu": 0,
172
  "status": "failed",
173
  "elapsed": 5,
174
- "idle_time": 914,
175
  "exit_code": -9,
176
  "retries": 1,
177
  "log_file": "/tmp/gpu_queue/job_013_as_sorl_abs10_K4_500K_gpu0.log"
@@ -228,13 +228,13 @@
228
  "job_id": 18,
229
  "name": "as_sorl_abs100_K4_500K",
230
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 100 --K 4 --n",
231
- "gpu": -1,
232
- "status": "pending",
233
- "elapsed": 0,
234
- "idle_time": 0,
235
  "exit_code": -1,
236
  "retries": 0,
237
- "log_file": ""
238
  },
239
  {
240
  "job_id": 19,
@@ -336,13 +336,13 @@
336
  "job_id": 27,
337
  "name": "as_sorl_abs5_K1_25K",
338
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 25000 --abs_vocab 5 --K 1 --num_",
339
- "gpu": -1,
340
- "status": "pending",
341
- "elapsed": 0,
342
- "idle_time": 0,
343
- "exit_code": -1,
344
- "retries": 0,
345
- "log_file": ""
346
  },
347
  {
348
  "job_id": 28,
@@ -459,7 +459,7 @@
459
  "gpu": 0,
460
  "status": "done",
461
  "elapsed": 4510,
462
- "idle_time": 15173,
463
  "exit_code": 0,
464
  "retries": 0,
465
  "log_file": "/tmp/gpu_queue/job_037_as_sorl_abs10_K4_25K_gpu0.log"
@@ -504,13 +504,13 @@
504
  "job_id": 41,
505
  "name": "as_sorl_abs10_K4_50K",
506
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 50000 --abs_vocab 10 --K 4 --num",
507
- "gpu": -1,
508
- "status": "pending",
509
- "elapsed": 0,
510
- "idle_time": 0,
511
- "exit_code": -1,
512
- "retries": 0,
513
- "log_file": ""
514
  },
515
  {
516
  "job_id": 42,
@@ -531,7 +531,7 @@
531
  "gpu": 0,
532
  "status": "done",
533
  "elapsed": 5510,
534
- "idle_time": 9658,
535
  "exit_code": 0,
536
  "retries": 0,
537
  "log_file": "/tmp/gpu_queue/job_043_as_sorl_abs50_K4_50K_gpu0.log"
@@ -552,13 +552,13 @@
552
  "job_id": 45,
553
  "name": "as_sorl_abs10_K4_100K",
554
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 4 --nu",
555
- "gpu": -1,
556
- "status": "pending",
557
- "elapsed": 0,
558
- "idle_time": 0,
559
  "exit_code": -1,
560
  "retries": 0,
561
- "log_file": ""
562
  },
563
  {
564
  "job_id": 46,
@@ -612,13 +612,13 @@
612
  "job_id": 50,
613
  "name": "as_sorl_abs10_K1_zipf10.0_500K",
614
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --al",
615
- "gpu": -1,
616
- "status": "pending",
617
- "elapsed": 0,
618
- "idle_time": 0,
619
- "exit_code": -1,
620
- "retries": 0,
621
- "log_file": ""
622
  },
623
  {
624
  "job_id": 51,
@@ -651,7 +651,7 @@
651
  "gpu": 1,
652
  "status": "failed",
653
  "elapsed": 5,
654
- "idle_time": 13337,
655
  "exit_code": -9,
656
  "retries": 1,
657
  "log_file": "/tmp/gpu_queue/job_053_as_sorl_abs10_K4_zipf10.0_500K_gpu1.log"
@@ -675,7 +675,7 @@
675
  "gpu": 0,
676
  "status": "failed",
677
  "elapsed": 5,
678
- "idle_time": 6037,
679
  "exit_code": -9,
680
  "retries": 1,
681
  "log_file": "/tmp/gpu_queue/job_055_as_sorl_abs100_K1_zipf5.0_500K_gpu0.log"
@@ -708,13 +708,13 @@
708
  "job_id": 58,
709
  "name": "as_sorl_abs100_K4_zipf5.0_500K",
710
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 100 --K 4 --a",
711
- "gpu": -1,
712
- "status": "pending",
713
- "elapsed": 0,
714
- "idle_time": 0,
715
  "exit_code": -1,
716
  "retries": 0,
717
- "log_file": ""
718
  },
719
  {
720
  "job_id": 59,
@@ -756,13 +756,13 @@
756
  "job_id": 62,
757
  "name": "as_baseline_50K_1L3H510d",
758
  "cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 50000 --num_epochs 20 --n_la",
759
- "gpu": -1,
760
- "status": "pending",
761
- "elapsed": 0,
762
  "idle_time": 0,
763
- "exit_code": -1,
764
- "retries": 0,
765
- "log_file": ""
766
  },
767
  {
768
  "job_id": 63,
@@ -771,7 +771,7 @@
771
  "gpu": 0,
772
  "status": "done",
773
  "elapsed": 5352,
774
- "idle_time": 6051,
775
  "exit_code": 0,
776
  "retries": 0,
777
  "log_file": "/tmp/gpu_queue/job_063_as_sorl_abs10_K1_50K_1L3H510d_gpu0.log"
@@ -780,24 +780,24 @@
780
  "job_id": 64,
781
  "name": "as_baseline_100K_1L3H510d",
782
  "cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 100000 --num_epochs 20 --n_l",
783
- "gpu": -1,
784
- "status": "pending",
785
- "elapsed": 0,
786
- "idle_time": 0,
787
- "exit_code": -1,
788
- "retries": 0,
789
- "log_file": ""
790
  },
791
  {
792
  "job_id": 65,
793
  "name": "as_sorl_abs10_K1_100K_1L3H510d",
794
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 1 --nu",
795
  "gpu": 0,
796
- "status": "running",
797
- "elapsed": 1341,
798
- "idle_time": 15,
799
- "exit_code": -1,
800
- "retries": 0,
801
  "log_file": "/tmp/gpu_queue/job_065_as_sorl_abs10_K1_100K_1L3H510d_gpu0.log"
802
  },
803
  {
@@ -841,24 +841,24 @@
841
  "name": "as_sorl_abs10_K1_500K_1L3H510d",
842
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
843
  "gpu": 2,
844
- "status": "running",
845
- "elapsed": 14496,
846
- "idle_time": 25,
847
- "exit_code": -1,
848
- "retries": 0,
849
  "log_file": "/tmp/gpu_queue/job_069_as_sorl_abs10_K1_500K_1L3H510d_gpu2.log"
850
  },
851
  {
852
  "job_id": 70,
853
  "name": "as_baseline_25K_1L2H256d",
854
  "cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 25000 --num_epochs 20 --n_la",
855
- "gpu": 0,
856
- "status": "running",
857
- "elapsed": 911,
858
- "idle_time": 55,
859
- "exit_code": -1,
860
- "retries": 0,
861
- "log_file": "/tmp/gpu_queue/job_070_as_baseline_25K_1L2H256d_gpu0.log"
862
  },
863
  {
864
  "job_id": 71,
@@ -903,7 +903,7 @@
903
  "gpu": 2,
904
  "status": "done",
905
  "elapsed": 2797,
906
- "idle_time": 14501,
907
  "exit_code": 0,
908
  "retries": 0,
909
  "log_file": "/tmp/gpu_queue/job_074_as_baseline_100K_1L2H256d_gpu2.log"
@@ -912,13 +912,13 @@
912
  "job_id": 75,
913
  "name": "as_sorl_abs10_K1_100K_1L2H256d",
914
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 1 --nu",
915
- "gpu": -1,
916
- "status": "pending",
917
- "elapsed": 0,
918
- "idle_time": 0,
919
- "exit_code": -1,
920
- "retries": 0,
921
- "log_file": ""
922
  },
923
  {
924
  "job_id": 76,
@@ -927,7 +927,7 @@
927
  "gpu": 0,
928
  "status": "done",
929
  "elapsed": 4832,
930
- "idle_time": 1346,
931
  "exit_code": 0,
932
  "retries": 0,
933
  "log_file": "/tmp/gpu_queue/job_076_as_baseline_250K_1L2H256d_gpu0.log"
@@ -951,7 +951,7 @@
951
  "gpu": 2,
952
  "status": "done",
953
  "elapsed": 7064,
954
- "idle_time": 11874,
955
  "exit_code": 0,
956
  "retries": 0,
957
  "log_file": "/tmp/gpu_queue/job_078_as_baseline_500K_1L2H256d_gpu2.log"
@@ -975,7 +975,7 @@
975
  "gpu": 1,
976
  "status": "done",
977
  "elapsed": 1971,
978
- "idle_time": 13352,
979
  "exit_code": 0,
980
  "retries": 0,
981
  "log_file": "/tmp/gpu_queue/job_080_as_baseline_25K_2L1H128d_gpu1.log"
@@ -1057,11 +1057,11 @@
1057
  "name": "as_sorl_abs10_K1_250K_2L1H128d",
1058
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 1 --nu",
1059
  "gpu": 1,
1060
- "status": "running",
1061
- "elapsed": 13336,
1062
- "idle_time": 12,
1063
- "exit_code": -1,
1064
- "retries": 0,
1065
  "log_file": "/tmp/gpu_queue/job_087_as_sorl_abs10_K1_250K_2L1H128d_gpu1.log"
1066
  },
1067
  {
@@ -1081,11 +1081,11 @@
1081
  "name": "as_sorl_abs10_K1_500K_2L1H128d",
1082
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
1083
  "gpu": 1,
1084
- "status": "running",
1085
- "elapsed": 19946,
1086
- "idle_time": 12,
1087
- "exit_code": -1,
1088
- "retries": 0,
1089
  "log_file": "/tmp/gpu_queue/job_089_as_sorl_abs10_K1_500K_2L1H128d_gpu1.log"
1090
  },
1091
  {
@@ -1119,7 +1119,7 @@
1119
  "gpu": 0,
1120
  "status": "done",
1121
  "elapsed": 3477,
1122
- "idle_time": 6181,
1123
  "exit_code": 0,
1124
  "retries": 0,
1125
  "log_file": "/tmp/gpu_queue/job_092_as_sorl_abs30_K4_10K_gpu0.log"
@@ -1153,11 +1153,11 @@
1153
  "name": "as_sorl_abs10_K4_250K",
1154
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 4 --nu",
1155
  "gpu": 2,
1156
- "status": "running",
1157
- "elapsed": 11871,
1158
- "idle_time": 12,
1159
- "exit_code": -1,
1160
- "retries": 0,
1161
  "log_file": "/tmp/gpu_queue/job_095_as_sorl_abs10_K4_250K_gpu2.log"
1162
  },
1163
  {
@@ -1188,25 +1188,25 @@
1188
  "job_id": 98,
1189
  "name": "as_sorl_abs2_K1_100K",
1190
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 2 --K 1 --num",
1191
- "gpu": -1,
1192
- "status": "pending",
1193
- "elapsed": 0,
1194
- "idle_time": 0,
1195
- "exit_code": -1,
1196
- "retries": 0,
1197
- "log_file": ""
1198
  },
1199
  {
1200
  "job_id": 99,
1201
  "name": "as_sorl_abs2_K4_100K",
1202
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 2 --K 4 --num",
1203
- "gpu": -1,
1204
- "status": "pending",
1205
- "elapsed": 0,
1206
  "idle_time": 0,
1207
- "exit_code": -1,
1208
- "retries": 0,
1209
- "log_file": ""
1210
  },
1211
  {
1212
  "job_id": 100,
 
1
  {
2
+ "timestamp": "2026-04-12 08:07:30",
3
  "total": 120,
4
+ "pending": 84,
5
+ "running": 3,
6
  "done": 15,
7
+ "failed": 18,
8
  "stale": 0,
9
  "retrying": 0,
10
  "jobs": [
 
15
  "gpu": 0,
16
  "status": "done",
17
  "elapsed": 1786,
18
+ "idle_time": 20388,
19
  "exit_code": 0,
20
  "retries": 0,
21
  "log_file": "/tmp/gpu_queue/job_000_add_sub_baseline_10K_gpu0.log"
 
27
  "gpu": 1,
28
  "status": "done",
29
  "elapsed": 2101,
30
+ "idle_time": 20073,
31
  "exit_code": 0,
32
  "retries": 0,
33
  "log_file": "/tmp/gpu_queue/job_001_add_sub_baseline_25K_gpu1.log"
 
39
  "gpu": 2,
40
  "status": "done",
41
  "elapsed": 4753,
42
+ "idle_time": 17421,
43
  "exit_code": 0,
44
  "retries": 0,
45
  "log_file": "/tmp/gpu_queue/job_002_as_sorl_abs10_K1_25K_gpu2.log"
 
51
  "gpu": 0,
52
  "status": "done",
53
  "elapsed": 2366,
54
+ "idle_time": 19808,
55
  "exit_code": 0,
56
  "retries": 0,
57
  "log_file": "/tmp/gpu_queue/job_003_add_sub_baseline_50K_gpu0.log"
 
63
  "gpu": 1,
64
  "status": "done",
65
  "elapsed": 6727,
66
+ "idle_time": 15447,
67
  "exit_code": 0,
68
  "retries": 0,
69
  "log_file": "/tmp/gpu_queue/job_004_as_sorl_abs10_K1_50K_gpu1.log"
 
75
  "gpu": 2,
76
  "status": "done",
77
  "elapsed": 3112,
78
+ "idle_time": 19062,
79
  "exit_code": 0,
80
  "retries": 0,
81
  "log_file": "/tmp/gpu_queue/job_005_add_sub_baseline_100K_gpu2.log"
 
87
  "gpu": 0,
88
  "status": "done",
89
  "elapsed": 8856,
90
+ "idle_time": 11528,
91
  "exit_code": 0,
92
  "retries": 0,
93
  "log_file": "/tmp/gpu_queue/job_006_as_sorl_abs10_K1_100K_gpu0.log"
 
144
  "job_id": 11,
145
  "name": "as_sorl_abs2_K4_500K",
146
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 2 --K 4 --num",
147
+ "gpu": 2,
148
+ "status": "failed",
149
+ "elapsed": 5,
150
+ "idle_time": 1,
151
+ "exit_code": -9,
152
+ "retries": 1,
153
+ "log_file": "/tmp/gpu_queue/job_011_as_sorl_abs2_K4_500K_gpu2.log"
154
  },
155
  {
156
  "job_id": 12,
 
171
  "gpu": 0,
172
  "status": "failed",
173
  "elapsed": 5,
174
+ "idle_time": 1035,
175
  "exit_code": -9,
176
  "retries": 1,
177
  "log_file": "/tmp/gpu_queue/job_013_as_sorl_abs10_K4_500K_gpu0.log"
 
228
  "job_id": 18,
229
  "name": "as_sorl_abs100_K4_500K",
230
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 100 --K 4 --n",
231
+ "gpu": 2,
232
+ "status": "running",
233
+ "elapsed": 1,
234
+ "idle_time": 1,
235
  "exit_code": -1,
236
  "retries": 0,
237
+ "log_file": "/tmp/gpu_queue/job_018_as_sorl_abs100_K4_500K_gpu2.log"
238
  },
239
  {
240
  "job_id": 19,
 
336
  "job_id": 27,
337
  "name": "as_sorl_abs5_K1_25K",
338
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 25000 --abs_vocab 5 --K 1 --num_",
339
+ "gpu": 0,
340
+ "status": "failed",
341
+ "elapsed": 6,
342
+ "idle_time": 5,
343
+ "exit_code": -9,
344
+ "retries": 1,
345
+ "log_file": "/tmp/gpu_queue/job_027_as_sorl_abs5_K1_25K_gpu0.log"
346
  },
347
  {
348
  "job_id": 28,
 
459
  "gpu": 0,
460
  "status": "done",
461
  "elapsed": 4510,
462
+ "idle_time": 15293,
463
  "exit_code": 0,
464
  "retries": 0,
465
  "log_file": "/tmp/gpu_queue/job_037_as_sorl_abs10_K4_25K_gpu0.log"
 
504
  "job_id": 41,
505
  "name": "as_sorl_abs10_K4_50K",
506
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 50000 --abs_vocab 10 --K 4 --num",
507
+ "gpu": 2,
508
+ "status": "failed",
509
+ "elapsed": 6,
510
+ "idle_time": 5,
511
+ "exit_code": -9,
512
+ "retries": 1,
513
+ "log_file": "/tmp/gpu_queue/job_041_as_sorl_abs10_K4_50K_gpu2.log"
514
  },
515
  {
516
  "job_id": 42,
 
531
  "gpu": 0,
532
  "status": "done",
533
  "elapsed": 5510,
534
+ "idle_time": 9779,
535
  "exit_code": 0,
536
  "retries": 0,
537
  "log_file": "/tmp/gpu_queue/job_043_as_sorl_abs50_K4_50K_gpu0.log"
 
552
  "job_id": 45,
553
  "name": "as_sorl_abs10_K4_100K",
554
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 4 --nu",
555
+ "gpu": 0,
556
+ "status": "running",
557
+ "elapsed": 1,
558
+ "idle_time": 1,
559
  "exit_code": -1,
560
  "retries": 0,
561
+ "log_file": "/tmp/gpu_queue/job_045_as_sorl_abs10_K4_100K_gpu0.log"
562
  },
563
  {
564
  "job_id": 46,
 
612
  "job_id": 50,
613
  "name": "as_sorl_abs10_K1_zipf10.0_500K",
614
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --al",
615
+ "gpu": 2,
616
+ "status": "failed",
617
+ "elapsed": 6,
618
+ "idle_time": 11,
619
+ "exit_code": -9,
620
+ "retries": 1,
621
+ "log_file": "/tmp/gpu_queue/job_050_as_sorl_abs10_K1_zipf10.0_500K_gpu2.log"
622
  },
623
  {
624
  "job_id": 51,
 
651
  "gpu": 1,
652
  "status": "failed",
653
  "elapsed": 5,
654
+ "idle_time": 13457,
655
  "exit_code": -9,
656
  "retries": 1,
657
  "log_file": "/tmp/gpu_queue/job_053_as_sorl_abs10_K4_zipf10.0_500K_gpu1.log"
 
675
  "gpu": 0,
676
  "status": "failed",
677
  "elapsed": 5,
678
+ "idle_time": 6157,
679
  "exit_code": -9,
680
  "retries": 1,
681
  "log_file": "/tmp/gpu_queue/job_055_as_sorl_abs100_K1_zipf5.0_500K_gpu0.log"
 
708
  "job_id": 58,
709
  "name": "as_sorl_abs100_K4_zipf5.0_500K",
710
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 100 --K 4 --a",
711
+ "gpu": 1,
712
+ "status": "running",
713
+ "elapsed": 1,
714
+ "idle_time": 1,
715
  "exit_code": -1,
716
  "retries": 0,
717
+ "log_file": "/tmp/gpu_queue/job_058_as_sorl_abs100_K4_zipf5.0_500K_gpu1.log"
718
  },
719
  {
720
  "job_id": 59,
 
756
  "job_id": 62,
757
  "name": "as_baseline_50K_1L3H510d",
758
  "cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 50000 --num_epochs 20 --n_la",
759
+ "gpu": 0,
760
+ "status": "failed",
761
+ "elapsed": 5,
762
  "idle_time": 0,
763
+ "exit_code": -9,
764
+ "retries": 1,
765
+ "log_file": "/tmp/gpu_queue/job_062_as_baseline_50K_1L3H510d_gpu0.log"
766
  },
767
  {
768
  "job_id": 63,
 
771
  "gpu": 0,
772
  "status": "done",
773
  "elapsed": 5352,
774
+ "idle_time": 6171,
775
  "exit_code": 0,
776
  "retries": 0,
777
  "log_file": "/tmp/gpu_queue/job_063_as_sorl_abs10_K1_50K_1L3H510d_gpu0.log"
 
780
  "job_id": 64,
781
  "name": "as_baseline_100K_1L3H510d",
782
  "cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 100000 --num_epochs 20 --n_l",
783
+ "gpu": 1,
784
+ "status": "failed",
785
+ "elapsed": 4,
786
+ "idle_time": 21,
787
+ "exit_code": -15,
788
+ "retries": 1,
789
+ "log_file": "/tmp/gpu_queue/job_064_as_baseline_100K_1L3H510d_gpu1.log"
790
  },
791
  {
792
  "job_id": 65,
793
  "name": "as_sorl_abs10_K1_100K_1L3H510d",
794
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 1 --nu",
795
  "gpu": 0,
796
+ "status": "failed",
797
+ "elapsed": 6,
798
+ "idle_time": 12,
799
+ "exit_code": -9,
800
+ "retries": 1,
801
  "log_file": "/tmp/gpu_queue/job_065_as_sorl_abs10_K1_100K_1L3H510d_gpu0.log"
802
  },
803
  {
 
841
  "name": "as_sorl_abs10_K1_500K_1L3H510d",
842
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
843
  "gpu": 2,
844
+ "status": "failed",
845
+ "elapsed": 2,
846
+ "idle_time": 19,
847
+ "exit_code": -15,
848
+ "retries": 1,
849
  "log_file": "/tmp/gpu_queue/job_069_as_sorl_abs10_K1_500K_1L3H510d_gpu2.log"
850
  },
851
  {
852
  "job_id": 70,
853
  "name": "as_baseline_25K_1L2H256d",
854
  "cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 25000 --num_epochs 20 --n_la",
855
+ "gpu": 1,
856
+ "status": "failed",
857
+ "elapsed": 6,
858
+ "idle_time": 11,
859
+ "exit_code": -9,
860
+ "retries": 1,
861
+ "log_file": "/tmp/gpu_queue/job_070_as_baseline_25K_1L2H256d_gpu1.log"
862
  },
863
  {
864
  "job_id": 71,
 
903
  "gpu": 2,
904
  "status": "done",
905
  "elapsed": 2797,
906
+ "idle_time": 14622,
907
  "exit_code": 0,
908
  "retries": 0,
909
  "log_file": "/tmp/gpu_queue/job_074_as_baseline_100K_1L2H256d_gpu2.log"
 
912
  "job_id": 75,
913
  "name": "as_sorl_abs10_K1_100K_1L2H256d",
914
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 10 --K 1 --nu",
915
+ "gpu": 1,
916
+ "status": "failed",
917
+ "elapsed": 6,
918
+ "idle_time": 5,
919
+ "exit_code": -9,
920
+ "retries": 1,
921
+ "log_file": "/tmp/gpu_queue/job_075_as_sorl_abs10_K1_100K_1L2H256d_gpu1.log"
922
  },
923
  {
924
  "job_id": 76,
 
927
  "gpu": 0,
928
  "status": "done",
929
  "elapsed": 4832,
930
+ "idle_time": 1467,
931
  "exit_code": 0,
932
  "retries": 0,
933
  "log_file": "/tmp/gpu_queue/job_076_as_baseline_250K_1L2H256d_gpu0.log"
 
951
  "gpu": 2,
952
  "status": "done",
953
  "elapsed": 7064,
954
+ "idle_time": 11995,
955
  "exit_code": 0,
956
  "retries": 0,
957
  "log_file": "/tmp/gpu_queue/job_078_as_baseline_500K_1L2H256d_gpu2.log"
 
975
  "gpu": 1,
976
  "status": "done",
977
  "elapsed": 1971,
978
+ "idle_time": 13473,
979
  "exit_code": 0,
980
  "retries": 0,
981
  "log_file": "/tmp/gpu_queue/job_080_as_baseline_25K_2L1H128d_gpu1.log"
 
1057
  "name": "as_sorl_abs10_K1_250K_2L1H128d",
1058
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 1 --nu",
1059
  "gpu": 1,
1060
+ "status": "failed",
1061
+ "elapsed": 5,
1062
+ "idle_time": 27,
1063
+ "exit_code": -9,
1064
+ "retries": 1,
1065
  "log_file": "/tmp/gpu_queue/job_087_as_sorl_abs10_K1_250K_2L1H128d_gpu1.log"
1066
  },
1067
  {
 
1081
  "name": "as_sorl_abs10_K1_500K_2L1H128d",
1082
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
1083
  "gpu": 1,
1084
+ "status": "failed",
1085
+ "elapsed": 5,
1086
+ "idle_time": 26,
1087
+ "exit_code": -9,
1088
+ "retries": 1,
1089
  "log_file": "/tmp/gpu_queue/job_089_as_sorl_abs10_K1_500K_2L1H128d_gpu1.log"
1090
  },
1091
  {
 
1119
  "gpu": 0,
1120
  "status": "done",
1121
  "elapsed": 3477,
1122
+ "idle_time": 6302,
1123
  "exit_code": 0,
1124
  "retries": 0,
1125
  "log_file": "/tmp/gpu_queue/job_092_as_sorl_abs30_K4_10K_gpu0.log"
 
1153
  "name": "as_sorl_abs10_K4_250K",
1154
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 4 --nu",
1155
  "gpu": 2,
1156
+ "status": "failed",
1157
+ "elapsed": 5,
1158
+ "idle_time": 23,
1159
+ "exit_code": -9,
1160
+ "retries": 1,
1161
  "log_file": "/tmp/gpu_queue/job_095_as_sorl_abs10_K4_250K_gpu2.log"
1162
  },
1163
  {
 
1188
  "job_id": 98,
1189
  "name": "as_sorl_abs2_K1_100K",
1190
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 2 --K 1 --num",
1191
+ "gpu": 1,
1192
+ "status": "failed",
1193
+ "elapsed": 3,
1194
+ "idle_time": 20,
1195
+ "exit_code": -15,
1196
+ "retries": 1,
1197
+ "log_file": "/tmp/gpu_queue/job_098_as_sorl_abs2_K1_100K_gpu1.log"
1198
  },
1199
  {
1200
  "job_id": 99,
1201
  "name": "as_sorl_abs2_K4_100K",
1202
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 100000 --abs_vocab 2 --K 4 --num",
1203
+ "gpu": 1,
1204
+ "status": "failed",
1205
+ "elapsed": 5,
1206
  "idle_time": 0,
1207
+ "exit_code": -9,
1208
+ "retries": 1,
1209
+ "log_file": "/tmp/gpu_queue/job_099_as_sorl_abs2_K4_100K_gpu1.log"
1210
  },
1211
  {
1212
  "job_id": 100,