amirali1985 commited on
Commit
d43d5ff
·
verified ·
1 Parent(s): 5f7d914

queue status update

Browse files
Files changed (1) hide show
  1. queue_status.json +41 -41
queue_status.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "timestamp": "2026-04-12 06:22:31",
3
  "total": 98,
4
- "pending": 79,
5
  "running": 5,
6
- "done": 13,
7
- "failed": 1,
8
  "stale": 0,
9
  "retrying": 0,
10
  "jobs": [
@@ -15,7 +15,7 @@
15
  "gpu": 0,
16
  "status": "done",
17
  "elapsed": 1786,
18
- "idle_time": 14088,
19
  "exit_code": 0,
20
  "retries": 0,
21
  "log_file": "/tmp/gpu_queue/job_000_add_sub_baseline_10K_gpu0.log"
@@ -27,7 +27,7 @@
27
  "gpu": 1,
28
  "status": "done",
29
  "elapsed": 2101,
30
- "idle_time": 13773,
31
  "exit_code": 0,
32
  "retries": 0,
33
  "log_file": "/tmp/gpu_queue/job_001_add_sub_baseline_25K_gpu1.log"
@@ -39,7 +39,7 @@
39
  "gpu": 2,
40
  "status": "done",
41
  "elapsed": 4753,
42
- "idle_time": 11122,
43
  "exit_code": 0,
44
  "retries": 0,
45
  "log_file": "/tmp/gpu_queue/job_002_as_sorl_abs10_K1_25K_gpu2.log"
@@ -51,7 +51,7 @@
51
  "gpu": 0,
52
  "status": "done",
53
  "elapsed": 2366,
54
- "idle_time": 13508,
55
  "exit_code": 0,
56
  "retries": 0,
57
  "log_file": "/tmp/gpu_queue/job_003_add_sub_baseline_50K_gpu0.log"
@@ -63,7 +63,7 @@
63
  "gpu": 1,
64
  "status": "done",
65
  "elapsed": 6727,
66
- "idle_time": 9147,
67
  "exit_code": 0,
68
  "retries": 0,
69
  "log_file": "/tmp/gpu_queue/job_004_as_sorl_abs10_K1_50K_gpu1.log"
@@ -75,7 +75,7 @@
75
  "gpu": 2,
76
  "status": "done",
77
  "elapsed": 3112,
78
- "idle_time": 12762,
79
  "exit_code": 0,
80
  "retries": 0,
81
  "log_file": "/tmp/gpu_queue/job_005_add_sub_baseline_100K_gpu2.log"
@@ -87,7 +87,7 @@
87
  "gpu": 0,
88
  "status": "done",
89
  "elapsed": 8856,
90
- "idle_time": 5228,
91
  "exit_code": 0,
92
  "retries": 0,
93
  "log_file": "/tmp/gpu_queue/job_006_as_sorl_abs10_K1_100K_gpu0.log"
@@ -459,7 +459,7 @@
459
  "gpu": 0,
460
  "status": "done",
461
  "elapsed": 4510,
462
- "idle_time": 8993,
463
  "exit_code": 0,
464
  "retries": 0,
465
  "log_file": "/tmp/gpu_queue/job_037_as_sorl_abs10_K4_25K_gpu0.log"
@@ -531,7 +531,7 @@
531
  "gpu": 0,
532
  "status": "done",
533
  "elapsed": 5510,
534
- "idle_time": 3479,
535
  "exit_code": 0,
536
  "retries": 0,
537
  "log_file": "/tmp/gpu_queue/job_043_as_sorl_abs50_K4_50K_gpu0.log"
@@ -651,7 +651,7 @@
651
  "gpu": 1,
652
  "status": "failed",
653
  "elapsed": 5,
654
- "idle_time": 7158,
655
  "exit_code": -9,
656
  "retries": 1,
657
  "log_file": "/tmp/gpu_queue/job_053_as_sorl_abs10_K4_zipf10.0_500K_gpu1.log"
@@ -672,13 +672,13 @@
672
  "job_id": 55,
673
  "name": "as_sorl_abs100_K1_zipf5.0_500K",
674
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 100 --K 1 --a",
675
- "gpu": -1,
676
- "status": "pending",
677
- "elapsed": 0,
678
  "idle_time": 0,
679
- "exit_code": -1,
680
- "retries": 0,
681
- "log_file": ""
682
  },
683
  {
684
  "job_id": 56,
@@ -769,10 +769,10 @@
769
  "name": "as_sorl_abs10_K1_50K_1L3H510d",
770
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 50000 --abs_vocab 10 --K 1 --num",
771
  "gpu": 0,
772
- "status": "running",
773
- "elapsed": 5222,
774
- "idle_time": 84,
775
- "exit_code": -1,
776
  "retries": 0,
777
  "log_file": "/tmp/gpu_queue/job_063_as_sorl_abs10_K1_50K_1L3H510d_gpu0.log"
778
  },
@@ -842,8 +842,8 @@
842
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
843
  "gpu": 2,
844
  "status": "running",
845
- "elapsed": 8317,
846
- "idle_time": 21,
847
  "exit_code": -1,
848
  "retries": 0,
849
  "log_file": "/tmp/gpu_queue/job_069_as_sorl_abs10_K1_500K_1L3H510d_gpu2.log"
@@ -903,7 +903,7 @@
903
  "gpu": 2,
904
  "status": "done",
905
  "elapsed": 2797,
906
- "idle_time": 8322,
907
  "exit_code": 0,
908
  "retries": 0,
909
  "log_file": "/tmp/gpu_queue/job_074_as_baseline_100K_1L2H256d_gpu2.log"
@@ -924,13 +924,13 @@
924
  "job_id": 76,
925
  "name": "as_baseline_250K_1L2H256d",
926
  "cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 250000 --num_epochs 20 --n_l",
927
- "gpu": -1,
928
- "status": "pending",
929
- "elapsed": 0,
930
- "idle_time": 0,
931
  "exit_code": -1,
932
  "retries": 0,
933
- "log_file": ""
934
  },
935
  {
936
  "job_id": 77,
@@ -951,7 +951,7 @@
951
  "gpu": 2,
952
  "status": "done",
953
  "elapsed": 7064,
954
- "idle_time": 5695,
955
  "exit_code": 0,
956
  "retries": 0,
957
  "log_file": "/tmp/gpu_queue/job_078_as_baseline_500K_1L2H256d_gpu2.log"
@@ -975,7 +975,7 @@
975
  "gpu": 1,
976
  "status": "done",
977
  "elapsed": 1971,
978
- "idle_time": 7173,
979
  "exit_code": 0,
980
  "retries": 0,
981
  "log_file": "/tmp/gpu_queue/job_080_as_baseline_25K_2L1H128d_gpu1.log"
@@ -1058,8 +1058,8 @@
1058
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 1 --nu",
1059
  "gpu": 1,
1060
  "status": "running",
1061
- "elapsed": 7157,
1062
- "idle_time": 3,
1063
  "exit_code": -1,
1064
  "retries": 0,
1065
  "log_file": "/tmp/gpu_queue/job_087_as_sorl_abs10_K1_250K_2L1H128d_gpu1.log"
@@ -1082,8 +1082,8 @@
1082
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
1083
  "gpu": 1,
1084
  "status": "running",
1085
- "elapsed": 13767,
1086
- "idle_time": 263,
1087
  "exit_code": -1,
1088
  "retries": 0,
1089
  "log_file": "/tmp/gpu_queue/job_089_as_sorl_abs10_K1_500K_2L1H128d_gpu1.log"
@@ -1119,7 +1119,7 @@
1119
  "gpu": 0,
1120
  "status": "done",
1121
  "elapsed": 3477,
1122
- "idle_time": 2,
1123
  "exit_code": 0,
1124
  "retries": 0,
1125
  "log_file": "/tmp/gpu_queue/job_092_as_sorl_abs30_K4_10K_gpu0.log"
@@ -1154,8 +1154,8 @@
1154
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 4 --nu",
1155
  "gpu": 2,
1156
  "status": "running",
1157
- "elapsed": 5692,
1158
- "idle_time": 4,
1159
  "exit_code": -1,
1160
  "retries": 0,
1161
  "log_file": "/tmp/gpu_queue/job_095_as_sorl_abs10_K4_250K_gpu2.log"
 
1
  {
2
+ "timestamp": "2026-04-12 06:24:53",
3
  "total": 98,
4
+ "pending": 77,
5
  "running": 5,
6
+ "done": 14,
7
+ "failed": 2,
8
  "stale": 0,
9
  "retrying": 0,
10
  "jobs": [
 
15
  "gpu": 0,
16
  "status": "done",
17
  "elapsed": 1786,
18
+ "idle_time": 14231,
19
  "exit_code": 0,
20
  "retries": 0,
21
  "log_file": "/tmp/gpu_queue/job_000_add_sub_baseline_10K_gpu0.log"
 
27
  "gpu": 1,
28
  "status": "done",
29
  "elapsed": 2101,
30
+ "idle_time": 13916,
31
  "exit_code": 0,
32
  "retries": 0,
33
  "log_file": "/tmp/gpu_queue/job_001_add_sub_baseline_25K_gpu1.log"
 
39
  "gpu": 2,
40
  "status": "done",
41
  "elapsed": 4753,
42
+ "idle_time": 11264,
43
  "exit_code": 0,
44
  "retries": 0,
45
  "log_file": "/tmp/gpu_queue/job_002_as_sorl_abs10_K1_25K_gpu2.log"
 
51
  "gpu": 0,
52
  "status": "done",
53
  "elapsed": 2366,
54
+ "idle_time": 13650,
55
  "exit_code": 0,
56
  "retries": 0,
57
  "log_file": "/tmp/gpu_queue/job_003_add_sub_baseline_50K_gpu0.log"
 
63
  "gpu": 1,
64
  "status": "done",
65
  "elapsed": 6727,
66
+ "idle_time": 9290,
67
  "exit_code": 0,
68
  "retries": 0,
69
  "log_file": "/tmp/gpu_queue/job_004_as_sorl_abs10_K1_50K_gpu1.log"
 
75
  "gpu": 2,
76
  "status": "done",
77
  "elapsed": 3112,
78
+ "idle_time": 12904,
79
  "exit_code": 0,
80
  "retries": 0,
81
  "log_file": "/tmp/gpu_queue/job_005_add_sub_baseline_100K_gpu2.log"
 
87
  "gpu": 0,
88
  "status": "done",
89
  "elapsed": 8856,
90
+ "idle_time": 5371,
91
  "exit_code": 0,
92
  "retries": 0,
93
  "log_file": "/tmp/gpu_queue/job_006_as_sorl_abs10_K1_100K_gpu0.log"
 
459
  "gpu": 0,
460
  "status": "done",
461
  "elapsed": 4510,
462
+ "idle_time": 9136,
463
  "exit_code": 0,
464
  "retries": 0,
465
  "log_file": "/tmp/gpu_queue/job_037_as_sorl_abs10_K4_25K_gpu0.log"
 
531
  "gpu": 0,
532
  "status": "done",
533
  "elapsed": 5510,
534
+ "idle_time": 3622,
535
  "exit_code": 0,
536
  "retries": 0,
537
  "log_file": "/tmp/gpu_queue/job_043_as_sorl_abs50_K4_50K_gpu0.log"
 
651
  "gpu": 1,
652
  "status": "failed",
653
  "elapsed": 5,
654
+ "idle_time": 7300,
655
  "exit_code": -9,
656
  "retries": 1,
657
  "log_file": "/tmp/gpu_queue/job_053_as_sorl_abs10_K4_zipf10.0_500K_gpu1.log"
 
672
  "job_id": 55,
673
  "name": "as_sorl_abs100_K1_zipf5.0_500K",
674
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 100 --K 1 --a",
675
+ "gpu": 0,
676
+ "status": "failed",
677
+ "elapsed": 5,
678
  "idle_time": 0,
679
+ "exit_code": -9,
680
+ "retries": 1,
681
+ "log_file": "/tmp/gpu_queue/job_055_as_sorl_abs100_K1_zipf5.0_500K_gpu0.log"
682
  },
683
  {
684
  "job_id": 56,
 
769
  "name": "as_sorl_abs10_K1_50K_1L3H510d",
770
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 50000 --abs_vocab 10 --K 1 --num",
771
  "gpu": 0,
772
+ "status": "done",
773
+ "elapsed": 5352,
774
+ "idle_time": 14,
775
+ "exit_code": 0,
776
  "retries": 0,
777
  "log_file": "/tmp/gpu_queue/job_063_as_sorl_abs10_K1_50K_1L3H510d_gpu0.log"
778
  },
 
842
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
843
  "gpu": 2,
844
  "status": "running",
845
+ "elapsed": 8459,
846
+ "idle_time": 12,
847
  "exit_code": -1,
848
  "retries": 0,
849
  "log_file": "/tmp/gpu_queue/job_069_as_sorl_abs10_K1_500K_1L3H510d_gpu2.log"
 
903
  "gpu": 2,
904
  "status": "done",
905
  "elapsed": 2797,
906
+ "idle_time": 8464,
907
  "exit_code": 0,
908
  "retries": 0,
909
  "log_file": "/tmp/gpu_queue/job_074_as_baseline_100K_1L2H256d_gpu2.log"
 
924
  "job_id": 76,
925
  "name": "as_baseline_250K_1L2H256d",
926
  "cmd": "python -m arithmetic.train --mode baseline --ops add_sub --dataset_size 250000 --num_epochs 20 --n_l",
927
+ "gpu": 0,
928
+ "status": "running",
929
+ "elapsed": 139,
930
+ "idle_time": 27,
931
  "exit_code": -1,
932
  "retries": 0,
933
+ "log_file": "/tmp/gpu_queue/job_076_as_baseline_250K_1L2H256d_gpu0.log"
934
  },
935
  {
936
  "job_id": 77,
 
951
  "gpu": 2,
952
  "status": "done",
953
  "elapsed": 7064,
954
+ "idle_time": 5837,
955
  "exit_code": 0,
956
  "retries": 0,
957
  "log_file": "/tmp/gpu_queue/job_078_as_baseline_500K_1L2H256d_gpu2.log"
 
975
  "gpu": 1,
976
  "status": "done",
977
  "elapsed": 1971,
978
+ "idle_time": 7316,
979
  "exit_code": 0,
980
  "retries": 0,
981
  "log_file": "/tmp/gpu_queue/job_080_as_baseline_25K_2L1H128d_gpu1.log"
 
1058
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 1 --nu",
1059
  "gpu": 1,
1060
  "status": "running",
1061
+ "elapsed": 7299,
1062
+ "idle_time": 7,
1063
  "exit_code": -1,
1064
  "retries": 0,
1065
  "log_file": "/tmp/gpu_queue/job_087_as_sorl_abs10_K1_250K_2L1H128d_gpu1.log"
 
1082
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 500000 --abs_vocab 10 --K 1 --nu",
1083
  "gpu": 1,
1084
  "status": "running",
1085
+ "elapsed": 13909,
1086
+ "idle_time": 13,
1087
  "exit_code": -1,
1088
  "retries": 0,
1089
  "log_file": "/tmp/gpu_queue/job_089_as_sorl_abs10_K1_500K_2L1H128d_gpu1.log"
 
1119
  "gpu": 0,
1120
  "status": "done",
1121
  "elapsed": 3477,
1122
+ "idle_time": 145,
1123
  "exit_code": 0,
1124
  "retries": 0,
1125
  "log_file": "/tmp/gpu_queue/job_092_as_sorl_abs30_K4_10K_gpu0.log"
 
1154
  "cmd": "python -m arithmetic.train --mode sorl --ops add_sub --dataset_size 250000 --abs_vocab 10 --K 4 --nu",
1155
  "gpu": 2,
1156
  "status": "running",
1157
+ "elapsed": 5834,
1158
+ "idle_time": 8,
1159
  "exit_code": -1,
1160
  "retries": 0,
1161
  "log_file": "/tmp/gpu_queue/job_095_as_sorl_abs10_K4_250K_gpu2.log"