Spaces:
Runtime error
Runtime error
Yixin Liu commited on
Commit ·
5f30d29
1
Parent(s): 1782b42
upload
Browse files
main.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
# import imp
|
|
|
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
|
@@ -18,6 +19,9 @@ import time
|
|
| 18 |
# title
|
| 19 |
st.title("Exp Command Generator")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
| 21 |
## 检查框
|
| 22 |
debug = st.checkbox("Debug:选择则会串行地执行命令", value=True)
|
| 23 |
# st.write(f"checkbox的值是{res}")
|
|
@@ -41,19 +45,26 @@ lr=5e-5""")
|
|
| 41 |
|
| 42 |
## gpu 相关参数
|
| 43 |
gpu_list = st.multiselect("multi select", range(10), [1, 2, 3, 4, 5, 6, 7, 8, 9])
|
| 44 |
-
print(gpu_list)
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
sleep_time_after_loading_task= st.number_input("加载任务后等待秒数", value=20, min_value=0,step=5)
|
| 48 |
all_full_sleep_time = st.number_input("全满之后等待秒数", value=20, min_value=0,step=5)
|
| 49 |
|
| 50 |
gpu_list_str = ' '.join([str(i) for i in gpu_list])
|
| 51 |
-
gpu_hyper =
|
| 52 |
gpu_hyper+=f"allow_gpu_memory_threshold={allow_gpu_memory_threshold}\n"
|
| 53 |
gpu_hyper+=f"gpu_threshold={gpu_threshold}\n"
|
| 54 |
gpu_hyper+=f"sleep_time_after_loading_task={sleep_time_after_loading_task}s\n"
|
| 55 |
gpu_hyper+=f"all_full_sleep_time={all_full_sleep_time}s\n"
|
| 56 |
gpu_hyper+=f"gpunum={len(gpu_list)}\n"
|
|
|
|
| 57 |
|
| 58 |
main_loop = st.text_area("Main loop", """for lambda_1 in 1 3;do
|
| 59 |
for lambda_2 in 1 10;do
|
|
@@ -83,7 +94,6 @@ if g:
|
|
| 83 |
s += gpu_hyper + "\n\n"
|
| 84 |
s += hyper_loop + "\n\n"
|
| 85 |
s += """
|
| 86 |
-
i=0 # we search from the first gpu
|
| 87 |
while true; do
|
| 88 |
gpu_id=${gpu[$i]}
|
| 89 |
# nvidia-smi --query-gpu=utilization.gpu --format=csv -i 2 | grep -Eo "[0-9]+"
|
|
@@ -103,6 +113,10 @@ while true; do
|
|
| 103 |
done
|
| 104 |
|
| 105 |
gpu_id=${gpu[$i]}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $gpu_id | grep -Eo "[0-9]+")
|
| 107 |
gpu_u=$(nvidia-smi --query-gpu=utilization.gpu --format=csv -i $gpu_id | grep -Eo "[0-9]+")
|
| 108 |
export CUDA_VISIBLE_DEVICES=$gpu_id
|
|
@@ -113,7 +127,10 @@ echo "use gpu id is ${gpu[$i]}, free memory is $free_mem, it utilization is ${gp
|
|
| 113 |
s += "echo ==========================================================================================\n"
|
| 114 |
if debug:
|
| 115 |
s += "$com\n"
|
|
|
|
|
|
|
| 116 |
else:
|
|
|
|
| 117 |
s += "mkdir -p ./logs/\n"
|
| 118 |
s += "nohup $com > ./logs/$exp_name-$RANDOM.log 2>&1 &\n"
|
| 119 |
s += """echo "sleep for $sleep_time_after_loading_task to wait the task loaded"
|
|
|
|
| 1 |
# import imp
|
| 2 |
+
from email.policy import default
|
| 3 |
import streamlit as st
|
| 4 |
import pandas as pd
|
| 5 |
import numpy as np
|
|
|
|
| 19 |
# title
|
| 20 |
st.title("Exp Command Generator")
|
| 21 |
|
| 22 |
+
# experiment mode
|
| 23 |
+
exp_mode = st.selectbox("Select Experiment Mode", ["OneExpOnecard", "MultipleExpOnecard"],key="MultipleExpOnecard")
|
| 24 |
+
|
| 25 |
## 检查框
|
| 26 |
debug = st.checkbox("Debug:选择则会串行地执行命令", value=True)
|
| 27 |
# st.write(f"checkbox的值是{res}")
|
|
|
|
| 45 |
|
| 46 |
## gpu 相关参数
|
| 47 |
gpu_list = st.multiselect("multi select", range(10), [1, 2, 3, 4, 5, 6, 7, 8, 9])
|
| 48 |
+
# print(gpu_list)
|
| 49 |
+
if exp_mode == "OneExpOnecard":
|
| 50 |
+
allow_gpu_memory_threshold_default = 20000
|
| 51 |
+
gpu_threshold_default = 1
|
| 52 |
+
elif exp_mode == "MultipleExpOnecard":
|
| 53 |
+
allow_gpu_memory_threshold_default = 5000
|
| 54 |
+
gpu_threshold_default = 70
|
| 55 |
+
allow_gpu_memory_threshold = st.number_input("最小单卡剩余容量", value=allow_gpu_memory_threshold_default, min_value=0, max_value=30000, step=1000)
|
| 56 |
+
gpu_threshold = st.number_input("最大单卡利用率", value=gpu_threshold_default, min_value=0, max_value=100, step=10)
|
| 57 |
sleep_time_after_loading_task= st.number_input("加载任务后等待秒数", value=20, min_value=0,step=5)
|
| 58 |
all_full_sleep_time = st.number_input("全满之后等待秒数", value=20, min_value=0,step=5)
|
| 59 |
|
| 60 |
gpu_list_str = ' '.join([str(i) for i in gpu_list])
|
| 61 |
+
gpu_hyper = "gpu=$\{#gpu[@]}\n"
|
| 62 |
gpu_hyper+=f"allow_gpu_memory_threshold={allow_gpu_memory_threshold}\n"
|
| 63 |
gpu_hyper+=f"gpu_threshold={gpu_threshold}\n"
|
| 64 |
gpu_hyper+=f"sleep_time_after_loading_task={sleep_time_after_loading_task}s\n"
|
| 65 |
gpu_hyper+=f"all_full_sleep_time={all_full_sleep_time}s\n"
|
| 66 |
gpu_hyper+=f"gpunum={len(gpu_list)}\n"
|
| 67 |
+
gpu_hyper+="i=0\n"
|
| 68 |
|
| 69 |
main_loop = st.text_area("Main loop", """for lambda_1 in 1 3;do
|
| 70 |
for lambda_2 in 1 10;do
|
|
|
|
| 94 |
s += gpu_hyper + "\n\n"
|
| 95 |
s += hyper_loop + "\n\n"
|
| 96 |
s += """
|
|
|
|
| 97 |
while true; do
|
| 98 |
gpu_id=${gpu[$i]}
|
| 99 |
# nvidia-smi --query-gpu=utilization.gpu --format=csv -i 2 | grep -Eo "[0-9]+"
|
|
|
|
| 113 |
done
|
| 114 |
|
| 115 |
gpu_id=${gpu[$i]}
|
| 116 |
+
# search from the next gpu
|
| 117 |
+
i=`expr $i + 1`
|
| 118 |
+
i=`expr $i % $gpunum`
|
| 119 |
+
|
| 120 |
free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $gpu_id | grep -Eo "[0-9]+")
|
| 121 |
gpu_u=$(nvidia-smi --query-gpu=utilization.gpu --format=csv -i $gpu_id | grep -Eo "[0-9]+")
|
| 122 |
export CUDA_VISIBLE_DEVICES=$gpu_id
|
|
|
|
| 127 |
s += "echo ==========================================================================================\n"
|
| 128 |
if debug:
|
| 129 |
s += "$com\n"
|
| 130 |
+
s += "# mkdir -p ./logs/\n"
|
| 131 |
+
s += "# nohup $com > ./logs/$exp_name-$RANDOM.log 2>&1 &\n"
|
| 132 |
else:
|
| 133 |
+
s += "# $com\n"
|
| 134 |
s += "mkdir -p ./logs/\n"
|
| 135 |
s += "nohup $com > ./logs/$exp_name-$RANDOM.log 2>&1 &\n"
|
| 136 |
s += """echo "sleep for $sleep_time_after_loading_task to wait the task loaded"
|