File size: 1,903 Bytes
a80200a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import subprocess
import time
import logging
logger = logging.getLogger()
log_file_path = "gpu_hunter.log"
logger.addHandler(logging.FileHandler(log_file_path))
logging.basicConfig(filename=log_file_path, level=logging.DEBUG, format='%(asctime)s - %(message)s')
print("start hunting")
def get_gpu_memory_usage():
try:
output = subprocess.check_output(
["nvidia-smi", "--query-gpu=memory.used,memory.total",
"--format=csv,noheader,nounits", "-i", "0,1,2,3"],
universal_newlines=True
)
return output.strip().split('\n')
except Exception as e:
print(f"Error getting GPU info: {e}")
return None
def check_low_usage(threshold=10):
gpu_data = get_gpu_memory_usage()
if not gpu_data:
return False
for gpu in gpu_data:
used, total = map(int, gpu.split(', '))
usage_percent = (used / total) * 100
if usage_percent >= threshold:
return False
return True
def main():
check_interval = 60*10 # 检查间隔(秒)
command_to_run = "bash /mnt/lyc/wuxinrui/Qwen2.5-Math/evaluation/remaining_eval/TCMv4_Nratio_copy.sh" # 替换为需要执行的命令
while True:
if check_low_usage(threshold=10):
time.sleep(check_interval)
if check_low_usage(threshold=10):
print("All GPUs have memory usage below 10%. Executing command...")
subprocess.run('conda deactivate', shell=True)
subprocess.run('conda activate QMath-wxr', shell=True)
subprocess.run(command_to_run, shell=True)
print("Command executed. Exiting GPU monitoring.")
break # 退出循环,停止监听
else:
print("GPUs are in use. Waiting...")
time.sleep(check_interval)
if __name__ == "__main__":
main() |