QWen2.5-eval-NEWA800 / GPU_hunter.py
Xin-Rui's picture
Upload folder using huggingface_hub
a80200a verified
import subprocess
import time
import logging
logger = logging.getLogger()
log_file_path = "gpu_hunter.log"
logger.addHandler(logging.FileHandler(log_file_path))
logging.basicConfig(filename=log_file_path, level=logging.DEBUG, format='%(asctime)s - %(message)s')
print("start hunting")
def get_gpu_memory_usage():
try:
output = subprocess.check_output(
["nvidia-smi", "--query-gpu=memory.used,memory.total",
"--format=csv,noheader,nounits", "-i", "0,1,2,3"],
universal_newlines=True
)
return output.strip().split('\n')
except Exception as e:
print(f"Error getting GPU info: {e}")
return None
def check_low_usage(threshold=10):
gpu_data = get_gpu_memory_usage()
if not gpu_data:
return False
for gpu in gpu_data:
used, total = map(int, gpu.split(', '))
usage_percent = (used / total) * 100
if usage_percent >= threshold:
return False
return True
def main():
check_interval = 60*10 # 检查间隔(秒)
command_to_run = "bash /mnt/lyc/wuxinrui/Qwen2.5-Math/evaluation/remaining_eval/TCMv4_Nratio_copy.sh" # 替换为需要执行的命令
while True:
if check_low_usage(threshold=10):
time.sleep(check_interval)
if check_low_usage(threshold=10):
print("All GPUs have memory usage below 10%. Executing command...")
subprocess.run('conda deactivate', shell=True)
subprocess.run('conda activate QMath-wxr', shell=True)
subprocess.run(command_to_run, shell=True)
print("Command executed. Exiting GPU monitoring.")
break # 退出循环,停止监听
else:
print("GPUs are in use. Waiting...")
time.sleep(check_interval)
if __name__ == "__main__":
main()