| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | import subprocess |
| | import time |
| | import threading |
| | import torch |
| | from collections import deque |
| |
|
| | def get_gpu_details(gpu_id): |
| | """Returns the GPU utilization, used memory, and total memory for a specific GPU.""" |
| | cmd = ['nvidia-smi', '--id=' + str(gpu_id), |
| | '--query-gpu=utilization.gpu,memory.used,memory.total', |
| | '--format=csv,noheader,nounits'] |
| | result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True) |
| | utilization, used_memory, total_memory = result.stdout.strip().split(', ') |
| | return int(utilization), int(used_memory), int(total_memory) |
| |
|
| | def matrix_calculation_task(gpu_id, stop_event, task_running): |
| | """Performs a GPU-occupying task on the specified GPU.""" |
| | torch.cuda.set_device(gpu_id) |
| | task_running[gpu_id] = True |
| | while not stop_event.is_set(): |
| | a = torch.rand(55000, 55000, device='cuda') |
| | b = torch.rand(55000, 55000, device='cuda') |
| | torch.matmul(a, b) |
| | task_running[gpu_id] = False |
| |
|
| | def monitor_and_manage_gpu(gpu_id, stop_event, task_running): |
| | """Monitors a GPU and manages the matrix calculation task based on average usage.""" |
| | utilization_data = deque(maxlen=30) |
| | while True: |
| | utilization, _, _ = get_gpu_details(gpu_id) |
| | utilization_data.append(utilization) |
| | if len(utilization_data) == 30: |
| | avg_utilization = round(sum(utilization_data) / len(utilization_data), 1) |
| | if avg_utilization < 90 and not task_running[gpu_id]: |
| | print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is underutilized, starting task.") |
| | stop_event.clear() |
| | threading.Thread(target=matrix_calculation_task, args=(gpu_id, stop_event, task_running)).start() |
| | elif avg_utilization >= 90 and task_running[gpu_id]: |
| | print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal, keep running.") |
| | else: |
| | if task_running[gpu_id]: |
| | print(f"Occupying task just starts, and average GPU {gpu_id} ({avg_utilization}%) is increasing, keep monitoring.") |
| | else: |
| | print(f"No occupying task running, but average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal.") |
| | time.sleep(1) |
| |
|
| | num_gpus = 8 |
| | stop_events = [threading.Event() for _ in range(num_gpus)] |
| | task_running = [False] * num_gpus |
| |
|
| | |
| | for gpu_id in range(1, num_gpus): |
| | threading.Thread(target=monitor_and_manage_gpu, args=(gpu_id, stop_events[gpu_id], task_running)).start() |
| |
|