File size: 7,276 Bytes
19891ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import torch
import multiprocessing
import time
import os
import sys
def occupy_gpu(device_id, memory_fraction=0.90, compute_size=8192):
"""
Target function for a process to occupy a specific GPU.
Args:
device_id (int): The ID of the GPU to occupy (e.g., 1 for cuda:1).
memory_fraction (float): Fraction of free memory to try and allocate (0.0 to 1.0).
compute_size (int): Dimension of square matrices for matmul compute load.
Larger values increase compute intensity but also use some memory.
"""
try:
# Ensure this process targets the correct GPU
torch.cuda.set_device(device_id)
device = f'cuda:{device_id}'
process_id = os.getpid()
print(f"[PID {process_id}] Targeting {device}...")
# --- 1. Allocate Memory ---
allocated_tensor = None
try:
# Get free memory and total memory
free_mem, total_mem = torch.cuda.mem_get_info(device_id)
target_alloc_bytes = int(free_mem * memory_fraction)
print(f"[PID {process_id}] {device}: Total Mem={total_mem/1024**3:.2f} GB, Free Mem={free_mem/1024**3:.2f} GB")
print(f"[PID {process_id}] {device}: Attempting to allocate ~{target_alloc_bytes/1024**3:.2f} GB ({memory_fraction*100:.0f}% of free)...")
# Calculate tensor size (using float32 = 4 bytes per element)
elements_needed = target_alloc_bytes // 4
# Create a 1D tensor first, as it's simpler to calculate size
allocated_tensor = torch.empty(elements_needed, dtype=torch.float32, device=device)
# Fill it with some data to ensure allocation happens (sometimes lazy allocation occurs)
allocated_tensor.fill_(1.0)
torch.cuda.synchronize(device_id) # Wait for allocation to complete
# Verify allocated memory (this is approximate as PyTorch reserves some overhead)
allocated_bytes = allocated_tensor.nelement() * allocated_tensor.element_size()
print(f"[PID {process_id}] {device}: Successfully allocated tensor using ~{allocated_bytes/1024**3:.2f} GB.")
# Keep the tensor alive by referencing it
except RuntimeError as e:
print(f"[PID {process_id}] {device}: ERROR allocating memory - {e}. Memory usage might be lower.")
print(f"[PID {process_id}] {device}: Check if {memory_fraction*100:.0f}% is too high or other processes are using memory.")
# Continue to compute loop even if memory allocation failed partially or fully
# --- 2. Run Compute Load ---
print(f"[PID {process_id}] {device}: Starting compute loop (matmul {compute_size}x{compute_size})...")
# Create tensors for computation
try:
a = torch.randn(compute_size, compute_size, dtype=torch.float32, device=device)
b = torch.randn(compute_size, compute_size, dtype=torch.float32, device=device)
except RuntimeError as e:
print(f"[PID {process_id}] {device}: ERROR creating compute tensors ({compute_size}x{compute_size}) - {e}.")
print(f"[PID {process_id}] {device}: GPU might not have enough remaining memory for this compute size. Try reducing 'compute_size'. Exiting process.")
return # Exit this process if we can't even create compute tensors
# Infinite compute loop
while True:
# Perform a compute-intensive operation
c = torch.matmul(a, b)
# Optional: add more operations if matmul alone isn't maxing out utilization
# a = a * 1.0001 # Avoid values growing too large/small quickly
# b = b + 0.0001
# torch.cuda.synchronize(device_id) # Usually not needed in a tight loop like this
# We don't need to do anything with 'c', the goal is just the computation.
# No sleep here, we want maximum utilization.
except Exception as e:
print(f"[PID {process_id}] {device}: UNEXPECTED ERROR - {e}")
# Log any other errors that might occur
if __name__ == "__main__":
# --- Configuration ---
TARGET_GPU_IDS = [0,1] # <<< Your target GPU IDs here (cuda:1, cuda:2, cuda:3)
MEMORY_FRACTION_TO_USE = 0.85 # <<< Try to use 90% of *free* memory. Adjust if needed (0.8 to 0.95 is typical)
COMPUTE_MATRIX_DIM = 8192 # <<< Dimension for matmul (e.g., 8192, 10240, 12288).
# Larger = more compute intensive bursts, but uses more temp memory.
# Adjust based on GPU capability and remaining memory after allocation.
# --- End Configuration ---
# Check CUDA availability and device count
if not torch.cuda.is_available():
print("Error: CUDA is not available. Please check your PyTorch installation and CUDA drivers.")
sys.exit(1)
num_gpus = torch.cuda.device_count()
print(f"Found {num_gpus} CUDA devices.")
valid_target_gpus = []
for gpu_id in TARGET_GPU_IDS:
if gpu_id < 0 or gpu_id >= num_gpus:
print(f"Warning: GPU ID {gpu_id} is invalid (must be between 0 and {num_gpus-1}). Skipping.")
else:
valid_target_gpus.append(gpu_id)
if not valid_target_gpus:
print("Error: No valid target GPUs specified or available. Exiting.")
sys.exit(1)
print(f"Attempting to occupy GPUs: {valid_target_gpus}")
print(f"Memory target: {MEMORY_FRACTION_TO_USE*100:.0f}% of free memory per GPU.")
print(f"Compute load: Matrix multiplication of size {COMPUTE_MATRIX_DIM}x{COMPUTE_MATRIX_DIM}.")
print("-" * 30)
# Set multiprocessing start method (important for CUDA in some environments)
try:
multiprocessing.set_start_method('spawn', force=True)
except RuntimeError:
print("Note: Could not set multiprocessing start method to 'spawn'. Using default.")
pass
processes = []
for gpu_id in valid_target_gpus:
p = multiprocessing.Process(target=occupy_gpu, args=(gpu_id, MEMORY_FRACTION_TO_USE, COMPUTE_MATRIX_DIM))
processes.append(p)
p.start()
print("\nProcesses started. Monitor GPU usage with 'nvidia-smi'.")
print("Press Ctrl+C to stop the script and terminate processes.")
try:
# Keep the main script alive while child processes run
for p in processes:
p.join() # Wait for processes to finish (they won't unless error or terminated)
except KeyboardInterrupt:
print("\nCtrl+C detected. Terminating GPU occupation processes...")
for p in processes:
if p.is_alive():
p.terminate() # Send SIGTERM
p.join(timeout=5) # Wait max 5 seconds for graceful exit
if p.is_alive():
print(f"Process {p.pid} did not terminate gracefully, killing.")
p.kill() # Send SIGKILL if necessary
p.join() # Wait for kill
print("All processes terminated.")
except Exception as main_e:
print(f"An error occurred in the main process: {main_e}")
# Optionally try to clean up child processes here too
for p in processes:
if p.is_alive(): p.terminate() |