|
|
import torch |
|
|
import multiprocessing |
|
|
import time |
|
|
import os |
|
|
import sys |
|
|
|
|
|
def occupy_gpu(device_id, memory_fraction=0.90, compute_size=8192): |
|
|
""" |
|
|
Target function for a process to occupy a specific GPU. |
|
|
Args: |
|
|
device_id (int): The ID of the GPU to occupy (e.g., 1 for cuda:1). |
|
|
memory_fraction (float): Fraction of free memory to try and allocate (0.0 to 1.0). |
|
|
compute_size (int): Dimension of square matrices for matmul compute load. |
|
|
Larger values increase compute intensity but also use some memory. |
|
|
""" |
|
|
try: |
|
|
|
|
|
torch.cuda.set_device(device_id) |
|
|
device = f'cuda:{device_id}' |
|
|
process_id = os.getpid() |
|
|
print(f"[PID {process_id}] Targeting {device}...") |
|
|
|
|
|
|
|
|
allocated_tensor = None |
|
|
try: |
|
|
|
|
|
free_mem, total_mem = torch.cuda.mem_get_info(device_id) |
|
|
target_alloc_bytes = int(free_mem * memory_fraction) |
|
|
print(f"[PID {process_id}] {device}: Total Mem={total_mem/1024**3:.2f} GB, Free Mem={free_mem/1024**3:.2f} GB") |
|
|
print(f"[PID {process_id}] {device}: Attempting to allocate ~{target_alloc_bytes/1024**3:.2f} GB ({memory_fraction*100:.0f}% of free)...") |
|
|
|
|
|
|
|
|
elements_needed = target_alloc_bytes // 4 |
|
|
|
|
|
allocated_tensor = torch.empty(elements_needed, dtype=torch.float32, device=device) |
|
|
|
|
|
allocated_tensor.fill_(1.0) |
|
|
torch.cuda.synchronize(device_id) |
|
|
|
|
|
|
|
|
allocated_bytes = allocated_tensor.nelement() * allocated_tensor.element_size() |
|
|
print(f"[PID {process_id}] {device}: Successfully allocated tensor using ~{allocated_bytes/1024**3:.2f} GB.") |
|
|
|
|
|
|
|
|
except RuntimeError as e: |
|
|
print(f"[PID {process_id}] {device}: ERROR allocating memory - {e}. Memory usage might be lower.") |
|
|
print(f"[PID {process_id}] {device}: Check if {memory_fraction*100:.0f}% is too high or other processes are using memory.") |
|
|
|
|
|
|
|
|
|
|
|
print(f"[PID {process_id}] {device}: Starting compute loop (matmul {compute_size}x{compute_size})...") |
|
|
|
|
|
try: |
|
|
a = torch.randn(compute_size, compute_size, dtype=torch.float32, device=device) |
|
|
b = torch.randn(compute_size, compute_size, dtype=torch.float32, device=device) |
|
|
except RuntimeError as e: |
|
|
print(f"[PID {process_id}] {device}: ERROR creating compute tensors ({compute_size}x{compute_size}) - {e}.") |
|
|
print(f"[PID {process_id}] {device}: GPU might not have enough remaining memory for this compute size. Try reducing 'compute_size'. Exiting process.") |
|
|
return |
|
|
|
|
|
|
|
|
while True: |
|
|
|
|
|
c = torch.matmul(a, b) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
print(f"[PID {process_id}] {device}: UNEXPECTED ERROR - {e}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
TARGET_GPU_IDS = [0,1] |
|
|
MEMORY_FRACTION_TO_USE = 0.85 |
|
|
COMPUTE_MATRIX_DIM = 8192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not torch.cuda.is_available(): |
|
|
print("Error: CUDA is not available. Please check your PyTorch installation and CUDA drivers.") |
|
|
sys.exit(1) |
|
|
|
|
|
num_gpus = torch.cuda.device_count() |
|
|
print(f"Found {num_gpus} CUDA devices.") |
|
|
|
|
|
valid_target_gpus = [] |
|
|
for gpu_id in TARGET_GPU_IDS: |
|
|
if gpu_id < 0 or gpu_id >= num_gpus: |
|
|
print(f"Warning: GPU ID {gpu_id} is invalid (must be between 0 and {num_gpus-1}). Skipping.") |
|
|
else: |
|
|
valid_target_gpus.append(gpu_id) |
|
|
|
|
|
if not valid_target_gpus: |
|
|
print("Error: No valid target GPUs specified or available. Exiting.") |
|
|
sys.exit(1) |
|
|
|
|
|
print(f"Attempting to occupy GPUs: {valid_target_gpus}") |
|
|
print(f"Memory target: {MEMORY_FRACTION_TO_USE*100:.0f}% of free memory per GPU.") |
|
|
print(f"Compute load: Matrix multiplication of size {COMPUTE_MATRIX_DIM}x{COMPUTE_MATRIX_DIM}.") |
|
|
print("-" * 30) |
|
|
|
|
|
|
|
|
try: |
|
|
multiprocessing.set_start_method('spawn', force=True) |
|
|
except RuntimeError: |
|
|
print("Note: Could not set multiprocessing start method to 'spawn'. Using default.") |
|
|
pass |
|
|
|
|
|
processes = [] |
|
|
for gpu_id in valid_target_gpus: |
|
|
p = multiprocessing.Process(target=occupy_gpu, args=(gpu_id, MEMORY_FRACTION_TO_USE, COMPUTE_MATRIX_DIM)) |
|
|
processes.append(p) |
|
|
p.start() |
|
|
|
|
|
print("\nProcesses started. Monitor GPU usage with 'nvidia-smi'.") |
|
|
print("Press Ctrl+C to stop the script and terminate processes.") |
|
|
|
|
|
try: |
|
|
|
|
|
for p in processes: |
|
|
p.join() |
|
|
except KeyboardInterrupt: |
|
|
print("\nCtrl+C detected. Terminating GPU occupation processes...") |
|
|
for p in processes: |
|
|
if p.is_alive(): |
|
|
p.terminate() |
|
|
p.join(timeout=5) |
|
|
if p.is_alive(): |
|
|
print(f"Process {p.pid} did not terminate gracefully, killing.") |
|
|
p.kill() |
|
|
p.join() |
|
|
print("All processes terminated.") |
|
|
except Exception as main_e: |
|
|
print(f"An error occurred in the main process: {main_e}") |
|
|
|
|
|
for p in processes: |
|
|
if p.is_alive(): p.terminate() |