import torch import multiprocessing import time import os import sys def occupy_gpu(device_id, memory_fraction=0.90, compute_size=8192): """ Target function for a process to occupy a specific GPU. Args: device_id (int): The ID of the GPU to occupy (e.g., 1 for cuda:1). memory_fraction (float): Fraction of free memory to try and allocate (0.0 to 1.0). compute_size (int): Dimension of square matrices for matmul compute load. Larger values increase compute intensity but also use some memory. """ try: # Ensure this process targets the correct GPU torch.cuda.set_device(device_id) device = f'cuda:{device_id}' process_id = os.getpid() print(f"[PID {process_id}] Targeting {device}...") # --- 1. Allocate Memory --- allocated_tensor = None try: # Get free memory and total memory free_mem, total_mem = torch.cuda.mem_get_info(device_id) target_alloc_bytes = int(free_mem * memory_fraction) print(f"[PID {process_id}] {device}: Total Mem={total_mem/1024**3:.2f} GB, Free Mem={free_mem/1024**3:.2f} GB") print(f"[PID {process_id}] {device}: Attempting to allocate ~{target_alloc_bytes/1024**3:.2f} GB ({memory_fraction*100:.0f}% of free)...") # Calculate tensor size (using float32 = 4 bytes per element) elements_needed = target_alloc_bytes // 4 # Create a 1D tensor first, as it's simpler to calculate size allocated_tensor = torch.empty(elements_needed, dtype=torch.float32, device=device) # Fill it with some data to ensure allocation happens (sometimes lazy allocation occurs) allocated_tensor.fill_(1.0) torch.cuda.synchronize(device_id) # Wait for allocation to complete # Verify allocated memory (this is approximate as PyTorch reserves some overhead) allocated_bytes = allocated_tensor.nelement() * allocated_tensor.element_size() print(f"[PID {process_id}] {device}: Successfully allocated tensor using ~{allocated_bytes/1024**3:.2f} GB.") # Keep the tensor alive by referencing it except RuntimeError as e: print(f"[PID {process_id}] {device}: ERROR allocating memory - {e}. Memory usage might be lower.") print(f"[PID {process_id}] {device}: Check if {memory_fraction*100:.0f}% is too high or other processes are using memory.") # Continue to compute loop even if memory allocation failed partially or fully # --- 2. Run Compute Load --- print(f"[PID {process_id}] {device}: Starting compute loop (matmul {compute_size}x{compute_size})...") # Create tensors for computation try: a = torch.randn(compute_size, compute_size, dtype=torch.float32, device=device) b = torch.randn(compute_size, compute_size, dtype=torch.float32, device=device) except RuntimeError as e: print(f"[PID {process_id}] {device}: ERROR creating compute tensors ({compute_size}x{compute_size}) - {e}.") print(f"[PID {process_id}] {device}: GPU might not have enough remaining memory for this compute size. Try reducing 'compute_size'. Exiting process.") return # Exit this process if we can't even create compute tensors # Infinite compute loop while True: # Perform a compute-intensive operation c = torch.matmul(a, b) # Optional: add more operations if matmul alone isn't maxing out utilization # a = a * 1.0001 # Avoid values growing too large/small quickly # b = b + 0.0001 # torch.cuda.synchronize(device_id) # Usually not needed in a tight loop like this # We don't need to do anything with 'c', the goal is just the computation. # No sleep here, we want maximum utilization. except Exception as e: print(f"[PID {process_id}] {device}: UNEXPECTED ERROR - {e}") # Log any other errors that might occur if __name__ == "__main__": # --- Configuration --- TARGET_GPU_IDS = [0,1] # <<< Your target GPU IDs here (cuda:1, cuda:2, cuda:3) MEMORY_FRACTION_TO_USE = 0.85 # <<< Try to use 90% of *free* memory. Adjust if needed (0.8 to 0.95 is typical) COMPUTE_MATRIX_DIM = 8192 # <<< Dimension for matmul (e.g., 8192, 10240, 12288). # Larger = more compute intensive bursts, but uses more temp memory. # Adjust based on GPU capability and remaining memory after allocation. # --- End Configuration --- # Check CUDA availability and device count if not torch.cuda.is_available(): print("Error: CUDA is not available. Please check your PyTorch installation and CUDA drivers.") sys.exit(1) num_gpus = torch.cuda.device_count() print(f"Found {num_gpus} CUDA devices.") valid_target_gpus = [] for gpu_id in TARGET_GPU_IDS: if gpu_id < 0 or gpu_id >= num_gpus: print(f"Warning: GPU ID {gpu_id} is invalid (must be between 0 and {num_gpus-1}). Skipping.") else: valid_target_gpus.append(gpu_id) if not valid_target_gpus: print("Error: No valid target GPUs specified or available. Exiting.") sys.exit(1) print(f"Attempting to occupy GPUs: {valid_target_gpus}") print(f"Memory target: {MEMORY_FRACTION_TO_USE*100:.0f}% of free memory per GPU.") print(f"Compute load: Matrix multiplication of size {COMPUTE_MATRIX_DIM}x{COMPUTE_MATRIX_DIM}.") print("-" * 30) # Set multiprocessing start method (important for CUDA in some environments) try: multiprocessing.set_start_method('spawn', force=True) except RuntimeError: print("Note: Could not set multiprocessing start method to 'spawn'. Using default.") pass processes = [] for gpu_id in valid_target_gpus: p = multiprocessing.Process(target=occupy_gpu, args=(gpu_id, MEMORY_FRACTION_TO_USE, COMPUTE_MATRIX_DIM)) processes.append(p) p.start() print("\nProcesses started. Monitor GPU usage with 'nvidia-smi'.") print("Press Ctrl+C to stop the script and terminate processes.") try: # Keep the main script alive while child processes run for p in processes: p.join() # Wait for processes to finish (they won't unless error or terminated) except KeyboardInterrupt: print("\nCtrl+C detected. Terminating GPU occupation processes...") for p in processes: if p.is_alive(): p.terminate() # Send SIGTERM p.join(timeout=5) # Wait max 5 seconds for graceful exit if p.is_alive(): print(f"Process {p.pid} did not terminate gracefully, killing.") p.kill() # Send SIGKILL if necessary p.join() # Wait for kill print("All processes terminated.") except Exception as main_e: print(f"An error occurred in the main process: {main_e}") # Optionally try to clean up child processes here too for p in processes: if p.is_alive(): p.terminate()