ArthurY's picture
update source
c3d0544
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
try:
import rmm
RMM_AVAILABLE = True
except ImportError:
RMM_AVAILABLE = False
try:
import cupy
CUPY_AVAILABLE = True
except ImportError:
CUPY_AVAILABLE = False
"""
Using a unifed gpu memory provider, we consolidate the pool into just a
single allocator for cupy/rapids and torch. Ideally, we add warp to this someday.
To use this, you need to add the following to your code at or near the top
(before allocating any GPU memory):
```python
from physicsnemo.utils.memory import unified_gpu_memory
```
"""
def srt2bool(val: str):
if isinstance(val, bool):
return val
if val.lower() in ["true", "1", "yes", "y"]:
return True
elif val.lower() in ["false", "0", "no", "n"]:
return False
else:
raise ValueError(f"Invalid boolean value: {val}")
DISABLE_RMM = srt2bool(os.environ.get("PHYSICSNEMO_DISABLE_RMM", False))
def _setup_unified_gpu_memory():
# Skip if RMM is disabled
if RMM_AVAILABLE and not DISABLE_RMM:
# First, determine the local rank so that we allocate on the right device.
# These are meant to be tested in the same order as DistributedManager
# We can't actually initialize it, though, since we have to unify mallocs
# before torch init.
PHYSICSNEMO_DISTRIBUTED_INITIALIZATION_METHOD = os.environ.get(
"PHYSICSNEMO_DISTRIBUTED_INITIALIZATION_METHOD", None
)
if PHYSICSNEMO_DISTRIBUTED_INITIALIZATION_METHOD is None:
# default to 0:
local_rank = 0
# Update if a variable sets the local rank:
for method in ["LOCAL_RANK", "OMPI_COMM_WORLD_LOCAL_RANK", "SLURM_LOCALID"]:
if os.environ.get(method) is not None:
local_rank = int(os.environ.get(method))
break
else:
if PHYSICSNEMO_DISTRIBUTED_INITIALIZATION_METHOD == "ENV":
local_rank = int(os.environ.get("LOCAL_RANK"))
elif PHYSICSNEMO_DISTRIBUTED_INITIALIZATION_METHOD == "SLURM":
local_rank = int(os.environ.get("SLURM_LOCALID"))
elif PHYSICSNEMO_DISTRIBUTED_INITIALIZATION_METHOD == "OPENMPI":
local_rank = int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK"))
else:
raise ValueError(
f"Unknown initialization method: {PHYSICSNEMO_DISTRIBUTED_INITIALIZATION_METHOD}"
)
# Initialize RMM
rmm.reinitialize(
pool_allocator=True, devices=local_rank, initial_pool_size="1024MB"
)
# Set PyTorch allocator if available
from rmm.allocators.torch import rmm_torch_allocator
if torch.cuda.is_available():
torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
# Set CuPy allocator if available
if CUPY_AVAILABLE:
from rmm.allocators.cupy import rmm_cupy_allocator
cupy.cuda.set_allocator(rmm_cupy_allocator)
# This is what gets executed when someone does "from memory import unified_gpu_memory"
def __getattr__(name):
if name == "unified_gpu_memory":
return _setup_unified_gpu_memory()
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")