Spaces:
Sleeping
Sleeping
| import torch | |
| from pytorch_lightning import Callback, Trainer, LightningModule | |
| import logging | |
| log = logging.getLogger(__name__) # We want a logger for each process, not just the rank 0 | |
| def l2_promote(): | |
| import ctypes | |
| _libcudart = ctypes.CDLL('libcudart.so') | |
| # Set device limit on the current device | |
| # cudaLimitMaxL2FetchGranularity = 0x05 | |
| pValue = ctypes.cast((ctypes.c_int*1)(), ctypes.POINTER(ctypes.c_int)) | |
| _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) | |
| _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) | |
| assert pValue.contents.value == 128 | |
| def set_affinity(trainer): | |
| try: | |
| from src.utils.gpu_affinity import set_affinity | |
| nproc_per_node = torch.cuda.device_count() | |
| affinity = set_affinity(trainer.local_rank, nproc_per_node, 'socket_unique_continuous') | |
| log.info(f'{trainer.local_rank}: thread affinity: {affinity}') | |
| # TD [2022-05-07] Somehow calling this causes GPU 0 to allocate extra ~800MB of memory per | |
| # number of GPUs (e.g., 6.4GB of extra memory in a 8-GPU setup). H/t Dan. | |
| # l2_promote() | |
| except: | |
| pass | |
| class GpuAffinity(Callback): | |
| """Set GPU affinity and increase the L2 fetch granularity. | |
| Adapted from https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/Transformer-XL | |
| """ | |
| def setup(self, trainer: Trainer, pl_module: LightningModule, stage=None) -> None: | |
| set_affinity(trainer) | |