|
|
|
|
|
""" |
|
|
NCCL Memory Conflict Resolution |
|
|
|
|
|
This script addresses the "illegal memory access" error in multi-GPU training |
|
|
by implementing memory management strategies and fallback mechanisms. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import subprocess |
|
|
import torch |
|
|
import torch.distributed as dist |
|
|
from typing import Optional, Dict, Any |
|
|
import logging |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def check_gpu_memory_usage() -> Dict[int, Dict[str, float]]: |
|
|
"""Check current GPU memory usage""" |
|
|
memory_info = {} |
|
|
for i in range(torch.cuda.device_count()): |
|
|
allocated = torch.cuda.memory_allocated(i) / 1024**3 |
|
|
reserved = torch.cuda.memory_reserved(i) / 1024**3 |
|
|
total = torch.cuda.get_device_properties(i).total_memory / 1024**3 |
|
|
free = total - reserved |
|
|
|
|
|
memory_info[i] = { |
|
|
'allocated': allocated, |
|
|
'reserved': reserved, |
|
|
'total': total, |
|
|
'free': free, |
|
|
'usage_percent': (reserved / total) * 100 |
|
|
} |
|
|
|
|
|
logger.info(f"GPU {i}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved, " |
|
|
f"{free:.1f}GB free ({memory_info[i]['usage_percent']:.1f}% used)") |
|
|
|
|
|
return memory_info |
|
|
|
|
|
def clear_gpu_memory(): |
|
|
"""Clear GPU memory and cache""" |
|
|
logger.info("Clearing GPU memory...") |
|
|
for i in range(torch.cuda.device_count()): |
|
|
torch.cuda.set_device(i) |
|
|
torch.cuda.empty_cache() |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
|
|
|
import gc |
|
|
gc.collect() |
|
|
|
|
|
logger.info("GPU memory cleared") |
|
|
|
|
|
def kill_competing_processes(): |
|
|
"""Kill processes that might be using GPU memory""" |
|
|
try: |
|
|
|
|
|
result = subprocess.run(['nvidia-smi', '--query-compute-apps=pid,process_name,used_memory', |
|
|
'--format=csv,noheader,nounits'], |
|
|
capture_output=True, text=True) |
|
|
|
|
|
if result.returncode == 0: |
|
|
lines = result.stdout.strip().split('\n') |
|
|
for line in lines: |
|
|
if line.strip(): |
|
|
parts = line.split(', ') |
|
|
if len(parts) >= 3: |
|
|
pid, name, memory = parts[0], parts[1], parts[2] |
|
|
if 'llama' in name.lower() or int(memory) > 1000: |
|
|
logger.info(f"Found competing process: {name} (PID: {pid}, Memory: {memory}MB)") |
|
|
try: |
|
|
subprocess.run(['kill', '-9', pid], check=True) |
|
|
logger.info(f"Killed process {pid}") |
|
|
except subprocess.CalledProcessError: |
|
|
logger.warning(f"Could not kill process {pid}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"Could not check/kill competing processes: {e}") |
|
|
|
|
|
def setup_nccl_environment(): |
|
|
"""Set up optimal NCCL environment variables""" |
|
|
nccl_env = { |
|
|
'NCCL_DEBUG': 'INFO', |
|
|
'NCCL_IB_DISABLE': '1', |
|
|
'NCCL_P2P_DISABLE': '1', |
|
|
'NCCL_SHM_DISABLE': '0', |
|
|
'NCCL_SOCKET_IFNAME': 'enp6s18', |
|
|
'NCCL_NET_GDR_LEVEL': '0', |
|
|
'NCCL_CROSS_NIC': '0', |
|
|
'NCCL_ALGO': 'Ring', |
|
|
'CUDA_LAUNCH_BLOCKING': '1', |
|
|
'TORCH_NCCL_ASYNC_ERROR_HANDLING': '1', |
|
|
'TOKENIZERS_PARALLELISM': 'false', |
|
|
} |
|
|
|
|
|
for key, value in nccl_env.items(): |
|
|
os.environ[key] = value |
|
|
logger.info(f"Set {key}={value}") |
|
|
|
|
|
def create_memory_efficient_config(base_config: Dict[str, Any]) -> Dict[str, Any]: |
|
|
"""Create memory-efficient training configuration""" |
|
|
memory_config = base_config.copy() |
|
|
|
|
|
|
|
|
memory_config.update({ |
|
|
'per_device_train_batch_size': 1, |
|
|
'gradient_accumulation_steps': 8, |
|
|
'eval_batch_size': 1, |
|
|
'max_seq_length': 512, |
|
|
'fp16': True, |
|
|
'bf16': False, |
|
|
'pin_memory': False, |
|
|
'num_workers': 0, |
|
|
}) |
|
|
|
|
|
logger.info("Created memory-efficient configuration") |
|
|
return memory_config |
|
|
|
|
|
def test_nccl_communication(): |
|
|
"""Test NCCL communication without training""" |
|
|
logger.info("Testing NCCL communication...") |
|
|
|
|
|
try: |
|
|
|
|
|
if not dist.is_initialized(): |
|
|
dist.init_process_group(backend='nccl') |
|
|
|
|
|
rank = dist.get_rank() |
|
|
world_size = dist.get_world_size() |
|
|
|
|
|
logger.info(f"Rank {rank}/{world_size} initialized") |
|
|
|
|
|
|
|
|
if rank == 0: |
|
|
tensor = torch.ones(10, device='cuda') |
|
|
logger.info(f"Rank 0 sending tensor: {tensor}") |
|
|
else: |
|
|
tensor = torch.zeros(10, device='cuda') |
|
|
logger.info(f"Rank 1 receiving tensor: {tensor}") |
|
|
|
|
|
|
|
|
dist.all_reduce(tensor) |
|
|
logger.info(f"Rank {rank} after all_reduce: {tensor}") |
|
|
|
|
|
|
|
|
dist.barrier() |
|
|
logger.info(f"Rank {rank} passed barrier") |
|
|
|
|
|
logger.info("✅ NCCL communication test PASSED") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ NCCL communication test FAILED: {e}") |
|
|
return False |
|
|
finally: |
|
|
if dist.is_initialized(): |
|
|
dist.destroy_process_group() |
|
|
|
|
|
def run_memory_safe_training(config_path: str): |
|
|
"""Run training with memory safety measures""" |
|
|
logger.info("Starting memory-safe training...") |
|
|
|
|
|
|
|
|
clear_gpu_memory() |
|
|
|
|
|
|
|
|
kill_competing_processes() |
|
|
|
|
|
|
|
|
setup_nccl_environment() |
|
|
|
|
|
|
|
|
memory_info = check_gpu_memory_usage() |
|
|
|
|
|
|
|
|
if not test_nccl_communication(): |
|
|
logger.error("NCCL communication test failed, falling back to single GPU") |
|
|
return False |
|
|
|
|
|
|
|
|
logger.info("Running memory-safe multi-GPU training...") |
|
|
|
|
|
|
|
|
return True |
|
|
|
|
|
def main(): |
|
|
"""Main function for testing memory fixes""" |
|
|
print("🚀 NCCL Memory Conflict Resolution") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
print("\n📊 Initial GPU Memory State:") |
|
|
memory_info = check_gpu_memory_usage() |
|
|
|
|
|
|
|
|
print("\n🧹 Clearing GPU Memory:") |
|
|
clear_gpu_memory() |
|
|
|
|
|
|
|
|
print("\n📊 GPU Memory After Cleanup:") |
|
|
memory_info = check_gpu_memory_usage() |
|
|
|
|
|
|
|
|
print("\n⚙️ Setting up NCCL Environment:") |
|
|
setup_nccl_environment() |
|
|
|
|
|
print("\n✅ Memory management setup complete!") |
|
|
print(" Ready for memory-safe multi-GPU training") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|