#!/usr/bin/env python3 """ Minimal 8-GPU test: DDP init + one small op per GPU + all_reduce. Run from repo root: torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu If NCCL segfaults (exit -11), try workaround env vars first: bash scripts/try_nccl_8gpu.sh Else use gloo: NANOCHAT_DDP_BACKEND=gloo torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu """ import os import torch import torch.distributed as dist def main(): # torchrun sets these rank = int(os.environ["RANK"]) local_rank = int(os.environ["LOCAL_RANK"]) world_size = int(os.environ["WORLD_SIZE"]) backend = os.environ.get("NANOCHAT_DDP_BACKEND", "nccl").lower() if backend not in ("nccl", "gloo"): backend = "nccl" device = torch.device("cuda", local_rank) torch.cuda.set_device(device) if backend == "nccl": dist.init_process_group(backend="nccl", device_id=device) else: dist.init_process_group(backend="gloo") # One small op on this GPU n = 1024 a = torch.ones(n, n, device=device, dtype=torch.float32) b = torch.ones(n, n, device=device, dtype=torch.float32) c = a @ b s = c.sum().item() # All-reduce so all ranks agree we got here t = torch.tensor([s], dtype=torch.float32, device=device) dist.all_reduce(t, op=dist.ReduceOp.SUM) dist.barrier() if rank == 0: gpu_name = torch.cuda.get_device_name(0) print(f"test_8gpu OK: world_size={world_size} backend={backend} device={gpu_name}") print(f" (each rank did 1024x1024 matmul; all_reduce sum={t.item():.0f})") dist.destroy_process_group() if __name__ == "__main__": main()