| |
| """ |
| Minimal 8-GPU test: DDP init + one small op per GPU + all_reduce. |
| Run from repo root: |
| torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu |
| If NCCL segfaults (exit -11), try workaround env vars first: |
| bash scripts/try_nccl_8gpu.sh |
| Else use gloo: |
| NANOCHAT_DDP_BACKEND=gloo torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu |
| """ |
| import os |
| import torch |
| import torch.distributed as dist |
|
|
| def main(): |
| |
| rank = int(os.environ["RANK"]) |
| local_rank = int(os.environ["LOCAL_RANK"]) |
| world_size = int(os.environ["WORLD_SIZE"]) |
|
|
| backend = os.environ.get("NANOCHAT_DDP_BACKEND", "nccl").lower() |
| if backend not in ("nccl", "gloo"): |
| backend = "nccl" |
|
|
| device = torch.device("cuda", local_rank) |
| torch.cuda.set_device(device) |
|
|
| if backend == "nccl": |
| dist.init_process_group(backend="nccl", device_id=device) |
| else: |
| dist.init_process_group(backend="gloo") |
|
|
| |
| n = 1024 |
| a = torch.ones(n, n, device=device, dtype=torch.float32) |
| b = torch.ones(n, n, device=device, dtype=torch.float32) |
| c = a @ b |
| s = c.sum().item() |
|
|
| |
| t = torch.tensor([s], dtype=torch.float32, device=device) |
| dist.all_reduce(t, op=dist.ReduceOp.SUM) |
| dist.barrier() |
|
|
| if rank == 0: |
| gpu_name = torch.cuda.get_device_name(0) |
| print(f"test_8gpu OK: world_size={world_size} backend={backend} device={gpu_name}") |
| print(f" (each rank did 1024x1024 matmul; all_reduce sum={t.item():.0f})") |
|
|
| dist.destroy_process_group() |
|
|
| if __name__ == "__main__": |
| main() |
|
|