#!/usr/bin/env python3
"""
Minimal 8-GPU test: DDP init + one small op per GPU + all_reduce.
Run from repo root:
  torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu
If NCCL segfaults (exit -11), try workaround env vars first:
  bash scripts/try_nccl_8gpu.sh
Else use gloo:
  NANOCHAT_DDP_BACKEND=gloo torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu
"""
import os
import torch
import torch.distributed as dist

def main():
    # torchrun sets these
    rank = int(os.environ["RANK"])
    local_rank = int(os.environ["LOCAL_RANK"])
    world_size = int(os.environ["WORLD_SIZE"])

    backend = os.environ.get("NANOCHAT_DDP_BACKEND", "nccl").lower()
    if backend not in ("nccl", "gloo"):
        backend = "nccl"

    device = torch.device("cuda", local_rank)
    torch.cuda.set_device(device)

    if backend == "nccl":
        dist.init_process_group(backend="nccl", device_id=device)
    else:
        dist.init_process_group(backend="gloo")

    # One small op on this GPU
    n = 1024
    a = torch.ones(n, n, device=device, dtype=torch.float32)
    b = torch.ones(n, n, device=device, dtype=torch.float32)
    c = a @ b
    s = c.sum().item()

    # All-reduce so all ranks agree we got here
    t = torch.tensor([s], dtype=torch.float32, device=device)
    dist.all_reduce(t, op=dist.ReduceOp.SUM)
    dist.barrier()

    if rank == 0:
        gpu_name = torch.cuda.get_device_name(0)
        print(f"test_8gpu OK: world_size={world_size} backend={backend} device={gpu_name}")
        print(f"  (each rank did 1024x1024 matmul; all_reduce sum={t.item():.0f})")

    dist.destroy_process_group()

if __name__ == "__main__":
    main()