File size: 2,274 Bytes
5fb9ec3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
"""Quick GPU diagnostic for MI300X before full training."""
import sys
import torch

print("=" * 50)
print("  MI300X GPU Diagnostic")
print("=" * 50)

# Step 1: Basic GPU info
print("\n[1] GPU Info:")
print(f"  PyTorch: {torch.__version__}")
print(f"  CUDA available: {torch.cuda.is_available()}")
if not torch.cuda.is_available():
    print("  FATAL: No GPU!")
    sys.exit(1)

print(f"  GPU: {torch.cuda.get_device_name(0)}")
props = torch.cuda.get_device_properties(0)
vram_gb = props.total_memory / (1024**3)
print(f"  VRAM: {vram_gb:.0f} GB")
print(f"  ROCm: {torch.version.hip}")

# Step 2: Small tensor test
print("\n[2] Small tensor test:")
x = torch.randn(10, 10, device='cuda', dtype=torch.bfloat16)
y = x @ x.T
print(f"  bf16 matmul: OK (shape={y.shape})")
del x, y
torch.cuda.empty_cache()

# Step 3: Larger allocation
print("\n[3] Large allocation test (1GB):")
big = torch.zeros(256, 1024, 1024, dtype=torch.bfloat16, device='cuda')
print(f"  1GB alloc: OK")
del big
torch.cuda.empty_cache()

# Step 4: Try loading model with from_pretrained on CPU
print("\n[4] Loading Qwen2.5-Coder-7B to CPU ...")
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Coder-7B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map=None,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)
param_count = sum(p.numel() for p in model.parameters())
print(f"  Loaded: {param_count / 1e9:.2f}B params on CPU")

# Step 5: Move layer by layer to test
print("\n[5] Moving model to CUDA (layer by layer) ...")
try:
    model = model.to('cuda')
    print(f"  Model on CUDA: OK")
    used = torch.cuda.memory_allocated() / (1024**3)
    print(f"  VRAM used: {used:.1f} GB")
except Exception as e:
    print(f"  FAILED: {e}")
    print("  Trying half() first ...")
    model = model.half().to('cuda')
    used = torch.cuda.memory_allocated() / (1024**3)
    print(f"  VRAM used: {used:.1f} GB")

# Step 6: Quick forward pass
print("\n[6] Forward pass test ...")
input_ids = torch.tensor([[1, 2, 3, 4, 5]], device='cuda')
with torch.no_grad():
    out = model(input_ids)
print(f"  Forward: OK (logits shape={out.logits.shape})")

print("\n" + "=" * 50)
print("  ALL TESTS PASSED!")
print("=" * 50)