Faaz commited on
Commit
5fb9ec3
·
1 Parent(s): 35fd5fc

Add GPU diagnostic script, fix architecture loading with low_cpu_mem_usage and sync

Browse files
scripts/gpu_diagnostic.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Quick GPU diagnostic for MI300X before full training."""
3
+ import sys
4
+ import torch
5
+
6
+ print("=" * 50)
7
+ print(" MI300X GPU Diagnostic")
8
+ print("=" * 50)
9
+
10
+ # Step 1: Basic GPU info
11
+ print("\n[1] GPU Info:")
12
+ print(f" PyTorch: {torch.__version__}")
13
+ print(f" CUDA available: {torch.cuda.is_available()}")
14
+ if not torch.cuda.is_available():
15
+ print(" FATAL: No GPU!")
16
+ sys.exit(1)
17
+
18
+ print(f" GPU: {torch.cuda.get_device_name(0)}")
19
+ props = torch.cuda.get_device_properties(0)
20
+ vram_gb = props.total_memory / (1024**3)
21
+ print(f" VRAM: {vram_gb:.0f} GB")
22
+ print(f" ROCm: {torch.version.hip}")
23
+
24
+ # Step 2: Small tensor test
25
+ print("\n[2] Small tensor test:")
26
+ x = torch.randn(10, 10, device='cuda', dtype=torch.bfloat16)
27
+ y = x @ x.T
28
+ print(f" bf16 matmul: OK (shape={y.shape})")
29
+ del x, y
30
+ torch.cuda.empty_cache()
31
+
32
+ # Step 3: Larger allocation
33
+ print("\n[3] Large allocation test (1GB):")
34
+ big = torch.zeros(256, 1024, 1024, dtype=torch.bfloat16, device='cuda')
35
+ print(f" 1GB alloc: OK")
36
+ del big
37
+ torch.cuda.empty_cache()
38
+
39
+ # Step 4: Try loading model with from_pretrained on CPU
40
+ print("\n[4] Loading Qwen2.5-Coder-7B to CPU ...")
41
+ from transformers import AutoModelForCausalLM
42
+ model = AutoModelForCausalLM.from_pretrained(
43
+ "Qwen/Qwen2.5-Coder-7B-Instruct",
44
+ torch_dtype=torch.bfloat16,
45
+ device_map=None,
46
+ trust_remote_code=True,
47
+ low_cpu_mem_usage=True,
48
+ )
49
+ param_count = sum(p.numel() for p in model.parameters())
50
+ print(f" Loaded: {param_count / 1e9:.2f}B params on CPU")
51
+
52
+ # Step 5: Move layer by layer to test
53
+ print("\n[5] Moving model to CUDA (layer by layer) ...")
54
+ try:
55
+ model = model.to('cuda')
56
+ print(f" Model on CUDA: OK")
57
+ used = torch.cuda.memory_allocated() / (1024**3)
58
+ print(f" VRAM used: {used:.1f} GB")
59
+ except Exception as e:
60
+ print(f" FAILED: {e}")
61
+ print(" Trying half() first ...")
62
+ model = model.half().to('cuda')
63
+ used = torch.cuda.memory_allocated() / (1024**3)
64
+ print(f" VRAM used: {used:.1f} GB")
65
+
66
+ # Step 6: Quick forward pass
67
+ print("\n[6] Forward pass test ...")
68
+ input_ids = torch.tensor([[1, 2, 3, 4, 5]], device='cuda')
69
+ with torch.no_grad():
70
+ out = model(input_ids)
71
+ print(f" Forward: OK (logits shape={out.logits.shape})")
72
+
73
+ print("\n" + "=" * 50)
74
+ print(" ALL TESTS PASSED!")
75
+ print("=" * 50)
src/model/architecture.py CHANGED
@@ -55,16 +55,32 @@ class MINDIArchitecture:
55
  def _load_model(self) -> None:
56
  """Load the base model and tokenizer from HuggingFace or cache."""
57
  print(f"[MINDIArchitecture] Loading {self.model_name} ...")
 
 
 
 
 
 
 
58
  self.model = AutoModelForCausalLM.from_pretrained(
59
  self.model_name,
60
- cache_dir=str(self.cache_dir),
61
  torch_dtype=self.torch_dtype,
62
- device_map="auto" if self.device == "cuda" else None,
63
  trust_remote_code=True,
 
64
  )
 
 
 
 
 
 
 
 
 
 
65
  self.tokenizer = AutoTokenizer.from_pretrained(
66
  self.model_name,
67
- cache_dir=str(self.cache_dir),
68
  trust_remote_code=True,
69
  )
70
  print(f"[MINDIArchitecture] Loaded on {self.device} "
 
55
  def _load_model(self) -> None:
56
  """Load the base model and tokenizer from HuggingFace or cache."""
57
  print(f"[MINDIArchitecture] Loading {self.model_name} ...")
58
+
59
+ if self.device == "cuda":
60
+ # Clear GPU state before loading
61
+ torch.cuda.empty_cache()
62
+ torch.cuda.synchronize()
63
+ print(f"[MINDIArchitecture] GPU cleared, loading to CPU first ...")
64
+
65
  self.model = AutoModelForCausalLM.from_pretrained(
66
  self.model_name,
 
67
  torch_dtype=self.torch_dtype,
68
+ device_map=None,
69
  trust_remote_code=True,
70
+ low_cpu_mem_usage=True,
71
  )
72
+ param_count = sum(p.numel() for p in self.model.parameters())
73
+ print(f"[MINDIArchitecture] CPU load done ({param_count / 1e9:.2f}B params)")
74
+
75
+ if self.device == "cuda":
76
+ print(f"[MINDIArchitecture] Moving to CUDA ...")
77
+ self.model = self.model.to("cuda")
78
+ torch.cuda.synchronize()
79
+ vram_gb = torch.cuda.memory_allocated() / (1024**3)
80
+ print(f"[MINDIArchitecture] CUDA transfer done ({vram_gb:.1f} GB VRAM)")
81
+
82
  self.tokenizer = AutoTokenizer.from_pretrained(
83
  self.model_name,
 
84
  trust_remote_code=True,
85
  )
86
  print(f"[MINDIArchitecture] Loaded on {self.device} "