Text Generation
PEFT
lora
trl
naming
brand-generation
controllable-generation
krystv commited on
Commit
a544dd6
·
verified ·
1 Parent(s): 39399b9

Add GPU preflight diagnostics

Browse files
Files changed (1) hide show
  1. scripts/preflight_gpu.py +27 -0
scripts/preflight_gpu.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GPU preflight diagnostics for Nomen-AI training.
2
+
3
+ Run before smoke/SFT/DPO to fail early if CUDA or VRAM is unavailable.
4
+ """
5
+ import sys
6
+ import torch
7
+
8
+
9
+ def main():
10
+ print('torch:', torch.__version__)
11
+ print('cuda_available:', torch.cuda.is_available())
12
+ if not torch.cuda.is_available():
13
+ raise SystemExit('ERROR: CUDA is not available. Use a Colab GPU/T4 runtime or Docker with NVIDIA runtime.')
14
+ device = torch.cuda.current_device()
15
+ name = torch.cuda.get_device_name(device)
16
+ props = torch.cuda.get_device_properties(device)
17
+ total_gb = props.total_memory / 1e9
18
+ print('gpu_name:', name)
19
+ print('total_vram_gb:', round(total_gb, 2))
20
+ print('compute_capability:', f'{props.major}.{props.minor}')
21
+ if total_gb < 14:
22
+ raise SystemExit(f'ERROR: VRAM {total_gb:.1f}GB is below expected T4-class 15GB. Use T4/A10G or larger.')
23
+ print('GPU_PREFLIGHT_PASS')
24
+
25
+
26
+ if __name__ == '__main__':
27
+ main()