File size: 3,139 Bytes
858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe f91d771 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe f91d771 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | """Device detection and auto-configuration utilities."""
from __future__ import annotations
from typing import TYPE_CHECKING
import torch
if TYPE_CHECKING:
from llm_lab.config import TrainConfig
def get_device() -> torch.device:
"""Returns the available device (cuda or cpu)."""
return torch.device("cuda" if torch.cuda.is_available() else "cpu")
def detect_gpu_info() -> dict:
"""Returns GPU name and memory information.
Returns:
{"name": str, "memory_gb": float} or an empty dict if no GPU is available
"""
if not torch.cuda.is_available():
return {}
return {
"name": torch.cuda.get_device_name(),
"memory_gb": round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1),
}
def auto_configure(config: "TrainConfig") -> "TrainConfig":
"""Automatically adjusts configuration based on GPU type.
In Colab Pro+, an A100 is not always assigned.
If a T4 or V100 is assigned, configuration is automatically adjusted.
Returns:
Adjusted TrainConfig
"""
if not torch.cuda.is_available():
print("β οΈ No GPU found! Running in CPU mode (very slow)")
config.dtype = "float32"
config.micro_batch_size = 1
config.gradient_accumulation_steps = 4
return config
gpu_name = torch.cuda.get_device_name().lower()
gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f"\nπ GPU detected: {torch.cuda.get_device_name()} ({gpu_mem:.1f} GB)")
if "a100" in gpu_name:
# A100 40GB: use default settings (optimal)
print(" β A100 detected: using default settings (bf16, batch=4)")
config.dtype = "bfloat16"
config.micro_batch_size = 4
elif "v100" in gpu_name:
# V100 16GB: bf16 not supported, reduce batch size
print(" β V100 detected: fp16 mode, reduced batch size")
config.dtype = "float16"
config.micro_batch_size = 2
config.gradient_accumulation_steps = 64 # maintain effective batch size
elif "t4" in gpu_name:
# T4 16GB: bf16 not supported, smaller batch
print(" β T4 detected: fp16 mode, minimum batch size")
config.dtype = "float16"
config.micro_batch_size = 1
config.gradient_accumulation_steps = 128
elif "l4" in gpu_name:
# L4 24GB: bf16 supported
print(" β L4 detected: bf16 mode, adjusted batch size")
config.dtype = "bfloat16"
config.micro_batch_size = 2
config.gradient_accumulation_steps = 64
else:
print(f" β Unknown GPU. Adjusting settings based on memory")
if gpu_mem >= 30:
config.micro_batch_size = 4
elif gpu_mem >= 16:
config.micro_batch_size = 2
else:
config.micro_batch_size = 1
config.gradient_accumulation_steps = 128
print(f" β dtype: {config.dtype}")
print(f" β micro_batch: {config.micro_batch_size}")
print(f" β grad_accum: {config.gradient_accumulation_steps}")
print(f" β effective_batch: {config.effective_batch_size}")
return config
|