LLM-1B-Lab / llm_lab /utils /device.py
Vjeong's picture
fix(device): correct attribute name from total_mem to total_memory
f91d771
"""Device detection and auto-configuration utilities."""
from __future__ import annotations
from typing import TYPE_CHECKING
import torch
if TYPE_CHECKING:
from llm_lab.config import TrainConfig
def get_device() -> torch.device:
"""Returns the available device (cuda or cpu)."""
return torch.device("cuda" if torch.cuda.is_available() else "cpu")
def detect_gpu_info() -> dict:
"""Returns GPU name and memory information.
Returns:
{"name": str, "memory_gb": float} or an empty dict if no GPU is available
"""
if not torch.cuda.is_available():
return {}
return {
"name": torch.cuda.get_device_name(),
"memory_gb": round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1),
}
def auto_configure(config: "TrainConfig") -> "TrainConfig":
"""Automatically adjusts configuration based on GPU type.
In Colab Pro+, an A100 is not always assigned.
If a T4 or V100 is assigned, configuration is automatically adjusted.
Returns:
Adjusted TrainConfig
"""
if not torch.cuda.is_available():
print("⚠️ No GPU found! Running in CPU mode (very slow)")
config.dtype = "float32"
config.micro_batch_size = 1
config.gradient_accumulation_steps = 4
return config
gpu_name = torch.cuda.get_device_name().lower()
gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f"\nπŸ” GPU detected: {torch.cuda.get_device_name()} ({gpu_mem:.1f} GB)")
if "a100" in gpu_name:
# A100 40GB: use default settings (optimal)
print(" β†’ A100 detected: using default settings (bf16, batch=4)")
config.dtype = "bfloat16"
config.micro_batch_size = 4
elif "v100" in gpu_name:
# V100 16GB: bf16 not supported, reduce batch size
print(" β†’ V100 detected: fp16 mode, reduced batch size")
config.dtype = "float16"
config.micro_batch_size = 2
config.gradient_accumulation_steps = 64 # maintain effective batch size
elif "t4" in gpu_name:
# T4 16GB: bf16 not supported, smaller batch
print(" β†’ T4 detected: fp16 mode, minimum batch size")
config.dtype = "float16"
config.micro_batch_size = 1
config.gradient_accumulation_steps = 128
elif "l4" in gpu_name:
# L4 24GB: bf16 supported
print(" β†’ L4 detected: bf16 mode, adjusted batch size")
config.dtype = "bfloat16"
config.micro_batch_size = 2
config.gradient_accumulation_steps = 64
else:
print(f" β†’ Unknown GPU. Adjusting settings based on memory")
if gpu_mem >= 30:
config.micro_batch_size = 4
elif gpu_mem >= 16:
config.micro_batch_size = 2
else:
config.micro_batch_size = 1
config.gradient_accumulation_steps = 128
print(f" β†’ dtype: {config.dtype}")
print(f" β†’ micro_batch: {config.micro_batch_size}")
print(f" β†’ grad_accum: {config.gradient_accumulation_steps}")
print(f" β†’ effective_batch: {config.effective_batch_size}")
return config