| """Device detection and auto-configuration utilities.""" |
| from __future__ import annotations |
|
|
| from typing import TYPE_CHECKING |
|
|
| import torch |
|
|
| if TYPE_CHECKING: |
| from llm_lab.config import TrainConfig |
|
|
|
|
| def get_device() -> torch.device: |
| """Returns the available device (cuda or cpu).""" |
| return torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
| def detect_gpu_info() -> dict: |
| """Returns GPU name and memory information. |
| |
| Returns: |
| {"name": str, "memory_gb": float} or an empty dict if no GPU is available |
| """ |
| if not torch.cuda.is_available(): |
| return {} |
| return { |
| "name": torch.cuda.get_device_name(), |
| "memory_gb": round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), |
| } |
|
|
|
|
| def auto_configure(config: "TrainConfig") -> "TrainConfig": |
| """Automatically adjusts configuration based on GPU type. |
| |
| In Colab Pro+, an A100 is not always assigned. |
| If a T4 or V100 is assigned, configuration is automatically adjusted. |
| |
| Returns: |
| Adjusted TrainConfig |
| """ |
| if not torch.cuda.is_available(): |
| print("β οΈ No GPU found! Running in CPU mode (very slow)") |
| config.dtype = "float32" |
| config.micro_batch_size = 1 |
| config.gradient_accumulation_steps = 4 |
| return config |
|
|
| gpu_name = torch.cuda.get_device_name().lower() |
| gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9 |
|
|
| print(f"\nπ GPU detected: {torch.cuda.get_device_name()} ({gpu_mem:.1f} GB)") |
|
|
| if "a100" in gpu_name: |
| |
| print(" β A100 detected: using default settings (bf16, batch=4)") |
| config.dtype = "bfloat16" |
| config.micro_batch_size = 4 |
|
|
| elif "v100" in gpu_name: |
| |
| print(" β V100 detected: fp16 mode, reduced batch size") |
| config.dtype = "float16" |
| config.micro_batch_size = 2 |
| config.gradient_accumulation_steps = 64 |
|
|
| elif "t4" in gpu_name: |
| |
| print(" β T4 detected: fp16 mode, minimum batch size") |
| config.dtype = "float16" |
| config.micro_batch_size = 1 |
| config.gradient_accumulation_steps = 128 |
|
|
| elif "l4" in gpu_name: |
| |
| print(" β L4 detected: bf16 mode, adjusted batch size") |
| config.dtype = "bfloat16" |
| config.micro_batch_size = 2 |
| config.gradient_accumulation_steps = 64 |
|
|
| else: |
| print(f" β Unknown GPU. Adjusting settings based on memory") |
| if gpu_mem >= 30: |
| config.micro_batch_size = 4 |
| elif gpu_mem >= 16: |
| config.micro_batch_size = 2 |
| else: |
| config.micro_batch_size = 1 |
| config.gradient_accumulation_steps = 128 |
|
|
| print(f" β dtype: {config.dtype}") |
| print(f" β micro_batch: {config.micro_batch_size}") |
| print(f" β grad_accum: {config.gradient_accumulation_steps}") |
| print(f" β effective_batch: {config.effective_batch_size}") |
|
|
| return config |
|
|