fix(device): correct attribute name from total_mem to total_memory

f91d771 30 days ago

3.14 kB

	"""Device detection and auto-configuration utilities."""
	from __future__ import annotations

	from typing import TYPE_CHECKING

	import torch

	if TYPE_CHECKING:
	from llm_lab.config import TrainConfig


	def get_device() -> torch.device:
	"""Returns the available device (cuda or cpu)."""
	return torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def detect_gpu_info() -> dict:
	"""Returns GPU name and memory information.

	Returns:
	{"name": str, "memory_gb": float} or an empty dict if no GPU is available
	"""
	if not torch.cuda.is_available():
	return {}
	return {
	"name": torch.cuda.get_device_name(),
	"memory_gb": round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1),
	}


	def auto_configure(config: "TrainConfig") -> "TrainConfig":
	"""Automatically adjusts configuration based on GPU type.

	In Colab Pro+, an A100 is not always assigned.
	If a T4 or V100 is assigned, configuration is automatically adjusted.

	Returns:
	Adjusted TrainConfig
	"""
	if not torch.cuda.is_available():
	print("⚠️ No GPU found! Running in CPU mode (very slow)")
	config.dtype = "float32"
	config.micro_batch_size = 1
	config.gradient_accumulation_steps = 4
	return config

	gpu_name = torch.cuda.get_device_name().lower()
	gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9

	print(f"\n🔍 GPU detected: {torch.cuda.get_device_name()} ({gpu_mem:.1f} GB)")

	if "a100" in gpu_name:
	# A100 40GB: use default settings (optimal)
	print(" → A100 detected: using default settings (bf16, batch=4)")
	config.dtype = "bfloat16"
	config.micro_batch_size = 4

	elif "v100" in gpu_name:
	# V100 16GB: bf16 not supported, reduce batch size
	print(" → V100 detected: fp16 mode, reduced batch size")
	config.dtype = "float16"
	config.micro_batch_size = 2
	config.gradient_accumulation_steps = 64 # maintain effective batch size

	elif "t4" in gpu_name:
	# T4 16GB: bf16 not supported, smaller batch
	print(" → T4 detected: fp16 mode, minimum batch size")
	config.dtype = "float16"
	config.micro_batch_size = 1
	config.gradient_accumulation_steps = 128

	elif "l4" in gpu_name:
	# L4 24GB: bf16 supported
	print(" → L4 detected: bf16 mode, adjusted batch size")
	config.dtype = "bfloat16"
	config.micro_batch_size = 2
	config.gradient_accumulation_steps = 64

	else:
	print(f" → Unknown GPU. Adjusting settings based on memory")
	if gpu_mem >= 30:
	config.micro_batch_size = 4
	elif gpu_mem >= 16:
	config.micro_batch_size = 2
	else:
	config.micro_batch_size = 1
	config.gradient_accumulation_steps = 128

	print(f" → dtype: {config.dtype}")
	print(f" → micro_batch: {config.micro_batch_size}")
	print(f" → grad_accum: {config.gradient_accumulation_steps}")
	print(f" → effective_batch: {config.effective_batch_size}")

	return config