File size: 3,139 Bytes

858e8b2
8a58ffe
 
 
 
 
 
 
 
 
 
 
858e8b2
8a58ffe
 
 
 
858e8b2
8a58ffe
 
858e8b2
8a58ffe
 
 
 
 
f91d771
8a58ffe
 
 
 
858e8b2
8a58ffe
858e8b2
 
8a58ffe
 
858e8b2
8a58ffe
 
858e8b2
8a58ffe
 
 
 
 
 
f91d771
8a58ffe
858e8b2
8a58ffe
 
858e8b2
 
8a58ffe
 
 
 
858e8b2
 
8a58ffe
 
858e8b2
8a58ffe
 
858e8b2
 
8a58ffe
 
 
 
 
858e8b2
 
8a58ffe
 
 
 
 
858e8b2
8a58ffe

"""Device detection and auto-configuration utilities."""
from __future__ import annotations

from typing import TYPE_CHECKING

import torch

if TYPE_CHECKING:
    from llm_lab.config import TrainConfig


def get_device() -> torch.device:
    """Returns the available device (cuda or cpu)."""
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")


def detect_gpu_info() -> dict:
    """Returns GPU name and memory information.

    Returns:
        {"name": str, "memory_gb": float} or an empty dict if no GPU is available
    """
    if not torch.cuda.is_available():
        return {}
    return {
        "name": torch.cuda.get_device_name(),
        "memory_gb": round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1),
    }


def auto_configure(config: "TrainConfig") -> "TrainConfig":
    """Automatically adjusts configuration based on GPU type.

    In Colab Pro+, an A100 is not always assigned.
    If a T4 or V100 is assigned, configuration is automatically adjusted.

    Returns:
        Adjusted TrainConfig
    """
    if not torch.cuda.is_available():
        print("⚠️ No GPU found! Running in CPU mode (very slow)")
        config.dtype = "float32"
        config.micro_batch_size = 1
        config.gradient_accumulation_steps = 4
        return config

    gpu_name = torch.cuda.get_device_name().lower()
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9

    print(f"\n🔍 GPU detected: {torch.cuda.get_device_name()} ({gpu_mem:.1f} GB)")

    if "a100" in gpu_name:
        # A100 40GB: use default settings (optimal)
        print("  → A100 detected: using default settings (bf16, batch=4)")
        config.dtype = "bfloat16"
        config.micro_batch_size = 4

    elif "v100" in gpu_name:
        # V100 16GB: bf16 not supported, reduce batch size
        print("  → V100 detected: fp16 mode, reduced batch size")
        config.dtype = "float16"
        config.micro_batch_size = 2
        config.gradient_accumulation_steps = 64  # maintain effective batch size

    elif "t4" in gpu_name:
        # T4 16GB: bf16 not supported, smaller batch
        print("  → T4 detected: fp16 mode, minimum batch size")
        config.dtype = "float16"
        config.micro_batch_size = 1
        config.gradient_accumulation_steps = 128

    elif "l4" in gpu_name:
        # L4 24GB: bf16 supported
        print("  → L4 detected: bf16 mode, adjusted batch size")
        config.dtype = "bfloat16"
        config.micro_batch_size = 2
        config.gradient_accumulation_steps = 64

    else:
        print(f"  → Unknown GPU. Adjusting settings based on memory")
        if gpu_mem >= 30:
            config.micro_batch_size = 4
        elif gpu_mem >= 16:
            config.micro_batch_size = 2
        else:
            config.micro_batch_size = 1
            config.gradient_accumulation_steps = 128

    print(f"  → dtype: {config.dtype}")
    print(f"  → micro_batch: {config.micro_batch_size}")
    print(f"  → grad_accum: {config.gradient_accumulation_steps}")
    print(f"  → effective_batch: {config.effective_batch_size}")

    return config