"""Device detection and auto-configuration utilities."""
from __future__ import annotations

from typing import TYPE_CHECKING

import torch

if TYPE_CHECKING:
    from llm_lab.config import TrainConfig


def get_device() -> torch.device:
    """Returns the available device (cuda or cpu)."""
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")


def detect_gpu_info() -> dict:
    """Returns GPU name and memory information.

    Returns:
        {"name": str, "memory_gb": float} or an empty dict if no GPU is available
    """
    if not torch.cuda.is_available():
        return {}
    return {
        "name": torch.cuda.get_device_name(),
        "memory_gb": round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1),
    }


def auto_configure(config: "TrainConfig") -> "TrainConfig":
    """Automatically adjusts configuration based on GPU type.

    In Colab Pro+, an A100 is not always assigned.
    If a T4 or V100 is assigned, configuration is automatically adjusted.

    Returns:
        Adjusted TrainConfig
    """
    if not torch.cuda.is_available():
        print("⚠️ No GPU found! Running in CPU mode (very slow)")
        config.dtype = "float32"
        config.micro_batch_size = 1
        config.gradient_accumulation_steps = 4
        return config

    gpu_name = torch.cuda.get_device_name().lower()
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9

    print(f"\n🔍 GPU detected: {torch.cuda.get_device_name()} ({gpu_mem:.1f} GB)")

    if "a100" in gpu_name:
        # A100 40GB: use default settings (optimal)
        print("  → A100 detected: using default settings (bf16, batch=4)")
        config.dtype = "bfloat16"
        config.micro_batch_size = 4

    elif "v100" in gpu_name:
        # V100 16GB: bf16 not supported, reduce batch size
        print("  → V100 detected: fp16 mode, reduced batch size")
        config.dtype = "float16"
        config.micro_batch_size = 2
        config.gradient_accumulation_steps = 64  # maintain effective batch size

    elif "t4" in gpu_name:
        # T4 16GB: bf16 not supported, smaller batch
        print("  → T4 detected: fp16 mode, minimum batch size")
        config.dtype = "float16"
        config.micro_batch_size = 1
        config.gradient_accumulation_steps = 128

    elif "l4" in gpu_name:
        # L4 24GB: bf16 supported
        print("  → L4 detected: bf16 mode, adjusted batch size")
        config.dtype = "bfloat16"
        config.micro_batch_size = 2
        config.gradient_accumulation_steps = 64

    else:
        print(f"  → Unknown GPU. Adjusting settings based on memory")
        if gpu_mem >= 30:
            config.micro_batch_size = 4
        elif gpu_mem >= 16:
            config.micro_batch_size = 2
        else:
            config.micro_batch_size = 1
            config.gradient_accumulation_steps = 128

    print(f"  → dtype: {config.dtype}")
    print(f"  → micro_batch: {config.micro_batch_size}")
    print(f"  → grad_accum: {config.gradient_accumulation_steps}")
    print(f"  → effective_batch: {config.effective_batch_size}")

    return config