File size: 4,790 Bytes

7275aef

"""
Validation matrix for model, precision, and GPU capabilities
"""

import importlib
import math
import os
from dataclasses import dataclass
from typing import Dict, Optional, Tuple

import torch
from transformers import AutoTokenizer

PRECISIONS = {"fp32", "fp16", "bf16", "qlora4bit"}


@dataclass
class GpuInfo:
    available: bool
    name: str
    total_bytes: int
    free_bytes: int
    cc_major: int
    cc_minor: int
    bf16_supported: bool
    device_index: int = 0


@dataclass
class MultiGpuInfo:
    """Information about all available GPUs"""
    gpus: list[GpuInfo]
    count: int
    total_vram_gb: float
    
    def __post_init__(self):
        self.count = len(self.gpus)
        self.total_vram_gb = sum(gpu.total_bytes for gpu in self.gpus) / (1024**3)


def get_gpu_info(device: int = 0) -> GpuInfo:
    """Get GPU information and capabilities for a specific device"""
    if not torch.cuda.is_available():
        return GpuInfo(False, "cpu", 0, 0, 0, 0, False, device)
    
    name = torch.cuda.get_device_name(device)
    total = torch.cuda.get_device_properties(device).total_memory
    free = torch.cuda.mem_get_info(device)[0]
    major, minor = torch.cuda.get_device_capability(device)
    bf16_ok = torch.cuda.is_bf16_supported()
    
    return GpuInfo(True, name, total, free, major, minor, bf16_ok, device)


def get_all_gpu_info() -> MultiGpuInfo:
    """Get information about all available GPUs"""
    if not torch.cuda.is_available():
        return MultiGpuInfo([], 0, 0.0)
    
    gpus = []
    device_count = torch.cuda.device_count()
    
    for device_idx in range(device_count):
        gpu_info = get_gpu_info(device_idx)
        gpus.append(gpu_info)
    
    return MultiGpuInfo(gpus, device_count, sum(gpu.total_bytes for gpu in gpus) / (1024**3))


def has_bitsandbytes() -> bool:
    """Check if bitsandbytes is available"""
    try:
        importlib.import_module("bitsandbytes")
        return True
    except Exception:
        return False


def precision_supported(precision: str, gpu: GpuInfo) -> Tuple[bool, str]:
    """Check if precision is supported on the given GPU"""
    if precision == "bf16" and not gpu.bf16_supported:
        return False, "bf16 is not supported on this GPU. Try fp16."
    
    if precision == "qlora4bit":
        if not gpu.available:
            return False, "4-bit requires CUDA GPU. Try fp16 on CPU is not supported for 4-bit."
        if not has_bitsandbytes():
            return False, "bitsandbytes not installed. `pip install bitsandbytes` or use fp16."
        if (gpu.cc_major, gpu.cc_minor) < (7, 0):
            return False, f"Compute capability {gpu.cc_major}.{gpu.cc_minor} may be insufficient for 4-bit. Use fp16."
    
    # fp16/fp32 are generally ok if CUDA present (or CPU for fp32)
    return True, "ok"


def estimate_model_params(config) -> Optional[int]:
    """Estimate model parameters from config"""
    try:
        hs = int(getattr(config, "hidden_size", 0))
        nl = int(getattr(config, "num_hidden_layers", 0))
        vs = int(getattr(config, "vocab_size", 0))
        
        if hs == 0 or nl == 0 or vs == 0:
            return None
            
        # rough param estimate: attention/MLP per layer + embeddings
        per_layer = 12 * hs * hs  # coarse
        total = per_layer * nl + vs * hs
        return total
    except Exception:
        return None


def bytes_per_param(precision: str) -> float:
    """Get bytes per parameter for given precision"""
    return {
        "fp32": 4,
        "fp16": 2,
        "bf16": 2,
        "qlora4bit": 0.5,  # model weights quantized; activations use higher precision at runtime
    }.get(precision, 2)


def estimate_memory_bytes(params: Optional[int], precision: str, adam: bool, lora: bool) -> Optional[int]:
    """Estimate memory usage in bytes"""
    if params is None:
        return None
    
    base = params * bytes_per_param(precision)
    # optimizer & gradients (very rough; LoRA reduces trainable params a lot)
    overhead = 2.0 if adam else 0.6
    if lora:
        overhead *= 0.3  # fewer trainable params
    
    return int(base * (1.0 + overhead))


def tokenizer_ok(model_id_or_path: str) -> Tuple[bool, str]:
    """Check if tokenizer can be loaded and is properly configured"""
    try:
        tok = AutoTokenizer.from_pretrained(model_id_or_path, use_fast=True, trust_remote_code=True)
        if tok.pad_token is None:
            # safe default: set pad_token to eos_token
            if getattr(tok, "eos_token", None):
                return True, "No pad_token; will use eos_token for padding."
            return False, "Tokenizer missing pad_token and eos_token."
        return True, "ok"
    except Exception as e:
        return False, f"Tokenizer load failed: {e}"