Spaces:

visualisable-ai
/

api

Running on CPU Upgrade

api

File size: 4,303 Bytes

"""
Model Configuration Registry
Defines metadata for all supported code generation models
"""

from typing import Dict, List, Optional, TypedDict
from dataclasses import dataclass


class ModelConfig(TypedDict):
    """Configuration metadata for a model"""
    hf_path: str
    display_name: str
    architecture: str
    size: str
    num_layers: int
    num_heads: int
    num_kv_heads: Optional[int]  # For GQA models
    vocab_size: int
    context_length: int
    attention_type: str  # "multi_head" or "grouped_query"
    requires_gpu: bool
    min_vram_gb: float
    min_ram_gb: float
    recommended_dtype: str  # "fp16", "bf16", or "fp32"


# Supported models registry
SUPPORTED_MODELS: Dict[str, ModelConfig] = {
    "codegen-350m": {
        "hf_path": "Salesforce/codegen-350M-mono",
        "display_name": "CodeGen 350M",
        "architecture": "gpt_neox",
        "size": "350M",
        "num_layers": 20,
        "num_heads": 16,
        "num_kv_heads": None,  # Standard MHA
        "vocab_size": 51200,
        "context_length": 2048,
        "attention_type": "multi_head",
        "requires_gpu": False,
        "min_vram_gb": 2.0,
        "min_ram_gb": 4.0,
        "recommended_dtype": "fp16"  # fp16 for GPU, fp32 for CPU
    },
    "code-llama-7b": {
        "hf_path": "codellama/CodeLlama-7b-hf",
        "display_name": "Code Llama 7B",
        "architecture": "llama",
        "size": "7B",
        "num_layers": 32,
        "num_heads": 32,
        "num_kv_heads": 32,  # GQA: 32 Q heads, 32 KV heads
        "vocab_size": 32000,
        "context_length": 16384,
        "attention_type": "grouped_query",
        "requires_gpu": True,  # Strongly recommended for usable performance
        "min_vram_gb": 14.0,   # FP16 requires ~14GB VRAM
        "min_ram_gb": 18.0,    # FP16 requires ~18GB RAM for CPU fallback
        "recommended_dtype": "fp16"
    },
    "devstral-small": {
        "hf_path": "mistralai/Devstral-Small-2507",
        "display_name": "Devstral Small 24B",
        "architecture": "mistral",
        "size": "24B",
        "num_layers": 40,
        "num_heads": 32,
        "num_kv_heads": 8,  # GQA: 32 Q heads, 8 KV heads
        "vocab_size": 131072,
        "context_length": 131072,
        "attention_type": "grouped_query",
        "requires_gpu": True,  # BF16 required, GPU strongly recommended
        "min_vram_gb": 48.0,   # BF16 requires ~48GB VRAM
        "min_ram_gb": 96.0,    # BF16 requires ~96GB RAM for CPU fallback
        "recommended_dtype": "bf16"  # Devstral requires bfloat16
    }
}


def get_model_config(model_id: str) -> Optional[ModelConfig]:
    """
    Get configuration for a specific model

    Args:
        model_id: Model identifier (e.g., "codegen-350m")

    Returns:
        ModelConfig dict or None if model not found
    """
    return SUPPORTED_MODELS.get(model_id)


def get_available_models(device_type: str = "cpu", available_vram_gb: float = 0) -> List[str]:
    """
    Filter models by hardware constraints

    Args:
        device_type: "cpu", "cuda", or "mps"
        available_vram_gb: Available VRAM in GB (0 for CPU)

    Returns:
        List of model IDs that can run on the hardware
    """
    available = []

    for model_id, config in SUPPORTED_MODELS.items():
        # Check if GPU is required but not available
        if config["requires_gpu"] and device_type == "cpu":
            continue

        # Check VRAM requirements
        if device_type in ["cuda", "mps"] and available_vram_gb > 0:
            if available_vram_gb < config["min_vram_gb"]:
                continue

        available.append(model_id)

    return available


def list_all_models() -> List[Dict[str, any]]:
    """
    List all supported models with their metadata

    Returns:
        List of model info dicts
    """
    models = []
    for model_id, config in SUPPORTED_MODELS.items():
        models.append({
            "id": model_id,
            "name": config["display_name"],
            "size": config["size"],
            "architecture": config["architecture"],
            "attention_type": config["attention_type"],
            "num_layers": config["num_layers"],
            "num_heads": config["num_heads"],
            "requires_gpu": config["requires_gpu"]
        })
    return models