File size: 4,303 Bytes
ed40a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62525b2
ed40a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62525b2
 
ed40a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
62525b2
 
9080f28
 
 
 
 
 
 
 
 
 
 
 
 
 
62525b2
 
ed40a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Model Configuration Registry
Defines metadata for all supported code generation models
"""

from typing import Dict, List, Optional, TypedDict
from dataclasses import dataclass


class ModelConfig(TypedDict):
    """Configuration metadata for a model"""
    hf_path: str
    display_name: str
    architecture: str
    size: str
    num_layers: int
    num_heads: int
    num_kv_heads: Optional[int]  # For GQA models
    vocab_size: int
    context_length: int
    attention_type: str  # "multi_head" or "grouped_query"
    requires_gpu: bool
    min_vram_gb: float
    min_ram_gb: float
    recommended_dtype: str  # "fp16", "bf16", or "fp32"


# Supported models registry
SUPPORTED_MODELS: Dict[str, ModelConfig] = {
    "codegen-350m": {
        "hf_path": "Salesforce/codegen-350M-mono",
        "display_name": "CodeGen 350M",
        "architecture": "gpt_neox",
        "size": "350M",
        "num_layers": 20,
        "num_heads": 16,
        "num_kv_heads": None,  # Standard MHA
        "vocab_size": 51200,
        "context_length": 2048,
        "attention_type": "multi_head",
        "requires_gpu": False,
        "min_vram_gb": 2.0,
        "min_ram_gb": 4.0,
        "recommended_dtype": "fp16"  # fp16 for GPU, fp32 for CPU
    },
    "code-llama-7b": {
        "hf_path": "codellama/CodeLlama-7b-hf",
        "display_name": "Code Llama 7B",
        "architecture": "llama",
        "size": "7B",
        "num_layers": 32,
        "num_heads": 32,
        "num_kv_heads": 32,  # GQA: 32 Q heads, 32 KV heads
        "vocab_size": 32000,
        "context_length": 16384,
        "attention_type": "grouped_query",
        "requires_gpu": True,  # Strongly recommended for usable performance
        "min_vram_gb": 14.0,   # FP16 requires ~14GB VRAM
        "min_ram_gb": 18.0,    # FP16 requires ~18GB RAM for CPU fallback
        "recommended_dtype": "fp16"
    },
    "devstral-small": {
        "hf_path": "mistralai/Devstral-Small-2507",
        "display_name": "Devstral Small 24B",
        "architecture": "mistral",
        "size": "24B",
        "num_layers": 40,
        "num_heads": 32,
        "num_kv_heads": 8,  # GQA: 32 Q heads, 8 KV heads
        "vocab_size": 131072,
        "context_length": 131072,
        "attention_type": "grouped_query",
        "requires_gpu": True,  # BF16 required, GPU strongly recommended
        "min_vram_gb": 48.0,   # BF16 requires ~48GB VRAM
        "min_ram_gb": 96.0,    # BF16 requires ~96GB RAM for CPU fallback
        "recommended_dtype": "bf16"  # Devstral requires bfloat16
    }
}


def get_model_config(model_id: str) -> Optional[ModelConfig]:
    """
    Get configuration for a specific model

    Args:
        model_id: Model identifier (e.g., "codegen-350m")

    Returns:
        ModelConfig dict or None if model not found
    """
    return SUPPORTED_MODELS.get(model_id)


def get_available_models(device_type: str = "cpu", available_vram_gb: float = 0) -> List[str]:
    """
    Filter models by hardware constraints

    Args:
        device_type: "cpu", "cuda", or "mps"
        available_vram_gb: Available VRAM in GB (0 for CPU)

    Returns:
        List of model IDs that can run on the hardware
    """
    available = []

    for model_id, config in SUPPORTED_MODELS.items():
        # Check if GPU is required but not available
        if config["requires_gpu"] and device_type == "cpu":
            continue

        # Check VRAM requirements
        if device_type in ["cuda", "mps"] and available_vram_gb > 0:
            if available_vram_gb < config["min_vram_gb"]:
                continue

        available.append(model_id)

    return available


def list_all_models() -> List[Dict[str, any]]:
    """
    List all supported models with their metadata

    Returns:
        List of model info dicts
    """
    models = []
    for model_id, config in SUPPORTED_MODELS.items():
        models.append({
            "id": model_id,
            "name": config["display_name"],
            "size": config["size"],
            "architecture": config["architecture"],
            "attention_type": config["attention_type"],
            "num_layers": config["num_layers"],
            "num_heads": config["num_heads"],
            "requires_gpu": config["requires_gpu"]
        })
    return models