api / backend /model_config.py
gary-boon
Add Code Llama 7B support with hardware-aware filtering and ICL timeout fixes
ed40a9a
raw
history blame
3.47 kB
"""
Model Configuration Registry
Defines metadata for all supported code generation models
"""
from typing import Dict, List, Optional, TypedDict
from dataclasses import dataclass
class ModelConfig(TypedDict):
"""Configuration metadata for a model"""
hf_path: str
display_name: str
architecture: str
size: str
num_layers: int
num_heads: int
num_kv_heads: Optional[int] # For GQA models
vocab_size: int
context_length: int
attention_type: str # "multi_head" or "grouped_query"
requires_gpu: bool
min_vram_gb: float
min_ram_gb: float
# Supported models registry
SUPPORTED_MODELS: Dict[str, ModelConfig] = {
"codegen-350m": {
"hf_path": "Salesforce/codegen-350M-mono",
"display_name": "CodeGen 350M",
"architecture": "gpt_neox",
"size": "350M",
"num_layers": 20,
"num_heads": 16,
"num_kv_heads": None, # Standard MHA
"vocab_size": 51200,
"context_length": 2048,
"attention_type": "multi_head",
"requires_gpu": False,
"min_vram_gb": 2.0,
"min_ram_gb": 4.0
},
"code-llama-7b": {
"hf_path": "codellama/CodeLlama-7b-hf",
"display_name": "Code Llama 7B",
"architecture": "llama",
"size": "7B",
"num_layers": 32,
"num_heads": 32,
"num_kv_heads": 32, # GQA: 32 Q heads, 32 KV heads
"vocab_size": 32000,
"context_length": 16384,
"attention_type": "grouped_query",
"requires_gpu": True, # Strongly recommended for usable performance
"min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
"min_ram_gb": 18.0 # FP16 requires ~18GB RAM for CPU fallback
}
}
def get_model_config(model_id: str) -> Optional[ModelConfig]:
"""
Get configuration for a specific model
Args:
model_id: Model identifier (e.g., "codegen-350m")
Returns:
ModelConfig dict or None if model not found
"""
return SUPPORTED_MODELS.get(model_id)
def get_available_models(device_type: str = "cpu", available_vram_gb: float = 0) -> List[str]:
"""
Filter models by hardware constraints
Args:
device_type: "cpu", "cuda", or "mps"
available_vram_gb: Available VRAM in GB (0 for CPU)
Returns:
List of model IDs that can run on the hardware
"""
available = []
for model_id, config in SUPPORTED_MODELS.items():
# Check if GPU is required but not available
if config["requires_gpu"] and device_type == "cpu":
continue
# Check VRAM requirements
if device_type in ["cuda", "mps"] and available_vram_gb > 0:
if available_vram_gb < config["min_vram_gb"]:
continue
available.append(model_id)
return available
def list_all_models() -> List[Dict[str, any]]:
"""
List all supported models with their metadata
Returns:
List of model info dicts
"""
models = []
for model_id, config in SUPPORTED_MODELS.items():
models.append({
"id": model_id,
"name": config["display_name"],
"size": config["size"],
"architecture": config["architecture"],
"attention_type": config["attention_type"],
"num_layers": config["num_layers"],
"num_heads": config["num_heads"],
"requires_gpu": config["requires_gpu"]
})
return models