Spaces:

visualisable-ai
/

api

Sleeping

api / backend /model_config.py

gary-boon

Add Code Llama 7B support with hardware-aware filtering and ICL timeout fixes

ed40a9a about 1 month ago

3.47 kB

	"""
	Model Configuration Registry
	Defines metadata for all supported code generation models
	"""

	from typing import Dict, List, Optional, TypedDict
	from dataclasses import dataclass


	class ModelConfig(TypedDict):
	"""Configuration metadata for a model"""
	hf_path: str
	display_name: str
	architecture: str
	size: str
	num_layers: int
	num_heads: int
	num_kv_heads: Optional[int] # For GQA models
	vocab_size: int
	context_length: int
	attention_type: str # "multi_head" or "grouped_query"
	requires_gpu: bool
	min_vram_gb: float
	min_ram_gb: float


	# Supported models registry
	SUPPORTED_MODELS: Dict[str, ModelConfig] = {
	"codegen-350m": {
	"hf_path": "Salesforce/codegen-350M-mono",
	"display_name": "CodeGen 350M",
	"architecture": "gpt_neox",
	"size": "350M",
	"num_layers": 20,
	"num_heads": 16,
	"num_kv_heads": None, # Standard MHA
	"vocab_size": 51200,
	"context_length": 2048,
	"attention_type": "multi_head",
	"requires_gpu": False,
	"min_vram_gb": 2.0,
	"min_ram_gb": 4.0
	},
	"code-llama-7b": {
	"hf_path": "codellama/CodeLlama-7b-hf",
	"display_name": "Code Llama 7B",
	"architecture": "llama",
	"size": "7B",
	"num_layers": 32,
	"num_heads": 32,
	"num_kv_heads": 32, # GQA: 32 Q heads, 32 KV heads
	"vocab_size": 32000,
	"context_length": 16384,
	"attention_type": "grouped_query",
	"requires_gpu": True, # Strongly recommended for usable performance
	"min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
	"min_ram_gb": 18.0 # FP16 requires ~18GB RAM for CPU fallback
	}
	}


	def get_model_config(model_id: str) -> Optional[ModelConfig]:
	"""
	Get configuration for a specific model

	Args:
	model_id: Model identifier (e.g., "codegen-350m")

	Returns:
	ModelConfig dict or None if model not found
	"""
	return SUPPORTED_MODELS.get(model_id)


	def get_available_models(device_type: str = "cpu", available_vram_gb: float = 0) -> List[str]:
	"""
	Filter models by hardware constraints

	Args:
	device_type: "cpu", "cuda", or "mps"
	available_vram_gb: Available VRAM in GB (0 for CPU)

	Returns:
	List of model IDs that can run on the hardware
	"""
	available = []

	for model_id, config in SUPPORTED_MODELS.items():
	# Check if GPU is required but not available
	if config["requires_gpu"] and device_type == "cpu":
	continue

	# Check VRAM requirements
	if device_type in ["cuda", "mps"] and available_vram_gb > 0:
	if available_vram_gb < config["min_vram_gb"]:
	continue

	available.append(model_id)

	return available


	def list_all_models() -> List[Dict[str, any]]:
	"""
	List all supported models with their metadata

	Returns:
	List of model info dicts
	"""
	models = []
	for model_id, config in SUPPORTED_MODELS.items():
	models.append({
	"id": model_id,
	"name": config["display_name"],
	"size": config["size"],
	"architecture": config["architecture"],
	"attention_type": config["attention_type"],
	"num_layers": config["num_layers"],
	"num_heads": config["num_heads"],
	"requires_gpu": config["requires_gpu"]
	})
	return models