Spaces:

Sabithulla
/

alpha-core-ai

Build error

alpha-core-ai / model_manager.py

Multi-stage Docker build: Stage 1 compiles llama-cpp-python once, Stage 2 reuses compiled wheels - NO TIMEOUT! Build time 8-12 minutes first time, then cached.

9d2777a 29 days ago

raw

history blame contribute delete

5.16 kB

	import os
	from llama_cpp import Llama
	import requests
	from typing import Generator

	class ModelManager:
	def __init__(self):
	self.models = {}
	self.model_configs = {
	"fast-chat": {
	"repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
	"file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
	"url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
	"format": "chatml"
	},
	"tinyllama": {
	"repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
	"file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
	"url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
	"format": "tinyllama"
	},
	"coder": {
	"repo": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
	"file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
	"url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
	"format": "chatml"
	}
	}
	self.models_dir = os.path.join(os.getcwd(), "models")
	os.makedirs(self.models_dir, exist_ok=True)
	self.critical_models = ["fast-chat"]
	self.auto_download_critical()

	def auto_download_critical(self):
	"""Download only critical lightweight models at startup"""
	print("Checking for pre-downloaded models...")
	for model_id in self.critical_models:
	try:
	path = self.download_model(model_id)
	print(f"✓ {model_id} ready")
	except Exception as e:
	print(f"✗ Failed to ensure {model_id}: {e}")

	def download_model(self, model_id: str):
	config = self.model_configs.get(model_id)
	if not config:
	raise ValueError(f"Model {model_id} not configured")

	target_path = os.path.join(self.models_dir, config["file"])
	if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000:
	return target_path

	print(f"Downloading {model_id}...")
	try:
	response = requests.get(config["url"], stream=True, timeout=60)
	response.raise_for_status()
	with open(target_path, "wb") as f:
	for chunk in response.iter_content(chunk_size=1024*1024):
	if chunk:
	f.write(chunk)
	print(f"✓ {model_id} downloaded")
	return target_path
	except Exception as e:
	if os.path.exists(target_path):
	os.remove(target_path)
	raise e

	def load_model(self, model_id: str):
	if model_id in self.models:
	return self.models[model_id]

	path = self.download_model(model_id)
	self.models[model_id] = Llama(
	model_path=path,
	n_ctx=1024,
	n_threads=2,
	verbose=False
	)
	return self.models[model_id]

	def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
	fmt = self.model_configs[model_id]["format"]

	if fmt == "chatml":
	full = f"<\|im_start\|>system\n{system}<\|im_end\|>\n"
	for msg in history:
	role = "user" if msg["role"] == "user" else "assistant"
	full += f"<\|im_start\|>{role}\n{msg['content']}<\|im_end\|>\n"
	full += f"<\|im_start\|>user\n{prompt}<\|im_end\|>\n<\|im_start\|>assistant\n"
	return full, ["<\|im_end\|>", "###", "<\|im_start\|>", "</s>"]

	elif fmt == "tinyllama":
	full = f"<\|system\|>\n{system}</s>\n"
	for msg in history:
	role = "user" if msg["role"] == "user" else "assistant"
	full += f"<\|{role}\|>\n{msg['content']}</s>\n"
	full += f"<\|user\|>\n{prompt}</s>\n<\|assistant\|>\n"
	return full, ["</s>", "<\|user\|>", "<\|assistant\|>"]

	return prompt, ["</s>"]

	def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
	llm = self.load_model(model_id)

	system_text = (
	"You are a helpful AI assistant. "
	"For math, use LaTeX with $ $ for display and \$ \$ for inline."
	)

	full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)

	params = {
	"max_tokens": kwargs.get("max_tokens", 512),
	"stop": stop_tokens,
	"stream": True,
	"temperature": kwargs.get("temperature", 0.7),
	"top_p": kwargs.get("top_p", 0.95)
	}

	for output in llm(full_prompt, **params):
	token = output["choices"][0]["text"]
	yield token

	def cleanup(self):
	"""Cleanup resources"""
	for model in self.models.values():
	if hasattr(model, 'close'):
	model.close()
	self.models.clear()