alpha-core-ai / model_manager.py
Sabithulla's picture
Multi-stage Docker build: Stage 1 compiles llama-cpp-python once, Stage 2 reuses compiled wheels - NO TIMEOUT! Build time 8-12 minutes first time, then cached.
9d2777a
import os
from llama_cpp import Llama
import requests
from typing import Generator
class ModelManager:
def __init__(self):
self.models = {}
self.model_configs = {
"fast-chat": {
"repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
"file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
"url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
"format": "chatml"
},
"tinyllama": {
"repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
"file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
"url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
"format": "tinyllama"
},
"coder": {
"repo": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
"file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
"url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
"format": "chatml"
}
}
self.models_dir = os.path.join(os.getcwd(), "models")
os.makedirs(self.models_dir, exist_ok=True)
self.critical_models = ["fast-chat"]
self.auto_download_critical()
def auto_download_critical(self):
"""Download only critical lightweight models at startup"""
print("Checking for pre-downloaded models...")
for model_id in self.critical_models:
try:
path = self.download_model(model_id)
print(f"✓ {model_id} ready")
except Exception as e:
print(f"✗ Failed to ensure {model_id}: {e}")
def download_model(self, model_id: str):
config = self.model_configs.get(model_id)
if not config:
raise ValueError(f"Model {model_id} not configured")
target_path = os.path.join(self.models_dir, config["file"])
if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000:
return target_path
print(f"Downloading {model_id}...")
try:
response = requests.get(config["url"], stream=True, timeout=60)
response.raise_for_status()
with open(target_path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024*1024):
if chunk:
f.write(chunk)
print(f"✓ {model_id} downloaded")
return target_path
except Exception as e:
if os.path.exists(target_path):
os.remove(target_path)
raise e
def load_model(self, model_id: str):
if model_id in self.models:
return self.models[model_id]
path = self.download_model(model_id)
self.models[model_id] = Llama(
model_path=path,
n_ctx=1024,
n_threads=2,
verbose=False
)
return self.models[model_id]
def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
fmt = self.model_configs[model_id]["format"]
if fmt == "chatml":
full = f"<|im_start|>system\n{system}<|im_end|>\n"
for msg in history:
role = "user" if msg["role"] == "user" else "assistant"
full += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n"
full += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
return full, ["<|im_end|>", "###", "<|im_start|>", "</s>"]
elif fmt == "tinyllama":
full = f"<|system|>\n{system}</s>\n"
for msg in history:
role = "user" if msg["role"] == "user" else "assistant"
full += f"<|{role}|>\n{msg['content']}</s>\n"
full += f"<|user|>\n{prompt}</s>\n<|assistant|>\n"
return full, ["</s>", "<|user|>", "<|assistant|>"]
return prompt, ["</s>"]
def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
llm = self.load_model(model_id)
system_text = (
"You are a helpful AI assistant. "
"For math, use LaTeX with $ $ for display and \\( \\) for inline."
)
full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)
params = {
"max_tokens": kwargs.get("max_tokens", 512),
"stop": stop_tokens,
"stream": True,
"temperature": kwargs.get("temperature", 0.7),
"top_p": kwargs.get("top_p", 0.95)
}
for output in llm(full_prompt, **params):
token = output["choices"][0]["text"]
yield token
def cleanup(self):
"""Cleanup resources"""
for model in self.models.values():
if hasattr(model, 'close'):
model.close()
self.models.clear()