import os from llama_cpp import Llama import requests from typing import Generator class ModelManager: def __init__(self): self.models = {} self.model_configs = { "fast-chat": { "repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF", "file": "qwen2.5-0.5b-instruct-q4_k_m.gguf", "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf", "format": "chatml" }, "tinyllama": { "repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", "url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", "format": "tinyllama" }, "coder": { "repo": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", "file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf", "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf", "format": "chatml" } } self.models_dir = os.path.join(os.getcwd(), "models") os.makedirs(self.models_dir, exist_ok=True) self.critical_models = ["fast-chat"] self.auto_download_critical() def auto_download_critical(self): """Download only critical lightweight models at startup""" print("Checking for pre-downloaded models...") for model_id in self.critical_models: try: path = self.download_model(model_id) print(f"✓ {model_id} ready") except Exception as e: print(f"✗ Failed to ensure {model_id}: {e}") def download_model(self, model_id: str): config = self.model_configs.get(model_id) if not config: raise ValueError(f"Model {model_id} not configured") target_path = os.path.join(self.models_dir, config["file"]) if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000: return target_path print(f"Downloading {model_id}...") try: response = requests.get(config["url"], stream=True, timeout=60) response.raise_for_status() with open(target_path, "wb") as f: for chunk in response.iter_content(chunk_size=1024*1024): if chunk: f.write(chunk) print(f"✓ {model_id} downloaded") return target_path except Exception as e: if os.path.exists(target_path): os.remove(target_path) raise e def load_model(self, model_id: str): if model_id in self.models: return self.models[model_id] path = self.download_model(model_id) self.models[model_id] = Llama( model_path=path, n_ctx=1024, n_threads=2, verbose=False ) return self.models[model_id] def format_prompt(self, model_id: str, system: str, history: list, prompt: str): fmt = self.model_configs[model_id]["format"] if fmt == "chatml": full = f"<|im_start|>system\n{system}<|im_end|>\n" for msg in history: role = "user" if msg["role"] == "user" else "assistant" full += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n" full += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" return full, ["<|im_end|>", "###", "<|im_start|>", ""] elif fmt == "tinyllama": full = f"<|system|>\n{system}\n" for msg in history: role = "user" if msg["role"] == "user" else "assistant" full += f"<|{role}|>\n{msg['content']}\n" full += f"<|user|>\n{prompt}\n<|assistant|>\n" return full, ["", "<|user|>", "<|assistant|>"] return prompt, [""] def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]: llm = self.load_model(model_id) system_text = ( "You are a helpful AI assistant. " "For math, use LaTeX with $ $ for display and \\( \\) for inline." ) full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt) params = { "max_tokens": kwargs.get("max_tokens", 512), "stop": stop_tokens, "stream": True, "temperature": kwargs.get("temperature", 0.7), "top_p": kwargs.get("top_p", 0.95) } for output in llm(full_prompt, **params): token = output["choices"][0]["text"] yield token def cleanup(self): """Cleanup resources""" for model in self.models.values(): if hasattr(model, 'close'): model.close() self.models.clear()