| import asyncio |
|
|
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
|
|
| from src.core.config import settings |
|
|
|
|
| class ModelEngine: |
| def __init__(self): |
| self.llm = None |
| self.lock = asyncio.Lock() |
| self._load_model() |
|
|
| def _load_model(self): |
| try: |
| model_path = hf_hub_download( |
| repo_id=settings.REPO_ID, filename=settings.FILENAME |
| ) |
| self.llm = Llama( |
| model_path=model_path, |
| n_ctx=settings.CONTEXT_SIZE, |
| n_threads=settings.N_THREADS, |
| n_gpu_layers=settings.N_GPU_LAYERS, |
| verbose=True, |
| ) |
| except Exception as e: |
| print(f"Error loading model: {e}") |
|
|
| def generate(self, messages, max_tokens, temperature, stream=True): |
| if not self.llm: |
| raise RuntimeError("Model not loaded") |
| return self.llm.create_chat_completion( |
| messages=messages, |
| max_tokens=int(max_tokens), |
| temperature=float(temperature), |
| stream=stream, |
| ) |
|
|
|
|
| |
| engine = ModelEngine() |
|
|