| """ |
| LLM Engine β local Qwen2.5-0.5B-Instruct fallback via llama-cpp-python. |
| |
| This is the bottom layer of the AnveshAI hierarchy: |
| |
| Math β math_engine (instant, rule-based) |
| Knowledge β knowledge_engine (keyword retrieval from knowledge.txt) |
| ββ no match β LLMEngine.generate (Qwen2.5-0.5B) |
| Conversation β conversation_engine (pattern matching from conversation.txt) |
| ββ no match β LLMEngine.generate (Qwen2.5-0.5B) |
| |
| Model: Qwen/Qwen2.5-0.5B-Instruct (Q4_K_M GGUF, ~350 MB) |
| β Best-in-class quality at 0.5B parameters |
| β Runs entirely on CPU via llama.cpp |
| β Downloaded once into ~/.cache/huggingface/ on first use |
| β Loaded LAZILY: the model only loads when first needed, |
| keeping startup instant. |
| """ |
|
|
| MODEL_REPO = "Qwen/Qwen2.5-0.5B-Instruct-GGUF" |
| MODEL_FILE = "qwen2.5-0.5b-instruct-q4_k_m.gguf" |
|
|
| SYSTEM_PROMPT = ( |
| "You are AnveshAI Edge, a helpful offline AI assistant. " |
| "Answer questions thoroughly and completely. Show full working steps " |
| "for math or technical questions. Do not repeat the question back. " |
| "If you are unsure about something, say so clearly." |
| ) |
|
|
| MATH_SYSTEM_PROMPT = ( |
| "You are a mathematics tutor. " |
| "You will be given a VERIFIED ANSWER computed by a symbolic engine. " |
| "That answer is 100% correct β do NOT change it, do NOT recompute it. " |
| "Your ONLY job is to explain, step by step, HOW a student would work through " |
| "the problem and arrive at that exact answer. " |
| "Every step must lead logically toward the verified answer. " |
| "State the verified answer word-for-word at the end of your explanation." |
| ) |
|
|
| MAX_TOKENS = 1024 |
| TEMPERATURE = 0.7 |
| MATH_TEMPERATURE = 0.1 |
| TOP_P = 0.9 |
| N_CTX = 16384 |
|
|
|
|
| class LLMEngine: |
| """ |
| Lazy-loading wrapper around Qwen2.5-0.5B-Instruct (GGUF via llama.cpp). |
| |
| Usage: |
| engine = LLMEngine() |
| response = engine.generate("What is photosynthesis?") |
| |
| The GGUF model is downloaded from HuggingFace on the first call to |
| generate() and cached locally. Every subsequent call reuses the |
| in-memory model β no re-loading. |
| """ |
|
|
| def __init__(self) -> None: |
| self._llm = None |
| self._loaded: bool = False |
| self._failed: bool = False |
| self._fail_reason: str = "" |
|
|
| def is_available(self) -> bool: |
| """True once the model has loaded without error.""" |
| return self._loaded and not self._failed |
|
|
| |
| |
| |
|
|
| def _load(self) -> None: |
| """Download (first run only) and load the GGUF model into memory.""" |
| if self._loaded or self._failed: |
| return |
|
|
| try: |
| print( |
| f"\n [LLM] Loading {MODEL_FILE} β¦ " |
| "(first run downloads ~350 MB, then cached locally)", |
| flush=True, |
| ) |
|
|
| from llama_cpp import Llama |
|
|
| self._llm = Llama.from_pretrained( |
| repo_id=MODEL_REPO, |
| filename=MODEL_FILE, |
| n_ctx=N_CTX, |
| n_threads=4, |
| verbose=False, |
| ) |
|
|
| self._loaded = True |
| print(" [LLM] Qwen2.5-0.5B-Instruct ready\n", flush=True) |
|
|
| except Exception as exc: |
| self._failed = True |
| self._fail_reason = str(exc) |
| print(f" [LLM] Failed to load: {exc}\n", flush=True) |
|
|
| |
| |
| |
|
|
| def generate( |
| self, |
| user_input: str, |
| context: str = "", |
| system_prompt: str = "", |
| temperature: float = TEMPERATURE, |
| ) -> str: |
| """ |
| Generate a response using the local LLM. |
| |
| Args: |
| user_input : The user's message or question. |
| context : Optional retrieved text to inject as background. |
| system_prompt : Override the default system prompt (e.g. for math). |
| temperature : Sampling temperature; use low values for math. |
| |
| Returns: |
| The model's reply as a plain string. |
| """ |
| self._load() |
|
|
| if self._failed: |
| return ( |
| "The local LLM is currently unavailable " |
| f"({self._fail_reason}). " |
| "Ensure 'llama-cpp-python' is installed and the model " |
| "could be downloaded." |
| ) |
|
|
| try: |
| system_content = system_prompt if system_prompt else SYSTEM_PROMPT |
| if context: |
| system_content += f"\n\nRelevant background:\n{context}" |
|
|
| messages = [ |
| {"role": "system", "content": system_content}, |
| {"role": "user", "content": user_input}, |
| ] |
|
|
| output = self._llm.create_chat_completion( |
| messages=messages, |
| max_tokens=MAX_TOKENS, |
| temperature=temperature, |
| top_p=TOP_P, |
| ) |
|
|
| response: str = output["choices"][0]["message"]["content"] |
| return response.strip() |
|
|
| except Exception as exc: |
| return f"LLM generation error: {exc}" |
|
|