File size: 5,593 Bytes
5d8fd4f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | """
LLM Engine β local Qwen2.5-0.5B-Instruct fallback via llama-cpp-python.
This is the bottom layer of the AnveshAI hierarchy:
Math β math_engine (instant, rule-based)
Knowledge β knowledge_engine (keyword retrieval from knowledge.txt)
ββ no match β LLMEngine.generate (Qwen2.5-0.5B)
Conversation β conversation_engine (pattern matching from conversation.txt)
ββ no match β LLMEngine.generate (Qwen2.5-0.5B)
Model: Qwen/Qwen2.5-0.5B-Instruct (Q4_K_M GGUF, ~350 MB)
β Best-in-class quality at 0.5B parameters
β Runs entirely on CPU via llama.cpp
β Downloaded once into ~/.cache/huggingface/ on first use
β Loaded LAZILY: the model only loads when first needed,
keeping startup instant.
"""
MODEL_REPO = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
MODEL_FILE = "qwen2.5-0.5b-instruct-q4_k_m.gguf"
SYSTEM_PROMPT = (
"You are AnveshAI Edge, a helpful offline AI assistant. "
"Answer questions thoroughly and completely. Show full working steps "
"for math or technical questions. Do not repeat the question back. "
"If you are unsure about something, say so clearly."
)
MATH_SYSTEM_PROMPT = (
"You are a mathematics tutor. "
"You will be given a VERIFIED ANSWER computed by a symbolic engine. "
"That answer is 100% correct β do NOT change it, do NOT recompute it. "
"Your ONLY job is to explain, step by step, HOW a student would work through "
"the problem and arrive at that exact answer. "
"Every step must lead logically toward the verified answer. "
"State the verified answer word-for-word at the end of your explanation."
)
MAX_TOKENS = 1024 # enough for detailed explanations and step-by-step answers
TEMPERATURE = 0.7
MATH_TEMPERATURE = 0.1 # near-deterministic for math explanations
TOP_P = 0.9
N_CTX = 16384 # match model's trained context (supports up to 32768)
class LLMEngine:
"""
Lazy-loading wrapper around Qwen2.5-0.5B-Instruct (GGUF via llama.cpp).
Usage:
engine = LLMEngine()
response = engine.generate("What is photosynthesis?")
The GGUF model is downloaded from HuggingFace on the first call to
generate() and cached locally. Every subsequent call reuses the
in-memory model β no re-loading.
"""
def __init__(self) -> None:
self._llm = None
self._loaded: bool = False
self._failed: bool = False
self._fail_reason: str = ""
def is_available(self) -> bool:
"""True once the model has loaded without error."""
return self._loaded and not self._failed
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _load(self) -> None:
"""Download (first run only) and load the GGUF model into memory."""
if self._loaded or self._failed:
return
try:
print(
f"\n [LLM] Loading {MODEL_FILE} β¦ "
"(first run downloads ~350 MB, then cached locally)",
flush=True,
)
from llama_cpp import Llama
self._llm = Llama.from_pretrained(
repo_id=MODEL_REPO,
filename=MODEL_FILE,
n_ctx=N_CTX,
n_threads=4, # use up to 4 CPU threads
verbose=False,
)
self._loaded = True
print(" [LLM] Qwen2.5-0.5B-Instruct ready\n", flush=True)
except Exception as exc:
self._failed = True
self._fail_reason = str(exc)
print(f" [LLM] Failed to load: {exc}\n", flush=True)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def generate(
self,
user_input: str,
context: str = "",
system_prompt: str = "",
temperature: float = TEMPERATURE,
) -> str:
"""
Generate a response using the local LLM.
Args:
user_input : The user's message or question.
context : Optional retrieved text to inject as background.
system_prompt : Override the default system prompt (e.g. for math).
temperature : Sampling temperature; use low values for math.
Returns:
The model's reply as a plain string.
"""
self._load()
if self._failed:
return (
"The local LLM is currently unavailable "
f"({self._fail_reason}). "
"Ensure 'llama-cpp-python' is installed and the model "
"could be downloaded."
)
try:
system_content = system_prompt if system_prompt else SYSTEM_PROMPT
if context:
system_content += f"\n\nRelevant background:\n{context}"
messages = [
{"role": "system", "content": system_content},
{"role": "user", "content": user_input},
]
output = self._llm.create_chat_completion(
messages=messages,
max_tokens=MAX_TOKENS,
temperature=temperature,
top_p=TOP_P,
)
response: str = output["choices"][0]["message"]["content"]
return response.strip()
except Exception as exc:
return f"LLM generation error: {exc}"
|