File size: 5,593 Bytes
5d8fd4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""
LLM Engine β€” local Qwen2.5-0.5B-Instruct fallback via llama-cpp-python.

This is the bottom layer of the AnveshAI hierarchy:

    Math          β†’  math_engine         (instant, rule-based)
    Knowledge     β†’  knowledge_engine    (keyword retrieval from knowledge.txt)
      └─ no match β†’  LLMEngine.generate  (Qwen2.5-0.5B)
    Conversation  β†’  conversation_engine (pattern matching from conversation.txt)
      └─ no match β†’  LLMEngine.generate  (Qwen2.5-0.5B)

Model: Qwen/Qwen2.5-0.5B-Instruct  (Q4_K_M GGUF, ~350 MB)
    ─ Best-in-class quality at 0.5B parameters
    ─ Runs entirely on CPU via llama.cpp
    ─ Downloaded once into ~/.cache/huggingface/ on first use
    ─ Loaded LAZILY: the model only loads when first needed,
      keeping startup instant.
"""

MODEL_REPO   = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
MODEL_FILE   = "qwen2.5-0.5b-instruct-q4_k_m.gguf"

SYSTEM_PROMPT = (
    "You are AnveshAI Edge, a helpful offline AI assistant. "
    "Answer questions thoroughly and completely. Show full working steps "
    "for math or technical questions. Do not repeat the question back. "
    "If you are unsure about something, say so clearly."
)

MATH_SYSTEM_PROMPT = (
    "You are a mathematics tutor. "
    "You will be given a VERIFIED ANSWER computed by a symbolic engine. "
    "That answer is 100% correct β€” do NOT change it, do NOT recompute it. "
    "Your ONLY job is to explain, step by step, HOW a student would work through "
    "the problem and arrive at that exact answer. "
    "Every step must lead logically toward the verified answer. "
    "State the verified answer word-for-word at the end of your explanation."
)

MAX_TOKENS  = 1024   # enough for detailed explanations and step-by-step answers
TEMPERATURE = 0.7
MATH_TEMPERATURE = 0.1   # near-deterministic for math explanations
TOP_P       = 0.9
N_CTX       = 16384  # match model's trained context (supports up to 32768)


class LLMEngine:
    """
    Lazy-loading wrapper around Qwen2.5-0.5B-Instruct (GGUF via llama.cpp).

    Usage:
        engine = LLMEngine()
        response = engine.generate("What is photosynthesis?")

    The GGUF model is downloaded from HuggingFace on the first call to
    generate() and cached locally. Every subsequent call reuses the
    in-memory model β€” no re-loading.
    """

    def __init__(self) -> None:
        self._llm   = None
        self._loaded: bool = False
        self._failed: bool = False
        self._fail_reason: str = ""

    def is_available(self) -> bool:
        """True once the model has loaded without error."""
        return self._loaded and not self._failed

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _load(self) -> None:
        """Download (first run only) and load the GGUF model into memory."""
        if self._loaded or self._failed:
            return

        try:
            print(
                f"\n  [LLM] Loading {MODEL_FILE} … "
                "(first run downloads ~350 MB, then cached locally)",
                flush=True,
            )

            from llama_cpp import Llama

            self._llm = Llama.from_pretrained(
                repo_id=MODEL_REPO,
                filename=MODEL_FILE,
                n_ctx=N_CTX,
                n_threads=4,   # use up to 4 CPU threads
                verbose=False,
            )

            self._loaded = True
            print("  [LLM] Qwen2.5-0.5B-Instruct ready\n", flush=True)

        except Exception as exc:
            self._failed = True
            self._fail_reason = str(exc)
            print(f"  [LLM] Failed to load: {exc}\n", flush=True)

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def generate(
        self,
        user_input: str,
        context: str = "",
        system_prompt: str = "",
        temperature: float = TEMPERATURE,
    ) -> str:
        """
        Generate a response using the local LLM.

        Args:
            user_input    : The user's message or question.
            context       : Optional retrieved text to inject as background.
            system_prompt : Override the default system prompt (e.g. for math).
            temperature   : Sampling temperature; use low values for math.

        Returns:
            The model's reply as a plain string.
        """
        self._load()

        if self._failed:
            return (
                "The local LLM is currently unavailable "
                f"({self._fail_reason}). "
                "Ensure 'llama-cpp-python' is installed and the model "
                "could be downloaded."
            )

        try:
            system_content = system_prompt if system_prompt else SYSTEM_PROMPT
            if context:
                system_content += f"\n\nRelevant background:\n{context}"

            messages = [
                {"role": "system", "content": system_content},
                {"role": "user",   "content": user_input},
            ]

            output = self._llm.create_chat_completion(
                messages=messages,
                max_tokens=MAX_TOKENS,
                temperature=temperature,
                top_p=TOP_P,
            )

            response: str = output["choices"][0]["message"]["content"]
            return response.strip()

        except Exception as exc:
            return f"LLM generation error: {exc}"