import express from 'express'; import { getLlama, LlamaChatSession, ChatMLChatWrapper } from 'node-llama-cpp'; import fs from 'fs'; import path from 'path'; import os from 'os'; import { Readable } from 'stream'; import { pipeline } from 'stream/promises'; const app = express(); app.use(express.json()); // 1. Back to Q8_0. You were right, it preserves the reasoning we need. const MODEL_URL = "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf"; const MODEL_PATH = path.resolve("./qwen2.5-0.5b-instruct-q8_0.gguf"); async function setupSystem() { if (!fs.existsSync(MODEL_PATH)) { console.log("โฌ๏ธ Downloading Q8_0 model..."); const response = await fetch(MODEL_URL); if (!response.ok) throw new Error("Fetch failed: " + response.statusText); const fileStream = fs.createWriteStream(MODEL_PATH); await pipeline(Readable.fromWeb(response.body), fileStream); console.log("โ Download complete!\n"); } console.log("๐ Initializing Engine..."); const llama = await getLlama(); const model = await llama.loadModel({ modelPath: MODEL_PATH }); const isHuggingFace = process.env.SPACE_ID !== undefined; const optimalThreads = isHuggingFace ? 2 : Math.min(4, Math.max(1, os.cpus().length - 1)); const context = await model.createContext({ contextSize: 2048, threads: optimalThreads, batchSize: 512 }); console.log("โ Engine Ready!"); return context; } // 2. TRUE STATEFUL MEMORY // The server holds the session alive in RAM. We don't rebuild it. let sharedContext = null; let activeSequence = null; let activeSession = null; function resetMemory() { if (activeSequence) activeSequence.dispose(); activeSequence = sharedContext.getSequence(); activeSession = new LlamaChatSession({ contextSequence: activeSequence, systemPrompt: "You are a helpful, pragmatic assistant.", chatWrapper: new ChatMLChatWrapper() // Enforces Qwen's prompt boundaries so it doesn't get amnesia }); console.log("๐งน Server memory wiped and ready."); } app.get('/', (req, res) => { res.send(`