import express from 'express'; import { getLlama, LlamaChatSession, ChatMLChatWrapper } from 'node-llama-cpp'; import fs from 'fs'; import path from 'path'; import os from 'os'; import { Readable } from 'stream'; import { pipeline } from 'stream/promises'; const app = express(); app.use(express.json()); // 1. Back to Q8_0. You were right, it preserves the reasoning we need. const MODEL_URL = "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf"; const MODEL_PATH = path.resolve("./qwen2.5-0.5b-instruct-q8_0.gguf"); async function setupSystem() { if (!fs.existsSync(MODEL_PATH)) { console.log("โฌ‡๏ธ Downloading Q8_0 model..."); const response = await fetch(MODEL_URL); if (!response.ok) throw new Error("Fetch failed: " + response.statusText); const fileStream = fs.createWriteStream(MODEL_PATH); await pipeline(Readable.fromWeb(response.body), fileStream); console.log("โœ… Download complete!\n"); } console.log("๐Ÿ”„ Initializing Engine..."); const llama = await getLlama(); const model = await llama.loadModel({ modelPath: MODEL_PATH }); const isHuggingFace = process.env.SPACE_ID !== undefined; const optimalThreads = isHuggingFace ? 2 : Math.min(4, Math.max(1, os.cpus().length - 1)); const context = await model.createContext({ contextSize: 2048, threads: optimalThreads, batchSize: 512 }); console.log("โœ… Engine Ready!"); return context; } // 2. TRUE STATEFUL MEMORY // The server holds the session alive in RAM. We don't rebuild it. let sharedContext = null; let activeSequence = null; let activeSession = null; function resetMemory() { if (activeSequence) activeSequence.dispose(); activeSequence = sharedContext.getSequence(); activeSession = new LlamaChatSession({ contextSequence: activeSequence, systemPrompt: "You are a helpful, pragmatic assistant.", chatWrapper: new ChatMLChatWrapper() // Enforces Qwen's prompt boundaries so it doesn't get amnesia }); console.log("๐Ÿงน Server memory wiped and ready."); } app.get('/', (req, res) => { res.send(` Qwen Local API

โšก Qwen 0.5B Server (Stateful RAM)

`); }); // The Stateful API Route app.post('/api/chat', async (req, res) => { if (!activeSession) return res.status(503).json({ error: "Engine loading" }); // Notice we ONLY take the new message. The backend handles the array looping. const { message } = req.body; if (!message) return res.status(400).json({ error: "Message is required" }); res.setHeader('Content-Type', 'text/event-stream'); res.setHeader('Cache-Control', 'no-transform, no-cache'); res.setHeader('Connection', 'keep-alive'); res.setHeader('X-Accel-Buffering', 'no'); try { // 4. INSTANT GENERATION: Appends to the internal array and generates immediately await activeSession.prompt(message, { onTextChunk(chunk) { res.write(`data: ${JSON.stringify({ text: chunk })}\n\n`); } }); res.write(`data: [DONE]\n\n`); res.end(); } catch (error) { res.write(`data: ${JSON.stringify({ error: error.message || "Generation failed" })}\n\n`); res.end(); } }); app.post('/api/clear', (req, res) => { resetMemory(); res.json({ success: true }); }); const PORT = process.env.PORT || 7860; setupSystem().then(context => { sharedContext = context; resetMemory(); app.listen(PORT, "0.0.0.0", () => { console.log(`\n๐Ÿš€ Stateful API live at port ${PORT}`); }); }).catch(err => { console.error("โŒ Boot Error:", err); process.exit(1); });