Spaces:
Running
Running
| import express from 'express'; | |
| import { getLlama, LlamaChatSession, ChatMLChatWrapper } from 'node-llama-cpp'; | |
| import fs from 'fs'; | |
| import path from 'path'; | |
| import os from 'os'; | |
| import { Readable } from 'stream'; | |
| import { pipeline } from 'stream/promises'; | |
| const app = express(); | |
| app.use(express.json()); | |
| // 1. Back to Q8_0. You were right, it preserves the reasoning we need. | |
| const MODEL_URL = "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf"; | |
| const MODEL_PATH = path.resolve("./qwen2.5-0.5b-instruct-q8_0.gguf"); | |
| async function setupSystem() { | |
| if (!fs.existsSync(MODEL_PATH)) { | |
| console.log("⬇️ Downloading Q8_0 model..."); | |
| const response = await fetch(MODEL_URL); | |
| if (!response.ok) throw new Error("Fetch failed: " + response.statusText); | |
| const fileStream = fs.createWriteStream(MODEL_PATH); | |
| await pipeline(Readable.fromWeb(response.body), fileStream); | |
| console.log("✅ Download complete!\n"); | |
| } | |
| console.log("🔄 Initializing Engine..."); | |
| const llama = await getLlama(); | |
| const model = await llama.loadModel({ modelPath: MODEL_PATH }); | |
| const isHuggingFace = process.env.SPACE_ID !== undefined; | |
| const optimalThreads = isHuggingFace ? 2 : Math.min(4, Math.max(1, os.cpus().length - 1)); | |
| const context = await model.createContext({ | |
| contextSize: 2048, | |
| threads: optimalThreads, | |
| batchSize: 512 | |
| }); | |
| console.log("✅ Engine Ready!"); | |
| return context; | |
| } | |
| // 2. TRUE STATEFUL MEMORY | |
| // The server holds the session alive in RAM. We don't rebuild it. | |
| let sharedContext = null; | |
| let activeSequence = null; | |
| let activeSession = null; | |
| function resetMemory() { | |
| if (activeSequence) activeSequence.dispose(); | |
| activeSequence = sharedContext.getSequence(); | |
| activeSession = new LlamaChatSession({ | |
| contextSequence: activeSequence, | |
| systemPrompt: "You are a helpful, pragmatic assistant.", | |
| chatWrapper: new ChatMLChatWrapper() // Enforces Qwen's prompt boundaries so it doesn't get amnesia | |
| }); | |
| console.log("🧹 Server memory wiped and ready."); | |
| } | |
| app.get('/', (req, res) => { | |
| res.send(` | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <title>Qwen Local API</title> | |
| <style> | |
| body { font-family: system-ui, sans-serif; max-width: 800px; margin: 2rem auto; padding: 0 1rem; background: #111; color: #eee; } | |
| #chat { height: 60vh; border: 1px solid #333; border-radius: 8px; overflow-y: auto; padding: 1rem; margin-bottom: 1rem; background: #1e1e1e; } | |
| .message { margin-bottom: 1rem; padding: 0.8rem; border-radius: 6px; line-height: 1.4; } | |
| .user { background: #2d3748; margin-left: 2rem; border: 1px solid #4a5568; } | |
| .bot { background: #222; margin-right: 2rem; border: 1px solid #333; } | |
| form { display: flex; gap: 0.5rem; } | |
| input { flex: 1; padding: 0.8rem; border-radius: 6px; border: 1px solid #444; background: #222; color: white; } | |
| button { padding: 0.8rem 1.5rem; border-radius: 6px; border: none; background: #3182ce; color: white; cursor: pointer; font-weight: bold; } | |
| button:disabled { background: #4a5568; cursor: not-allowed; } | |
| #clear-btn { background: #e53e3e; margin-bottom: 1rem; } | |
| </style> | |
| </head> | |
| <body> | |
| <div style="display: flex; justify-content: space-between; align-items: center;"> | |
| <h2>⚡ Qwen 0.5B Server (Stateful RAM)</h2> | |
| <button id="clear-btn">Clear Server Memory</button> | |
| </div> | |
| <div id="chat"></div> | |
| <form id="form"> | |
| <input type="text" id="input" placeholder="Type a message..." autocomplete="off" required> | |
| <button type="submit" id="btn">Send</button> | |
| </form> | |
| <script> | |
| const chat = document.getElementById('chat'); | |
| const form = document.getElementById('form'); | |
| const input = document.getElementById('input'); | |
| const btn = document.getElementById('btn'); | |
| const clearBtn = document.getElementById('clear-btn'); | |
| clearBtn.onclick = async () => { | |
| await fetch('/api/clear', { method: 'POST' }); | |
| chat.innerHTML = '<div style="color: #ecc94b; text-align: center; margin: 1rem 0;">🧹 Server Memory Cleared!</div>'; | |
| }; | |
| form.onsubmit = async (e) => { | |
| e.preventDefault(); | |
| const text = input.value; | |
| input.value = ''; | |
| chat.innerHTML += '<div class="message user">🤖 <strong>You:</strong> <br>' + text + '</div>'; | |
| const botMsg = document.createElement('div'); | |
| botMsg.className = 'message bot'; | |
| botMsg.innerHTML = '⚡ <strong>Bot:</strong> <br>'; | |
| chat.appendChild(botMsg); | |
| chat.scrollTop = chat.scrollHeight; | |
| btn.disabled = true; | |
| try { | |
| // 3. CLEAN API: We just send the single new message. | |
| // The server already knows the history. | |
| const res = await fetch('/api/chat', { | |
| method: 'POST', | |
| headers: { 'Content-Type': 'application/json' }, | |
| body: JSON.stringify({ message: text }) | |
| }); | |
| const reader = res.body.getReader(); | |
| const decoder = new TextDecoder(); | |
| let buffer = ''; | |
| while (true) { | |
| const { done, value } = await reader.read(); | |
| if (done) break; | |
| buffer += decoder.decode(value, { stream: true }); | |
| const lines = buffer.split('\\n'); | |
| buffer = lines.pop(); | |
| for (const line of lines) { | |
| if (line.trim().startsWith('data: ')) { | |
| const data = line.trim().slice(6).trim(); | |
| if (data === '[DONE]') continue; | |
| try { | |
| const parsed = JSON.parse(data); | |
| if (parsed.error) { | |
| botMsg.innerHTML += '<br><span style="color:red">Error: ' + parsed.error + '</span>'; | |
| chat.scrollTop = chat.scrollHeight; | |
| } | |
| if (parsed.text) { | |
| botMsg.innerHTML += parsed.text.replace(/\\n/g, '<br>'); | |
| chat.scrollTop = chat.scrollHeight; | |
| } | |
| } catch(err) { | |
| console.error("JSON parse error:", data); | |
| } | |
| } | |
| } | |
| } | |
| } catch (err) { | |
| botMsg.innerHTML += '<br><em style="color:red">Error: ' + err.message + '</em>'; | |
| } | |
| btn.disabled = false; | |
| input.focus(); | |
| }; | |
| </script> | |
| </body> | |
| </html> | |
| `); | |
| }); | |
| // The Stateful API Route | |
| app.post('/api/chat', async (req, res) => { | |
| if (!activeSession) return res.status(503).json({ error: "Engine loading" }); | |
| // Notice we ONLY take the new message. The backend handles the array looping. | |
| const { message } = req.body; | |
| if (!message) return res.status(400).json({ error: "Message is required" }); | |
| res.setHeader('Content-Type', 'text/event-stream'); | |
| res.setHeader('Cache-Control', 'no-transform, no-cache'); | |
| res.setHeader('Connection', 'keep-alive'); | |
| res.setHeader('X-Accel-Buffering', 'no'); | |
| try { | |
| // 4. INSTANT GENERATION: Appends to the internal array and generates immediately | |
| await activeSession.prompt(message, { | |
| onTextChunk(chunk) { | |
| res.write(`data: ${JSON.stringify({ text: chunk })}\n\n`); | |
| } | |
| }); | |
| res.write(`data: [DONE]\n\n`); | |
| res.end(); | |
| } catch (error) { | |
| res.write(`data: ${JSON.stringify({ error: error.message || "Generation failed" })}\n\n`); | |
| res.end(); | |
| } | |
| }); | |
| app.post('/api/clear', (req, res) => { | |
| resetMemory(); | |
| res.json({ success: true }); | |
| }); | |
| const PORT = process.env.PORT || 7860; | |
| setupSystem().then(context => { | |
| sharedContext = context; | |
| resetMemory(); | |
| app.listen(PORT, "0.0.0.0", () => { | |
| console.log(`\n🚀 Stateful API live at port ${PORT}`); | |
| }); | |
| }).catch(err => { | |
| console.error("❌ Boot Error:", err); | |
| process.exit(1); | |
| }); |