agentq-core-logics / app(lightmodel).js
everydaytok's picture
Rename app.js to app(lightmodel).js
5ec8baa verified
import express from 'express';
import { getLlama, LlamaChatSession, ChatMLChatWrapper } from 'node-llama-cpp';
import fs from 'fs';
import path from 'path';
import os from 'os';
import { Readable } from 'stream';
import { pipeline } from 'stream/promises';
const app = express();
app.use(express.json());
// 1. Back to Q8_0. You were right, it preserves the reasoning we need.
const MODEL_URL = "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf";
const MODEL_PATH = path.resolve("./qwen2.5-0.5b-instruct-q8_0.gguf");
async function setupSystem() {
if (!fs.existsSync(MODEL_PATH)) {
console.log("⬇️ Downloading Q8_0 model...");
const response = await fetch(MODEL_URL);
if (!response.ok) throw new Error("Fetch failed: " + response.statusText);
const fileStream = fs.createWriteStream(MODEL_PATH);
await pipeline(Readable.fromWeb(response.body), fileStream);
console.log("✅ Download complete!\n");
}
console.log("🔄 Initializing Engine...");
const llama = await getLlama();
const model = await llama.loadModel({ modelPath: MODEL_PATH });
const isHuggingFace = process.env.SPACE_ID !== undefined;
const optimalThreads = isHuggingFace ? 2 : Math.min(4, Math.max(1, os.cpus().length - 1));
const context = await model.createContext({
contextSize: 2048,
threads: optimalThreads,
batchSize: 512
});
console.log("✅ Engine Ready!");
return context;
}
// 2. TRUE STATEFUL MEMORY
// The server holds the session alive in RAM. We don't rebuild it.
let sharedContext = null;
let activeSequence = null;
let activeSession = null;
function resetMemory() {
if (activeSequence) activeSequence.dispose();
activeSequence = sharedContext.getSequence();
activeSession = new LlamaChatSession({
contextSequence: activeSequence,
systemPrompt: "You are a helpful, pragmatic assistant.",
chatWrapper: new ChatMLChatWrapper() // Enforces Qwen's prompt boundaries so it doesn't get amnesia
});
console.log("🧹 Server memory wiped and ready.");
}
app.get('/', (req, res) => {
res.send(`
<!DOCTYPE html>
<html>
<head>
<title>Qwen Local API</title>
<style>
body { font-family: system-ui, sans-serif; max-width: 800px; margin: 2rem auto; padding: 0 1rem; background: #111; color: #eee; }
#chat { height: 60vh; border: 1px solid #333; border-radius: 8px; overflow-y: auto; padding: 1rem; margin-bottom: 1rem; background: #1e1e1e; }
.message { margin-bottom: 1rem; padding: 0.8rem; border-radius: 6px; line-height: 1.4; }
.user { background: #2d3748; margin-left: 2rem; border: 1px solid #4a5568; }
.bot { background: #222; margin-right: 2rem; border: 1px solid #333; }
form { display: flex; gap: 0.5rem; }
input { flex: 1; padding: 0.8rem; border-radius: 6px; border: 1px solid #444; background: #222; color: white; }
button { padding: 0.8rem 1.5rem; border-radius: 6px; border: none; background: #3182ce; color: white; cursor: pointer; font-weight: bold; }
button:disabled { background: #4a5568; cursor: not-allowed; }
#clear-btn { background: #e53e3e; margin-bottom: 1rem; }
</style>
</head>
<body>
<div style="display: flex; justify-content: space-between; align-items: center;">
<h2>⚡ Qwen 0.5B Server (Stateful RAM)</h2>
<button id="clear-btn">Clear Server Memory</button>
</div>
<div id="chat"></div>
<form id="form">
<input type="text" id="input" placeholder="Type a message..." autocomplete="off" required>
<button type="submit" id="btn">Send</button>
</form>
<script>
const chat = document.getElementById('chat');
const form = document.getElementById('form');
const input = document.getElementById('input');
const btn = document.getElementById('btn');
const clearBtn = document.getElementById('clear-btn');
clearBtn.onclick = async () => {
await fetch('/api/clear', { method: 'POST' });
chat.innerHTML = '<div style="color: #ecc94b; text-align: center; margin: 1rem 0;">🧹 Server Memory Cleared!</div>';
};
form.onsubmit = async (e) => {
e.preventDefault();
const text = input.value;
input.value = '';
chat.innerHTML += '<div class="message user">🤖 <strong>You:</strong> <br>' + text + '</div>';
const botMsg = document.createElement('div');
botMsg.className = 'message bot';
botMsg.innerHTML = '⚡ <strong>Bot:</strong> <br>';
chat.appendChild(botMsg);
chat.scrollTop = chat.scrollHeight;
btn.disabled = true;
try {
// 3. CLEAN API: We just send the single new message.
// The server already knows the history.
const res = await fetch('/api/chat', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ message: text })
});
const reader = res.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\\n');
buffer = lines.pop();
for (const line of lines) {
if (line.trim().startsWith('data: ')) {
const data = line.trim().slice(6).trim();
if (data === '[DONE]') continue;
try {
const parsed = JSON.parse(data);
if (parsed.error) {
botMsg.innerHTML += '<br><span style="color:red">Error: ' + parsed.error + '</span>';
chat.scrollTop = chat.scrollHeight;
}
if (parsed.text) {
botMsg.innerHTML += parsed.text.replace(/\\n/g, '<br>');
chat.scrollTop = chat.scrollHeight;
}
} catch(err) {
console.error("JSON parse error:", data);
}
}
}
}
} catch (err) {
botMsg.innerHTML += '<br><em style="color:red">Error: ' + err.message + '</em>';
}
btn.disabled = false;
input.focus();
};
</script>
</body>
</html>
`);
});
// The Stateful API Route
app.post('/api/chat', async (req, res) => {
if (!activeSession) return res.status(503).json({ error: "Engine loading" });
// Notice we ONLY take the new message. The backend handles the array looping.
const { message } = req.body;
if (!message) return res.status(400).json({ error: "Message is required" });
res.setHeader('Content-Type', 'text/event-stream');
res.setHeader('Cache-Control', 'no-transform, no-cache');
res.setHeader('Connection', 'keep-alive');
res.setHeader('X-Accel-Buffering', 'no');
try {
// 4. INSTANT GENERATION: Appends to the internal array and generates immediately
await activeSession.prompt(message, {
onTextChunk(chunk) {
res.write(`data: ${JSON.stringify({ text: chunk })}\n\n`);
}
});
res.write(`data: [DONE]\n\n`);
res.end();
} catch (error) {
res.write(`data: ${JSON.stringify({ error: error.message || "Generation failed" })}\n\n`);
res.end();
}
});
app.post('/api/clear', (req, res) => {
resetMemory();
res.json({ success: true });
});
const PORT = process.env.PORT || 7860;
setupSystem().then(context => {
sharedContext = context;
resetMemory();
app.listen(PORT, "0.0.0.0", () => {
console.log(`\n🚀 Stateful API live at port ${PORT}`);
});
}).catch(err => {
console.error("❌ Boot Error:", err);
process.exit(1);
});