Spaces:

Duplicated from everydaytok/ai_plugin_server

everydaytok
/

agentq-core-logics

Running

App Files Files Community

agentq-core-logics / app(lightmodel).js

everydaytok's picture

Rename app.js to app(lightmodel).js

5ec8baa verified 13 days ago

history blame contribute delete

8.58 kB

	import express from 'express';
	import { getLlama, LlamaChatSession, ChatMLChatWrapper } from 'node-llama-cpp';
	import fs from 'fs';
	import path from 'path';
	import os from 'os';
	import { Readable } from 'stream';
	import { pipeline } from 'stream/promises';

	const app = express();
	app.use(express.json());

	// 1. Back to Q8_0. You were right, it preserves the reasoning we need.
	const MODEL_URL = "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf";
	const MODEL_PATH = path.resolve("./qwen2.5-0.5b-instruct-q8_0.gguf");

	async function setupSystem() {
	if (!fs.existsSync(MODEL_PATH)) {
	console.log("⬇️ Downloading Q8_0 model...");
	const response = await fetch(MODEL_URL);
	if (!response.ok) throw new Error("Fetch failed: " + response.statusText);

	const fileStream = fs.createWriteStream(MODEL_PATH);
	await pipeline(Readable.fromWeb(response.body), fileStream);
	console.log("✅ Download complete!\n");
	}

	console.log("🔄 Initializing Engine...");
	const llama = await getLlama();
	const model = await llama.loadModel({ modelPath: MODEL_PATH });

	const isHuggingFace = process.env.SPACE_ID !== undefined;
	const optimalThreads = isHuggingFace ? 2 : Math.min(4, Math.max(1, os.cpus().length - 1));

	const context = await model.createContext({
	contextSize: 2048,
	threads: optimalThreads,
	batchSize: 512
	});

	console.log("✅ Engine Ready!");
	return context;
	}

	// 2. TRUE STATEFUL MEMORY
	// The server holds the session alive in RAM. We don't rebuild it.
	let sharedContext = null;
	let activeSequence = null;
	let activeSession = null;

	function resetMemory() {
	if (activeSequence) activeSequence.dispose();

	activeSequence = sharedContext.getSequence();
	activeSession = new LlamaChatSession({
	contextSequence: activeSequence,
	systemPrompt: "You are a helpful, pragmatic assistant.",
	chatWrapper: new ChatMLChatWrapper() // Enforces Qwen's prompt boundaries so it doesn't get amnesia
	});
	console.log("🧹 Server memory wiped and ready.");
	}

	app.get('/', (req, res) => {
	res.send(`
	<!DOCTYPE html>
	<html>
	<head>
	<title>Qwen Local API</title>
	<style>
	body { font-family: system-ui, sans-serif; max-width: 800px; margin: 2rem auto; padding: 0 1rem; background: #111; color: #eee; }
	#chat { height: 60vh; border: 1px solid #333; border-radius: 8px; overflow-y: auto; padding: 1rem; margin-bottom: 1rem; background: #1e1e1e; }
	.message { margin-bottom: 1rem; padding: 0.8rem; border-radius: 6px; line-height: 1.4; }
	.user { background: #2d3748; margin-left: 2rem; border: 1px solid #4a5568; }
	.bot { background: #222; margin-right: 2rem; border: 1px solid #333; }
	form { display: flex; gap: 0.5rem; }
	input { flex: 1; padding: 0.8rem; border-radius: 6px; border: 1px solid #444; background: #222; color: white; }
	button { padding: 0.8rem 1.5rem; border-radius: 6px; border: none; background: #3182ce; color: white; cursor: pointer; font-weight: bold; }
	button:disabled { background: #4a5568; cursor: not-allowed; }
	#clear-btn { background: #e53e3e; margin-bottom: 1rem; }
	</style>
	</head>
	<body>
	<div style="display: flex; justify-content: space-between; align-items: center;">
	<h2>⚡ Qwen 0.5B Server (Stateful RAM)</h2>
	<button id="clear-btn">Clear Server Memory</button>
	</div>
	<div id="chat"></div>
	<form id="form">
	<input type="text" id="input" placeholder="Type a message..." autocomplete="off" required>
	<button type="submit" id="btn">Send</button>
	</form>

	<script>
	const chat = document.getElementById('chat');
	const form = document.getElementById('form');
	const input = document.getElementById('input');
	const btn = document.getElementById('btn');
	const clearBtn = document.getElementById('clear-btn');

	clearBtn.onclick = async () => {
	await fetch('/api/clear', { method: 'POST' });
	chat.innerHTML = '<div style="color: #ecc94b; text-align: center; margin: 1rem 0;">🧹 Server Memory Cleared!</div>';
	};

	form.onsubmit = async (e) => {
	e.preventDefault();
	const text = input.value;
	input.value = '';

	chat.innerHTML += '<div class="message user">🤖 <strong>You:</strong> <br>' + text + '</div>';

	const botMsg = document.createElement('div');
	botMsg.className = 'message bot';
	botMsg.innerHTML = '⚡ <strong>Bot:</strong> <br>';
	chat.appendChild(botMsg);
	chat.scrollTop = chat.scrollHeight;

	btn.disabled = true;

	try {
	// 3. CLEAN API: We just send the single new message.
	// The server already knows the history.
	const res = await fetch('/api/chat', {
	method: 'POST',
	headers: { 'Content-Type': 'application/json' },
	body: JSON.stringify({ message: text })
	});

	const reader = res.body.getReader();
	const decoder = new TextDecoder();
	let buffer = '';

	while (true) {
	const { done, value } = await reader.read();
	if (done) break;

	buffer += decoder.decode(value, { stream: true });
	const lines = buffer.split('\\n');
	buffer = lines.pop();

	for (const line of lines) {
	if (line.trim().startsWith('data: ')) {
	const data = line.trim().slice(6).trim();
	if (data === '[DONE]') continue;
	try {
	const parsed = JSON.parse(data);
	if (parsed.error) {
	botMsg.innerHTML += '<br><span style="color:red">Error: ' + parsed.error + '</span>';
	chat.scrollTop = chat.scrollHeight;
	}
	if (parsed.text) {
	botMsg.innerHTML += parsed.text.replace(/\\n/g, '<br>');
	chat.scrollTop = chat.scrollHeight;
	}
	} catch(err) {
	console.error("JSON parse error:", data);
	}
	}
	}
	}
	} catch (err) {
	botMsg.innerHTML += '<br><em style="color:red">Error: ' + err.message + '</em>';
	}
	btn.disabled = false;
	input.focus();
	};
	</script>
	</body>
	</html>
	`);
	});

	// The Stateful API Route
	app.post('/api/chat', async (req, res) => {
	if (!activeSession) return res.status(503).json({ error: "Engine loading" });

	// Notice we ONLY take the new message. The backend handles the array looping.
	const { message } = req.body;
	if (!message) return res.status(400).json({ error: "Message is required" });

	res.setHeader('Content-Type', 'text/event-stream');
	res.setHeader('Cache-Control', 'no-transform, no-cache');
	res.setHeader('Connection', 'keep-alive');
	res.setHeader('X-Accel-Buffering', 'no');

	try {
	// 4. INSTANT GENERATION: Appends to the internal array and generates immediately
	await activeSession.prompt(message, {
	onTextChunk(chunk) {
	res.write(`data: ${JSON.stringify({ text: chunk })}\n\n`);
	}
	});

	res.write(`data: [DONE]\n\n`);
	res.end();

	} catch (error) {
	res.write(`data: ${JSON.stringify({ error: error.message \|\| "Generation failed" })}\n\n`);
	res.end();
	}
	});

	app.post('/api/clear', (req, res) => {
	resetMemory();
	res.json({ success: true });
	});

	const PORT = process.env.PORT \|\| 7860;
	setupSystem().then(context => {
	sharedContext = context;
	resetMemory();
	app.listen(PORT, "0.0.0.0", () => {
	console.log(`\n🚀 Stateful API live at port ${PORT}`);
	});
	}).catch(err => {
	console.error("❌ Boot Error:", err);
	process.exit(1);
	});