Spaces:

tjwrld
/

llm-cpu-server

Sleeping

App Files Files Community

llm-cpu-server / server.js

tjwrld's picture

Update server.js

533392b verified about 1 month ago

history blame contribute delete

4.27 kB

	import express from "express";
	import { fileURLToPath } from "url";
	import path from "path";
	import morgan from "morgan";
	import os from "os";
	import { getLlama, LlamaChatSession } from "node-llama-cpp";

	const __dirname = path.dirname(fileURLToPath(import.meta.url));
	const app = express();

	app.use(express.json());
	app.use(morgan('dev'));
	app.use(express.static(path.join(__dirname, 'public')));

	const PORT = process.env.PORT \|\| 7860;

	// Global AI variables
	let modelInstance;
	let contextInstance;
	let isModelReady = false;

	/* -----------------------
	THE INVISIBLE QUEUE
	This ensures requests line up perfectly without crashing the context
	and without sending "Busy" errors to the user.
	----------------------- */
	class RequestQueue {
	constructor() {
	this.queue = Promise.resolve();
	}
	add(task) {
	return new Promise((resolve, reject) => {
	this.queue = this.queue.then(async () => {
	try {
	const result = await task();
	resolve(result);
	} catch (err) {
	reject(err);
	}
	});
	});
	}
	}
	const taskQueue = new RequestQueue();

	const cpuCores = Math.max(1, os.cpus().length);

	async function initModel() {
	console.log("-----------------------------------------");
	console.log(`Initializing Llama using ${cpuCores} vCPUs...`);
	const llama = await getLlama();

	const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");

	modelInstance = await llama.loadModel({
	modelPath: modelLocation,
	gpu: false
	});

	console.log("Creating hyper-optimized context...");
	contextInstance = await modelInstance.createContext({
	contextSize: 4096, // Cap context to save memory and increase speed
	batchSize: 512,
	threads: cpuCores,
	flashAttention: true // MAJOR speed boost for CPU inference
	});

	isModelReady = true;
	console.log("Model successfully loaded! API is online. 🚀");
	console.log("-----------------------------------------");
	}

	/* -----------------------
	STATELESS API ENDPOINT
	----------------------- */
	app.post("/generate", async (req, res) => {
	if (!isModelReady) {
	return res.status(503).json({ error: "Model is still booting up. Try again in a few seconds." });
	}

	const {
	user_input,
	user_temp = 0.7,
	user_inst = "You are an Wrld-AI assistant. Give short clear answers.",
	user_max_token = 1024
	} = req.body;

	if (!user_input) {
	return res.status(400).json({ error: "Missing required field: user_input" });
	}

	// Add request to the queue. The user's HTTP request will wait here
	// patiently until the CPU is free to generate the response.
	taskQueue.add(async () => {
	let sequence;
	try {
	// 1. Grab sequence memory ONLY when it is this request's turn
	sequence = contextInstance.getSequence();

	// 2. Create unique, stateless session
	const session = new LlamaChatSession({
	contextSequence: sequence,
	systemPrompt: user_inst
	});

	// 3. Generate response
	const responseText = await session.prompt(user_input, {
	maxTokens: parseInt(user_max_token),
	temperature: parseFloat(user_temp),
	topK: 40,
	topP: 0.9,
	repeatPenalty: 1.1
	});

	// Send successful response
	res.json({ response: responseText });

	} catch (err) {
	console.error("Error during generation:", err);
	res.status(500).json({ error: "An internal error occurred during text generation." });
	} finally {
	// 4. INSTANT CLEANUP: Free up the sequence immediately for the next request in the queue
	if (sequence) {
	sequence.dispose();
	}
	}
	});
	});

	app.listen(PORT, "0.0.0.0", () => {
	console.log(`✅ Web server is listening on port ${PORT}`);
	initModel().catch(err => {
	console.error("Critical Failure: Failed to load the AI model.", err);
	});
	});