llm-cpu-server / server.js
tjwrld's picture
Update server.js
533392b verified
import express from "express";
import { fileURLToPath } from "url";
import path from "path";
import morgan from "morgan";
import os from "os";
import { getLlama, LlamaChatSession } from "node-llama-cpp";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const app = express();
app.use(express.json());
app.use(morgan('dev'));
app.use(express.static(path.join(__dirname, 'public')));
const PORT = process.env.PORT || 7860;
// Global AI variables
let modelInstance;
let contextInstance;
let isModelReady = false;
/* -----------------------
THE INVISIBLE QUEUE
This ensures requests line up perfectly without crashing the context
and without sending "Busy" errors to the user.
----------------------- */
class RequestQueue {
constructor() {
this.queue = Promise.resolve();
}
add(task) {
return new Promise((resolve, reject) => {
this.queue = this.queue.then(async () => {
try {
const result = await task();
resolve(result);
} catch (err) {
reject(err);
}
});
});
}
}
const taskQueue = new RequestQueue();
const cpuCores = Math.max(1, os.cpus().length);
async function initModel() {
console.log("-----------------------------------------");
console.log(`Initializing Llama using ${cpuCores} vCPUs...`);
const llama = await getLlama();
const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
modelInstance = await llama.loadModel({
modelPath: modelLocation,
gpu: false
});
console.log("Creating hyper-optimized context...");
contextInstance = await modelInstance.createContext({
contextSize: 4096, // Cap context to save memory and increase speed
batchSize: 512,
threads: cpuCores,
flashAttention: true // MAJOR speed boost for CPU inference
});
isModelReady = true;
console.log("Model successfully loaded! API is online. 🚀");
console.log("-----------------------------------------");
}
/* -----------------------
STATELESS API ENDPOINT
----------------------- */
app.post("/generate", async (req, res) => {
if (!isModelReady) {
return res.status(503).json({ error: "Model is still booting up. Try again in a few seconds." });
}
const {
user_input,
user_temp = 0.7,
user_inst = "You are an Wrld-AI assistant. Give short clear answers.",
user_max_token = 1024
} = req.body;
if (!user_input) {
return res.status(400).json({ error: "Missing required field: user_input" });
}
// Add request to the queue. The user's HTTP request will wait here
// patiently until the CPU is free to generate the response.
taskQueue.add(async () => {
let sequence;
try {
// 1. Grab sequence memory ONLY when it is this request's turn
sequence = contextInstance.getSequence();
// 2. Create unique, stateless session
const session = new LlamaChatSession({
contextSequence: sequence,
systemPrompt: user_inst
});
// 3. Generate response
const responseText = await session.prompt(user_input, {
maxTokens: parseInt(user_max_token),
temperature: parseFloat(user_temp),
topK: 40,
topP: 0.9,
repeatPenalty: 1.1
});
// Send successful response
res.json({ response: responseText });
} catch (err) {
console.error("Error during generation:", err);
res.status(500).json({ error: "An internal error occurred during text generation." });
} finally {
// 4. INSTANT CLEANUP: Free up the sequence immediately for the next request in the queue
if (sequence) {
sequence.dispose();
}
}
});
});
app.listen(PORT, "0.0.0.0", () => {
console.log(`✅ Web server is listening on port ${PORT}`);
initModel().catch(err => {
console.error("Critical Failure: Failed to load the AI model.", err);
});
});