import express from "express"; import { fileURLToPath } from "url"; import path from "path"; import morgan from "morgan"; import os from "os"; import { getLlama, LlamaChatSession } from "node-llama-cpp"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const app = express(); app.use(express.json()); app.use(morgan('dev')); app.use(express.static(path.join(__dirname, 'public'))); const PORT = process.env.PORT || 7860; // Global AI variables let modelInstance; let contextInstance; let isModelReady = false; /* ----------------------- THE INVISIBLE QUEUE This ensures requests line up perfectly without crashing the context and without sending "Busy" errors to the user. ----------------------- */ class RequestQueue { constructor() { this.queue = Promise.resolve(); } add(task) { return new Promise((resolve, reject) => { this.queue = this.queue.then(async () => { try { const result = await task(); resolve(result); } catch (err) { reject(err); } }); }); } } const taskQueue = new RequestQueue(); const cpuCores = Math.max(1, os.cpus().length); async function initModel() { console.log("-----------------------------------------"); console.log(`Initializing Llama using ${cpuCores} vCPUs...`); const llama = await getLlama(); const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf"); modelInstance = await llama.loadModel({ modelPath: modelLocation, gpu: false }); console.log("Creating hyper-optimized context..."); contextInstance = await modelInstance.createContext({ contextSize: 4096, // Cap context to save memory and increase speed batchSize: 512, threads: cpuCores, flashAttention: true // MAJOR speed boost for CPU inference }); isModelReady = true; console.log("Model successfully loaded! API is online. 🚀"); console.log("-----------------------------------------"); } /* ----------------------- STATELESS API ENDPOINT ----------------------- */ app.post("/generate", async (req, res) => { if (!isModelReady) { return res.status(503).json({ error: "Model is still booting up. Try again in a few seconds." }); } const { user_input, user_temp = 0.7, user_inst = "You are an Wrld-AI assistant. Give short clear answers.", user_max_token = 1024 } = req.body; if (!user_input) { return res.status(400).json({ error: "Missing required field: user_input" }); } // Add request to the queue. The user's HTTP request will wait here // patiently until the CPU is free to generate the response. taskQueue.add(async () => { let sequence; try { // 1. Grab sequence memory ONLY when it is this request's turn sequence = contextInstance.getSequence(); // 2. Create unique, stateless session const session = new LlamaChatSession({ contextSequence: sequence, systemPrompt: user_inst }); // 3. Generate response const responseText = await session.prompt(user_input, { maxTokens: parseInt(user_max_token), temperature: parseFloat(user_temp), topK: 40, topP: 0.9, repeatPenalty: 1.1 }); // Send successful response res.json({ response: responseText }); } catch (err) { console.error("Error during generation:", err); res.status(500).json({ error: "An internal error occurred during text generation." }); } finally { // 4. INSTANT CLEANUP: Free up the sequence immediately for the next request in the queue if (sequence) { sequence.dispose(); } } }); }); app.listen(PORT, "0.0.0.0", () => { console.log(`✅ Web server is listening on port ${PORT}`); initModel().catch(err => { console.error("Critical Failure: Failed to load the AI model.", err); }); });