Spaces:

tjwrld
/

llm-cpu-server

Sleeping

App Files Files Community

tjwrld commited on about 1 month ago

Commit

533392b

verified ·

1 Parent(s): 141facc

Update server.js

Browse files

Files changed (1) hide show

server.js +80 -60

server.js CHANGED Viewed

@@ -2,6 +2,7 @@ import express from "express";
 import { fileURLToPath } from "url";
 import path from "path";
 import morgan from "morgan";
 import { getLlama, LlamaChatSession } from "node-llama-cpp";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -9,106 +10,125 @@ const app = express();
 app.use(express.json());
 app.use(morgan('dev'));
-// Serve the UI
 app.use(express.static(path.join(__dirname, 'public')));
 const PORT = process.env.PORT || 7860;
 let modelInstance;
 let contextInstance;
-let isModelReady = false; // Flag to track if the AI is ready
 /* -----------------------
-   LOAD MODEL
 ----------------------- */
 async function initModel() {
     console.log("-----------------------------------------");
-    console.log("Initializing Llama Backend...");
     const llama = await getLlama();
     const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
-    console.log(`Loading model into memory: ${modelLocation}`);
     modelInstance = await llama.loadModel({
         modelPath: modelLocation,
         gpu: false
     });
-    console.log("Creating context sequence...");
     contextInstance = await modelInstance.createContext({
         batchSize: 512,
-        threads: 6
     });
-    isModelReady = true; // Mark as ready!
-    console.log("Model successfully loaded and ready for requests! 🚀");
     console.log("-----------------------------------------");
 }
 /* -----------------------
-   API ENDPOINTS
 ----------------------- */
 app.post("/generate", async (req, res) => {
     if (!isModelReady) {
-        return res.status(503).json({
-            error: "The AI model is still loading into memory. Please wait a few seconds and try again."
-        });
     }
-    let sequence; // Define this here so we can clean it up in the 'finally' block
-    try {
-        const {
-            user_input,
-            user_temp = 0.2,
-            user_inst = "You are an Wrld-Gpt AI assistant. Give short clear answers. Do not make assumptions",
-            user_max_token = 1024
-        } = req.body;
-        if (!user_input) {
-            return res.status(400).json({ error: "Missing required field: user_input" });
-        }
-        // Grab a sequence memory slot for this specific request
-        sequence = contextInstance.getSequence();
-        const session = new LlamaChatSession({
-            contextSequence: sequence,
-            systemPrompt: user_inst
-        });
-        const responseText = await session.prompt(user_input, {
-            maxTokens: parseInt(user_max_token),
-            temperature: parseFloat(user_temp),
-            topK: 40,
-            topP: 0.9,
-            repeatPenalty: 1.1
-        });
-        res.json({ response: responseText });
-    } catch (err) {
-        console.error("Error during generation:", err);
-        res.status(500).json({ error: "An internal error occurred during text generation." });
-    } finally {
-        // CRITICAL FIX: Always free up the sequence slot when done, even if an error occurs!
-        if (sequence) {
-            sequence.dispose();
         }
-    }
 });
-/* -----------------------
-   STARTUP SEQUENCE
------------------------ */
 app.listen(PORT, "0.0.0.0", () => {
     console.log(`✅ Web server is listening on port ${PORT}`);
-    console.log(`⏳ Starting background model load...`);
-    //LOAD THE MODEL IN THE BACKGROUND
     initModel().catch(err => {
         console.error("Critical Failure: Failed to load the AI model.", err);
     });

 import { fileURLToPath } from "url";
 import path from "path";
 import morgan from "morgan";
+import os from "os";
 import { getLlama, LlamaChatSession } from "node-llama-cpp";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 app.use(express.json());
 app.use(morgan('dev'));
 app.use(express.static(path.join(__dirname, 'public')));
 const PORT = process.env.PORT || 7860;
+// Global AI variables
 let modelInstance;
 let contextInstance;
+let isModelReady = false;
 /* -----------------------
+   THE INVISIBLE QUEUE
+   This ensures requests line up perfectly without crashing the context
+   and without sending "Busy" errors to the user.
 ----------------------- */
+class RequestQueue {
+    constructor() {
+        this.queue = Promise.resolve();
+    }
+    add(task) {
+        return new Promise((resolve, reject) => {
+            this.queue = this.queue.then(async () => {
+                try {
+                    const result = await task();
+                    resolve(result);
+                } catch (err) {
+                    reject(err);
+                }
+            });
+        });
+    }
+}
+const taskQueue = new RequestQueue();
+const cpuCores = Math.max(1, os.cpus().length);
 async function initModel() {
     console.log("-----------------------------------------");
+    console.log(`Initializing Llama using ${cpuCores} vCPUs...`);
     const llama = await getLlama();
     const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
     modelInstance = await llama.loadModel({
         modelPath: modelLocation,
         gpu: false
     });
+    console.log("Creating hyper-optimized context...");
     contextInstance = await modelInstance.createContext({
+        contextSize: 4096,   // Cap context to save memory and increase speed
         batchSize: 512,
+        threads: cpuCores,
+        flashAttention: true // MAJOR speed boost for CPU inference
     });
+    isModelReady = true;
+    console.log("Model successfully loaded! API is online. 🚀");
     console.log("-----------------------------------------");
 }
 /* -----------------------
+   STATELESS API ENDPOINT
 ----------------------- */
 app.post("/generate", async (req, res) => {
     if (!isModelReady) {
+        return res.status(503).json({ error: "Model is still booting up. Try again in a few seconds." });
     }
+    const {
+        user_input,
+        user_temp = 0.7,
+        user_inst = "You are an Wrld-AI assistant. Give short clear answers.",
+        user_max_token = 1024
+    } = req.body;
+    if (!user_input) {
+        return res.status(400).json({ error: "Missing required field: user_input" });
+    }
+    // Add request to the queue. The user's HTTP request will wait here
+    // patiently until the CPU is free to generate the response.
+    taskQueue.add(async () => {
+        let sequence;
+        try {
+            // 1. Grab sequence memory ONLY when it is this request's turn
+            sequence = contextInstance.getSequence();
+            // 2. Create unique, stateless session
+            const session = new LlamaChatSession({
+                contextSequence: sequence,
+                systemPrompt: user_inst
+            });
+            // 3. Generate response
+            const responseText = await session.prompt(user_input, {
+                maxTokens: parseInt(user_max_token),
+                temperature: parseFloat(user_temp),
+                topK: 40,
+                topP: 0.9,
+                repeatPenalty: 1.1
+            });
+            // Send successful response
+            res.json({ response: responseText });
+        } catch (err) {
+            console.error("Error during generation:", err);
+            res.status(500).json({ error: "An internal error occurred during text generation." });
+        } finally {
+            // 4. INSTANT CLEANUP: Free up the sequence immediately for the next request in the queue
+            if (sequence) {
+                sequence.dispose();
+            }
         }
+    });
 });
 app.listen(PORT, "0.0.0.0", () => {
     console.log(`✅ Web server is listening on port ${PORT}`);
     initModel().catch(err => {
         console.error("Critical Failure: Failed to load the AI model.", err);
     });