Spaces:

tjwrld
/

llm-cpu-server

Sleeping

File size: 4,265 Bytes

import express from "express";
import { fileURLToPath } from "url";
import path from "path";
import morgan from "morgan";
import os from "os";
import { getLlama, LlamaChatSession } from "node-llama-cpp";

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const app = express();

app.use(express.json());
app.use(morgan('dev'));
app.use(express.static(path.join(__dirname, 'public')));

const PORT = process.env.PORT || 7860;

// Global AI variables
let modelInstance;
let contextInstance;
let isModelReady = false;

/* -----------------------
   THE INVISIBLE QUEUE
   This ensures requests line up perfectly without crashing the context
   and without sending "Busy" errors to the user.
----------------------- */
class RequestQueue {
    constructor() {
        this.queue = Promise.resolve();
    }
    add(task) {
        return new Promise((resolve, reject) => {
            this.queue = this.queue.then(async () => {
                try {
                    const result = await task();
                    resolve(result);
                } catch (err) {
                    reject(err);
                }
            });
        });
    }
}
const taskQueue = new RequestQueue();

const cpuCores = Math.max(1, os.cpus().length);

async function initModel() {
    console.log("-----------------------------------------");
    console.log(`Initializing Llama using ${cpuCores} vCPUs...`);
    const llama = await getLlama();
    
    const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
    
    modelInstance = await llama.loadModel({
        modelPath: modelLocation,
        gpu: false 
    });

    console.log("Creating hyper-optimized context...");
    contextInstance = await modelInstance.createContext({
        contextSize: 4096,   // Cap context to save memory and increase speed
        batchSize: 512,
        threads: cpuCores,
        flashAttention: true // MAJOR speed boost for CPU inference
    });
    
    isModelReady = true;
    console.log("Model successfully loaded! API is online. 🚀");
    console.log("-----------------------------------------");
}

/* -----------------------
   STATELESS API ENDPOINT
----------------------- */
app.post("/generate", async (req, res) => {
    if (!isModelReady) {
        return res.status(503).json({ error: "Model is still booting up. Try again in a few seconds." });
    }

    const {
        user_input,
        user_temp = 0.7,
        user_inst = "You are an Wrld-AI assistant. Give short clear answers.",
        user_max_token = 1024
    } = req.body;

    if (!user_input) {
        return res.status(400).json({ error: "Missing required field: user_input" });
    }

    // Add request to the queue. The user's HTTP request will wait here 
    // patiently until the CPU is free to generate the response.
    taskQueue.add(async () => {
        let sequence;
        try {
            // 1. Grab sequence memory ONLY when it is this request's turn
            sequence = contextInstance.getSequence();

            // 2. Create unique, stateless session
            const session = new LlamaChatSession({
                contextSequence: sequence,
                systemPrompt: user_inst 
            });

            // 3. Generate response
            const responseText = await session.prompt(user_input, {
                maxTokens: parseInt(user_max_token),
                temperature: parseFloat(user_temp),
                topK: 40,
                topP: 0.9,
                repeatPenalty: 1.1
            });

            // Send successful response
            res.json({ response: responseText });

        } catch (err) {
            console.error("Error during generation:", err);
            res.status(500).json({ error: "An internal error occurred during text generation." });
        } finally {
            // 4. INSTANT CLEANUP: Free up the sequence immediately for the next request in the queue
            if (sequence) {
                sequence.dispose();
            }
        }
    });
});

app.listen(PORT, "0.0.0.0", () => {
    console.log(`✅ Web server is listening on port ${PORT}`);
    initModel().catch(err => {
        console.error("Critical Failure: Failed to load the AI model.", err);
    });
});