File size: 4,265 Bytes
d8685b0
05e881a
 
8325cb6
533392b
05e881a
d8685b0
05e881a
d8685b0
0768396
d8685b0
8325cb6
90a79c7
d8685b0
05e881a
d8685b0
533392b
90a79c7
 
533392b
d8685b0
0768396
533392b
 
 
0768396
533392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05e881a
90a79c7
533392b
05e881a
 
90a79c7
 
 
 
8325cb6
05e881a
0768396
533392b
90a79c7
533392b
05e881a
533392b
 
05e881a
 
533392b
 
90a79c7
05e881a
d8685b0
0768396
533392b
0768396
d8685b0
8325cb6
533392b
8325cb6
 
533392b
 
 
 
 
 
d8685b0
533392b
 
 
d8685b0
533392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141facc
533392b
d8685b0
 
8325cb6
 
 
 
05e881a
d8685b0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import express from "express";
import { fileURLToPath } from "url";
import path from "path";
import morgan from "morgan";
import os from "os";
import { getLlama, LlamaChatSession } from "node-llama-cpp";

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const app = express();

app.use(express.json());
app.use(morgan('dev'));
app.use(express.static(path.join(__dirname, 'public')));

const PORT = process.env.PORT || 7860;

// Global AI variables
let modelInstance;
let contextInstance;
let isModelReady = false;

/* -----------------------
   THE INVISIBLE QUEUE
   This ensures requests line up perfectly without crashing the context
   and without sending "Busy" errors to the user.
----------------------- */
class RequestQueue {
    constructor() {
        this.queue = Promise.resolve();
    }
    add(task) {
        return new Promise((resolve, reject) => {
            this.queue = this.queue.then(async () => {
                try {
                    const result = await task();
                    resolve(result);
                } catch (err) {
                    reject(err);
                }
            });
        });
    }
}
const taskQueue = new RequestQueue();

const cpuCores = Math.max(1, os.cpus().length);

async function initModel() {
    console.log("-----------------------------------------");
    console.log(`Initializing Llama using ${cpuCores} vCPUs...`);
    const llama = await getLlama();
    
    const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
    
    modelInstance = await llama.loadModel({
        modelPath: modelLocation,
        gpu: false 
    });

    console.log("Creating hyper-optimized context...");
    contextInstance = await modelInstance.createContext({
        contextSize: 4096,   // Cap context to save memory and increase speed
        batchSize: 512,
        threads: cpuCores,
        flashAttention: true // MAJOR speed boost for CPU inference
    });
    
    isModelReady = true;
    console.log("Model successfully loaded! API is online. 🚀");
    console.log("-----------------------------------------");
}

/* -----------------------
   STATELESS API ENDPOINT
----------------------- */
app.post("/generate", async (req, res) => {
    if (!isModelReady) {
        return res.status(503).json({ error: "Model is still booting up. Try again in a few seconds." });
    }

    const {
        user_input,
        user_temp = 0.7,
        user_inst = "You are an Wrld-AI assistant. Give short clear answers.",
        user_max_token = 1024
    } = req.body;

    if (!user_input) {
        return res.status(400).json({ error: "Missing required field: user_input" });
    }

    // Add request to the queue. The user's HTTP request will wait here 
    // patiently until the CPU is free to generate the response.
    taskQueue.add(async () => {
        let sequence;
        try {
            // 1. Grab sequence memory ONLY when it is this request's turn
            sequence = contextInstance.getSequence();

            // 2. Create unique, stateless session
            const session = new LlamaChatSession({
                contextSequence: sequence,
                systemPrompt: user_inst 
            });

            // 3. Generate response
            const responseText = await session.prompt(user_input, {
                maxTokens: parseInt(user_max_token),
                temperature: parseFloat(user_temp),
                topK: 40,
                topP: 0.9,
                repeatPenalty: 1.1
            });

            // Send successful response
            res.json({ response: responseText });

        } catch (err) {
            console.error("Error during generation:", err);
            res.status(500).json({ error: "An internal error occurred during text generation." });
        } finally {
            // 4. INSTANT CLEANUP: Free up the sequence immediately for the next request in the queue
            if (sequence) {
                sequence.dispose();
            }
        }
    });
});

app.listen(PORT, "0.0.0.0", () => {
    console.log(`✅ Web server is listening on port ${PORT}`);
    initModel().catch(err => {
        console.error("Critical Failure: Failed to load the AI model.", err);
    });
});