Spaces:
Sleeping
Sleeping
Update server.js
Browse files
server.js
CHANGED
|
@@ -2,6 +2,7 @@ import express from "express";
|
|
| 2 |
import { fileURLToPath } from "url";
|
| 3 |
import path from "path";
|
| 4 |
import morgan from "morgan";
|
|
|
|
| 5 |
import { getLlama, LlamaChatSession } from "node-llama-cpp";
|
| 6 |
|
| 7 |
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
@@ -9,106 +10,125 @@ const app = express();
|
|
| 9 |
|
| 10 |
app.use(express.json());
|
| 11 |
app.use(morgan('dev'));
|
| 12 |
-
|
| 13 |
-
// Serve the UI
|
| 14 |
app.use(express.static(path.join(__dirname, 'public')));
|
| 15 |
|
| 16 |
const PORT = process.env.PORT || 7860;
|
| 17 |
|
|
|
|
| 18 |
let modelInstance;
|
| 19 |
let contextInstance;
|
| 20 |
-
let isModelReady = false;
|
| 21 |
|
| 22 |
/* -----------------------
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
----------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
async function initModel() {
|
| 26 |
console.log("-----------------------------------------");
|
| 27 |
-
console.log(
|
| 28 |
const llama = await getLlama();
|
| 29 |
|
| 30 |
const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
|
| 31 |
|
| 32 |
-
console.log(`Loading model into memory: ${modelLocation}`);
|
| 33 |
modelInstance = await llama.loadModel({
|
| 34 |
modelPath: modelLocation,
|
| 35 |
gpu: false
|
| 36 |
});
|
| 37 |
|
| 38 |
-
console.log("Creating
|
| 39 |
contextInstance = await modelInstance.createContext({
|
|
|
|
| 40 |
batchSize: 512,
|
| 41 |
-
threads:
|
|
|
|
| 42 |
});
|
| 43 |
|
| 44 |
-
isModelReady = true;
|
| 45 |
-
console.log("Model successfully loaded
|
| 46 |
console.log("-----------------------------------------");
|
| 47 |
}
|
| 48 |
|
| 49 |
/* -----------------------
|
| 50 |
-
API
|
| 51 |
----------------------- */
|
| 52 |
-
|
| 53 |
app.post("/generate", async (req, res) => {
|
| 54 |
if (!isModelReady) {
|
| 55 |
-
return res.status(503).json({
|
| 56 |
-
error: "The AI model is still loading into memory. Please wait a few seconds and try again."
|
| 57 |
-
});
|
| 58 |
}
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
user_inst = "You are an Wrld-Gpt AI assistant. Give short clear answers. Do not make assumptions",
|
| 67 |
-
user_max_token = 1024
|
| 68 |
-
} = req.body;
|
| 69 |
-
|
| 70 |
-
if (!user_input) {
|
| 71 |
-
return res.status(400).json({ error: "Missing required field: user_input" });
|
| 72 |
-
}
|
| 73 |
-
|
| 74 |
-
// Grab a sequence memory slot for this specific request
|
| 75 |
-
sequence = contextInstance.getSequence();
|
| 76 |
-
|
| 77 |
-
const session = new LlamaChatSession({
|
| 78 |
-
contextSequence: sequence,
|
| 79 |
-
systemPrompt: user_inst
|
| 80 |
-
});
|
| 81 |
-
|
| 82 |
-
const responseText = await session.prompt(user_input, {
|
| 83 |
-
maxTokens: parseInt(user_max_token),
|
| 84 |
-
temperature: parseFloat(user_temp),
|
| 85 |
-
topK: 40,
|
| 86 |
-
topP: 0.9,
|
| 87 |
-
repeatPenalty: 1.1
|
| 88 |
-
});
|
| 89 |
|
| 90 |
-
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
sequence.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
}
|
| 100 |
-
}
|
| 101 |
});
|
| 102 |
|
| 103 |
-
/* -----------------------
|
| 104 |
-
STARTUP SEQUENCE
|
| 105 |
-
----------------------- */
|
| 106 |
-
|
| 107 |
app.listen(PORT, "0.0.0.0", () => {
|
| 108 |
console.log(`✅ Web server is listening on port ${PORT}`);
|
| 109 |
-
console.log(`⏳ Starting background model load...`);
|
| 110 |
-
|
| 111 |
-
//LOAD THE MODEL IN THE BACKGROUND
|
| 112 |
initModel().catch(err => {
|
| 113 |
console.error("Critical Failure: Failed to load the AI model.", err);
|
| 114 |
});
|
|
|
|
| 2 |
import { fileURLToPath } from "url";
|
| 3 |
import path from "path";
|
| 4 |
import morgan from "morgan";
|
| 5 |
+
import os from "os";
|
| 6 |
import { getLlama, LlamaChatSession } from "node-llama-cpp";
|
| 7 |
|
| 8 |
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
|
|
| 10 |
|
| 11 |
app.use(express.json());
|
| 12 |
app.use(morgan('dev'));
|
|
|
|
|
|
|
| 13 |
app.use(express.static(path.join(__dirname, 'public')));
|
| 14 |
|
| 15 |
const PORT = process.env.PORT || 7860;
|
| 16 |
|
| 17 |
+
// Global AI variables
|
| 18 |
let modelInstance;
|
| 19 |
let contextInstance;
|
| 20 |
+
let isModelReady = false;
|
| 21 |
|
| 22 |
/* -----------------------
|
| 23 |
+
THE INVISIBLE QUEUE
|
| 24 |
+
This ensures requests line up perfectly without crashing the context
|
| 25 |
+
and without sending "Busy" errors to the user.
|
| 26 |
----------------------- */
|
| 27 |
+
class RequestQueue {
|
| 28 |
+
constructor() {
|
| 29 |
+
this.queue = Promise.resolve();
|
| 30 |
+
}
|
| 31 |
+
add(task) {
|
| 32 |
+
return new Promise((resolve, reject) => {
|
| 33 |
+
this.queue = this.queue.then(async () => {
|
| 34 |
+
try {
|
| 35 |
+
const result = await task();
|
| 36 |
+
resolve(result);
|
| 37 |
+
} catch (err) {
|
| 38 |
+
reject(err);
|
| 39 |
+
}
|
| 40 |
+
});
|
| 41 |
+
});
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
const taskQueue = new RequestQueue();
|
| 45 |
+
|
| 46 |
+
const cpuCores = Math.max(1, os.cpus().length);
|
| 47 |
+
|
| 48 |
async function initModel() {
|
| 49 |
console.log("-----------------------------------------");
|
| 50 |
+
console.log(`Initializing Llama using ${cpuCores} vCPUs...`);
|
| 51 |
const llama = await getLlama();
|
| 52 |
|
| 53 |
const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
|
| 54 |
|
|
|
|
| 55 |
modelInstance = await llama.loadModel({
|
| 56 |
modelPath: modelLocation,
|
| 57 |
gpu: false
|
| 58 |
});
|
| 59 |
|
| 60 |
+
console.log("Creating hyper-optimized context...");
|
| 61 |
contextInstance = await modelInstance.createContext({
|
| 62 |
+
contextSize: 4096, // Cap context to save memory and increase speed
|
| 63 |
batchSize: 512,
|
| 64 |
+
threads: cpuCores,
|
| 65 |
+
flashAttention: true // MAJOR speed boost for CPU inference
|
| 66 |
});
|
| 67 |
|
| 68 |
+
isModelReady = true;
|
| 69 |
+
console.log("Model successfully loaded! API is online. 🚀");
|
| 70 |
console.log("-----------------------------------------");
|
| 71 |
}
|
| 72 |
|
| 73 |
/* -----------------------
|
| 74 |
+
STATELESS API ENDPOINT
|
| 75 |
----------------------- */
|
|
|
|
| 76 |
app.post("/generate", async (req, res) => {
|
| 77 |
if (!isModelReady) {
|
| 78 |
+
return res.status(503).json({ error: "Model is still booting up. Try again in a few seconds." });
|
|
|
|
|
|
|
| 79 |
}
|
| 80 |
|
| 81 |
+
const {
|
| 82 |
+
user_input,
|
| 83 |
+
user_temp = 0.7,
|
| 84 |
+
user_inst = "You are an Wrld-AI assistant. Give short clear answers.",
|
| 85 |
+
user_max_token = 1024
|
| 86 |
+
} = req.body;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
if (!user_input) {
|
| 89 |
+
return res.status(400).json({ error: "Missing required field: user_input" });
|
| 90 |
+
}
|
| 91 |
|
| 92 |
+
// Add request to the queue. The user's HTTP request will wait here
|
| 93 |
+
// patiently until the CPU is free to generate the response.
|
| 94 |
+
taskQueue.add(async () => {
|
| 95 |
+
let sequence;
|
| 96 |
+
try {
|
| 97 |
+
// 1. Grab sequence memory ONLY when it is this request's turn
|
| 98 |
+
sequence = contextInstance.getSequence();
|
| 99 |
+
|
| 100 |
+
// 2. Create unique, stateless session
|
| 101 |
+
const session = new LlamaChatSession({
|
| 102 |
+
contextSequence: sequence,
|
| 103 |
+
systemPrompt: user_inst
|
| 104 |
+
});
|
| 105 |
+
|
| 106 |
+
// 3. Generate response
|
| 107 |
+
const responseText = await session.prompt(user_input, {
|
| 108 |
+
maxTokens: parseInt(user_max_token),
|
| 109 |
+
temperature: parseFloat(user_temp),
|
| 110 |
+
topK: 40,
|
| 111 |
+
topP: 0.9,
|
| 112 |
+
repeatPenalty: 1.1
|
| 113 |
+
});
|
| 114 |
+
|
| 115 |
+
// Send successful response
|
| 116 |
+
res.json({ response: responseText });
|
| 117 |
+
|
| 118 |
+
} catch (err) {
|
| 119 |
+
console.error("Error during generation:", err);
|
| 120 |
+
res.status(500).json({ error: "An internal error occurred during text generation." });
|
| 121 |
+
} finally {
|
| 122 |
+
// 4. INSTANT CLEANUP: Free up the sequence immediately for the next request in the queue
|
| 123 |
+
if (sequence) {
|
| 124 |
+
sequence.dispose();
|
| 125 |
+
}
|
| 126 |
}
|
| 127 |
+
});
|
| 128 |
});
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
app.listen(PORT, "0.0.0.0", () => {
|
| 131 |
console.log(`✅ Web server is listening on port ${PORT}`);
|
|
|
|
|
|
|
|
|
|
| 132 |
initModel().catch(err => {
|
| 133 |
console.error("Critical Failure: Failed to load the AI model.", err);
|
| 134 |
});
|