Spaces:
Sleeping
Sleeping
File size: 4,265 Bytes
d8685b0 05e881a 8325cb6 533392b 05e881a d8685b0 05e881a d8685b0 0768396 d8685b0 8325cb6 90a79c7 d8685b0 05e881a d8685b0 533392b 90a79c7 533392b d8685b0 0768396 533392b 0768396 533392b 05e881a 90a79c7 533392b 05e881a 90a79c7 8325cb6 05e881a 0768396 533392b 90a79c7 533392b 05e881a 533392b 05e881a 533392b 90a79c7 05e881a d8685b0 0768396 533392b 0768396 d8685b0 8325cb6 533392b 8325cb6 533392b d8685b0 533392b d8685b0 533392b 141facc 533392b d8685b0 8325cb6 05e881a d8685b0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | import express from "express";
import { fileURLToPath } from "url";
import path from "path";
import morgan from "morgan";
import os from "os";
import { getLlama, LlamaChatSession } from "node-llama-cpp";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const app = express();
app.use(express.json());
app.use(morgan('dev'));
app.use(express.static(path.join(__dirname, 'public')));
const PORT = process.env.PORT || 7860;
// Global AI variables
let modelInstance;
let contextInstance;
let isModelReady = false;
/* -----------------------
THE INVISIBLE QUEUE
This ensures requests line up perfectly without crashing the context
and without sending "Busy" errors to the user.
----------------------- */
class RequestQueue {
constructor() {
this.queue = Promise.resolve();
}
add(task) {
return new Promise((resolve, reject) => {
this.queue = this.queue.then(async () => {
try {
const result = await task();
resolve(result);
} catch (err) {
reject(err);
}
});
});
}
}
const taskQueue = new RequestQueue();
const cpuCores = Math.max(1, os.cpus().length);
async function initModel() {
console.log("-----------------------------------------");
console.log(`Initializing Llama using ${cpuCores} vCPUs...`);
const llama = await getLlama();
const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
modelInstance = await llama.loadModel({
modelPath: modelLocation,
gpu: false
});
console.log("Creating hyper-optimized context...");
contextInstance = await modelInstance.createContext({
contextSize: 4096, // Cap context to save memory and increase speed
batchSize: 512,
threads: cpuCores,
flashAttention: true // MAJOR speed boost for CPU inference
});
isModelReady = true;
console.log("Model successfully loaded! API is online. 🚀");
console.log("-----------------------------------------");
}
/* -----------------------
STATELESS API ENDPOINT
----------------------- */
app.post("/generate", async (req, res) => {
if (!isModelReady) {
return res.status(503).json({ error: "Model is still booting up. Try again in a few seconds." });
}
const {
user_input,
user_temp = 0.7,
user_inst = "You are an Wrld-AI assistant. Give short clear answers.",
user_max_token = 1024
} = req.body;
if (!user_input) {
return res.status(400).json({ error: "Missing required field: user_input" });
}
// Add request to the queue. The user's HTTP request will wait here
// patiently until the CPU is free to generate the response.
taskQueue.add(async () => {
let sequence;
try {
// 1. Grab sequence memory ONLY when it is this request's turn
sequence = contextInstance.getSequence();
// 2. Create unique, stateless session
const session = new LlamaChatSession({
contextSequence: sequence,
systemPrompt: user_inst
});
// 3. Generate response
const responseText = await session.prompt(user_input, {
maxTokens: parseInt(user_max_token),
temperature: parseFloat(user_temp),
topK: 40,
topP: 0.9,
repeatPenalty: 1.1
});
// Send successful response
res.json({ response: responseText });
} catch (err) {
console.error("Error during generation:", err);
res.status(500).json({ error: "An internal error occurred during text generation." });
} finally {
// 4. INSTANT CLEANUP: Free up the sequence immediately for the next request in the queue
if (sequence) {
sequence.dispose();
}
}
});
});
app.listen(PORT, "0.0.0.0", () => {
console.log(`✅ Web server is listening on port ${PORT}`);
initModel().catch(err => {
console.error("Critical Failure: Failed to load the AI model.", err);
});
}); |