Spaces:
Sleeping
Sleeping
| import express from "express"; | |
| import { fileURLToPath } from "url"; | |
| import path from "path"; | |
| import morgan from "morgan"; | |
| import os from "os"; | |
| import { getLlama, LlamaChatSession } from "node-llama-cpp"; | |
| const __dirname = path.dirname(fileURLToPath(import.meta.url)); | |
| const app = express(); | |
| app.use(express.json()); | |
| app.use(morgan('dev')); | |
| app.use(express.static(path.join(__dirname, 'public'))); | |
| const PORT = process.env.PORT || 7860; | |
| // Global AI variables | |
| let modelInstance; | |
| let contextInstance; | |
| let isModelReady = false; | |
| /* ----------------------- | |
| THE INVISIBLE QUEUE | |
| This ensures requests line up perfectly without crashing the context | |
| and without sending "Busy" errors to the user. | |
| ----------------------- */ | |
| class RequestQueue { | |
| constructor() { | |
| this.queue = Promise.resolve(); | |
| } | |
| add(task) { | |
| return new Promise((resolve, reject) => { | |
| this.queue = this.queue.then(async () => { | |
| try { | |
| const result = await task(); | |
| resolve(result); | |
| } catch (err) { | |
| reject(err); | |
| } | |
| }); | |
| }); | |
| } | |
| } | |
| const taskQueue = new RequestQueue(); | |
| const cpuCores = Math.max(1, os.cpus().length); | |
| async function initModel() { | |
| console.log("-----------------------------------------"); | |
| console.log(`Initializing Llama using ${cpuCores} vCPUs...`); | |
| const llama = await getLlama(); | |
| const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf"); | |
| modelInstance = await llama.loadModel({ | |
| modelPath: modelLocation, | |
| gpu: false | |
| }); | |
| console.log("Creating hyper-optimized context..."); | |
| contextInstance = await modelInstance.createContext({ | |
| contextSize: 4096, // Cap context to save memory and increase speed | |
| batchSize: 512, | |
| threads: cpuCores, | |
| flashAttention: true // MAJOR speed boost for CPU inference | |
| }); | |
| isModelReady = true; | |
| console.log("Model successfully loaded! API is online. 🚀"); | |
| console.log("-----------------------------------------"); | |
| } | |
| /* ----------------------- | |
| STATELESS API ENDPOINT | |
| ----------------------- */ | |
| app.post("/generate", async (req, res) => { | |
| if (!isModelReady) { | |
| return res.status(503).json({ error: "Model is still booting up. Try again in a few seconds." }); | |
| } | |
| const { | |
| user_input, | |
| user_temp = 0.7, | |
| user_inst = "You are an Wrld-AI assistant. Give short clear answers.", | |
| user_max_token = 1024 | |
| } = req.body; | |
| if (!user_input) { | |
| return res.status(400).json({ error: "Missing required field: user_input" }); | |
| } | |
| // Add request to the queue. The user's HTTP request will wait here | |
| // patiently until the CPU is free to generate the response. | |
| taskQueue.add(async () => { | |
| let sequence; | |
| try { | |
| // 1. Grab sequence memory ONLY when it is this request's turn | |
| sequence = contextInstance.getSequence(); | |
| // 2. Create unique, stateless session | |
| const session = new LlamaChatSession({ | |
| contextSequence: sequence, | |
| systemPrompt: user_inst | |
| }); | |
| // 3. Generate response | |
| const responseText = await session.prompt(user_input, { | |
| maxTokens: parseInt(user_max_token), | |
| temperature: parseFloat(user_temp), | |
| topK: 40, | |
| topP: 0.9, | |
| repeatPenalty: 1.1 | |
| }); | |
| // Send successful response | |
| res.json({ response: responseText }); | |
| } catch (err) { | |
| console.error("Error during generation:", err); | |
| res.status(500).json({ error: "An internal error occurred during text generation." }); | |
| } finally { | |
| // 4. INSTANT CLEANUP: Free up the sequence immediately for the next request in the queue | |
| if (sequence) { | |
| sequence.dispose(); | |
| } | |
| } | |
| }); | |
| }); | |
| app.listen(PORT, "0.0.0.0", () => { | |
| console.log(`✅ Web server is listening on port ${PORT}`); | |
| initModel().catch(err => { | |
| console.error("Critical Failure: Failed to load the AI model.", err); | |
| }); | |
| }); |