tjwrld commited on
Commit
533392b
·
verified ·
1 Parent(s): 141facc

Update server.js

Browse files
Files changed (1) hide show
  1. server.js +80 -60
server.js CHANGED
@@ -2,6 +2,7 @@ import express from "express";
2
  import { fileURLToPath } from "url";
3
  import path from "path";
4
  import morgan from "morgan";
 
5
  import { getLlama, LlamaChatSession } from "node-llama-cpp";
6
 
7
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -9,106 +10,125 @@ const app = express();
9
 
10
  app.use(express.json());
11
  app.use(morgan('dev'));
12
-
13
- // Serve the UI
14
  app.use(express.static(path.join(__dirname, 'public')));
15
 
16
  const PORT = process.env.PORT || 7860;
17
 
 
18
  let modelInstance;
19
  let contextInstance;
20
- let isModelReady = false; // Flag to track if the AI is ready
21
 
22
  /* -----------------------
23
- LOAD MODEL
 
 
24
  ----------------------- */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  async function initModel() {
26
  console.log("-----------------------------------------");
27
- console.log("Initializing Llama Backend...");
28
  const llama = await getLlama();
29
 
30
  const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
31
 
32
- console.log(`Loading model into memory: ${modelLocation}`);
33
  modelInstance = await llama.loadModel({
34
  modelPath: modelLocation,
35
  gpu: false
36
  });
37
 
38
- console.log("Creating context sequence...");
39
  contextInstance = await modelInstance.createContext({
 
40
  batchSize: 512,
41
- threads: 6
 
42
  });
43
 
44
- isModelReady = true; // Mark as ready!
45
- console.log("Model successfully loaded and ready for requests! 🚀");
46
  console.log("-----------------------------------------");
47
  }
48
 
49
  /* -----------------------
50
- API ENDPOINTS
51
  ----------------------- */
52
-
53
  app.post("/generate", async (req, res) => {
54
  if (!isModelReady) {
55
- return res.status(503).json({
56
- error: "The AI model is still loading into memory. Please wait a few seconds and try again."
57
- });
58
  }
59
 
60
- let sequence; // Define this here so we can clean it up in the 'finally' block
61
-
62
- try {
63
- const {
64
- user_input,
65
- user_temp = 0.2,
66
- user_inst = "You are an Wrld-Gpt AI assistant. Give short clear answers. Do not make assumptions",
67
- user_max_token = 1024
68
- } = req.body;
69
-
70
- if (!user_input) {
71
- return res.status(400).json({ error: "Missing required field: user_input" });
72
- }
73
-
74
- // Grab a sequence memory slot for this specific request
75
- sequence = contextInstance.getSequence();
76
-
77
- const session = new LlamaChatSession({
78
- contextSequence: sequence,
79
- systemPrompt: user_inst
80
- });
81
-
82
- const responseText = await session.prompt(user_input, {
83
- maxTokens: parseInt(user_max_token),
84
- temperature: parseFloat(user_temp),
85
- topK: 40,
86
- topP: 0.9,
87
- repeatPenalty: 1.1
88
- });
89
 
90
- res.json({ response: responseText });
 
 
91
 
92
- } catch (err) {
93
- console.error("Error during generation:", err);
94
- res.status(500).json({ error: "An internal error occurred during text generation." });
95
- } finally {
96
- // CRITICAL FIX: Always free up the sequence slot when done, even if an error occurs!
97
- if (sequence) {
98
- sequence.dispose();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  }
100
- }
101
  });
102
 
103
- /* -----------------------
104
- STARTUP SEQUENCE
105
- ----------------------- */
106
-
107
  app.listen(PORT, "0.0.0.0", () => {
108
  console.log(`✅ Web server is listening on port ${PORT}`);
109
- console.log(`⏳ Starting background model load...`);
110
-
111
- //LOAD THE MODEL IN THE BACKGROUND
112
  initModel().catch(err => {
113
  console.error("Critical Failure: Failed to load the AI model.", err);
114
  });
 
2
  import { fileURLToPath } from "url";
3
  import path from "path";
4
  import morgan from "morgan";
5
+ import os from "os";
6
  import { getLlama, LlamaChatSession } from "node-llama-cpp";
7
 
8
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
 
10
 
11
  app.use(express.json());
12
  app.use(morgan('dev'));
 
 
13
  app.use(express.static(path.join(__dirname, 'public')));
14
 
15
  const PORT = process.env.PORT || 7860;
16
 
17
+ // Global AI variables
18
  let modelInstance;
19
  let contextInstance;
20
+ let isModelReady = false;
21
 
22
  /* -----------------------
23
+ THE INVISIBLE QUEUE
24
+ This ensures requests line up perfectly without crashing the context
25
+ and without sending "Busy" errors to the user.
26
  ----------------------- */
27
+ class RequestQueue {
28
+ constructor() {
29
+ this.queue = Promise.resolve();
30
+ }
31
+ add(task) {
32
+ return new Promise((resolve, reject) => {
33
+ this.queue = this.queue.then(async () => {
34
+ try {
35
+ const result = await task();
36
+ resolve(result);
37
+ } catch (err) {
38
+ reject(err);
39
+ }
40
+ });
41
+ });
42
+ }
43
+ }
44
+ const taskQueue = new RequestQueue();
45
+
46
+ const cpuCores = Math.max(1, os.cpus().length);
47
+
48
  async function initModel() {
49
  console.log("-----------------------------------------");
50
+ console.log(`Initializing Llama using ${cpuCores} vCPUs...`);
51
  const llama = await getLlama();
52
 
53
  const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
54
 
 
55
  modelInstance = await llama.loadModel({
56
  modelPath: modelLocation,
57
  gpu: false
58
  });
59
 
60
+ console.log("Creating hyper-optimized context...");
61
  contextInstance = await modelInstance.createContext({
62
+ contextSize: 4096, // Cap context to save memory and increase speed
63
  batchSize: 512,
64
+ threads: cpuCores,
65
+ flashAttention: true // MAJOR speed boost for CPU inference
66
  });
67
 
68
+ isModelReady = true;
69
+ console.log("Model successfully loaded! API is online. 🚀");
70
  console.log("-----------------------------------------");
71
  }
72
 
73
  /* -----------------------
74
+ STATELESS API ENDPOINT
75
  ----------------------- */
 
76
  app.post("/generate", async (req, res) => {
77
  if (!isModelReady) {
78
+ return res.status(503).json({ error: "Model is still booting up. Try again in a few seconds." });
 
 
79
  }
80
 
81
+ const {
82
+ user_input,
83
+ user_temp = 0.7,
84
+ user_inst = "You are an Wrld-AI assistant. Give short clear answers.",
85
+ user_max_token = 1024
86
+ } = req.body;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ if (!user_input) {
89
+ return res.status(400).json({ error: "Missing required field: user_input" });
90
+ }
91
 
92
+ // Add request to the queue. The user's HTTP request will wait here
93
+ // patiently until the CPU is free to generate the response.
94
+ taskQueue.add(async () => {
95
+ let sequence;
96
+ try {
97
+ // 1. Grab sequence memory ONLY when it is this request's turn
98
+ sequence = contextInstance.getSequence();
99
+
100
+ // 2. Create unique, stateless session
101
+ const session = new LlamaChatSession({
102
+ contextSequence: sequence,
103
+ systemPrompt: user_inst
104
+ });
105
+
106
+ // 3. Generate response
107
+ const responseText = await session.prompt(user_input, {
108
+ maxTokens: parseInt(user_max_token),
109
+ temperature: parseFloat(user_temp),
110
+ topK: 40,
111
+ topP: 0.9,
112
+ repeatPenalty: 1.1
113
+ });
114
+
115
+ // Send successful response
116
+ res.json({ response: responseText });
117
+
118
+ } catch (err) {
119
+ console.error("Error during generation:", err);
120
+ res.status(500).json({ error: "An internal error occurred during text generation." });
121
+ } finally {
122
+ // 4. INSTANT CLEANUP: Free up the sequence immediately for the next request in the queue
123
+ if (sequence) {
124
+ sequence.dispose();
125
+ }
126
  }
127
+ });
128
  });
129
 
 
 
 
 
130
  app.listen(PORT, "0.0.0.0", () => {
131
  console.log(`✅ Web server is listening on port ${PORT}`);
 
 
 
132
  initModel().catch(err => {
133
  console.error("Critical Failure: Failed to load the AI model.", err);
134
  });