Spaces:

tjwrld
/

llm-cpu-server

Running

App Files Files Community

tjwrld commited on Mar 16

Commit

90a79c7

verified ·

1 Parent(s): c5eeb26

Update server.js

Browse files

Files changed (1) hide show

server.js +45 -30

server.js CHANGED Viewed

@@ -1,73 +1,87 @@
 import express from "express";
 import { fileURLToPath } from "url";
 import path from "path";
 import { getLlama, LlamaChatSession } from "node-llama-cpp";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const app = express();
-// Middleware to parse JSON bodies
 app.use(express.json());
 // Hugging Face Spaces expects apps to run on port 7860
 const PORT = process.env.PORT || 7860;
-let model;
-let context;
 /* -----------------------
    LOAD MODEL (ONCE)
 ----------------------- */
 async function initModel() {
-    console.log("Initializing Llama...");
     const llama = await getLlama();
-    console.log("Loading model into memory...");
-    model = await llama.loadModel({
-        modelPath: path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf"),
-        gpu: false // HF Free tier doesn't have a GPU
     });
-    console.log("Creating context...");
-    context = await model.createContext({
         batchSize: 512,
-        threads: 6
     });
-    console.log("Model loaded and ready! 🚀");
 }
 /* -----------------------
    API ENDPOINTS
 ----------------------- */
-// Health check endpoint (Useful for the HF Spaces UI)
-app.get("/", (req, res) => {
-    res.send("EdgeGPT Server is running. Send a POST request to /generate.");
-});
-// The main generation endpoint
 app.post("/generate", async (req, res) => {
     try {
-        // Extract inputs with default fallbacks
         const {
             user_input,
-            user_temp = 0.1,
-            user_inst = "You are a helpful AI assistant.",
             user_max_token = 5120
         } = req.body;
         if (!user_input) {
-            return res.status(400).json({ error: "user_input is required in the request body." });
         }
-        // Create a new session for this request to apply the specific system instructions
         const session = new LlamaChatSession({
-            contextSequence: context.getSequence(),
-            systemPrompt: user_inst
         });
-        const response = await session.prompt(user_input, {
             maxTokens: parseInt(user_max_token),
             temperature: parseFloat(user_temp),
             topK: 40,
@@ -75,11 +89,11 @@ app.post("/generate", async (req, res) => {
             repeatPenalty: 1.1
         });
-        res.json({ response });
     } catch (err) {
         console.error("Error during generation:", err);
-        res.status(500).json({ error: "An error occurred during text generation." });
     }
 });
@@ -87,11 +101,12 @@ app.post("/generate", async (req, res) => {
    START SERVER
 ----------------------- */
 initModel().then(() => {
-    // Listen on 0.0.0.0 so external networks (like Hugging Face routing) can connect
     app.listen(PORT, "0.0.0.0", () => {
         console.log(`Server is listening on port ${PORT}`);
     });
 }).catch(err => {
-    console.error("Failed to initialize the model server:", err);
     process.exit(1);
 });

 import express from "express";
 import { fileURLToPath } from "url";
 import path from "path";
+import morgan from "morgan"; // Useful logging
 import { getLlama, LlamaChatSession } from "node-llama-cpp";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const app = express();
+// Middleware to parse JSON bodies and log requests
 app.use(express.json());
+app.use(morgan('dev')); // Logs endpoint access to the console
+// SERVE THE UI: Tells Express to look for files in the "public" folder
+app.use(express.static(path.join(__dirname, 'public')));
 // Hugging Face Spaces expects apps to run on port 7860
 const PORT = process.env.PORT || 7860;
+// Set global instances so the model stays loaded in memory
+let modelInstance;
+let contextInstance;
 /* -----------------------
    LOAD MODEL (ONCE)
 ----------------------- */
 async function initModel() {
+    console.log("-----------------------------------------");
+    console.log("Initializing Llama Backend...");
     const llama = await getLlama();
+    // Path inside the Docker container
+    const modelLocation = path.join(__dirname, "models", "gemma-3-1b-it-UD-IQ1_S.gguf");
+    console.log(`Loading model into memory: ${modelLocation}`);
+    modelInstance = await llama.loadModel({
+        modelPath: modelLocation,
+        gpu: false // Ensure CPU execution for HF Free tier
     });
+    console.log("Creating context sequence...");
+    contextInstance = await modelInstance.createContext({
         batchSize: 512,
+        threads: 6 // Optimize for available vCPUs
     });
+    console.log("Model successfully loaded! 🚀");
+    console.log("-----------------------------------------");
 }
 /* -----------------------
    API ENDPOINTS
 ----------------------- */
+/**
+ * Main generation endpoint (The UI calls this internally)
+ * Takes 4 inputs in JSON: user_input, user_temp, user_inst, user_max_token
+ */
 app.post("/generate", async (req, res) => {
     try {
+        // Input validation and defaults
         const {
             user_input,
+            user_temp = 0.7,
+            user_inst = "You are an AI assistant. Give short clear answers.",
             user_max_token = 5120
         } = req.body;
         if (!user_input) {
+            return res.status(400).json({ error: "Missing required field: user_input" });
         }
+        console.log(`Generating response for: "${user_input.substring(0, 50)}..."`);
+        console.log(`Params: temp=${user_temp}, max=${user_max_token}`);
+        // We create a new session for each request so it uses the *dynamic* instructions
+        // This is safe because it reuses the global context sequence
         const session = new LlamaChatSession({
+            contextSequence: contextInstance.getSequence(),
+            systemPrompt: user_inst // Apply user provided instructions
         });
+        // Generate response using provided temperature and max_token parameters
+        const responseText = await session.prompt(user_input, {
             maxTokens: parseInt(user_max_token),
             temperature: parseFloat(user_temp),
             topK: 40,
             repeatPenalty: 1.1
         });
+        res.json({ response: responseText });
     } catch (err) {
         console.error("Error during generation:", err);
+        res.status(500).json({ error: "An internal error occurred during text generation." });
     }
 });
    START SERVER
 ----------------------- */
 initModel().then(() => {
+    // Listen on 0.0.0.0 for external network routing (like Hugging Face)
     app.listen(PORT, "0.0.0.0", () => {
         console.log(`Server is listening on port ${PORT}`);
+        console.log(`Access UI at: http://localhost:${PORT}`);
     });
 }).catch(err => {
+    console.error("Critical Failure: Failed to initialize the model server.", err);
     process.exit(1);
 });