Spaces:

fugthchat
/

fugth-chat-NDM3-Brain

Build error

App Files Files Community

fugthchat commited on Nov 5, 2025

Commit

c8b68fa

verified ·

1 Parent(s): cd32c82

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -70

app.py CHANGED Viewed

@@ -1,102 +1,160 @@
-from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from llama_cpp import Llama
 from fastapi.middleware.cors import CORSMiddleware
 from huggingface_hub import hf_hub_download
 import logging
-import re
 import threading
-# Set up logging to get more detailed output
 logging.basicConfig(level=logging.INFO)
-# --- STABLE MODEL CONFIGURATION ---
-MODEL_REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
-MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
-# --- GLOBAL MODEL OBJECT & THREAD LOCK ---
-llm = None
-model_lock = threading.Lock()
-# --- SERVER STARTUP LOGIC ---
-logging.info("Server starting...")
-try:
-    logging.info(f"Downloading single model: {MODEL_FILENAME} from {MODEL_REPO_ID}")
-    model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
-    logging.info(f"Model downloaded to: {model_path}")
-    logging.info("Loading model from local path with optimized settings...")
-    llm = Llama(
-        model_path=model_path,
-        n_ctx=1024,
-        n_threads=2,
-        n_gpu_layers=0,
-        verbose=True
-    )
-    logging.info("Model loaded successfully! AI server is ready.")
-except Exception as e:
-    logging.critical(f"CRITICAL ERROR: Failed to load the model. Server will be non-functional. Error: {e}", exc_info=True)
-# --- FASTAPI APP SETUP ---
-app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# --- API ENDPOINTS ---
 @app.get("/")
 def get_status():
-    """Endpoint to check if the server and model are online."""
     return {
         "status": "AI server is online",
-        "model_loaded": llm is not None
     }
-@app.post("/chat")
-async def chat_endpoint(request: Request): # Changed to use the raw Request object
-    """
-    Main chat endpoint. This version manually parses the JSON body to
-    bypass the Pydantic 422 validation error.
-    """
     with model_lock:
-        if not llm:
-            logging.error("Chat request received but model is not loaded.")
-            return JSONResponse(status_code=503, content={"response": "The AI model is not available. Please contact support."})
         try:
-            # Manually parse the JSON from the request body
-            data = await request.json()
-            prompt = data.get("prompt")
-            quality = data.get("quality", "lite")
-            if not prompt:
-                return JSONResponse(status_code=400, content={"response": "Error: No prompt was provided in the request."})
-            if quality == "high":
-                max_tokens = 512
-                logging.info(f"Handling request with HIGH quality setting (max_tokens={max_tokens}).")
-            else:
-                max_tokens = 200
-                logging.info(f"Handling request with LITE quality setting (max_tokens={max_tokens}).")
-            output = llm.create_completion(
-                prompt=prompt,
-                max_tokens=max_tokens,
-                temperature=0.7,
-                stop=["</s>", "<|user|>", "<|system|>"],
-                stream=False
             )
-            response_text = output['choices'][0]['text'].strip()
-            logging.info("Successfully generated response.")
-            return {"response": response_text}
         except Exception as e:
-            logging.error(f"An internal error occurred during chat completion: {e}", exc_info=True)
-            return JSONResponse(status_code=500, content={"response": "An unexpected error occurred while processing your request."})

+import os
+import uvicorn
+from fastapi import FastAPI
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from llama_cpp import Llama
 from fastapi.middleware.cors import CORSMiddleware
 from huggingface_hub import hf_hub_download
 import logging
 import threading
+# --- Setup ---
 logging.basicConfig(level=logging.INFO)
+app = FastAPI()
+model_lock = threading.Lock() # From your old app, this is great for stability
+llm_cache = {} # To store loaded models
+# --- Model Map (With CORRECT URLs) ---
+# Your frontend can request "light", "medium", or "heavy"
+MODEL_MAP = {
+    "light": {
+        "repo_id": "TheBloke/stablelm-zephyr-3b-GGUF",
+        "filename": "stablelm-zephyr-3b.Q3_K_S.gguf" # 1.25 GB
+    },
+    "medium": {
+        "repo_id": "TheBloke/stablelm-zephyr-3b-GGUF",
+        "filename": "stablelm-zephyr-3b.Q4_K_M.gguf" # 1.71 GB
+    },
+    "heavy": {
+        "repo_id": "TheBloke/stablelm-zephyr-3b-GGUF",
+        "filename": "stablelm-zephyr-3b.Q5_K_M.gguf" # 2.03 GB
+    }
+}
+# --- CORS ---
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"], # Allow your GitHub Pages frontend
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+# --- Model Loading Logic ---
+def get_llm_instance(choice: str) -> Llama:
+    """
+    Downloads, loads, and caches a model.
+    This is thread-safe thanks to the lock.
+    """
+    if choice not in MODEL_MAP:
+        logging.error(f"Invalid model choice: {choice}")
+        return None
+    # If model is already loaded, just return it
+    if choice in llm_cache:
+        logging.info(f"Using cached model: {choice}")
+        return llm_cache[choice]
+    # If not in cache, download and load
+    model_info = MODEL_MAP[choice]
+    repo_id = model_info["repo_id"]
+    filename = model_info["filename"]
+    try:
+        logging.info(f"Downloading model: {filename} from {repo_id}...")
+        # Use hf_hub_download (from your old app)
+        model_path = hf_hub_download(repo_id=repo_id, filename=filename)
+        logging.info(f"Model downloaded to: {model_path}")
+        logging.info("Loading model into memory...")
+        llm = Llama(
+            model_path=model_path,
+            n_ctx=4096,      # Max context
+            n_threads=2,    # Free HF CPU has 2 cores
+            n_gpu_layers=0, # Force CPU
+            verbose=True
+        )
+        llm_cache.clear() # Clear old models to save RAM
+        llm_cache[choice] = llm # Cache the new model
+        logging.info(f"Model {choice} loaded successfully.")
+        return llm
+    except Exception as e:
+        logging.critical(f"Failed to download/load model {filename}. Error: {e}", exc_info=True)
+        return None
+# --- API Request Model ---
+class StoryPrompt(BaseModel):
+    prompt: str
+    feedback: str
+    story_memory: str
+    model_choice: str
+# --- App Startup Event ---
+@app.on_event("startup")
+async def startup_event():
+    """
+    On startup, we acquire the lock and pre-load the default 'light' model.
+    This is what runs *after* the build.
+    """
+    logging.info("Server starting... Acquiring lock to pre-load 'light' model.")
+    with model_lock:
+        get_llm_instance("light")
+    logging.info("Server is ready and 'light' model is loaded.")
+# --- API Endpoints ---
 @app.get("/")
 def get_status():
+    """Health check endpoint."""
+    loaded_model = list(llm_cache.keys())[0] if llm_cache else "None"
     return {
         "status": "AI server is online",
+        "model_loaded": loaded_model
     }
+@app.post("/generate")
+async def generate_story(prompt: StoryPrompt):
+    """Main generation endpoint. It's thread-safe."""
+    logging.info("Request received. Waiting for model lock...")
     with model_lock:
+        logging.info("Lock acquired. Processing.")
         try:
+            # 1. Get the correct LLM (load if needed)
+            llm = get_llm_instance(prompt.model_choice)
+            if llm is None:
+                return JSONResponse(status_code=503, content={"error": "Model failed to load."})
+            # 2. Format the prompt (Zephyr/ChatML format)
+            final_prompt = f"""<|user|>
+Story so far:
+{prompt.story_memory}
+My new part/instruction:
+{prompt.prompt}
+Feedback to apply:
+{prompt.feedback}
+Generate the next part of the story.<|endoftext|>
+<|assistant|>"""
+            # 3. Generate
+            logging.info(f"Generating with {prompt.model_choice}...")
+            output = llm(
+                final_prompt,
+                max_tokens=512,
+                stop=["<|user|>", "<|endoftext|>"],
+                echo=False
             )
+            generated_text = output["choices"][0]["text"].strip()
+            logging.info("Generation complete.")
+            # This matches the key your frontend expects
+            return {"story_text": generated_text}
         except Exception as e:
+            logging.error(f"Generation error: {e}", exc_info=True)
+            return JSONResponse(status_code=500, content={"error": "An unexpected error occurred."})