Spaces:

fugthchat
/

fugthdes

Sleeping

App Files Files Community

fugthchat commited on Nov 5, 2025

Commit

5ae1757

verified ·

1 Parent(s): 14f464e

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -63

app.py CHANGED Viewed

@@ -1,66 +1,168 @@
-from flask import Flask, request, jsonify
-from llama_cpp import Llama
 import os
-app = Flask(__name__)
-MODEL_URLS = {
-    "light": "https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/stablelm-zephyr-3b.Q3_K_S.gguf",
-    "medium": "https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/stablelm-zephyr-3b.Q4_K_M.gguf",
-    "heavy": "https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/stablelm-zephyr-3b.Q5_0.gguf"
-}
-MODEL_PATHS = {
-    k: f"{k}.gguf" for k in MODEL_URLS
 }
-current_model = None
-llm = None
-def ensure_model(model_choice):
-    global llm, current_model
-    model_path = MODEL_PATHS[model_choice]
-    url = MODEL_URLS[model_choice]
-    if not os.path.exists(model_path):
-        print(f"Downloading {model_choice} model...")
-        os.system(f"wget -O {model_path} {url}")
-    if current_model != model_choice:
-        print(f"Loading {model_choice} model...")
-        llm = Llama(model_path=model_path, n_ctx=2048, n_threads=4, use_mlock=False)
-        current_model = model_choice
-    return llm
-@app.route("/status")
-def status():
-    return jsonify({"status": "ok" if llm else "not_loaded", "model": current_model})
-@app.route("/generate", methods=["POST"])
-def generate():
-    data = request.get_json(force=True)
-    model_choice = data.get("model_choice", "light")
-    prompt = data.get("prompt", "")
-    story_memory = data.get("story_memory", "")
-    feedback = data.get("feedback", "")
-    llm = ensure_model(model_choice)
-    full_prompt = story_memory + "\n\n" + prompt
-    if feedback:
-        full_prompt += f"\n\nUser feedback: {feedback}\n"
-    result = llm(full_prompt, max_tokens=512, temperature=0.8)
-    text = result["choices"][0]["text"].strip()
-    return jsonify({"response": text})
-@app.route("/")
-def root():
-    return "StableLM Zephyr GGUF API running!"
-if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860)

 import os
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from llama_cpp import Llama
+from fastapi.middleware.cors import CORSMiddleware
+from huggingface_hub import hf_hub_download
+import logging
+import threading
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+# --- MODEL MAP ---
+# This maps the "light", "medium", "heavy" keys from your frontend
+# to the actual model files on Hugging Face.
+MODEL_MAP = {
+    "light": {
+        "repo_id": "TheBloke/stablelm-zephyr-3b-GGUF",
+        "filename": "stablelm-zephyr-3b.Q3_K_S.gguf" # 1.25 GB
+    },
+    "medium": {
+        "repo_id": "TheBloke/stablelm-zephyr-3b-GGUF",
+        "filename": "stablelm-zephyr-3b.Q4_K_M.gguf" # 1.71 GB
+    },
+    "heavy": {
+        "repo_id": "TheBloke/stablelm-zephyr-3b-GGUF",
+        "filename": "stablelm-zephyr-3b.Q5_K_M.gguf" # 2.03 GB
+    }
 }
+# --- GLOBAL CACHE & LOCK ---
+llm_cache = {} # Caches loaded models
+model_lock = threading.Lock() # Prevents two requests from using the model at once
+app = FastAPI()
+# --- CORS ---
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"], # Allows your GitHub Pages site to connect
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# --- Helper Function to Load Model ---
+def get_llm_instance(choice: str) -> Llama:
+    """
+    Loads a model based on the choice.
+    Uses hf_hub_download.
+    Caches the loaded model in memory.
+    """
+    if choice not in MODEL_MAP:
+        logging.error(f"Invalid model choice: {choice}")
+        return None
+    if choice in llm_cache:
+        logging.info(f"Using cached model: {choice}")
+        return llm_cache[choice]
+    model_info = MODEL_MAP[choice]
+    repo_id = model_info["repo_id"]
+    filename = model_info["filename"]
+    try:
+        logging.info(f"Downloading model: {filename} from {repo_id}")
+        model_path = hf_hub_download(repo_id=repo_id, filename=filename)
+        logging.info(f"Model downloaded to: {model_path}")
+        logging.info("Loading model into memory...")
+        llm = Llama(
+            model_path=model_path,
+            n_ctx=4096,       # Max context
+            n_threads=2,      # For free HF CPU
+            n_gpu_layers=0,   # Force CPU
+            verbose=True
+        )
+        llm_cache.clear()
+        llm_cache[choice] = llm
+        logging.info(f"Model {choice} loaded successfully.")
+        return llm
+    except Exception as e:
+        logging.critical(f"CRITICAL ERROR: Failed to download/load model {filename}. Error: {e}", exc_info=True)
+        return None
+# --- API Data Models ---
+class StoryPrompt(BaseModel):
+    prompt: str
+    feedback: str
+    story_memory: str
+    model_choice: str
+# --- API Endpoints ---
+@app.on_event("startup")
+async def startup_event():
+    """
+    This runs when your Space starts.
+    It pre-loads the 'light' model so the app is ready faster.
+    """
+    logging.info("Server starting up... Acquiring lock to pre-load model.")
+    with model_lock:
+        get_llm_instance("light")
+    logging.info("Server is ready and 'light' model is loaded.")
+@app.get("/")
+def get_status():
+    """
+    Health check endpoint.
+    This is what your frontend pings.
+    """
+    loaded_model = list(llm_cache.keys())[0] if llm_cache else "None"
+    return {
+        "status": "AI server is online",
+        "model_loaded": loaded_model,
+        "models": list(MODEL_MAP.keys()) # <-- This is the CRUCIAL line for your frontend
+    }
+@app.post("/generate")
+async def generate_story(prompt: StoryPrompt):
+    """
+    Main generation endpoint.
+    Uses the thread lock to ensure stability.
+    """
+    logging.info("Request received. Waiting to acquire model lock...")
+    with model_lock:
+        logging.info("Lock acquired. Processing request.")
+        try:
+            llm = get_llm_instance(prompt.model_choice)
+            if llm is None:
+                logging.error(f"Failed to get model for choice: {prompt.model_choice}")
+                return JSONResponse(status_code=503, content={"error": "The AI model is not available or failed to load."})
+            # Format the prompt (Zephyr/ChatML format)
+            final_prompt = f"""<|user|>
+Here is the story so far:
+{prompt.story_memory}
+Here is the part I just wrote or want to continue from:
+{prompt.prompt}
+Please use this feedback to guide the next chapter:
+{prompt.feedback}
+Generate the next part of the story.<|endoftext|>
+<|assistant|>"""
+            logging.info(f"Generating with {prompt.model_choice}...")
+            output = llm(
+                final_prompt,
+                max_tokens=512,
+                stop=["<|user|>", "<|endoftext|>"],
+                echo=False
+            )
+            generated_text = output["choices"][0]["text"].strip()
+            logging.info("Generation complete.")
+            return {"story_text": generated_text}
+        except Exception as e:
+            logging.error(f"An internal error occurred during generation: {e}", exc_info=True)
+            return JSONResponse(status_code=500, content={"error": "An unexpected error occurred."})
+        finally:
+            logging.info("Releasing model lock.")