Update app.py
Browse files
app.py
CHANGED
|
@@ -8,12 +8,9 @@ import requests
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
|
| 10 |
# --- Configuration ---
|
| 11 |
-
# This is the model the Space will download and use if it's not already present.
|
| 12 |
-
# You can change this to any GGUF model compatible with llama-cpp-python.
|
| 13 |
MODEL_NAME = "stablelm-zephyr-3b.Q3_K_S.gguf"
|
| 14 |
MODEL_URL = f"https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/{MODEL_NAME}"
|
| 15 |
MODEL_PATH = f"./{MODEL_NAME}"
|
| 16 |
-
N_GPU_LAYERS = -1 # -1 to offload all layers to GPU, 0 for CPU only. Adjust if needed.
|
| 17 |
N_CTX = 4096 # Context window size.
|
| 18 |
|
| 19 |
# --- Model Download ---
|
|
@@ -38,46 +35,41 @@ def download_model():
|
|
| 38 |
# --- FastAPI App ---
|
| 39 |
app = FastAPI()
|
| 40 |
|
| 41 |
-
# Add CORS middleware to allow requests from your website
|
| 42 |
app.add_middleware(
|
| 43 |
CORSMiddleware,
|
| 44 |
-
allow_origins=["http://fugthdesign.space", "https://fugthdesign.space"],
|
| 45 |
allow_credentials=True,
|
| 46 |
allow_methods=["*"],
|
| 47 |
allow_headers=["*"],
|
| 48 |
)
|
| 49 |
|
| 50 |
-
# Load the model on startup
|
| 51 |
llm = None
|
| 52 |
@app.on_event("startup")
|
| 53 |
def load_llm():
|
| 54 |
global llm
|
| 55 |
if download_model():
|
| 56 |
try:
|
| 57 |
-
print("Loading GGUF model...")
|
| 58 |
llm = Llama(
|
| 59 |
model_path=MODEL_PATH,
|
| 60 |
n_ctx=N_CTX,
|
| 61 |
-
n_gpu_layers=
|
| 62 |
verbose=True
|
| 63 |
)
|
| 64 |
-
print("Model loaded successfully!")
|
| 65 |
except Exception as e:
|
| 66 |
print(f"Error loading model: {e}")
|
| 67 |
|
| 68 |
-
# Define the structure of the data your frontend will send
|
| 69 |
class ChatRequest(BaseModel):
|
| 70 |
userInput: str
|
| 71 |
persona: str
|
| 72 |
localKnowledge: str
|
| 73 |
|
| 74 |
-
# --- API Endpoint ---
|
| 75 |
@app.post("/chat")
|
| 76 |
async def chat(request: ChatRequest):
|
| 77 |
if not llm:
|
| 78 |
return {"error": "Model is not loaded or failed to load."}
|
| 79 |
|
| 80 |
-
# Construct the prompt using the data from the frontend
|
| 81 |
system_prompt = request.persona or "You are a helpful AI assistant."
|
| 82 |
knowledge_context = f"Use the following context to inform your answer:\n---CONTEXT---\n{request.localKnowledge}\n---END CONTEXT---" if request.localKnowledge else ""
|
| 83 |
|
|
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
|
| 10 |
# --- Configuration ---
|
|
|
|
|
|
|
| 11 |
MODEL_NAME = "stablelm-zephyr-3b.Q3_K_S.gguf"
|
| 12 |
MODEL_URL = f"https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/{MODEL_NAME}"
|
| 13 |
MODEL_PATH = f"./{MODEL_NAME}"
|
|
|
|
| 14 |
N_CTX = 4096 # Context window size.
|
| 15 |
|
| 16 |
# --- Model Download ---
|
|
|
|
| 35 |
# --- FastAPI App ---
|
| 36 |
app = FastAPI()
|
| 37 |
|
|
|
|
| 38 |
app.add_middleware(
|
| 39 |
CORSMiddleware,
|
| 40 |
+
allow_origins=["http://fugthdesign.space", "https://fugthdesign.space"],
|
| 41 |
allow_credentials=True,
|
| 42 |
allow_methods=["*"],
|
| 43 |
allow_headers=["*"],
|
| 44 |
)
|
| 45 |
|
|
|
|
| 46 |
llm = None
|
| 47 |
@app.on_event("startup")
|
| 48 |
def load_llm():
|
| 49 |
global llm
|
| 50 |
if download_model():
|
| 51 |
try:
|
| 52 |
+
print("Loading GGUF model for CPU...")
|
| 53 |
llm = Llama(
|
| 54 |
model_path=MODEL_PATH,
|
| 55 |
n_ctx=N_CTX,
|
| 56 |
+
n_gpu_layers=0, # ** THIS IS THE KEY CHANGE FOR CPU **
|
| 57 |
verbose=True
|
| 58 |
)
|
| 59 |
+
print("Model loaded successfully on CPU!")
|
| 60 |
except Exception as e:
|
| 61 |
print(f"Error loading model: {e}")
|
| 62 |
|
|
|
|
| 63 |
class ChatRequest(BaseModel):
|
| 64 |
userInput: str
|
| 65 |
persona: str
|
| 66 |
localKnowledge: str
|
| 67 |
|
|
|
|
| 68 |
@app.post("/chat")
|
| 69 |
async def chat(request: ChatRequest):
|
| 70 |
if not llm:
|
| 71 |
return {"error": "Model is not loaded or failed to load."}
|
| 72 |
|
|
|
|
| 73 |
system_prompt = request.persona or "You are a helpful AI assistant."
|
| 74 |
knowledge_context = f"Use the following context to inform your answer:\n---CONTEXT---\n{request.localKnowledge}\n---END CONTEXT---" if request.localKnowledge else ""
|
| 75 |
|