Spaces:

CooLLaMACEO
/

ChatGPTOpenSource1.0

Running

App Files Files Community

CooLLaMACEO commited on Feb 5

Commit

111e00f

verified ·

1 Parent(s): ca02091

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -27

app.py CHANGED Viewed

@@ -1,65 +1,103 @@
 import os
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from llama_cpp import Llama
-# 20B Q3_K_M is ~11.5GB. With context, it will hit ~14-15GB RAM.
 MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
-# Initialize Model BEFORE FastAPI starts to ensure it's ready
-print("🔥 Loading 20B Engine (This may take 2-4 minutes)...")
-try:
-    llm = Llama(
-        model_path=MODEL_PATH,
-        n_ctx=1024,        # Reduced context to stay under 16GB RAM limit
-        n_threads=2,       # HF Free Tier has 2 vCPUs
-        n_batch=128,
-        verbose=True
-    )
-    print("✅ Model Loaded Successfully.")
-except Exception as e:
-    print(f"❌ Failed to load model: {e}")
-    llm = None
-app = FastAPI()
-# CORS: Allow your GitHub site to talk to this API
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["https://hydrogenclient.github.io"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 @app.get("/")
 async def root():
     return {"status": "online", "message": "Connect to /chat"}
 @app.post("/chat")
 async def chat(request: Request):
     if llm is None:
-        return JSONResponse({"error": "Model failed to load on start."}, status_code=500)
     try:
         data = await request.json()
-        user_message = data.get("message", "")
-        # GPT-OSS formatting
         prompt = f"<|system|>You are a helpful AI.<|user|>{user_message}<|assistant|>"
         output = llm(
             prompt,
             max_tokens=256,
-            stop=["<|user|>", "</s>"],
             temperature=0.7
         )
-        return {"response": output["choices"][0]["text"].strip()}
     except Exception as e:
-        return JSONResponse({"error": str(e)}, status_code=500)
-@app.get("/health")
-async def health():
-    return {"status": "ready" if llm else "initializing"}

 import os
+import logging
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from llama_cpp import Llama
+# 1. Setup Logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# 2. Model Configuration (20B Q3_K_M)
 MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
+llm = None
+def load_model():
+    global llm
+    if llm is None:
+        logger.info("🔥 Initializing 20B Engine (Direct I/O Mode)...")
+        try:
+            # Using conservative settings to fit in 16GB RAM
+            llm = Llama(
+                model_path=MODEL_PATH,
+                n_ctx=1024,        # Crucial: Keep context low to avoid OOM crashes
+                n_threads=2,       # HF Free tier limit
+                n_batch=512,
+                use_mmap=False,    # Match your log discovery
+                use_mlock=False,
+                verbose=True
+            )
+            logger.info("✅ Brain Linked! System Online.")
+        except Exception as e:
+            logger.error(f"❌ Initialization failed: {e}")
+# 3. FastAPI App Setup
+app = FastAPI(title="ChatGPT Open-Source 1.0 API")
+# 4. CORS Setup: Allows GitHub Pages and Local Testing
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"], # Change to ["https://hydrogenclient.github.io"] for production
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+@app.on_event("startup")
+async def startup_event():
+    load_model()
+# 5. Routes
 @app.get("/")
 async def root():
     return {"status": "online", "message": "Connect to /chat"}
+@app.get("/health")
+async def health():
+    return {"status": "ready" if llm else "loading"}
 @app.post("/chat")
 async def chat(request: Request):
     if llm is None:
+        return JSONResponse({"response": "I'm still waking up. Try again in 60 seconds."}, status_code=503)
     try:
         data = await request.json()
+        # --- Handle different request formats ---
+        # Format A: {"message": "Hello"}
+        user_message = data.get("message")
+        # Format B: {"messages": [{"role": "user", "content": "Hello"}]}
+        if not user_message and "messages" in data:
+            # Take the last message from the conversation list
+            user_message = data["messages"][-1]["content"]
+        if not user_message:
+            return JSONResponse({"response": "I didn't see a message in your request."}, status_code=400)
+        # --- Formatting for GPT-OSS Architecture ---
+        # Note: Your model expects <|user|> and <|assistant|> markers
         prompt = f"<|system|>You are a helpful AI.<|user|>{user_message}<|assistant|>"
+        # --- Inference ---
         output = llm(
             prompt,
             max_tokens=256,
+            stop=["<|user|>", "<|system|>", "</s>"],
             temperature=0.7
         )
+        reply = output["choices"][0]["text"].strip()
+        return {"response": reply}
     except Exception as e:
+        logger.error(f"❌ Inference Error: {e}")
+        return JSONResponse({"response": "My brain encountered an error processing that."}, status_code=500)
+# 6. Entry point for local testing
+if __name__ == "__main__":
+    import uvicorn
+    # Local: uvicorn app:app --host 0.0.0.0 --port 7860
+    uvicorn.run(app, host="0.0.0.0", port=7860)