Spaces:

CooLLaMACEO
/

ChatGPTOpenSource1.0

Running

App Files Files Community

CooLLaMACEO commited on Feb 5

Commit

ca02091

verified ·

1 Parent(s): 0f66a58

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -55

app.py CHANGED Viewed

@@ -1,85 +1,65 @@
 import os
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import HTMLResponse, JSONResponse
 from llama_cpp import Llama
-# ==========================================
-# 1. AI Model Configuration
-# ==========================================
 MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
-print("🔥 ChatGPT Open-Source 1.0: Initializing 20B Engine...")
-# We use a smaller n_ctx (2048) to ensure we don't exceed HF's 16GB RAM
-# once the 10.7GB model is fully loaded.
-llm = Llama(
-    model_path=MODEL_PATH,
-    n_ctx=2048,
-    n_threads=os.cpu_count(),
-    n_batch=512,
-    verbose=True
-)
-print("✅ Brain Linked! System Online.")
-# ==========================================
-# 2. FastAPI Setup
-# ==========================================
-app = FastAPI(title="ChatGPT Open-Source 1.0 Backend")
-# Enable CORS so your GitHub site can talk to this Hugging Face Space
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"], # Change to your github.io URL for better security late
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# ==========================================
-# 3. Routes
-# ==========================================
-@app.get("/", response_class=HTMLResponse)
-async def get_ui():
-    """Serves the local index.html UI"""
-    if os.path.exists("index.html"):
-        with open("index.html", "r") as f:
-            return f.read()
-    return "<h1>System Online</h1><p>Backend is running, but index.html was not found.</p>"
 @app.post("/chat")
 async def chat(request: Request):
-    """Handles AI Chat Requests"""
     try:
         data = await request.json()
         user_message = data.get("message", "")
-        if not user_message:
-            return JSONResponse({"response": "I didn't receive a message."}, status_code=400)
-        # Formatting for the GPT-OSS model architecture
-        prompt = f"<|system|>You are ChatGPT Open-Source 1.0, a helpful local AI.<|user|>{user_message}<|assistant|>"
-        # Generate response
         output = llm(
             prompt,
-            max_tokens=512,
-            stop=["<|user|>", "<|system|>", "</s>"],
             temperature=0.7
         )
-        reply = output["choices"][0]["text"].strip()
-        return JSONResponse({"response": reply})
     except Exception as e:
-        print(f"❌ Error during inference: {e}")
-        return JSONResponse({"response": "My brain encountered an error processing that."}, status_code=500)
-# ==========================================
-# 4. Health Check
-# ==========================================
 @app.get("/health")
 async def health():
-    return {"status": "ready", "model": "20B-Q3_K_M", "ram_bypass": True}

 import os
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
 from llama_cpp import Llama
+# 20B Q3_K_M is ~11.5GB. With context, it will hit ~14-15GB RAM.
 MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
+# Initialize Model BEFORE FastAPI starts to ensure it's ready
+print("🔥 Loading 20B Engine (This may take 2-4 minutes)...")
+try:
+    llm = Llama(
+        model_path=MODEL_PATH,
+        n_ctx=1024,        # Reduced context to stay under 16GB RAM limit
+        n_threads=2,       # HF Free Tier has 2 vCPUs
+        n_batch=128,
+        verbose=True
+    )
+    print("✅ Model Loaded Successfully.")
+except Exception as e:
+    print(f"❌ Failed to load model: {e}")
+    llm = None
+app = FastAPI()
+# CORS: Allow your GitHub site to talk to this API
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["https://hydrogenclient.github.io"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+@app.get("/")
+async def root():
+    return {"status": "online", "message": "Connect to /chat"}
 @app.post("/chat")
 async def chat(request: Request):
+    if llm is None:
+        return JSONResponse({"error": "Model failed to load on start."}, status_code=500)
     try:
         data = await request.json()
         user_message = data.get("message", "")
+        # GPT-OSS formatting
+        prompt = f"<|system|>You are a helpful AI.<|user|>{user_message}<|assistant|>"
         output = llm(
             prompt,
+            max_tokens=256,
+            stop=["<|user|>", "</s>"],
             temperature=0.7
         )
+        return {"response": output["choices"][0]["text"].strip()}
     except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
 @app.get("/health")
 async def health():
+    return {"status": "ready" if llm else "initializing"}