CooLLaMACEO commited on
Commit
ca02091
·
verified ·
1 Parent(s): 0f66a58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -55
app.py CHANGED
@@ -1,85 +1,65 @@
1
  import os
2
  from fastapi import FastAPI, Request
3
  from fastapi.middleware.cors import CORSMiddleware
4
- from fastapi.responses import HTMLResponse, JSONResponse
5
  from llama_cpp import Llama
6
 
7
- # ==========================================
8
- # 1. AI Model Configuration
9
- # ==========================================
10
  MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
11
 
12
- print("🔥 ChatGPT Open-Source 1.0: Initializing 20B Engine...")
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # We use a smaller n_ctx (2048) to ensure we don't exceed HF's 16GB RAM
15
- # once the 10.7GB model is fully loaded.
16
- llm = Llama(
17
- model_path=MODEL_PATH,
18
- n_ctx=2048,
19
- n_threads=os.cpu_count(),
20
- n_batch=512,
21
- verbose=True
22
- )
23
-
24
- print("✅ Brain Linked! System Online.")
25
-
26
- # ==========================================
27
- # 2. FastAPI Setup
28
- # ==========================================
29
- app = FastAPI(title="ChatGPT Open-Source 1.0 Backend")
30
 
31
- # Enable CORS so your GitHub site can talk to this Hugging Face Space
32
  app.add_middleware(
33
  CORSMiddleware,
34
- allow_origins=["*"], # Change to your github.io URL for better security late
35
  allow_credentials=True,
36
  allow_methods=["*"],
37
  allow_headers=["*"],
38
  )
39
 
40
- # ==========================================
41
- # 3. Routes
42
- # ==========================================
43
-
44
- @app.get("/", response_class=HTMLResponse)
45
- async def get_ui():
46
- """Serves the local index.html UI"""
47
- if os.path.exists("index.html"):
48
- with open("index.html", "r") as f:
49
- return f.read()
50
- return "<h1>System Online</h1><p>Backend is running, but index.html was not found.</p>"
51
 
52
  @app.post("/chat")
53
  async def chat(request: Request):
54
- """Handles AI Chat Requests"""
 
 
55
  try:
56
  data = await request.json()
57
  user_message = data.get("message", "")
58
-
59
- if not user_message:
60
- return JSONResponse({"response": "I didn't receive a message."}, status_code=400)
61
-
62
- # Formatting for the GPT-OSS model architecture
63
- prompt = f"<|system|>You are ChatGPT Open-Source 1.0, a helpful local AI.<|user|>{user_message}<|assistant|>"
64
-
65
- # Generate response
66
  output = llm(
67
  prompt,
68
- max_tokens=512,
69
- stop=["<|user|>", "<|system|>", "</s>"],
70
  temperature=0.7
71
  )
72
-
73
- reply = output["choices"][0]["text"].strip()
74
- return JSONResponse({"response": reply})
75
-
76
  except Exception as e:
77
- print(f"❌ Error during inference: {e}")
78
- return JSONResponse({"response": "My brain encountered an error processing that."}, status_code=500)
79
 
80
- # ==========================================
81
- # 4. Health Check
82
- # ==========================================
83
  @app.get("/health")
84
  async def health():
85
- return {"status": "ready", "model": "20B-Q3_K_M", "ram_bypass": True}
 
1
  import os
2
  from fastapi import FastAPI, Request
3
  from fastapi.middleware.cors import CORSMiddleware
4
+ from fastapi.responses import JSONResponse
5
  from llama_cpp import Llama
6
 
7
+ # 20B Q3_K_M is ~11.5GB. With context, it will hit ~14-15GB RAM.
 
 
8
  MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
9
 
10
+ # Initialize Model BEFORE FastAPI starts to ensure it's ready
11
+ print("🔥 Loading 20B Engine (This may take 2-4 minutes)...")
12
+ try:
13
+ llm = Llama(
14
+ model_path=MODEL_PATH,
15
+ n_ctx=1024, # Reduced context to stay under 16GB RAM limit
16
+ n_threads=2, # HF Free Tier has 2 vCPUs
17
+ n_batch=128,
18
+ verbose=True
19
+ )
20
+ print("✅ Model Loaded Successfully.")
21
+ except Exception as e:
22
+ print(f"❌ Failed to load model: {e}")
23
+ llm = None
24
 
25
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ # CORS: Allow your GitHub site to talk to this API
28
  app.add_middleware(
29
  CORSMiddleware,
30
+ allow_origins=["https://hydrogenclient.github.io"],
31
  allow_credentials=True,
32
  allow_methods=["*"],
33
  allow_headers=["*"],
34
  )
35
 
36
+ @app.get("/")
37
+ async def root():
38
+ return {"status": "online", "message": "Connect to /chat"}
 
 
 
 
 
 
 
 
39
 
40
  @app.post("/chat")
41
  async def chat(request: Request):
42
+ if llm is None:
43
+ return JSONResponse({"error": "Model failed to load on start."}, status_code=500)
44
+
45
  try:
46
  data = await request.json()
47
  user_message = data.get("message", "")
48
+
49
+ # GPT-OSS formatting
50
+ prompt = f"<|system|>You are a helpful AI.<|user|>{user_message}<|assistant|>"
51
+
 
 
 
 
52
  output = llm(
53
  prompt,
54
+ max_tokens=256,
55
+ stop=["<|user|>", "</s>"],
56
  temperature=0.7
57
  )
58
+
59
+ return {"response": output["choices"][0]["text"].strip()}
 
 
60
  except Exception as e:
61
+ return JSONResponse({"error": str(e)}, status_code=500)
 
62
 
 
 
 
63
  @app.get("/health")
64
  async def health():
65
+ return {"status": "ready" if llm else "initializing"}