CooLLaMACEO commited on
Commit
111e00f
Β·
verified Β·
1 Parent(s): ca02091

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -27
app.py CHANGED
@@ -1,65 +1,103 @@
1
  import os
 
2
  from fastapi import FastAPI, Request
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from fastapi.responses import JSONResponse
5
  from llama_cpp import Llama
6
 
7
- # 20B Q3_K_M is ~11.5GB. With context, it will hit ~14-15GB RAM.
 
 
 
 
8
  MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
 
9
 
10
- # Initialize Model BEFORE FastAPI starts to ensure it's ready
11
- print("πŸ”₯ Loading 20B Engine (This may take 2-4 minutes)...")
12
- try:
13
- llm = Llama(
14
- model_path=MODEL_PATH,
15
- n_ctx=1024, # Reduced context to stay under 16GB RAM limit
16
- n_threads=2, # HF Free Tier has 2 vCPUs
17
- n_batch=128,
18
- verbose=True
19
- )
20
- print("βœ… Model Loaded Successfully.")
21
- except Exception as e:
22
- print(f"❌ Failed to load model: {e}")
23
- llm = None
 
 
 
 
24
 
25
- app = FastAPI()
 
26
 
27
- # CORS: Allow your GitHub site to talk to this API
28
  app.add_middleware(
29
  CORSMiddleware,
30
- allow_origins=["https://hydrogenclient.github.io"],
31
  allow_credentials=True,
32
  allow_methods=["*"],
33
  allow_headers=["*"],
34
  )
35
 
 
 
 
 
 
36
  @app.get("/")
37
  async def root():
38
  return {"status": "online", "message": "Connect to /chat"}
39
 
 
 
 
 
40
  @app.post("/chat")
41
  async def chat(request: Request):
42
  if llm is None:
43
- return JSONResponse({"error": "Model failed to load on start."}, status_code=500)
44
 
45
  try:
46
  data = await request.json()
47
- user_message = data.get("message", "")
48
 
49
- # GPT-OSS formatting
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  prompt = f"<|system|>You are a helpful AI.<|user|>{user_message}<|assistant|>"
51
 
 
52
  output = llm(
53
  prompt,
54
  max_tokens=256,
55
- stop=["<|user|>", "</s>"],
56
  temperature=0.7
57
  )
58
 
59
- return {"response": output["choices"][0]["text"].strip()}
 
 
60
  except Exception as e:
61
- return JSONResponse({"error": str(e)}, status_code=500)
 
62
 
63
- @app.get("/health")
64
- async def health():
65
- return {"status": "ready" if llm else "initializing"}
 
 
 
1
  import os
2
+ import logging
3
  from fastapi import FastAPI, Request
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from fastapi.responses import JSONResponse
6
  from llama_cpp import Llama
7
 
8
+ # 1. Setup Logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # 2. Model Configuration (20B Q3_K_M)
13
  MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
14
+ llm = None
15
 
16
+ def load_model():
17
+ global llm
18
+ if llm is None:
19
+ logger.info("πŸ”₯ Initializing 20B Engine (Direct I/O Mode)...")
20
+ try:
21
+ # Using conservative settings to fit in 16GB RAM
22
+ llm = Llama(
23
+ model_path=MODEL_PATH,
24
+ n_ctx=1024, # Crucial: Keep context low to avoid OOM crashes
25
+ n_threads=2, # HF Free tier limit
26
+ n_batch=512,
27
+ use_mmap=False, # Match your log discovery
28
+ use_mlock=False,
29
+ verbose=True
30
+ )
31
+ logger.info("βœ… Brain Linked! System Online.")
32
+ except Exception as e:
33
+ logger.error(f"❌ Initialization failed: {e}")
34
 
35
+ # 3. FastAPI App Setup
36
+ app = FastAPI(title="ChatGPT Open-Source 1.0 API")
37
 
38
+ # 4. CORS Setup: Allows GitHub Pages and Local Testing
39
  app.add_middleware(
40
  CORSMiddleware,
41
+ allow_origins=["*"], # Change to ["https://hydrogenclient.github.io"] for production
42
  allow_credentials=True,
43
  allow_methods=["*"],
44
  allow_headers=["*"],
45
  )
46
 
47
+ @app.on_event("startup")
48
+ async def startup_event():
49
+ load_model()
50
+
51
+ # 5. Routes
52
  @app.get("/")
53
  async def root():
54
  return {"status": "online", "message": "Connect to /chat"}
55
 
56
+ @app.get("/health")
57
+ async def health():
58
+ return {"status": "ready" if llm else "loading"}
59
+
60
  @app.post("/chat")
61
  async def chat(request: Request):
62
  if llm is None:
63
+ return JSONResponse({"response": "I'm still waking up. Try again in 60 seconds."}, status_code=503)
64
 
65
  try:
66
  data = await request.json()
 
67
 
68
+ # --- Handle different request formats ---
69
+ # Format A: {"message": "Hello"}
70
+ user_message = data.get("message")
71
+
72
+ # Format B: {"messages": [{"role": "user", "content": "Hello"}]}
73
+ if not user_message and "messages" in data:
74
+ # Take the last message from the conversation list
75
+ user_message = data["messages"][-1]["content"]
76
+
77
+ if not user_message:
78
+ return JSONResponse({"response": "I didn't see a message in your request."}, status_code=400)
79
+
80
+ # --- Formatting for GPT-OSS Architecture ---
81
+ # Note: Your model expects <|user|> and <|assistant|> markers
82
  prompt = f"<|system|>You are a helpful AI.<|user|>{user_message}<|assistant|>"
83
 
84
+ # --- Inference ---
85
  output = llm(
86
  prompt,
87
  max_tokens=256,
88
+ stop=["<|user|>", "<|system|>", "</s>"],
89
  temperature=0.7
90
  )
91
 
92
+ reply = output["choices"][0]["text"].strip()
93
+ return {"response": reply}
94
+
95
  except Exception as e:
96
+ logger.error(f"❌ Inference Error: {e}")
97
+ return JSONResponse({"response": "My brain encountered an error processing that."}, status_code=500)
98
 
99
+ # 6. Entry point for local testing
100
+ if __name__ == "__main__":
101
+ import uvicorn
102
+ # Local: uvicorn app:app --host 0.0.0.0 --port 7860
103
+ uvicorn.run(app, host="0.0.0.0", port=7860)