CooLLaMACEO commited on
Commit
5e45853
·
verified ·
1 Parent(s): 40f58a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -51
app.py CHANGED
@@ -1,67 +1,85 @@
1
- from fastapi import FastAPI
2
- from pydantic import BaseModel
3
- from llama_cpp import Llama
4
  import os
 
 
 
 
5
 
6
- # =========================
7
- # Config
8
- # =========================
9
  MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
10
 
11
- SYSTEM_PROMPT = (
12
- "You are ChatGPT Open-Source 1.0, a high-performance local AI. "
13
- "You were built by the open-source community. "
14
- "You are helpful, witty, and proud to run locally without the internet."
15
- )
16
 
17
- # =========================
18
- # Load Model (ON START)
19
- # =========================
20
- print("🔥 Loading model...")
21
  llm = Llama(
22
  model_path=MODEL_PATH,
23
- n_ctx=16384,
24
  n_threads=os.cpu_count(),
25
- n_batch=256,
26
- verbose=False,
27
  )
28
- print("✅ Model loaded!")
29
 
30
- # =========================
31
- # FastAPI
32
- # =========================
33
- app = FastAPI(title="ChatGPT Open-Source 1.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- class ChatRequest(BaseModel):
36
- message: str
37
 
38
- class ChatResponse(BaseModel):
39
- reply: str
40
 
41
- @app.get("/")
42
- def root():
43
- return {
44
- "name": "ChatGPT Open-Source 1.0",
45
- "status": "running",
46
- "model": "gpt-oss-20b-Q3_K_M",
47
- "offline": True
48
- }
49
 
50
- @app.post("/chat", response_model=ChatResponse)
51
- def chat(req: ChatRequest):
52
- prompt = f"""<|system|>
53
- {SYSTEM_PROMPT}
54
- <|user|>
55
- {req.message}
56
- <|assistant|>
57
- """
58
 
59
- output = llm(
60
- prompt,
61
- max_tokens=512,
62
- stop=["<|user|>", "<|system|>"],
63
- temperature=0.7,
64
- )
65
 
66
- reply = output["choices"][0]["text"].strip()
67
- return ChatResponse(reply=reply)
 
 
 
 
 
 
 
 
1
  import os
2
+ from fastapi import FastAPI, Request
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from fastapi.responses import HTMLResponse, JSONResponse
5
+ from llama_cpp import Llama
6
 
7
+ # ==========================================
8
+ # 1. AI Model Configuration
9
+ # ==========================================
10
  MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
11
 
12
+ print("🔥 ChatGPT Open-Source 1.0: Initializing 20B Engine...")
 
 
 
 
13
 
14
+ # We use a smaller n_ctx (2048) to ensure we don't exceed HF's 16GB RAM
15
+ # once the 10.7GB model is fully loaded.
 
 
16
  llm = Llama(
17
  model_path=MODEL_PATH,
18
+ n_ctx=2048,
19
  n_threads=os.cpu_count(),
20
+ n_batch=512,
21
+ verbose=True
22
  )
 
23
 
24
+ print("✅ Brain Linked! System Online.")
25
+
26
+ # ==========================================
27
+ # 2. FastAPI Setup
28
+ # ==========================================
29
+ app = FastAPI(title="ChatGPT Open-Source 1.0 Backend")
30
+
31
+ # Enable CORS so your GitHub site can talk to this Hugging Face Space
32
+ app.add_middleware(
33
+ CORSMiddleware,
34
+ allow_origins=["*"], # Change to your github.io URL for better security later
35
+ allow_credentials=True,
36
+ allow_methods=["*"],
37
+ allow_headers=["*"],
38
+ )
39
+
40
+ # ==========================================
41
+ # 3. Routes
42
+ # ==========================================
43
+
44
+ @app.get("/", response_class=HTMLResponse)
45
+ async def get_ui():
46
+ """Serves the local index.html UI"""
47
+ if os.path.exists("index.html"):
48
+ with open("index.html", "r") as f:
49
+ return f.read()
50
+ return "<h1>System Online</h1><p>Backend is running, but index.html was not found.</p>"
51
+
52
+ @app.post("/chat")
53
+ async def chat(request: Request):
54
+ """Handles AI Chat Requests"""
55
+ try:
56
+ data = await request.json()
57
+ user_message = data.get("message", "")
58
 
59
+ if not user_message:
60
+ return JSONResponse({"response": "I didn't receive a message."}, status_code=400)
61
 
62
+ # Formatting for the GPT-OSS model architecture
63
+ prompt = f"<|system|>You are ChatGPT Open-Source 1.0, a helpful local AI.<|user|>{user_message}<|assistant|>"
64
 
65
+ # Generate response
66
+ output = llm(
67
+ prompt,
68
+ max_tokens=512,
69
+ stop=["<|user|>", "<|system|>", "</s>"],
70
+ temperature=0.7
71
+ )
 
72
 
73
+ reply = output["choices"][0]["text"].strip()
74
+ return JSONResponse({"response": reply})
 
 
 
 
 
 
75
 
76
+ except Exception as e:
77
+ print(f"❌ Error during inference: {e}")
78
+ return JSONResponse({"response": "My brain encountered an error processing that."}, status_code=500)
 
 
 
79
 
80
+ # ==========================================
81
+ # 4. Health Check
82
+ # ==========================================
83
+ @app.get("/health")
84
+ async def health():
85
+ return {"status": "ready", "model": "20B-Q3_K_M", "ram_bypass": True}