CooLLaMACEO commited on
Commit
049d9d4
·
verified ·
1 Parent(s): 111e00f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -33
app.py CHANGED
@@ -9,23 +9,20 @@ from llama_cpp import Llama
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
- # 2. Model Configuration (20B Q3_K_M)
13
  MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
14
  llm = None
15
 
16
  def load_model():
17
  global llm
18
  if llm is None:
19
- logger.info("🔥 Initializing 20B Engine (Direct I/O Mode)...")
20
  try:
21
- # Using conservative settings to fit in 16GB RAM
22
  llm = Llama(
23
  model_path=MODEL_PATH,
24
- n_ctx=1024, # Crucial: Keep context low to avoid OOM crashes
25
- n_threads=2, # HF Free tier limit
26
- n_batch=512,
27
- use_mmap=False, # Match your log discovery
28
- use_mlock=False,
29
  verbose=True
30
  )
31
  logger.info("✅ Brain Linked! System Online.")
@@ -35,10 +32,10 @@ def load_model():
35
  # 3. FastAPI App Setup
36
  app = FastAPI(title="ChatGPT Open-Source 1.0 API")
37
 
38
- # 4. CORS Setup: Allows GitHub Pages and Local Testing
39
  app.add_middleware(
40
  CORSMiddleware,
41
- allow_origins=["*"], # Change to ["https://hydrogenclient.github.io"] for production
42
  allow_credentials=True,
43
  allow_methods=["*"],
44
  allow_headers=["*"],
@@ -48,56 +45,56 @@ app.add_middleware(
48
  async def startup_event():
49
  load_model()
50
 
51
- # 5. Routes
52
  @app.get("/")
53
  async def root():
54
  return {"status": "online", "message": "Connect to /chat"}
55
 
56
- @app.get("/health")
57
- async def health():
58
- return {"status": "ready" if llm else "loading"}
59
-
60
  @app.post("/chat")
61
  async def chat(request: Request):
62
  if llm is None:
63
- return JSONResponse({"response": "I'm still waking up. Try again in 60 seconds."}, status_code=503)
64
 
65
  try:
66
  data = await request.json()
67
 
68
- # --- Handle different request formats ---
69
- # Format A: {"message": "Hello"}
70
  user_message = data.get("message")
71
-
72
- # Format B: {"messages": [{"role": "user", "content": "Hello"}]}
73
  if not user_message and "messages" in data:
74
- # Take the last message from the conversation list
75
  user_message = data["messages"][-1]["content"]
76
 
77
  if not user_message:
78
- return JSONResponse({"response": "I didn't see a message in your request."}, status_code=400)
79
 
80
- # --- Formatting for GPT-OSS Architecture ---
81
- # Note: Your model expects <|user|> and <|assistant|> markers
82
- prompt = f"<|system|>You are a helpful AI.<|user|>{user_message}<|assistant|>"
 
 
 
 
83
 
84
- # --- Inference ---
85
  output = llm(
86
  prompt,
87
- max_tokens=256,
88
- stop=["<|user|>", "<|system|>", "</s>"],
89
- temperature=0.7
 
 
90
  )
91
 
92
  reply = output["choices"][0]["text"].strip()
 
 
 
 
 
93
  return {"response": reply}
94
 
95
  except Exception as e:
96
- logger.error(f"❌ Inference Error: {e}")
97
- return JSONResponse({"response": "My brain encountered an error processing that."}, status_code=500)
98
 
99
- # 6. Entry point for local testing
100
  if __name__ == "__main__":
101
  import uvicorn
102
- # Local: uvicorn app:app --host 0.0.0.0 --port 7860
103
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
+ # 2. Model Configuration
13
  MODEL_PATH = "./models/gpt-oss-20b-Q3_K_M.gguf"
14
  llm = None
15
 
16
  def load_model():
17
  global llm
18
  if llm is None:
19
+ logger.info("🔥 Initializing 20B Harmony Engine...")
20
  try:
 
21
  llm = Llama(
22
  model_path=MODEL_PATH,
23
+ n_ctx=2048,
24
+ n_threads=2,
25
+ use_mmap=False,
 
 
26
  verbose=True
27
  )
28
  logger.info("✅ Brain Linked! System Online.")
 
32
  # 3. FastAPI App Setup
33
  app = FastAPI(title="ChatGPT Open-Source 1.0 API")
34
 
35
+ # 4. CORS Setup (Crucial for GitHub Pages)
36
  app.add_middleware(
37
  CORSMiddleware,
38
+ allow_origins=["*"],
39
  allow_credentials=True,
40
  allow_methods=["*"],
41
  allow_headers=["*"],
 
45
  async def startup_event():
46
  load_model()
47
 
 
48
  @app.get("/")
49
  async def root():
50
  return {"status": "online", "message": "Connect to /chat"}
51
 
 
 
 
 
52
  @app.post("/chat")
53
  async def chat(request: Request):
54
  if llm is None:
55
+ return JSONResponse({"response": "I'm still waking up..."}, status_code=503)
56
 
57
  try:
58
  data = await request.json()
59
 
60
+ # Handle both simple string and OpenAI list formats
 
61
  user_message = data.get("message")
 
 
62
  if not user_message and "messages" in data:
 
63
  user_message = data["messages"][-1]["content"]
64
 
65
  if not user_message:
66
+ return JSONResponse({"response": "No message received."}, status_code=400)
67
 
68
+ # --- THE HARMONY FIX ---
69
+ # We wrap the message so it stops rambling about code tests.
70
+ prompt = (
71
+ f"<|start|>system<|message|>You are ChatGPT Open-Source 1.0, a helpful assistant.<|end|>\n"
72
+ f"<|start|>user<|message|>{user_message}<|end|>\n"
73
+ f"<|start|>assistant<|message|>"
74
+ )
75
 
76
+ # --- Inference with Stop Tokens ---
77
  output = llm(
78
  prompt,
79
+ max_tokens=512,
80
+ # We tell the model to STOP as soon as it tries to write its own end tokens
81
+ stop=["<|end|>", "<|return|>", "<|start|>", "user:", "assistant:"],
82
+ temperature=0.7,
83
+ repeat_penalty=1.2
84
  )
85
 
86
  reply = output["choices"][0]["text"].strip()
87
+
88
+ # If the model still includes "analysis" channel text, we clean it
89
+ if "<|channel|>final<|message|>" in reply:
90
+ reply = reply.split("<|channel|>final<|message|>")[-1].strip()
91
+
92
  return {"response": reply}
93
 
94
  except Exception as e:
95
+ logger.error(f"❌ Error: {e}")
96
+ return JSONResponse({"response": "Error processing request."}, status_code=500)
97
 
 
98
  if __name__ == "__main__":
99
  import uvicorn
 
100
  uvicorn.run(app, host="0.0.0.0", port=7860)