CooLLaMACEO commited on
Commit
1e27e60
·
verified ·
1 Parent(s): 967e0b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -45
app.py CHANGED
@@ -1,32 +1,32 @@
1
  from fastapi import FastAPI, Request, HTTPException, Depends
2
  from fastapi.middleware.cors import CORSMiddleware
 
3
  from llama_cpp import Llama
4
  import uvicorn
 
5
 
6
  app = FastAPI()
7
 
8
- # 1. ALLOW EXTERNAL CONNECTIONS (CORS)
9
- # This allows your GitHub Pages or local HTML file to talk to the HF Space
10
  app.add_middleware(
11
  CORSMiddleware,
12
- allow_origins=["*"],
13
  allow_credentials=True,
14
  allow_methods=["*"],
15
  allow_headers=["*"],
16
  )
17
 
18
- # Configuration
19
  MY_API_KEY = "my-secret-key-456"
20
- MODEL_PATH = "./model.gguf"
21
 
22
- # 2. INITIALIZE MISTRAL (Optimized for CPU)
23
- # We use 4 threads to utilize the full power of the HF Free Tier CPU
24
  llm = Llama(
25
- model_path=MODEL_PATH,
26
- n_ctx=2048, # Context window
27
- n_threads=4, # Parallel CPU cores
28
- n_batch=512, # Prompt processing speed
29
- verbose=True # Logs performance to the HF console
30
  )
31
 
32
  def verify_key(request: Request):
@@ -36,59 +36,55 @@ def verify_key(request: Request):
36
 
37
  @app.get("/")
38
  def home():
39
- return {
40
- "status": "Online",
41
- "engine": "Mistral-7B-Q8_0",
42
- "brand": "ChatMPT Official"
43
- }
44
 
45
  @app.post("/v1/chat")
46
  async def chat(request: Request, _ = Depends(verify_key)):
47
  try:
48
  body = await request.json()
49
- user_input = body.get("prompt", "")
50
 
51
  if not user_input:
52
- return {"reply": "System: No prompt received."}
53
 
54
- # 3. MISTRAL INSTRUCT FORMAT
55
- # llama-cpp-python adds the <s> automatically.
56
- # [INST] tags guide the model's logic.
57
- system_msg = "You are ChatMPT, a professional AI. Be concise and never repeat your own sentences."
58
  prompt = f"[INST] {system_msg}\n\n{user_input} [/INST]"
59
 
60
- # 4. ANTI-LOOP & SPEED SAMPLING
 
 
 
61
  response = llm(
62
- prompt,
63
- max_tokens=256,
64
- stop=["</s>", "[INST]"], # Hard stops for Mistral
65
- temperature=0.75, # Balanced creativity
66
-
67
- # --- THE "ANTI-LOOP" ARMOR ---
68
- repeat_penalty=1.25, # Heavily discourages repeating phrases
69
- frequency_penalty=0.5, # Discourages using the same word too often
70
- presence_penalty=0.5, # Discourages returning to the same topics
71
-
72
- # --- MIROSTAT 2.0 (High Quality Control) ---
73
- mirostat_mode=2, # 2 is the modern standard for CPU stability
74
- mirostat_tau=5.0, # Higher = more interesting, Lower = more focused
75
- mirostat_eta=0.1 # Learning rate for the AI's "internal mood"
76
  )
77
 
78
  final_reply = response["choices"][0]["text"].strip()
79
 
80
- # 5. BRAND PROTECTION (Safety Filter)
81
- # Force-corrects common hallucinations
82
- hallucinations = ["ChatGPT", "Chat GPT", "OpenAI", "ChatPapers", "ChatPBT"]
 
 
 
 
 
 
 
83
  for item in hallucinations:
84
  final_reply = final_reply.replace(item, "ChatMPT")
85
 
86
- return {"reply": final_reply}
87
 
88
  except Exception as e:
89
- print(f"Error: {str(e)}")
90
- return {"reply": f"System Error: The engine is under heavy load. Please try again."}
91
 
92
  if __name__ == "__main__":
93
- # Standard Hugging Face port
94
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  from fastapi import FastAPI, Request, HTTPException, Depends
2
  from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.responses import JSONResponse
4
  from llama_cpp import Llama
5
  import uvicorn
6
+ import os
7
 
8
  app = FastAPI()
9
 
10
+ # 1. CORS CONFIGURATION
11
+ # This is crucial so your website can actually read the response
12
  app.add_middleware(
13
  CORSMiddleware,
14
+ allow_origins=["*"],
15
  allow_credentials=True,
16
  allow_methods=["*"],
17
  allow_headers=["*"],
18
  )
19
 
 
20
  MY_API_KEY = "my-secret-key-456"
 
21
 
22
+ # 2. LOAD MISTRAL (Optimized for HF CPU)
23
+ # We use 4 threads and a higher batch size for faster 'first thought'
24
  llm = Llama(
25
+ model_path="./model.gguf",
26
+ n_ctx=2048,
27
+ n_threads=4,
28
+ n_batch=512,
29
+ verbose=True
30
  )
31
 
32
  def verify_key(request: Request):
 
36
 
37
  @app.get("/")
38
  def home():
39
+ return {"status": "Online", "mode": "Mistral-7B-Q8", "brand": "ChatMPT"}
 
 
 
 
40
 
41
  @app.post("/v1/chat")
42
  async def chat(request: Request, _ = Depends(verify_key)):
43
  try:
44
  body = await request.json()
45
+ user_input = body.get("prompt", "").strip()
46
 
47
  if not user_input:
48
+ return JSONResponse(content={"reply": "Please enter a message!"})
49
 
50
+ # --- MISTRAL INSTRUCT FORMAT ---
51
+ # llama-cpp adds <s> automatically, so we just wrap in [INST]
52
+ system_msg = "You are ChatMPT, a helpful AI. Be direct and concise."
 
53
  prompt = f"[INST] {system_msg}\n\n{user_input} [/INST]"
54
 
55
+ print(f"\n--- USER PROMPT: {user_input} ---")
56
+
57
+ # --- GENERATION SETTINGS ---
58
+ # We lower repeat_penalty slightly so it doesn't get 'scared' to talk
59
  response = llm(
60
+ prompt,
61
+ max_tokens=512,
62
+ stop=["</s>", "[INST]", "[/INST]"],
63
+ temperature=0.7,
64
+ repeat_penalty=1.1,
65
+ mirostat_mode=2
 
 
 
 
 
 
 
 
66
  )
67
 
68
  final_reply = response["choices"][0]["text"].strip()
69
 
70
+ # --- DEBUG LOGGING ---
71
+ # This will show up in your Hugging Face Logs
72
+ print(f"--- AI REPLY: {final_reply} ---")
73
+
74
+ # --- FALLBACK IF EMPTY ---
75
+ if not final_reply or len(final_reply) < 2:
76
+ final_reply = "I'm here! Could you please rephrase that?"
77
+
78
+ # Identity Safety
79
+ hallucinations = ["ChatGPT", "Chat GPT", "OpenAI", "ChatPapers"]
80
  for item in hallucinations:
81
  final_reply = final_reply.replace(item, "ChatMPT")
82
 
83
+ return JSONResponse(content={"reply": final_reply})
84
 
85
  except Exception as e:
86
+ print(f"SERVER ERROR: {str(e)}")
87
+ return JSONResponse(status_code=500, content={"reply": "System Error: Brain is overloaded."})
88
 
89
  if __name__ == "__main__":
 
90
  uvicorn.run(app, host="0.0.0.0", port=7860)