SharmaGroups07 commited on
Commit
61ccc8d
·
verified ·
1 Parent(s): 428ee1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -9
app.py CHANGED
@@ -2,9 +2,14 @@ from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
 
5
 
6
  app = FastAPI()
7
 
 
 
 
 
8
  MODEL_REPO = "bartowski/Qwen2.5-3B-Instruct-GGUF"
9
  MODEL_FILE = "Qwen2.5-3B-Instruct-Q4_K_M.gguf"
10
 
@@ -13,33 +18,88 @@ model_path = hf_hub_download(
13
  filename=MODEL_FILE
14
  )
15
 
 
 
 
 
16
  llm = Llama(
17
  model_path=model_path,
18
- n_ctx=2048,
19
- n_threads=2
 
 
 
 
 
 
 
 
 
 
 
 
20
  )
21
 
 
 
 
 
22
  class ChatRequest(BaseModel):
23
  message: str
24
 
 
 
 
 
25
  @app.get("/")
26
  def root():
27
- return {"status": "AI engine running"}
 
 
 
 
28
 
29
  @app.post("/chat")
30
  def chat(req: ChatRequest):
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  output = llm(
32
- f"<|user|>{req.message}<|assistant|>",
33
- max_tokens=512,
34
- temperature=0.7,
 
 
 
 
 
 
35
  top_p=0.9,
36
- repeat_penalty=1.1,
 
 
 
37
  stop=["<|end|>"]
38
  )
39
 
40
- return {"reply": output["choices"][0]["text"]}
 
 
 
 
 
 
41
 
42
- # ⭐ THIS PART WAS MISSING
43
  if __name__ == "__main__":
44
  import uvicorn
45
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
+ import multiprocessing
6
 
7
  app = FastAPI()
8
 
9
+ # ===============================
10
+ # MODEL CONFIG
11
+ # ===============================
12
+
13
  MODEL_REPO = "bartowski/Qwen2.5-3B-Instruct-GGUF"
14
  MODEL_FILE = "Qwen2.5-3B-Instruct-Q4_K_M.gguf"
15
 
 
18
  filename=MODEL_FILE
19
  )
20
 
21
+ # ===============================
22
+ # LLM INITIALIZATION (OPTIMIZED)
23
+ # ===============================
24
+
25
  llm = Llama(
26
  model_path=model_path,
27
+
28
+ # Large context for deep reasoning
29
+ n_ctx=8192,
30
+
31
+ # Use all CPU cores
32
+ n_threads=multiprocessing.cpu_count(),
33
+
34
+ # CPU mode
35
+ n_gpu_layers=0,
36
+
37
+ # Performance boost
38
+ n_batch=512,
39
+ use_mmap=True,
40
+ use_mlock=True,
41
  )
42
 
43
+ # ===============================
44
+ # REQUEST MODEL
45
+ # ===============================
46
+
47
  class ChatRequest(BaseModel):
48
  message: str
49
 
50
+ # ===============================
51
+ # HEALTH CHECK
52
+ # ===============================
53
+
54
  @app.get("/")
55
  def root():
56
+ return {"status": "Strategy AI engine running"}
57
+
58
+ # ===============================
59
+ # CHAT ENDPOINT
60
+ # ===============================
61
 
62
  @app.post("/chat")
63
  def chat(req: ChatRequest):
64
+
65
+ # STRATEGY SPECIALIZED SYSTEM PROMPT
66
+ system_prompt = (
67
+ "<|system|>"
68
+ "You are an elite strategic intelligence AI. "
69
+ "Think step-by-step before answering. "
70
+ "Provide deep analysis, structured reasoning, and clear actionable insights. "
71
+ "Use bullet points, numbered steps, and markdown formatting."
72
+ "<|end|>"
73
+ )
74
+
75
+ prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"
76
+
77
  output = llm(
78
+ prompt,
79
+
80
+ # Longer reasoning output
81
+ max_tokens=900,
82
+
83
+ # Lower randomness for logical thinking
84
+ temperature=0.35,
85
+
86
+ # Stable probability sampling
87
  top_p=0.9,
88
+
89
+ # Prevent loops
90
+ repeat_penalty=1.2,
91
+
92
  stop=["<|end|>"]
93
  )
94
 
95
+ response_text = output["choices"][0]["text"].strip()
96
+
97
+ return {"reply": response_text}
98
+
99
+ # ===============================
100
+ # LOCAL RUN
101
+ # ===============================
102
 
 
103
  if __name__ == "__main__":
104
  import uvicorn
105
  uvicorn.run(app, host="0.0.0.0", port=7860)