SharmaGroups07 commited on
Commit
e5cd937
·
verified ·
1 Parent(s): cf95752

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -7
app.py CHANGED
@@ -6,6 +6,10 @@ import multiprocessing
6
 
7
  app = FastAPI()
8
 
 
 
 
 
9
  MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
10
  MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"
11
 
@@ -14,37 +18,81 @@ model_path = hf_hub_download(
14
  filename=MODEL_FILE
15
  )
16
 
 
 
 
 
17
  llm = Llama(
18
  model_path=model_path,
 
 
19
  n_ctx=4096,
 
 
20
  n_threads=multiprocessing.cpu_count(),
21
- n_gpu_layers=0
 
 
 
 
 
 
 
22
  )
23
 
 
 
 
 
24
  class ChatRequest(BaseModel):
25
  message: str
26
 
 
 
 
 
27
  @app.get("/")
28
  def root():
29
- return {"status": "AI engine running"}
 
 
 
 
30
 
31
  @app.post("/chat")
32
  def chat(req: ChatRequest):
33
 
34
- system_prompt = "<|system|>You are a professional AI assistant. Answer clearly, structured, and concisely using markdown formatting.<|end|>"
 
 
 
 
 
 
 
 
35
 
36
  prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"
37
 
 
38
  output = llm(
39
  prompt,
40
- max_tokens=512,
41
- temperature=0.7,
 
42
  top_p=0.9,
43
- repeat_penalty=1.1,
 
44
  stop=["<|end|>"]
45
  )
46
 
47
- return {"reply": output["choices"][0]["text"]}
 
 
 
 
 
 
48
 
49
  if __name__ == "__main__":
50
  import uvicorn
 
6
 
7
  app = FastAPI()
8
 
9
+ # ===============================
10
+ # MODEL CONFIG
11
+ # ===============================
12
+
13
  MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
14
  MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"
15
 
 
18
  filename=MODEL_FILE
19
  )
20
 
21
+ # ===============================
22
+ # LLM INITIALIZATION (OPTIMIZED)
23
+ # ===============================
24
+
25
  llm = Llama(
26
  model_path=model_path,
27
+
28
+ # Context window (balance speed + memory)
29
  n_ctx=4096,
30
+
31
+ # Use all CPU cores automatically
32
  n_threads=multiprocessing.cpu_count(),
33
+
34
+ # CPU inference
35
+ n_gpu_layers=0,
36
+
37
+ # Performance optimizations
38
+ n_batch=512, # faster token processing
39
+ use_mmap=True, # faster loading
40
+ use_mlock=True, # prevents RAM swapping
41
  )
42
 
43
+ # ===============================
44
+ # REQUEST MODEL
45
+ # ===============================
46
+
47
  class ChatRequest(BaseModel):
48
  message: str
49
 
50
+ # ===============================
51
+ # HEALTH CHECK
52
+ # ===============================
53
+
54
  @app.get("/")
55
  def root():
56
+ return {"status": "Speed AI engine running"}
57
+
58
+ # ===============================
59
+ # CHAT ENDPOINT
60
+ # ===============================
61
 
62
  @app.post("/chat")
63
  def chat(req: ChatRequest):
64
 
65
+ # PROFESSIONAL SYSTEM PROMPT
66
+ system_prompt = (
67
+ "<|system|>"
68
+ "You are a high-speed professional AI assistant. "
69
+ "Respond clearly, concisely, and in structured markdown format. "
70
+ "Use bullet points, headings, and emojis when helpful. "
71
+ "Never include conversation history unless asked."
72
+ "<|end|>"
73
+ )
74
 
75
  prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"
76
 
77
+ # GENERATION SETTINGS (OPTIMIZED BALANCE)
78
  output = llm(
79
  prompt,
80
+
81
+ max_tokens=400, # faster than 512
82
+ temperature=0.6, # less hallucination
83
  top_p=0.9,
84
+ repeat_penalty=1.15, # reduces loops
85
+
86
  stop=["<|end|>"]
87
  )
88
 
89
+ response_text = output["choices"][0]["text"].strip()
90
+
91
+ return {"reply": response_text}
92
+
93
+ # ===============================
94
+ # LOCAL RUN
95
+ # ===============================
96
 
97
  if __name__ == "__main__":
98
  import uvicorn