SharmaGroups07 commited on
Commit
c7135bb
·
verified ·
1 Parent(s): 48b1cf8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -9
app.py CHANGED
@@ -2,9 +2,14 @@ from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
 
5
 
6
  app = FastAPI()
7
 
 
 
 
 
8
  MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
9
  MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf"
10
 
@@ -13,33 +18,89 @@ model_path = hf_hub_download(
13
  filename=MODEL_FILE
14
  )
15
 
 
 
 
 
16
  llm = Llama(
17
  model_path=model_path,
18
- n_ctx=2048,
19
- n_threads=2
 
 
 
 
 
 
 
 
 
 
 
 
20
  )
21
 
 
 
 
 
22
  class ChatRequest(BaseModel):
23
  message: str
24
 
 
 
 
 
25
  @app.get("/")
26
  def root():
27
- return {"status": "AI engine running"}
 
 
 
 
28
 
29
  @app.post("/chat")
30
  def chat(req: ChatRequest):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  output = llm(
32
- f"<|user|>{req.message}<|assistant|>",
33
- max_tokens=512,
34
- temperature=0.7,
 
 
 
 
 
 
35
  top_p=0.9,
36
- repeat_penalty=1.1,
 
 
 
37
  stop=["<|end|>"]
38
  )
39
 
40
- return {"reply": output["choices"][0]["text"]}
 
 
 
 
 
 
41
 
42
- # ⭐ THIS PART WAS MISSING
43
  if __name__ == "__main__":
44
  import uvicorn
45
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
+ import multiprocessing
6
 
7
  app = FastAPI()
8
 
9
+ # ===============================
10
+ # MODEL CONFIG
11
+ # ===============================
12
+
13
  MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
14
  MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf"
15
 
 
18
  filename=MODEL_FILE
19
  )
20
 
21
+ # ===============================
22
+ # LLM INITIALIZATION (OPTIMIZED)
23
+ # ===============================
24
+
25
  llm = Llama(
26
  model_path=model_path,
27
+
28
+ # Larger context for coding tasks
29
+ n_ctx=8192,
30
+
31
+ # Use all CPU cores
32
+ n_threads=multiprocessing.cpu_count(),
33
+
34
+ # CPU inference
35
+ n_gpu_layers=0,
36
+
37
+ # PERFORMANCE BOOST
38
+ n_batch=512,
39
+ use_mmap=True,
40
+ use_mlock=True,
41
  )
42
 
43
+ # ===============================
44
+ # REQUEST MODEL
45
+ # ===============================
46
+
47
  class ChatRequest(BaseModel):
48
  message: str
49
 
50
+ # ===============================
51
+ # HEALTH CHECK
52
+ # ===============================
53
+
54
  @app.get("/")
55
  def root():
56
+ return {"status": "Coding AI engine running"}
57
+
58
+ # ===============================
59
+ # CHAT ENDPOINT
60
+ # ===============================
61
 
62
  @app.post("/chat")
63
  def chat(req: ChatRequest):
64
+
65
+ # CODING SPECIALIZED SYSTEM PROMPT
66
+ system_prompt = (
67
+ "<|system|>"
68
+ "You are an elite senior software engineer AI. "
69
+ "Write clean, production-ready code. "
70
+ "Always include comments. "
71
+ "Use best practices, error handling, and optimization. "
72
+ "Format output in proper markdown with code blocks."
73
+ "<|end|>"
74
+ )
75
+
76
+ prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"
77
+
78
  output = llm(
79
+ prompt,
80
+
81
+ # Larger token output for code
82
+ max_tokens=800,
83
+
84
+ # Lower randomness = better code
85
+ temperature=0.4,
86
+
87
+ # Stable generation
88
  top_p=0.9,
89
+
90
+ # Prevent repetition loops
91
+ repeat_penalty=1.2,
92
+
93
  stop=["<|end|>"]
94
  )
95
 
96
+ response_text = output["choices"][0]["text"].strip()
97
+
98
+ return {"reply": response_text}
99
+
100
+ # ===============================
101
+ # LOCAL RUN
102
+ # ===============================
103
 
 
104
  if __name__ == "__main__":
105
  import uvicorn
106
  uvicorn.run(app, host="0.0.0.0", port=7860)