Spaces:

SharmaGroups07
/

ai-engine

Running

App Files Files Community

SharmaGroups07 commited on Feb 19

Commit

e5cd937

verified ·

1 Parent(s): cf95752

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -7

app.py CHANGED Viewed

@@ -6,6 +6,10 @@ import multiprocessing
 app = FastAPI()
 MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
 MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"
@@ -14,37 +18,81 @@ model_path = hf_hub_download(
     filename=MODEL_FILE
 )
 llm = Llama(
     model_path=model_path,
     n_ctx=4096,
     n_threads=multiprocessing.cpu_count(),
-    n_gpu_layers=0
 )
 class ChatRequest(BaseModel):
     message: str
 @app.get("/")
 def root():
-    return {"status": "AI engine running"}
 @app.post("/chat")
 def chat(req: ChatRequest):
-    system_prompt = "<|system|>You are a professional AI assistant. Answer clearly, structured, and concisely using markdown formatting.<|end|>"
     prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"
     output = llm(
         prompt,
-        max_tokens=512,
-        temperature=0.7,
         top_p=0.9,
-        repeat_penalty=1.1,
         stop=["<|end|>"]
     )
-    return {"reply": output["choices"][0]["text"]}
 if __name__ == "__main__":
     import uvicorn

 app = FastAPI()
+# ===============================
+# MODEL CONFIG
+# ===============================
 MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
 MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"
     filename=MODEL_FILE
 )
+# ===============================
+# LLM INITIALIZATION (OPTIMIZED)
+# ===============================
 llm = Llama(
     model_path=model_path,
+    # Context window (balance speed + memory)
     n_ctx=4096,
+    # Use all CPU cores automatically
     n_threads=multiprocessing.cpu_count(),
+    # CPU inference
+    n_gpu_layers=0,
+    # Performance optimizations
+    n_batch=512,            # faster token processing
+    use_mmap=True,          # faster loading
+    use_mlock=True,         # prevents RAM swapping
 )
+# ===============================
+# REQUEST MODEL
+# ===============================
 class ChatRequest(BaseModel):
     message: str
+# ===============================
+# HEALTH CHECK
+# ===============================
 @app.get("/")
 def root():
+    return {"status": "Speed AI engine running"}
+# ===============================
+# CHAT ENDPOINT
+# ===============================
 @app.post("/chat")
 def chat(req: ChatRequest):
+    # PROFESSIONAL SYSTEM PROMPT
+    system_prompt = (
+        "<|system|>"
+        "You are a high-speed professional AI assistant. "
+        "Respond clearly, concisely, and in structured markdown format. "
+        "Use bullet points, headings, and emojis when helpful. "
+        "Never include conversation history unless asked."
+        "<|end|>"
+    )
     prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"
+    # GENERATION SETTINGS (OPTIMIZED BALANCE)
     output = llm(
         prompt,
+        max_tokens=400,        # faster than 512
+        temperature=0.6,       # less hallucination
         top_p=0.9,
+        repeat_penalty=1.15,   # reduces loops
         stop=["<|end|>"]
     )
+    response_text = output["choices"][0]["text"].strip()
+    return {"reply": response_text}
+# ===============================
+# LOCAL RUN
+# ===============================
 if __name__ == "__main__":
     import uvicorn