Spaces:
Running
Running
File size: 2,355 Bytes
885354f 73a88d0 b9c72be cf95752 59d2308 885354f 3a784e6 e5cd937 bca4678 0eea540 db0a3fa 885354f 59d2308 e5cd937 885354f e5cd937 cf95752 e5cd937 cf95752 e5cd937 885354f cd2d9ab e5cd937 73a88d0 4f9c2f2 e5cd937 8e15e85 e5cd937 8e15e85 73a88d0 cf95752 e5cd937 cf95752 e5cd937 ef3a046 cf95752 e5cd937 ef3a046 e5cd937 ef3a046 69bab40 e5cd937 e2bfa03 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import multiprocessing
app = FastAPI()
# ===============================
# MODEL CONFIG
# ===============================
MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"
model_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILE
)
# ===============================
# LLM INITIALIZATION (OPTIMIZED)
# ===============================
llm = Llama(
model_path=model_path,
# Context window (balance speed + memory)
n_ctx=4096,
# Use all CPU cores automatically
n_threads=multiprocessing.cpu_count(),
# CPU inference
n_gpu_layers=0,
# Performance optimizations
n_batch=512, # faster token processing
use_mmap=True, # faster loading
use_mlock=True, # prevents RAM swapping
)
# ===============================
# REQUEST MODEL
# ===============================
class ChatRequest(BaseModel):
message: str
# ===============================
# HEALTH CHECK
# ===============================
@app.get("/")
def root():
return {"status": "Speed AI engine running"}
# ===============================
# CHAT ENDPOINT
# ===============================
@app.post("/chat")
def chat(req: ChatRequest):
# PROFESSIONAL SYSTEM PROMPT
system_prompt = (
"<|system|>"
"You are a high-speed professional AI assistant. "
"Respond clearly, concisely, and in structured markdown format. "
"Use bullet points, headings, and emojis when helpful. "
"Never include conversation history unless asked."
"<|end|>"
)
prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"
# GENERATION SETTINGS (OPTIMIZED BALANCE)
output = llm(
prompt,
max_tokens=400, # faster than 512
temperature=0.6, # less hallucination
top_p=0.9,
repeat_penalty=1.15, # reduces loops
stop=["<|end|>"]
)
response_text = output["choices"][0]["text"].strip()
return {"reply": response_text}
# ===============================
# LOCAL RUN
# ===============================
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
|