Spaces:
Running
Running
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| import multiprocessing | |
| app = FastAPI() | |
| # =============================== | |
| # MODEL CONFIG | |
| # =============================== | |
| MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf" | |
| MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf" | |
| model_path = hf_hub_download( | |
| repo_id=MODEL_REPO, | |
| filename=MODEL_FILE | |
| ) | |
| # =============================== | |
| # LLM INITIALIZATION (OPTIMIZED) | |
| # =============================== | |
| llm = Llama( | |
| model_path=model_path, | |
| # Context window (balance speed + memory) | |
| n_ctx=4096, | |
| # Use all CPU cores automatically | |
| n_threads=multiprocessing.cpu_count(), | |
| # CPU inference | |
| n_gpu_layers=0, | |
| # Performance optimizations | |
| n_batch=512, # faster token processing | |
| use_mmap=True, # faster loading | |
| use_mlock=True, # prevents RAM swapping | |
| ) | |
| # =============================== | |
| # REQUEST MODEL | |
| # =============================== | |
| class ChatRequest(BaseModel): | |
| message: str | |
| # =============================== | |
| # HEALTH CHECK | |
| # =============================== | |
| def root(): | |
| return {"status": "Speed AI engine running"} | |
| # =============================== | |
| # CHAT ENDPOINT | |
| # =============================== | |
| def chat(req: ChatRequest): | |
| # PROFESSIONAL SYSTEM PROMPT | |
| system_prompt = ( | |
| "<|system|>" | |
| "You are a high-speed professional AI assistant. " | |
| "Respond clearly, concisely, and in structured markdown format. " | |
| "Use bullet points, headings, and emojis when helpful. " | |
| "Never include conversation history unless asked." | |
| "<|end|>" | |
| ) | |
| prompt = system_prompt + f"<|user|>{req.message}<|assistant|>" | |
| # GENERATION SETTINGS (OPTIMIZED BALANCE) | |
| output = llm( | |
| prompt, | |
| max_tokens=400, # faster than 512 | |
| temperature=0.6, # less hallucination | |
| top_p=0.9, | |
| repeat_penalty=1.15, # reduces loops | |
| stop=["<|end|>"] | |
| ) | |
| response_text = output["choices"][0]["text"].strip() | |
| return {"reply": response_text} | |
| # =============================== | |
| # LOCAL RUN | |
| # =============================== | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |