from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download import multiprocessing app = FastAPI() # =============================== # MODEL CONFIG # =============================== MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf" MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf" model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE ) # =============================== # LLM INITIALIZATION (OPTIMIZED) # =============================== llm = Llama( model_path=model_path, # Context window (balance speed + memory) n_ctx=4096, # Use all CPU cores automatically n_threads=multiprocessing.cpu_count(), # CPU inference n_gpu_layers=0, # Performance optimizations n_batch=512, # faster token processing use_mmap=True, # faster loading use_mlock=True, # prevents RAM swapping ) # =============================== # REQUEST MODEL # =============================== class ChatRequest(BaseModel): message: str # =============================== # HEALTH CHECK # =============================== @app.get("/") def root(): return {"status": "Speed AI engine running"} # =============================== # CHAT ENDPOINT # =============================== @app.post("/chat") def chat(req: ChatRequest): # PROFESSIONAL SYSTEM PROMPT system_prompt = ( "<|system|>" "You are a high-speed professional AI assistant. " "Respond clearly, concisely, and in structured markdown format. " "Use bullet points, headings, and emojis when helpful. " "Never include conversation history unless asked." "<|end|>" ) prompt = system_prompt + f"<|user|>{req.message}<|assistant|>" # GENERATION SETTINGS (OPTIMIZED BALANCE) output = llm( prompt, max_tokens=400, # faster than 512 temperature=0.6, # less hallucination top_p=0.9, repeat_penalty=1.15, # reduces loops stop=["<|end|>"] ) response_text = output["choices"][0]["text"].strip() return {"reply": response_text} # =============================== # LOCAL RUN # =============================== if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)