from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download import multiprocessing app = FastAPI() # =============================== # MODEL CONFIG # =============================== MODEL_REPO = "bartowski/Qwen2.5-3B-Instruct-GGUF" MODEL_FILE = "Qwen2.5-3B-Instruct-Q4_K_M.gguf" model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE ) # =============================== # LLM INITIALIZATION (OPTIMIZED) # =============================== llm = Llama( model_path=model_path, # Large context for deep reasoning n_ctx=8192, # Use all CPU cores n_threads=multiprocessing.cpu_count(), # CPU mode n_gpu_layers=0, # Performance boost n_batch=512, use_mmap=True, use_mlock=True, ) # =============================== # REQUEST MODEL # =============================== class ChatRequest(BaseModel): message: str # =============================== # HEALTH CHECK # =============================== @app.get("/") def root(): return {"status": "Strategy AI engine running"} # =============================== # CHAT ENDPOINT # =============================== @app.post("/chat") def chat(req: ChatRequest): # STRATEGY SPECIALIZED SYSTEM PROMPT system_prompt = ( "<|system|>" "You are an elite strategic intelligence AI. " "Think step-by-step before answering. " "Provide deep analysis, structured reasoning, and clear actionable insights. " "Use bullet points, numbered steps, and markdown formatting." "<|end|>" ) prompt = system_prompt + f"<|user|>{req.message}<|assistant|>" output = llm( prompt, # Longer reasoning output max_tokens=900, # Lower randomness for logical thinking temperature=0.35, # Stable probability sampling top_p=0.9, # Prevent loops repeat_penalty=1.2, stop=["<|end|>"] ) response_text = output["choices"][0]["text"].strip() return {"reply": response_text} # =============================== # LOCAL RUN # =============================== if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)