Spaces:
Running
Running
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| import multiprocessing | |
| app = FastAPI() | |
| # =============================== | |
| # MODEL CONFIG | |
| # =============================== | |
| MODEL_REPO = "bartowski/Qwen2.5-3B-Instruct-GGUF" | |
| MODEL_FILE = "Qwen2.5-3B-Instruct-Q4_K_M.gguf" | |
| model_path = hf_hub_download( | |
| repo_id=MODEL_REPO, | |
| filename=MODEL_FILE | |
| ) | |
| # =============================== | |
| # LLM INITIALIZATION (OPTIMIZED) | |
| # =============================== | |
| llm = Llama( | |
| model_path=model_path, | |
| # Large context for deep reasoning | |
| n_ctx=8192, | |
| # Use all CPU cores | |
| n_threads=multiprocessing.cpu_count(), | |
| # CPU mode | |
| n_gpu_layers=0, | |
| # Performance boost | |
| n_batch=512, | |
| use_mmap=True, | |
| use_mlock=True, | |
| ) | |
| # =============================== | |
| # REQUEST MODEL | |
| # =============================== | |
| class ChatRequest(BaseModel): | |
| message: str | |
| # =============================== | |
| # HEALTH CHECK | |
| # =============================== | |
| def root(): | |
| return {"status": "Strategy AI engine running"} | |
| # =============================== | |
| # CHAT ENDPOINT | |
| # =============================== | |
| def chat(req: ChatRequest): | |
| # STRATEGY SPECIALIZED SYSTEM PROMPT | |
| system_prompt = ( | |
| "<|system|>" | |
| "You are an elite strategic intelligence AI. " | |
| "Think step-by-step before answering. " | |
| "Provide deep analysis, structured reasoning, and clear actionable insights. " | |
| "Use bullet points, numbered steps, and markdown formatting." | |
| "<|end|>" | |
| ) | |
| prompt = system_prompt + f"<|user|>{req.message}<|assistant|>" | |
| output = llm( | |
| prompt, | |
| # Longer reasoning output | |
| max_tokens=900, | |
| # Lower randomness for logical thinking | |
| temperature=0.35, | |
| # Stable probability sampling | |
| top_p=0.9, | |
| # Prevent loops | |
| repeat_penalty=1.2, | |
| stop=["<|end|>"] | |
| ) | |
| response_text = output["choices"][0]["text"].strip() | |
| return {"reply": response_text} | |
| # =============================== | |
| # LOCAL RUN | |
| # =============================== | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |