from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import multiprocessing

app = FastAPI()

# ===============================
# MODEL CONFIG
# ===============================

MODEL_REPO = "bartowski/Qwen2.5-3B-Instruct-GGUF"
MODEL_FILE = "Qwen2.5-3B-Instruct-Q4_K_M.gguf"

model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE
)

# ===============================
# LLM INITIALIZATION (OPTIMIZED)
# ===============================

llm = Llama(
    model_path=model_path,

    # Large context for deep reasoning
    n_ctx=8192,

    # Use all CPU cores
    n_threads=multiprocessing.cpu_count(),

    # CPU mode
    n_gpu_layers=0,

    # Performance boost
    n_batch=512,
    use_mmap=True,
    use_mlock=True,
)

# ===============================
# REQUEST MODEL
# ===============================

class ChatRequest(BaseModel):
    message: str

# ===============================
# HEALTH CHECK
# ===============================

@app.get("/")
def root():
    return {"status": "Strategy AI engine running"}

# ===============================
# CHAT ENDPOINT
# ===============================

@app.post("/chat")
def chat(req: ChatRequest):

    # STRATEGY SPECIALIZED SYSTEM PROMPT
    system_prompt = (
        "<|system|>"
        "You are an elite strategic intelligence AI. "
        "Think step-by-step before answering. "
        "Provide deep analysis, structured reasoning, and clear actionable insights. "
        "Use bullet points, numbered steps, and markdown formatting."
        "<|end|>"
    )

    prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"

    output = llm(
        prompt,

        # Longer reasoning output
        max_tokens=900,

        # Lower randomness for logical thinking
        temperature=0.35,

        # Stable probability sampling
        top_p=0.9,

        # Prevent loops
        repeat_penalty=1.2,

        stop=["<|end|>"]
    )

    response_text = output["choices"][0]["text"].strip()

    return {"reply": response_text}

# ===============================
# LOCAL RUN
# ===============================

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)