from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import multiprocessing

app = FastAPI()

# ===============================
# MODEL CONFIG
# ===============================

MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"

model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE
)

# ===============================
# LLM INITIALIZATION (OPTIMIZED)
# ===============================

llm = Llama(
    model_path=model_path,

    # Context window (balance speed + memory)
    n_ctx=4096,

    # Use all CPU cores automatically
    n_threads=multiprocessing.cpu_count(),

    # CPU inference
    n_gpu_layers=0,

    # Performance optimizations
    n_batch=512,            # faster token processing
    use_mmap=True,          # faster loading
    use_mlock=True,         # prevents RAM swapping
)

# ===============================
# REQUEST MODEL
# ===============================

class ChatRequest(BaseModel):
    message: str

# ===============================
# HEALTH CHECK
# ===============================

@app.get("/")
def root():
    return {"status": "Speed AI engine running"}

# ===============================
# CHAT ENDPOINT
# ===============================

@app.post("/chat")
def chat(req: ChatRequest):

    # PROFESSIONAL SYSTEM PROMPT
    system_prompt = (
        "<|system|>"
        "You are a high-speed professional AI assistant. "
        "Respond clearly, concisely, and in structured markdown format. "
        "Use bullet points, headings, and emojis when helpful. "
        "Never include conversation history unless asked."
        "<|end|>"
    )

    prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"

    # GENERATION SETTINGS (OPTIMIZED BALANCE)
    output = llm(
        prompt,

        max_tokens=400,        # faster than 512
        temperature=0.6,       # less hallucination
        top_p=0.9,
        repeat_penalty=1.15,   # reduces loops

        stop=["<|end|>"]
    )

    response_text = output["choices"][0]["text"].strip()

    return {"reply": response_text}

# ===============================
# LOCAL RUN
# ===============================

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)