from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import multiprocessing

app = FastAPI()

# ===============================
# MODEL CONFIG
# ===============================

MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf"

model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE
)

# ===============================
# LLM INITIALIZATION (OPTIMIZED)
# ===============================

llm = Llama(
    model_path=model_path,

    # Larger context for coding tasks
    n_ctx=8192,

    # Use all CPU cores
    n_threads=multiprocessing.cpu_count(),

    # CPU inference
    n_gpu_layers=0,

    # PERFORMANCE BOOST
    n_batch=512,
    use_mmap=True,
    use_mlock=True,
)

# ===============================
# REQUEST MODEL
# ===============================

class ChatRequest(BaseModel):
    message: str

# ===============================
# HEALTH CHECK
# ===============================

@app.get("/")
def root():
    return {"status": "Coding AI engine running"}

# ===============================
# CHAT ENDPOINT
# ===============================

@app.post("/chat")
def chat(req: ChatRequest):

    # CODING SPECIALIZED SYSTEM PROMPT
    system_prompt = (
        "<|system|>"
        "You are an elite senior software engineer AI. "
        "Write clean, production-ready code. "
        "Always include comments. "
        "Use best practices, error handling, and optimization. "
        "Format output in proper markdown with code blocks."
        "<|end|>"
    )

    prompt = system_prompt + f"<|user|>{req.message}<|assistant|>"

    output = llm(
        prompt,

        # Larger token output for code
        max_tokens=800,

        # Lower randomness = better code
        temperature=0.4,

        # Stable generation
        top_p=0.9,

        # Prevent repetition loops
        repeat_penalty=1.2,

        stop=["<|end|>"]
    )

    response_text = output["choices"][0]["text"].strip()

    return {"reply": response_text}

# ===============================
# LOCAL RUN
# ===============================

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)