import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Download your GGUF model from HF Hub
model_path = hf_hub_download(
    repo_id="astegaras/lora_merged",
    filename="llama-3.2-3b-instruct.Q2_K.gguf"
)

# Load GGUF with safe HF settings
llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=4,
    n_batch=64,
    n_gpu_layers=0,     # IMPORTANT
    use_mmap=False,     # IMPORTANT
    use_mlock=False,    # IMPORTANT
    low_vram=True,      # IMPORTANT
    verbose=False
)

def chat_fn(message, history):
    # Reformat history for llama.cpp chat template
    messages = []
    for user, assistant in history:
        messages.append({"role": "user", "content": user})
        messages.append({"role": "assistant", "content": assistant})

    messages.append({"role": "user", "content": message})

    output = llm.create_chat_completion(
        messages=messages,
        max_tokens=256,
        temperature=0.2,
        top_p=0.5
    )

    reply = output["choices"][0]["message"]["content"]
    return reply


# Gradio UI
chatbot = gr.ChatInterface(
    fn=chat_fn,
    title="Merged Kaggle Model (GGUF)",
    description="Running llama.cpp inference on GGUF model",
)

chatbot.launch()