import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

MODEL_REPO = "Qwen/Qwen3-4B-Instruct-GGUF"
MODEL_FILE = "Qwen3-4B-Instruct-Q4_K_M.gguf"

print("Loading model...")

model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE
)

llm = Llama(
    model_path=model_path,
    n_ctx=1024,
    n_batch=512,
    n_threads=max(1, os.cpu_count() - 1),
    verbose=False
)

PERSONAS = {
    "Assistant": "You are a helpful AI assistant.",
    "Programmer": "You are an expert software engineer.",
    "Writer": "You are a creative writer.",
    "Teacher": "You explain concepts clearly and step-by-step."
}

def chat(message, history, persona):
    messages = [
        {
            "role": "system",
            "content": PERSONAS[persona]
        }
    ]

    for user, assistant in history:
        messages.append({"role": "user", "content": user})
        messages.append({"role": "assistant", "content": assistant})

    messages.append({"role": "user", "content": message})

    output = llm.create_chat_completion(
        messages=messages,
        temperature=0.8,
        top_p=0.95,
        max_tokens=1024,
        repeat_penalty=1.1
    )

    return output["choices"][0]["message"]["content"]

with gr.Blocks() as demo:
    gr.Markdown("# Qwen 4B Chat")

    persona = gr.Dropdown(
        choices=list(PERSONAS.keys()),
        value="Assistant",
        label="Persona"
    )

    chatbot = gr.Chatbot(height=600)

    msg = gr.Textbox(
        placeholder="Type a message..."
    )

    clear = gr.Button("Clear")

    def respond(message, history, persona):
        answer = chat(message, history, persona)
        history.append((message, answer))
        return "", history

    msg.submit(
        respond,
        [msg, chatbot, persona],
        [msg, chatbot]
    )

    clear.click(
        lambda: [],
        outputs=chatbot
    )

demo.launch()