import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread

# ------------------------------------------------------------------
# 1. Model setup
# ------------------------------------------------------------------
MODEL_ID = "michsethowusu/opani-coder_1b-merged-16bit"

print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

print("Loading model…")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

print("Model ready!")

# ------------------------------------------------------------------
# 2. Generation helper
# ------------------------------------------------------------------
def generate_response(message: str, history: list[dict], temperature, top_p, top_k, max_tokens):
    """
    message: str — the newest user message
    history: list[dict] — previous turns in {"role": "user"|"assistant", "content": "…"} format
    yields partial assistant reply strings
    """
    messages = history + [{"role": "user", "content": message}]

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    streamer = TextIteratorStreamer(
        tokenizer,
        skip_prompt=True,
        skip_special_tokens=True
    )

    gen_kwargs = dict(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        do_sample=True,
        streamer=streamer,
    )

    thread = Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()

    partial = ""
    for new_text in streamer:
        partial += new_text
        yield partial

    thread.join()

# ------------------------------------------------------------------
# 3. Gradio event helpers
# ------------------------------------------------------------------
def user_submit(user_message, history):
    # history is list[dict] — append user message
    return "", history + [{"role": "user", "content": user_message}]


def bot_respond(history, temperature, top_p, top_k, max_tokens):
    user_turn = history[-1]["content"]
    history_before = history[:-1]

    assistant_text = ""
    for assistant_text in generate_response(
        user_turn, history_before, temperature, top_p, top_k, max_tokens
    ):
        # update last dict incrementally
        history[-1] = {"role": "assistant", "content": assistant_text}
        yield history

# ------------------------------------------------------------------
# 4. Gradio UI
# ------------------------------------------------------------------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🇬🇭 Opani Coder 1B
        A fine-tuned Llama 3.2 1B model (16-bit) for coding assistance in Twi.
        Ask me anything about programming, and I'll help you out!
        """
    )

    chatbot = gr.Chatbot(
        height=500,
        label="Chat History",
        type="messages",
        avatar_images=(None, "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png"),
    )

    with gr.Row():
        msg = gr.Textbox(
            label="Your Message",
            placeholder="Ask me a coding question…",
            scale=4,
            lines=2,
        )
        submit = gr.Button("Send 🚀", scale=1, variant="primary")

    with gr.Accordion("⚙️ Generation Parameters", open=False):
        gr.Markdown("*Adjust these settings to control the response style*")
        temperature = gr.Slider(0.1, 2.0, 0.7, step=0.1, label="Temperature")
        top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P")
        top_k = gr.Slider(1, 100, 20, step=1, label="Top K")
        max_tokens = gr.Slider(64, 2048, 512, step=64, label="Max Tokens")

    clear = gr.Button("🗑️ Clear Chat")

    # ------------------------------------------------------------------
    # 5. Examples
    # ------------------------------------------------------------------
    gr.Examples(
        examples=[
            ["Meyɛ dɛn na mekyerɛw Python function?"],
            ["Kyerɛkyerɛ nea for loop yɛ"],
            ["Kyerɛw calculator program a ɛnyɛ den"],
            ["Nsonoe bɛn na ɛda list ne tuple ntam?"],
            ["Boa me ma mensiesie saa code yi mu mfomso"],
        ],
        inputs=msg,
        label="Example Questions"
    )

    # ------------------------------------------------------------------
    # 6. Event wiring
    # ------------------------------------------------------------------
    msg.submit(
        user_submit, [msg, chatbot], [msg, chatbot], queue=False
    ).then(
        bot_respond,
        [chatbot, temperature, top_p, top_k, max_tokens],
        chatbot,
    )

    submit.click(
        user_submit, [msg, chatbot], [msg, chatbot], queue=False
    ).then(
        bot_respond,
        [chatbot, temperature, top_p, top_k, max_tokens],
        chatbot,
    )

    clear.click(lambda: None, None, chatbot, queue=False)

    gr.Markdown(
        """
        ---
        ### 💡 Tips for Best Results:
        - **Factual/Technical questions**: temperature 0.3-0.5
        - **Creative coding solutions**: temperature 0.7-1.0
        - **Code generation**: temperature 0.5-0.7

        ### 📝 About This Model
        Fine-tuned Llama 3.2 1B (16-bit full model) for coding assistance in Twi.  
        **Model**: [michsethowusu/opani-coder_1b-merged-16bit](https://huggingface.co/michsethowusu/opani-coder_1b-merged-16bit)
        """
    )

# ------------------------------------------------------------------
# 7. Launch
# ------------------------------------------------------------------
if __name__ == "__main__":
    demo.queue().launch()