import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Tiny, modern instruct model that can (patiently) run on CPU
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"

# Load tokenizer + model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,      # CPU-safe; on GPU you could use torch.float16/bfloat16
    low_cpu_mem_usage=True          # helps reduce peak RAM on load
)

# Make sure a pad token exists (avoids warnings on generation)
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token = tokenizer.eos_token

# Wrap with a text-generation pipeline
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer
)


# --- Decoding functions ---
def generate_sampling(prompt, max_new_tokens=96, temperature=0.6, top_p=0.9, repetition_penalty=1.1, ngram=3):
    if not prompt or not prompt.strip():
        return "Please enter an instruction (e.g., 'Explain why the sky is blue in one short paragraph.')"
    try:
        out = pipe(
            prompt.strip(),
            max_new_tokens=int(max_new_tokens),
            do_sample=True,
            temperature=float(temperature),
            top_p=float(top_p),
            repetition_penalty=float(repetition_penalty),
            no_repeat_ngram_size=int(ngram),
            return_full_text=False
        )
        return out[0]["generated_text"]
    except Exception as e:
        return f"⚠️ Sampling error: {e}"

def generate_deterministic(prompt, max_new_tokens=96, num_beams=4, length_penalty=0.9, ngram=3):
    if not prompt or not prompt.strip():
        return "Please enter an instruction (e.g., 'Explain why the sky is blue in one short paragraph.')"
    try:
        out = pipe(
            prompt.strip(),
            max_new_tokens=int(max_new_tokens),
            num_beams=int(num_beams),
            early_stopping=True,
            length_penalty=float(length_penalty),
            no_repeat_ngram_size=int(ngram),
            return_full_text=False
        )
        return out[0]["generated_text"]
    except Exception as e:
        return f"⚠️ Deterministic error: {e}"

def generate_both(prompt,
                  s_max_new=96, s_temp=0.6, s_topp=0.9, s_rep=1.1, s_ngram=3,
                  d_max_new=96, d_beams=4, d_lenpen=0.9, d_ngram=3):
    # Run both decoders off the same prompt
    sampling = generate_sampling(prompt, s_max_new, s_temp, s_topp, s_rep, s_ngram)
    deterministic = generate_deterministic(prompt, d_max_new, d_beams, d_lenpen, d_ngram)
    return sampling, deterministic

with gr.Blocks(fill_height=True, analytics_enabled=False) as demo:
    gr.Markdown(
        "# 🧪 Mini LLM Playground — Side-by-Side Decoding\n"
        "Enter one instruction below. The app generates **two answers** using:\n"
        "- **Sampling** (left): temperature & top-p for creativity\n"
        "- **Deterministic** (right): beam search for stability\n\n"
        "_Tip: keep outputs short on CPU (≤ 96 tokens). This is an educational demo; it may be incorrect._"
    )

    with gr.Row():
        prompt = gr.Textbox(
            label="Instruction",
            lines=4,
            placeholder="Explain in one short paragraph: Why is the sky blue?"
        )

    with gr.Row():
        # Left column: Sampling controls + output
        with gr.Column():
            gr.Markdown("### 🎲 Sampling (temperature / top-p)")
            with gr.Row():
                s_max_new = gr.Slider(32, 192, value=96, step=8, label="Max new tokens")
            with gr.Row():
                s_temp = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="Temperature")
                s_topp = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
            with gr.Row():
                s_rep = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repetition penalty")
                s_ngram = gr.Slider(0, 6, value=3, step=1, label="no_repeat_ngram_size")
            sampling_out = gr.Textbox(label="Sampling output", lines=10)

        # Right column: Deterministic controls + output
        with gr.Column():
            gr.Markdown("### 🧭 Deterministic (beam search)")
            with gr.Row():
                d_max_new = gr.Slider(32, 192, value=96, step=8, label="Max new tokens")
            with gr.Row():
                d_beams = gr.Slider(1, 8, value=4, step=1, label="Num beams")
                d_lenpen = gr.Slider(0.6, 1.4, value=0.9, step=0.05, label="Length penalty")
            with gr.Row():
                d_ngram = gr.Slider(0, 6, value=3, step=1, label="no_repeat_ngram_size")
            deterministic_out = gr.Textbox(label="Deterministic output", lines=10)

    with gr.Row():
        run_both = gr.Button("Generate Both", variant="primary")
        run_left = gr.Button("Generate Left Only (Sampling)")
        run_right = gr.Button("Generate Right Only (Deterministic)")

    # Wire buttons
    run_both.click(
        fn=generate_both,
        inputs=[prompt,
                s_max_new, s_temp, s_topp, s_rep, s_ngram,
                d_max_new, d_beams, d_lenpen, d_ngram],
        outputs=[sampling_out, deterministic_out]
    )

    run_left.click(
        fn=generate_sampling,
        inputs=[prompt, s_max_new, s_temp, s_topp, s_rep, s_ngram],
        outputs=sampling_out
    )

    run_right.click(
        fn=generate_deterministic,
        inputs=[prompt, d_max_new, d_beams, d_lenpen, d_ngram],
        outputs=deterministic_out
    )

    gr.Markdown(
        "#### Compare & Contrast (discussion prompts)\n"
        "- Which side feels **more factual** or **more concise**?\n"
        "- Which side feels **more varied** or **more creative**?\n"
        "- For a study guide, which would you pick? For brainstorming?\n"
    )

if __name__ == "__main__":
    demo.launch()