import gradio as gr
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load model and tokenizer once at startup
print("Loading GPT-2 Small (124M)...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()
print("Model loaded on CPU!")

def generate_text(
    prompt: str,
    max_new_tokens: int,
    temperature: float,
    top_k: int,
    top_p: float,
    repetition_penalty: float,
    do_sample: bool,
):
    if not prompt.strip():
        return "⚠️ Please enter a prompt!"

    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature if do_sample else 1.0,
            top_k=top_k if do_sample else 50,
            top_p=top_p if do_sample else 1.0,
            repetition_penalty=repetition_penalty,
            do_sample=do_sample,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode only the newly generated tokens
    generated_ids = output[0][input_ids.shape[-1]:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    return prompt + generated_text


# Gradio UI
with gr.Blocks(title="Write with GPT-2") as demo:
    gr.Markdown(
        """
        # ✍️ Write with GPT-2
        **Model:** [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) &nbsp;|&nbsp;
        **Hardware:** CPU only &nbsp;|&nbsp;
        **Space by:** [Tralalabs](https://huggingface.co/Tralalabs)

        > Classic GPT-2 Small running fully on CPU. Enter a prompt and let it continue!
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            prompt_input = gr.Textbox(
                label="Prompt",
                placeholder="Once upon a time...",
                lines=4,
            )

            with gr.Accordion("⚙️ Generation Settings", open=False):
                max_new_tokens = gr.Slider(
                    minimum=10, maximum=300, value=100, step=10,
                    label="Max New Tokens"
                )
                do_sample = gr.Checkbox(
                    value=True, label="Sampling (uncheck for greedy decoding)"
                )
                temperature = gr.Slider(
                    minimum=0.1, maximum=2.0, value=0.9, step=0.05,
                    label="Temperature"
                )
                top_k = gr.Slider(
                    minimum=0, maximum=100, value=50, step=1,
                    label="Top-K"
                )
                top_p = gr.Slider(
                    minimum=0.1, maximum=1.0, value=0.95, step=0.01,
                    label="Top-P (nucleus sampling)"
                )
                repetition_penalty = gr.Slider(
                    minimum=1.0, maximum=2.0, value=1.1, step=0.05,
                    label="Repetition Penalty"
                )

            generate_btn = gr.Button("🚀 Generate", variant="primary")

        with gr.Column(scale=1):
            output_text = gr.Textbox(
                label="Generated Text",
                lines=12,
                interactive=False,
            )

    gr.Examples(
        examples=[
            ["The future of artificial intelligence is", 120, 0.9, 50, 0.95, 1.1, True],
            ["Once upon a time in a land far away,", 150, 1.0, 40, 0.92, 1.2, True],
            ["Scientists recently discovered that", 100, 0.8, 50, 0.95, 1.1, True],
            ["Dear Claude, I wanted to tell you that", 100, 0.95, 60, 0.9, 1.0, True],
        ],
        inputs=[prompt_input, max_new_tokens, temperature, top_k, top_p, repetition_penalty, do_sample],
        outputs=output_text,
        fn=generate_text,
        cache_examples=False,
    )

    generate_btn.click(
        fn=generate_text,
        inputs=[prompt_input, max_new_tokens, temperature, top_k, top_p, repetition_penalty, do_sample],
        outputs=output_text,
    )

    gr.Markdown(
        """
        ---
        **Tips:**
        - Lower temperature (0.5–0.7) → more focused, coherent text
        - Higher temperature (1.0–1.5) → more creative, unpredictable text
        - Greedy decoding (uncheck Sampling) → always picks the most likely next token
        - Repetition Penalty > 1.0 helps avoid loops
        """
    )

if __name__ == "__main__":
    demo.launch()