import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline # Tiny, modern instruct model that can (patiently) run on CPU MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" # Load tokenizer + model tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, # CPU-safe; on GPU you could use torch.float16/bfloat16 low_cpu_mem_usage=True # helps reduce peak RAM on load ) # Make sure a pad token exists (avoids warnings on generation) if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None: tokenizer.pad_token = tokenizer.eos_token # Wrap with a text-generation pipeline pipe = pipeline( task="text-generation", model=model, tokenizer=tokenizer ) # --- Decoding functions --- def generate_sampling(prompt, max_new_tokens=96, temperature=0.6, top_p=0.9, repetition_penalty=1.1, ngram=3): if not prompt or not prompt.strip(): return "Please enter an instruction (e.g., 'Explain why the sky is blue in one short paragraph.')" try: out = pipe( prompt.strip(), max_new_tokens=int(max_new_tokens), do_sample=True, temperature=float(temperature), top_p=float(top_p), repetition_penalty=float(repetition_penalty), no_repeat_ngram_size=int(ngram), return_full_text=False ) return out[0]["generated_text"] except Exception as e: return f"⚠️ Sampling error: {e}" def generate_deterministic(prompt, max_new_tokens=96, num_beams=4, length_penalty=0.9, ngram=3): if not prompt or not prompt.strip(): return "Please enter an instruction (e.g., 'Explain why the sky is blue in one short paragraph.')" try: out = pipe( prompt.strip(), max_new_tokens=int(max_new_tokens), num_beams=int(num_beams), early_stopping=True, length_penalty=float(length_penalty), no_repeat_ngram_size=int(ngram), return_full_text=False ) return out[0]["generated_text"] except Exception as e: return f"⚠️ Deterministic error: {e}" def generate_both(prompt, s_max_new=96, s_temp=0.6, s_topp=0.9, s_rep=1.1, s_ngram=3, d_max_new=96, d_beams=4, d_lenpen=0.9, d_ngram=3): # Run both decoders off the same prompt sampling = generate_sampling(prompt, s_max_new, s_temp, s_topp, s_rep, s_ngram) deterministic = generate_deterministic(prompt, d_max_new, d_beams, d_lenpen, d_ngram) return sampling, deterministic with gr.Blocks(fill_height=True, analytics_enabled=False) as demo: gr.Markdown( "# 🧪 Mini LLM Playground — Side-by-Side Decoding\n" "Enter one instruction below. The app generates **two answers** using:\n" "- **Sampling** (left): temperature & top-p for creativity\n" "- **Deterministic** (right): beam search for stability\n\n" "_Tip: keep outputs short on CPU (≤ 96 tokens). This is an educational demo; it may be incorrect._" ) with gr.Row(): prompt = gr.Textbox( label="Instruction", lines=4, placeholder="Explain in one short paragraph: Why is the sky blue?" ) with gr.Row(): # Left column: Sampling controls + output with gr.Column(): gr.Markdown("### 🎲 Sampling (temperature / top-p)") with gr.Row(): s_max_new = gr.Slider(32, 192, value=96, step=8, label="Max new tokens") with gr.Row(): s_temp = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="Temperature") s_topp = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") with gr.Row(): s_rep = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repetition penalty") s_ngram = gr.Slider(0, 6, value=3, step=1, label="no_repeat_ngram_size") sampling_out = gr.Textbox(label="Sampling output", lines=10) # Right column: Deterministic controls + output with gr.Column(): gr.Markdown("### 🧭 Deterministic (beam search)") with gr.Row(): d_max_new = gr.Slider(32, 192, value=96, step=8, label="Max new tokens") with gr.Row(): d_beams = gr.Slider(1, 8, value=4, step=1, label="Num beams") d_lenpen = gr.Slider(0.6, 1.4, value=0.9, step=0.05, label="Length penalty") with gr.Row(): d_ngram = gr.Slider(0, 6, value=3, step=1, label="no_repeat_ngram_size") deterministic_out = gr.Textbox(label="Deterministic output", lines=10) with gr.Row(): run_both = gr.Button("Generate Both", variant="primary") run_left = gr.Button("Generate Left Only (Sampling)") run_right = gr.Button("Generate Right Only (Deterministic)") # Wire buttons run_both.click( fn=generate_both, inputs=[prompt, s_max_new, s_temp, s_topp, s_rep, s_ngram, d_max_new, d_beams, d_lenpen, d_ngram], outputs=[sampling_out, deterministic_out] ) run_left.click( fn=generate_sampling, inputs=[prompt, s_max_new, s_temp, s_topp, s_rep, s_ngram], outputs=sampling_out ) run_right.click( fn=generate_deterministic, inputs=[prompt, d_max_new, d_beams, d_lenpen, d_ngram], outputs=deterministic_out ) gr.Markdown( "#### Compare & Contrast (discussion prompts)\n" "- Which side feels **more factual** or **more concise**?\n" "- Which side feels **more varied** or **more creative**?\n" "- For a study guide, which would you pick? For brainstorming?\n" ) if __name__ == "__main__": demo.launch()