Spaces:
Sleeping
Sleeping
File size: 9,756 Bytes
5f4f867 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | import gradio as gr
import torch
import time
from typing import Dict, Tuple
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# =============================================================================
# LLM Decoding Strategy Analyzer
# =============================================================================
# This application demonstrates 5 different text generation decoding strategies
# using GPT-2, allowing users to compare outputs side-by-side.
#
# Research Foundation:
# - Holtzman et al. (2019) "The Curious Case of Neural Text Degeneration"
# https://arxiv.org/abs/1904.09751
# - Meister et al. (2020) "If beam search is the answer, what was the question?"
# https://arxiv.org/abs/2010.02650
# - Basu et al. (2020) "Mirostat: A Neural Text Decoding Algorithm"
# https://arxiv.org/abs/2007.14966
# =============================================================================
# ----- Model Loading -----
print("Loading GPT-2 model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")
model = model.to(device)
model.eval()
print(f"β
Model loaded on {device}")
# ----- Strategy Information -----
STRATEGY_INFO = {
"greedy": {
"name": "Greedy Decoding",
"description": "Always selects the highest probability token. Deterministic but often repetitive.",
"params": "do_sample=False"
},
"beam": {
"name": "Beam Search",
"description": "Explores multiple hypotheses simultaneously. Deterministic, better than greedy but still conservative.",
"params": "num_beams=5, no_repeat_ngram_size=2"
},
"top_k": {
"name": "Top-K Sampling",
"description": "Randomly samples from the K most likely tokens. Adds variety but K is fixed regardless of distribution.",
"params": "top_k=50, temperature=1.0"
},
"top_p": {
"name": "Top-P (Nucleus) Sampling",
"description": "Samples from the smallest set of tokens whose cumulative probability exceeds P. Adapts to distribution shape.",
"params": "top_p=0.95, temperature=1.0"
},
"temperature": {
"name": "Temperature + Top-P",
"description": "Scales logits before sampling. Lower temperature = more focused. Combined with top-p for quality.",
"params": "temperature=0.7, top_p=0.95"
}
}
# ----- Generation Functions -----
def generate_with_strategy(prompt: str, strategy: str, max_new_tokens: int = 100) -> Tuple[str, float]:
"""Generate text using a specified decoding strategy."""
inputs = tokenizer(prompt, return_tensors="pt").to(device)
input_length = inputs["input_ids"].shape[1]
generation_configs = {
"greedy": {
"do_sample": False,
"num_beams": 1,
},
"beam": {
"do_sample": False,
"num_beams": 5,
"early_stopping": True,
"no_repeat_ngram_size": 2,
},
"top_k": {
"do_sample": True,
"top_k": 50,
"top_p": 1.0,
"temperature": 1.0,
},
"top_p": {
"do_sample": True,
"top_k": 0,
"top_p": 0.95,
"temperature": 1.0,
},
"temperature": {
"do_sample": True,
"top_k": 0,
"top_p": 0.95,
"temperature": 0.7,
},
}
config = generation_configs[strategy]
if device.type == "cuda":
torch.cuda.synchronize()
start_time = time.perf_counter()
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
pad_token_id=tokenizer.eos_token_id,
**config
)
if device.type == "cuda":
torch.cuda.synchronize()
end_time = time.perf_counter()
generated_tokens = outputs[0][input_length:]
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
generation_time = end_time - start_time
return generated_text, generation_time
def generate_all_strategies(prompt: str, max_new_tokens: int = 100) -> Dict[str, Dict]:
"""Generate text using all 5 strategies and return results."""
strategies = ["greedy", "beam", "top_k", "top_p", "temperature"]
results = {}
for strategy in strategies:
text, gen_time = generate_with_strategy(prompt, strategy, max_new_tokens)
tokens_generated = len(tokenizer.encode(text))
results[strategy] = {
"text": text,
"time": gen_time,
"tokens": tokens_generated,
"tokens_per_second": tokens_generated / gen_time if gen_time > 0 else 0
}
return results
def run_all_strategies(prompt: str, max_tokens: int) -> tuple:
"""Runs all 5 decoding strategies and returns formatted outputs for Gradio."""
if not prompt.strip():
empty_msg = "β οΈ Please enter a prompt."
return (empty_msg,) * 6
try:
results = generate_all_strategies(prompt, max_new_tokens=int(max_tokens))
outputs = []
summary_lines = ["| Strategy | Time | Tokens | Speed |", "|---|---|---|---|"]
for strategy in ["greedy", "beam", "top_k", "top_p", "temperature"]:
data = results[strategy]
info = STRATEGY_INFO[strategy]
output_text = f"**{info['name']}**\n\n"
output_text += f"Parameters: `{info['params']}`\n\n"
output_text += f"β±οΈ {data['time']:.2f}s | π {data['tokens']} tokens | β‘ {data['tokens_per_second']:.1f} tok/s\n\n"
output_text += "---\n\n"
output_text += f"{data['text']}"
outputs.append(output_text)
summary_lines.append(
f"| {info['name']} | {data['time']:.2f}s | {data['tokens']} | {data['tokens_per_second']:.1f} tok/s |"
)
summary = "\n".join(summary_lines)
outputs.append(summary)
return tuple(outputs)
except Exception as e:
error_msg = f"β Error: {str(e)}"
return (error_msg,) * 6
# ----- Gradio Interface -----
demo = gr.Blocks(theme=gr.themes.Soft())
with demo:
gr.Markdown("""
# π¬ LLM Decoding Strategy Analyzer
Compare 5 text generation decoding strategies side-by-side using GPT-2.
**Research Foundation:**
- Holtzman et al. (2019) - [The Curious Case of Neural Text Degeneration](https://arxiv.org/abs/1904.09751)
- Meister et al. (2020) - [If beam search is the answer, what was the question?](https://arxiv.org/abs/2010.02650)
""")
with gr.Row():
with gr.Column(scale=3):
prompt_input = gr.Textbox(
label="Enter your prompt",
placeholder="In a distant galaxy, a lone astronaut discovered",
lines=2
)
with gr.Column(scale=1):
max_tokens_slider = gr.Slider(
minimum=20,
maximum=200,
value=100,
step=10,
label="Max New Tokens"
)
generate_btn = gr.Button("π Generate All", variant="primary")
gr.Examples(
examples=[
["In a distant galaxy, a lone astronaut discovered"],
["The secret to happiness is"],
["In the year 2050, artificial intelligence"],
["She opened the ancient book and read the first line:"],
["The most important scientific discovery of the century was"],
],
inputs=prompt_input,
label="Example Prompts"
)
gr.Markdown("---")
gr.Markdown("## π Generation Results")
gr.Markdown("### Deterministic Methods")
with gr.Row():
greedy_output = gr.Markdown(label="Greedy Decoding")
beam_output = gr.Markdown(label="Beam Search")
gr.Markdown("### Stochastic Sampling Methods")
with gr.Row():
topk_output = gr.Markdown(label="Top-K Sampling")
topp_output = gr.Markdown(label="Top-P (Nucleus) Sampling")
temp_output = gr.Markdown(label="Temperature + Top-P")
gr.Markdown("### β±οΈ Performance Summary")
summary_output = gr.Markdown()
with gr.Accordion("π Strategy Explanations", open=False):
gr.Markdown("""
| Strategy | How It Works | Pros | Cons |
|----------|--------------|------|------|
| **Greedy** | Always picks highest probability token | Fast, deterministic | Repetitive, boring |
| **Beam Search** | Tracks top-k hypotheses simultaneously | More coherent than greedy | Still conservative, slow |
| **Top-K Sampling** | Samples from K most likely tokens | Adds creativity | Fixed K ignores distribution shape |
| **Top-P (Nucleus)** | Samples from smallest set with cumulative prob β₯ p | Adapts to context | Slightly slower |
| **Temperature + Top-P** | Scales logits then applies top-p | Best quality/creativity balance | Requires tuning |
**Key Insight:** Deterministic methods (greedy, beam) maximize probability but produce dull text.
Sampling methods introduce controlled randomness for more human-like output.
""")
generate_btn.click(
fn=run_all_strategies,
inputs=[prompt_input, max_tokens_slider],
outputs=[greedy_output, beam_output, topk_output, topp_output, temp_output, summary_output]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|