Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import GPT2LMHeadModel, GPT2Tokenizer | |
| import gradio as gr | |
| import pandas as pd | |
| from collections import Counter, defaultdict | |
| import os | |
| from huggingface_hub import login | |
| # Get the token from the environment variable | |
| api_token = os.getenv('HF_TOKEN') | |
| # Load pre-trained model and tokenizer | |
| model_name = "gpt2-large" | |
| model = GPT2LMHeadModel.from_pretrained(model_name) | |
| tokenizer = GPT2Tokenizer.from_pretrained(model_name) | |
| device = torch.device("mps") if torch.has_mps else torch.device("cpu") | |
| model.to(device) | |
| model.eval() | |
| def create_ngrams(tokens, n): | |
| return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)] | |
| def calculate_probabilities(four_gram_counts, three_gram_counts): | |
| probabilities = defaultdict(lambda: defaultdict(float)) | |
| for four_gram, count in four_gram_counts.items(): | |
| three_gram = four_gram[:-1] | |
| probabilities[three_gram][four_gram[-1]] = count / three_gram_counts[three_gram] | |
| return probabilities | |
| def kneser_ney_smoothing(ngram_counts, lower_order_counts, discount=0.75): | |
| continuation_counts = Counter() | |
| lower_counts = Counter() | |
| for ngram in ngram_counts: | |
| lower_counts[ngram[1:]] += 1 | |
| continuation_counts[ngram[1:]] += 1 | |
| def continuation_probability(word): | |
| return continuation_counts[word] / sum(continuation_counts.values()) | |
| probabilities = defaultdict(lambda: defaultdict(float)) | |
| for ngram, count in ngram_counts.items(): | |
| lower_ngram = ngram[:-1] | |
| discounted_count = max(count - discount, 0) | |
| lambda_factor = (discount / lower_order_counts[lower_ngram]) * len(continuation_counts) | |
| probabilities[lower_ngram][ngram[-1]] = (discounted_count / lower_order_counts[lower_ngram]) + lambda_factor * continuation_probability(ngram[-1]) | |
| return probabilities | |
| def generate_text_with_probs(initial_context, top_p, max_length, top_k): | |
| input_ids = tokenizer.encode(initial_context, return_tensors="pt").to(device) | |
| generated_text = initial_context | |
| token_tables = [] | |
| token_no = 1 | |
| with torch.no_grad(): | |
| for _ in range(max_length): | |
| outputs = model(input_ids=input_ids) | |
| next_token_logits = outputs.logits[:, -1, :] | |
| # Apply top-p (nucleus) sampling | |
| sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True) | |
| cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1) | |
| sorted_indices_to_remove = cumulative_probs > top_p | |
| sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() | |
| sorted_indices_to_remove[..., 0] = 0 | |
| indices_to_remove = sorted_indices[sorted_indices_to_remove] | |
| next_token_logits[:, indices_to_remove] = -float('Inf') | |
| probabilities = torch.softmax(next_token_logits, dim=-1) | |
| next_token = torch.multinomial(probabilities, num_samples=1) | |
| next_token_prob = probabilities[0, next_token].item() | |
| next_token_text = tokenizer.decode(next_token.item()) | |
| top_tokens = sorted_indices[0, :top_k] | |
| top_probs = probabilities[0, top_tokens] | |
| top_token_probs = [(tokenizer.decode([token.item()]), prob.item()) for token, prob in zip(top_tokens, top_probs)] | |
| df = pd.DataFrame(top_token_probs, columns=["Token", "Probability"]) | |
| df.index = df.index + 1 | |
| token_tables.append((f"{token_no}>> Next token: {next_token_text} (Probability: {next_token_prob:.8f})", df)) | |
| token_no+=1 | |
| input_ids = torch.cat([input_ids, next_token], dim=-1) | |
| if next_token.item() == tokenizer.eos_token_id: | |
| break | |
| generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True) | |
| return generated_text[len(initial_context):], token_tables | |
| def predict_next_token_ngram(input_text, context_text, max_length): | |
| ip = input_text | |
| context_tokens = tokenizer.tokenize(context_text) | |
| four_grams = create_ngrams(context_tokens, 4) | |
| four_gram_counts = Counter(four_grams) | |
| three_gram_counts = Counter([gram[:-1] for gram in four_grams]) | |
| probabilities = calculate_probabilities(four_gram_counts, three_gram_counts) | |
| probs = kneser_ney_smoothing(four_gram_counts, three_gram_counts) | |
| input_tokens = tokenizer.tokenize(input_text) | |
| generated_tokens = input_tokens.copy() | |
| generated_text = input_text | |
| token_tables = [] | |
| if len(input_tokens) >= (max_length + len(generated_tokens)): | |
| generated_text = tokenizer.convert_tokens_to_string(input_tokens) | |
| return generated_text, token_tables | |
| token_no = 1 | |
| while len(input_tokens) < (max_length + len(generated_tokens)): | |
| input_3_gram = tuple(input_tokens[-3:]) | |
| next_token_probs = probs.get(input_3_gram, {}) | |
| if not next_token_probs: | |
| break | |
| next_token = max(next_token_probs, key=next_token_probs.get) | |
| input_tokens.append(next_token) | |
| top_k = 4 | |
| top_k_tokens = sorted(next_token_probs.items(), key=lambda x: x[1], reverse=True)[:top_k] | |
| top_k_tokens_df = pd.DataFrame(top_k_tokens, columns=["Token", "Probability"]) | |
| top_k_tokens_df.index = top_k_tokens_df.index + 1 # Add numbering to the DataFrame | |
| top_k_tokens_df["Token"] = top_k_tokens_df["Token"].apply(lambda x: tokenizer.convert_tokens_to_string([x])) | |
| token_tables.append((f"{token_no}>> Next token: {next_token}", top_k_tokens_df)) | |
| token_no+=1 | |
| generated_text = tokenizer.convert_tokens_to_string(input_tokens) | |
| return generated_text[len(ip):], token_tables | |
| def combined_model_predictions(context_text, initial_context, top_p, max_length, top_k): | |
| generated_text, token_tables = generate_text_with_probs(initial_context, top_p, max_length, top_k) | |
| ngram_generated_text, ngram_token_tables = predict_next_token_ngram(initial_context, context_text, max_length) | |
| return generated_text, token_tables, ngram_generated_text, ngram_token_tables | |
| iface = gr.Interface( | |
| fn=combined_model_predictions, | |
| inputs=[ | |
| gr.Textbox(lines=4, placeholder="Enter context for N-gram model..."), | |
| gr.Textbox(lines=2, placeholder="Enter initial context here..."), | |
| gr.Slider(0, 1, step=0.01, value=0.9, label="Top-p (nucleus) sampling"), | |
| gr.Slider(1, 100, step=1, value=50, label="Max length"), | |
| gr.Slider(1, 50, step=1, value=10, label="Top-k"), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Generated Text"), | |
| gr.Dataframe(label="LLM Token Probabilities"), | |
| gr.Textbox(label="N-gram Generated Text"), | |
| gr.Dataframe(label="N-gram Token Predictions"), | |
| ], | |
| title="Next Token Visualizer (GPT-2-large - 812M param.)" | |
| ) | |
| iface.launch() |