Spaces:

amasood
/

transE

Sleeping

App Files Files Community

transE / app.py

amasood

Update app.py

bc02f8a verified 11 months ago

raw

history blame contribute delete

4.37 kB

	import streamlit as st
	import torch
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np
	import pandas as pd
	import plotly.express as px
	from sklearn.decomposition import PCA
	from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForCausalLM

	# App Title
	st.title("🚀 Transformer Model Explorer")

	st.markdown(
	"""
	Explore different transformer models, their architectures, tokenization, and attention mechanisms.
	"""
	)

	# Model Selection
	model_name = st.selectbox(
	"Choose a Transformer Model:",
	["bert-base-uncased", "gpt2", "t5-small", "roberta-base"]
	)

	# Load Tokenizer & Model
	st.write(f"Loading model: `{model_name}`...")
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)

	# Display Model Details
	st.subheader("🛠 Model Details")
	st.write(f"Model Type: `{model.config.model_type}`")
	st.write(f"Number of Layers: `{model.config.num_hidden_layers}`")
	st.write(f"Number of Attention Heads: `{model.config.num_attention_heads if hasattr(model.config, 'num_attention_heads') else 'N/A'}`")
	st.write(f"Total Parameters: `{sum(p.numel() for p in model.parameters())/1e6:.2f}M`")

	# Model Size Comparison
	st.subheader("📊 Model Size Comparison")
	model_sizes = {
	"bert-base-uncased": 110,
	"gpt2": 124,
	"t5-small": 60,
	"roberta-base": 125
	}
	df_size = pd.DataFrame(model_sizes.items(), columns=["Model", "Size (Million Parameters)"])
	fig = px.bar(df_size, x="Model", y="Size (Million Parameters)", title="Model Size Comparison")
	st.plotly_chart(fig)

	# Tokenization Section
	st.subheader("📝 Tokenization Visualization")
	input_text = st.text_input("Enter Text:", "Hello, how are you?")
	tokens = tokenizer.tokenize(input_text)
	st.write("Tokenized Output:", tokens)

	# Token Embeddings Visualization (Fixed PCA Projection)
	st.subheader("🧩 Token Embeddings Visualization")
	with torch.no_grad():
	inputs = tokenizer(input_text, return_tensors="pt")
	outputs = model(**inputs)

	if hasattr(outputs, "last_hidden_state"):
	embeddings = outputs.last_hidden_state.squeeze(0).numpy()

	# Ensure the number of tokens and embeddings match
	n_tokens = min(len(tokens), embeddings.shape[0])
	embeddings = embeddings[:n_tokens] # Trim embeddings to match token count
	tokens = tokens[:n_tokens] # Trim tokens to match embeddings count

	pca = PCA(n_components=2)
	reduced_embeddings = pca.fit_transform(embeddings)

	df_embeddings = pd.DataFrame(reduced_embeddings, columns=["PCA1", "PCA2"])
	df_embeddings["Token"] = tokens

	fig = px.scatter(df_embeddings, x="PCA1", y="PCA2", text="Token",
	title="Token Embeddings (PCA Projection)")
	st.plotly_chart(fig)

	# Attention Visualization (for BERT & RoBERTa models)
	if "bert" in model_name or "roberta" in model_name:
	st.subheader("🔍 Attention Map")

	with torch.no_grad():
	outputs = model(**inputs, output_attentions=True)
	attention = outputs.attentions[-1].squeeze().detach().numpy()

	fig, ax = plt.subplots(figsize=(10, 5))
	sns.heatmap(attention[0], cmap="viridis", xticklabels=tokens, yticklabels=tokens, ax=ax)
	st.pyplot(fig)

	# Text Generation Demo (for GPT-like models)
	if "gpt" in model_name:
	st.subheader("✍️ Text Generation & Token Probabilities")

	generator = pipeline("text-generation", model=model_name, return_full_text=False)
	generated_output = generator(input_text, max_length=50, return_tensors=True)

	st.write("Generated Output:", generated_output[0]["generated_text"])

	# Token Probability Visualization
	model_gen = AutoModelForCausalLM.from_pretrained(model_name)
	with torch.no_grad():
	inputs = tokenizer(input_text, return_tensors="pt")
	logits = model_gen(**inputs).logits[:, -1, :]
	probs = torch.nn.functional.softmax(logits, dim=-1).squeeze().detach().numpy()

	top_tokens = np.argsort(probs)[-10:][::-1] # Top 10 tokens
	token_probs = {tokenizer.decode([idx]): probs[idx] for idx in top_tokens}

	df_probs = pd.DataFrame(token_probs.items(), columns=["Token", "Probability"])
	fig_prob = px.bar(df_probs, x="Token", y="Probability", title="Top Token Predictions")
	st.plotly_chart(fig_prob)

	st.markdown("💡 Explore more about Transformer models!")