ollibolli's picture
Update app.py
c7e22f6
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.preprocessing import normalize
from umap import UMAP
import os
import json
# Model configuration
MODEL_ID = "openai-community/gpt2"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype=torch.float32).to(device).eval()
# Get vocabulary info
vocab_size = tokenizer.vocab_size if tokenizer.vocab_size is not None else len(tokenizer)
vocab_tokens = tokenizer.convert_ids_to_tokens(list(range(vocab_size)))
# Cache for embeddings and UMAP (computed once at startup)
embeddings_cache = None
umap_projections_cache = None
UMAP_LAYOUT_PATH = os.environ.get("UMAP_LAYOUT_PATH", os.path.join("assets", "umap_gpt2_cosine.npy"))
umap_params = {
"n_neighbors": 75,
"min_dist": 0.15,
"metric": "cosine",
"random_state": 0,
"n_components": 2,
}
def initialize_embeddings():
"""Initialize embeddings and UMAP projections once at startup"""
global embeddings_cache, umap_projections_cache
# Get embeddings and cast to float32 for performance
embeddings_cache = (
model.get_input_embeddings().weight.detach().cpu().numpy().astype(np.float32)
) # [V, d]
# Normalize rows (cosine distance works best with normalized vectors)
norm_embeds = normalize(embeddings_cache, norm="l2", axis=1)
# If a precomputed layout exists, load it
try:
if os.path.isfile(UMAP_LAYOUT_PATH):
umap_projections_cache = np.load(UMAP_LAYOUT_PATH)
# Basic validation
if umap_projections_cache.shape[0] != norm_embeds.shape[0] or umap_projections_cache.shape[1] != 2:
raise ValueError("Precomputed UMAP layout shape mismatch; recomputing.")
return embeddings_cache, umap_projections_cache
except Exception as e:
# If load fails, fall through to recompute
print(f"Warning: failed to load precomputed UMAP layout: {e}. Recomputing...")
# UMAP to 2D (full vocab) — direct from normalized embeddings (no PCA)
umap_model = UMAP(**umap_params)
umap_projections_cache = umap_model.fit_transform(norm_embeds).astype(np.float32) # [V, 2]
# Save for future cold starts
try:
os.makedirs(os.path.dirname(UMAP_LAYOUT_PATH), exist_ok=True)
np.save(UMAP_LAYOUT_PATH, umap_projections_cache)
except Exception as e:
print(f"Warning: failed to save UMAP layout to {UMAP_LAYOUT_PATH}: {e}")
return embeddings_cache, umap_projections_cache
# Initialize embeddings at startup
initialize_embeddings()
def nice_tok(tok: str) -> str:
"""Clean up token display"""
return tok.replace("Ġ", " ").replace("▁", " ")
@torch.no_grad()
def get_next_token_probs(text: str | None):
"""Get next token probabilities for given text context"""
if text and len(text) > 0:
enc = tokenizer(text, return_tensors="pt").to(device)
out = model(**enc, return_dict=True)
logits = out.logits[0, -1, :]
else:
# Unconditional: use BOS or EOS token
bos_id = tokenizer.bos_token_id or tokenizer.eos_token_id
if bos_id is None:
raise ValueError("Tokenizer has neither BOS nor EOS.")
input_ids = torch.tensor([[bos_id]], device=device)
out = model(input_ids=input_ids, return_dict=True)
logits = out.logits[0, -1, :]
return logits
@torch.no_grad()
def predict_comprehensive(
text: str,
text2: str = "",
top_k: int = 0,
include_embeddings: bool = False,
include_layout: bool = False,
include_unconditional: bool = False,
use_logprobs: bool = True
):
"""
Comprehensive prediction endpoint that returns token probabilities,
embeddings, and PCA projections for visualization
"""
result = {
"model": MODEL_ID,
"vocab_size": vocab_size,
"device": device
}
# Get primary text probabilities
logits = get_next_token_probs(text)
probs = torch.softmax(logits, dim=-1).cpu().numpy()
if use_logprobs:
log_probs = torch.log_softmax(logits, dim=-1).cpu().numpy()
else:
log_probs = None
# Always include vocabulary tokens (cleaned)
result["vocab"] = {
"tokens": [nice_tok(t) for t in vocab_tokens],
"raw_tokens": vocab_tokens,
"size": vocab_size
}
# Primary context probabilities
result["probs"] = probs.tolist()
if log_probs is not None:
result["logprobs"] = log_probs.tolist()
# Top-k tokens if requested
if top_k and top_k > 0:
vals, idxs = torch.topk(torch.from_numpy(probs), k=min(top_k, len(probs)))
idxs = idxs.tolist()
vals = vals.tolist()
result["topk"] = {
"ids": idxs,
"tokens": [nice_tok(vocab_tokens[i]) for i in idxs],
"probs": vals
}
if log_probs is not None:
result["topk"]["logprobs"] = [float(log_probs[i]) for i in idxs]
# Second text context if provided
if text2 and len(text2) > 0:
logits2 = get_next_token_probs(text2)
probs2 = torch.softmax(logits2, dim=-1).cpu().numpy()
result["probs2"] = probs2.tolist()
if use_logprobs:
log_probs2 = torch.log_softmax(logits2, dim=-1).cpu().numpy()
result["logprobs2"] = log_probs2.tolist()
# Unconditional probabilities
if include_unconditional:
logits_uncond = get_next_token_probs(None)
probs_uncond = torch.softmax(logits_uncond, dim=-1).cpu().numpy()
result["unconditional_probs"] = probs_uncond.tolist()
if use_logprobs:
log_probs_uncond = torch.log_softmax(logits_uncond, dim=-1).cpu().numpy()
result["unconditional_logprobs"] = log_probs_uncond.tolist()
# Embeddings if requested
if include_embeddings:
# Return first 10 dimensions as sample (full embeddings would be too large)
result["embeddings_sample"] = {
"shape": list(embeddings_cache.shape),
"first_10_tokens_sample": embeddings_cache[:10, :10].tolist() if embeddings_cache is not None else None
}
# UMAP layout if requested
if include_layout:
if umap_projections_cache is not None:
result["layout"] = {
"method": "umap",
"umap_params": umap_params,
"projections": umap_projections_cache.tolist(),
}
return result
@torch.no_grad()
def get_embeddings_endpoint(token_ids: str = ""):
"""
Get embeddings for specific token IDs
"""
if token_ids:
try:
ids = [int(x.strip()) for x in token_ids.split(",")]
ids = [i for i in ids if 0 <= i < vocab_size]
except:
ids = list(range(min(100, vocab_size))) # Default to first 100
else:
ids = list(range(min(100, vocab_size))) # Default to first 100
embeddings_subset = embeddings_cache[ids]
tokens_subset = [nice_tok(vocab_tokens[i]) for i in ids]
return {
"token_ids": ids,
"tokens": tokens_subset,
"embeddings": embeddings_subset.tolist(),
"embedding_dim": embeddings_cache.shape[1],
"total_vocab_size": vocab_size
}
# Create Gradio interface
with gr.Blocks(title="Token Probability Visualization API") as demo:
gr.Markdown("""
# Token Probability Visualization API
This API provides token probabilities, embeddings, and a precomputed UMAP layout for visualization.
Model: `openai-community/gpt2`
""")
with gr.Tab("Comprehensive API"):
gr.Markdown("### Get token probabilities with optional embeddings and 2D layout")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Primary Context",
value="You are an expert in medieval history.",
lines=3
)
text2_input = gr.Textbox(
label="Secondary Context (optional)",
placeholder="Enter second text for comparison",
lines=3
)
with gr.Row():
top_k_slider = gr.Slider(0, 200, step=1, value=20, label="Top-K tokens")
include_embeddings = gr.Checkbox(False, label="Include Embeddings Sample")
include_layout = gr.Checkbox(False, label="Include 2D Layout (UMAP)")
include_unconditional = gr.Checkbox(True, label="Include Unconditional")
use_logprobs = gr.Checkbox(True, label="Include Log Probabilities")
predict_btn = gr.Button("Get Predictions", variant="primary")
with gr.Column():
output_json = gr.JSON(label="API Response")
predict_btn.click(
fn=predict_comprehensive,
inputs=[text_input, text2_input, top_k_slider, include_embeddings,
include_layout, include_unconditional, use_logprobs],
outputs=output_json,
api_name="predict"
)
with gr.Tab("Embeddings API"):
gr.Markdown("### Get embeddings for specific tokens")
with gr.Row():
with gr.Column():
token_ids_input = gr.Textbox(
label="Token IDs (comma-separated)",
placeholder="e.g., 0,1,2,3,4 or leave empty for first 100",
value="0,1,2,3,4,5,6,7,8,9"
)
get_embeddings_btn = gr.Button("Get Embeddings", variant="primary")
with gr.Column():
embeddings_output = gr.JSON(label="Embeddings Response")
get_embeddings_btn.click(
fn=get_embeddings_endpoint,
inputs=token_ids_input,
outputs=embeddings_output,
api_name="embeddings"
)
with gr.Tab("Layout API"):
gr.Markdown("### Get 2D token layout (UMAP) once and cache client-side")
with gr.Row():
get_layout_btn = gr.Button("Get UMAP Layout", variant="primary")
layout_output = gr.JSON(label="UMAP Layout Response")
def get_layout_endpoint():
return {
"method": "umap",
"umap_params": umap_params,
"tokens": [nice_tok(t) for t in vocab_tokens],
"projections": umap_projections_cache.tolist() if umap_projections_cache is not None else None,
}
get_layout_btn.click(
fn=get_layout_endpoint,
inputs=None,
outputs=layout_output,
api_name="layout",
)
gr.Markdown("""
## API Usage
### Endpoints:
- `/predict`: Main endpoint for token probabilities with optional embeddings and UMAP
- `/embeddings`: Get embeddings for specific token IDs
- `/layout`: Get 2D UMAP projections (fetch once, reuse client-side)
### Response includes:
- Token probabilities (conditional and unconditional)
- Log probabilities
- UMAP projections for all vocabulary tokens (if requested or via `/layout`)
- Token embeddings (sample or specific)
- Vocabulary mappings
### Use this data to:
- Visualize token probability landscapes
- Compare conditional vs unconditional distributions
- Create embedding visualizations
- Build interactive token explorers
""")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)