Spaces:

raayraay
/

Hallucination-Heatmap

Sleeping

App Files Files Community

Hallucination-Heatmap / app.py

raayraay

Upload folder using huggingface_hub

c5e6036 verified about 2 months ago

raw

history blame contribute delete

8.8 kB

	import gradio as gr
	import torch
	import torch.nn.functional as F
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import html

	# Load model and tokenizer
	MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto" if torch.cuda.is_available() else None,
	trust_remote_code=True
	)

	def calculate_entropy(logits):
	"""Calculate entropy of probability distribution. Higher = more uncertain."""
	probs = F.softmax(logits, dim=-1)
	log_probs = F.log_softmax(logits, dim=-1)
	entropy = -torch.sum(probs * log_probs, dim=-1)
	# Normalize by max entropy (log of vocab size)
	max_entropy = torch.log(torch.tensor(logits.shape[-1], dtype=torch.float32))
	normalized_entropy = entropy / max_entropy
	return normalized_entropy.item()

	def get_top_alternatives(logits, tokenizer, k=3):
	"""Get top k alternative tokens the model considered."""
	probs = F.softmax(logits, dim=-1)
	top_probs, top_ids = torch.topk(probs, k)
	alternatives = []
	for prob, tok_id in zip(top_probs.tolist(), top_ids.tolist()):
	token_text = tokenizer.decode([tok_id]).strip()
	if token_text:
	alternatives.append(f"'{token_text}' ({prob:.1%})")
	return alternatives

	def get_confidence_color(prob, entropy=None):
	"""Map probability and entropy to color."""
	# Combine signals: low prob OR high entropy = red
	if entropy is not None and entropy > 0.7:
	return "#ef4444" # red (high uncertainty)
	if prob >= 0.7:
	return "#22c55e" # green
	elif prob >= 0.4:
	return "#eab308" # yellow
	else:
	return "#ef4444" # red

	def generate_with_confidence(message):
	"""Generate response with per-token confidence scores."""

	if not message.strip():
	return "<p style='color: #888;'>Please enter a message.</p>"

	# Build conversation (single turn for simplicity)
	messages = [
	{"role": "system", "content": "You are a helpful assistant. Keep responses concise."},
	{"role": "user", "content": message}
	]

	# Tokenize
	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer(text, return_tensors="pt")
	if torch.cuda.is_available():
	inputs = inputs.to("cuda")

	input_length = inputs["input_ids"].shape[1]

	# Generate with scores
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=256,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	output_scores=True,
	return_dict_in_generate=True,
	pad_token_id=tokenizer.eos_token_id
	)

	# Extract generated tokens and their probabilities
	generated_ids = outputs.sequences[0][input_length:]
	scores = outputs.scores

	# Build HTML response
	html_parts = []
	current_word = ""
	current_probs = []
	current_entropies = []
	current_alternatives = []

	for i, (token_id, score) in enumerate(zip(generated_ids, scores)):
	# Get probability for selected token
	probs = torch.softmax(score[0], dim=-1)
	token_prob = probs[token_id].item()

	# Calculate entropy (uncertainty measure)
	entropy = calculate_entropy(score[0])

	# Get what else the model considered
	alternatives = get_top_alternatives(score[0], tokenizer, k=3)

	# Decode token
	token_text = tokenizer.decode([token_id])

	# Check if token starts a new word
	if token_text.startswith(" ") or token_text.startswith("\n"):
	# Flush current word
	if current_word and current_probs:
	avg_prob = sum(current_probs) / len(current_probs)
	avg_entropy = sum(current_entropies) / len(current_entropies) if current_entropies else 0
	color = get_confidence_color(avg_prob, avg_entropy)
	escaped_word = html.escape(current_word)

	# Build rich tooltip
	tooltip_lines = [f"Confidence: {avg_prob:.1%}", f"Uncertainty: {avg_entropy:.1%}"]
	if current_alternatives:
	tooltip_lines.append("Alternatives: " + ", ".join(current_alternatives[-1][:3]))
	tooltip = " \| ".join(tooltip_lines)

	html_parts.append(
	f'<span style="background-color: {color}; padding: 1px 3px; '
	f'border-radius: 3px; margin: 1px; cursor: help;" '
	f'title="{tooltip}">{escaped_word}</span>'
	)
	current_word = token_text
	current_probs = [token_prob]
	current_entropies = [entropy]
	current_alternatives = [alternatives]
	else:
	current_word += token_text
	current_probs.append(token_prob)
	current_entropies.append(entropy)
	current_alternatives.append(alternatives)

	# Flush last word
	if current_word and current_probs:
	avg_prob = sum(current_probs) / len(current_probs)
	avg_entropy = sum(current_entropies) / len(current_entropies) if current_entropies else 0
	color = get_confidence_color(avg_prob, avg_entropy)
	escaped_word = html.escape(current_word)

	tooltip_lines = [f"Confidence: {avg_prob:.1%}", f"Uncertainty: {avg_entropy:.1%}"]
	if current_alternatives:
	tooltip_lines.append("Alternatives: " + ", ".join(current_alternatives[-1][:3]))
	tooltip = " \| ".join(tooltip_lines)

	html_parts.append(
	f'<span style="background-color: {color}; padding: 1px 3px; '
	f'border-radius: 3px; margin: 1px; cursor: help;" '
	f'title="{tooltip}">{escaped_word}</span>'
	)

	html_response = "".join(html_parts)

	# Wrap in a styled container
	result = f"""
	<div style="background: #1a1a2e; padding: 20px; border-radius: 12px; font-size: 16px; line-height: 1.8;">
	<div style="margin-bottom: 10px; color: #888; font-size: 12px;">AI Response:</div>
	<div>{html_response}</div>
	</div>
	"""

	return result

	# Custom CSS
	css = """
	footer {visibility: hidden}
	.output-html {
	min-height: 200px;
	}
	"""

	# Build interface
	with gr.Blocks(css=css, theme=gr.themes.Base(primary_hue="red")) as demo:
	gr.Markdown("""
	# Hallucination Heatmap

	See exactly where the AI might be lying.

	Every word is color-coded by model confidence. Red words = low confidence = potential hallucination.
	Hover over any word to see the exact probability score and alternative tokens.
	""")

	with gr.Row():
	msg = gr.Textbox(
	placeholder="Ask me anything...",
	label="Your message",
	scale=7
	)
	submit = gr.Button("Generate", variant="primary", scale=1)

	# Output as HTML component
	output = gr.HTML(
	value="<div style='padding: 20px; color: #888; text-align: center;'>Response will appear here with confidence highlighting...</div>",
	label="Response"
	)

	# Legend
	gr.HTML("""
	<div style="margin-top: 15px; padding: 15px; background: #1a1a2e; border-radius: 8px; font-size: 13px;">
	<b>Confidence Legend:</b>
	<span style="background-color: #22c55e; padding: 3px 10px; border-radius: 3px; margin-left: 10px;">High (70%+)</span>
	<span style="background-color: #eab308; padding: 3px 10px; border-radius: 3px; margin-left: 5px;">Medium (40-70%)</span>
	<span style="background-color: #ef4444; padding: 3px 10px; border-radius: 3px; margin-left: 5px;">Low / Uncertain</span>
	<br><i style="color: #888; margin-top: 8px; display: block;">Hover over words to see: confidence %, uncertainty (entropy), and alternative tokens the model considered</i>
	</div>
	""")

	# Examples
	gr.Examples(
	examples=[
	["What is the capital of France?"],
	["Tell me about quantum computing."],
	["What happened on July 4th, 1776?"],
	["Explain why the sky is blue."],
	],
	inputs=msg
	)

	# Event handlers
	msg.submit(generate_with_confidence, msg, output)
	submit.click(generate_with_confidence, msg, output)

	gr.Markdown("""
	---
	How it works: The model outputs a probability distribution over all possible next tokens.
	We capture the probability of each selected token and visualize it. Low probability = the model was "unsure" = higher hallucination risk.

	Built by Eric Raymond (Purdue University) & Samiksha BC (IU South Bend)
	""")

	demo.launch()