Spaces:

Nav772
/

llm-evaluation-dashboard

Sleeping

App Files Files Community

llm-evaluation-dashboard / app.py

Nav772

Upload app.py with huggingface_hub

e998535 verified 18 days ago

raw

history blame contribute delete

20.6 kB

	import gradio as gr
	import pandas as pd
	import plotly.graph_objects as go
	from huggingface_hub import InferenceClient
	import time
	import json
	import re
	from io import StringIO

	# =============================================================================
	# LLM Evaluation Dashboard
	# =============================================================================

	MODELS = {
	"Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
	"Llama-3.2-3B": "meta-llama/Llama-3.2-3B-Instruct",
	"Llama-3.1-70B": "meta-llama/Llama-3.1-70B-Instruct",
	"Qwen2.5-72B": "Qwen/Qwen2.5-72B-Instruct",
	"Qwen2.5-Coder": "Qwen/Qwen2.5-Coder-32B-Instruct"
	}

	MODEL_INFO = {
	"Mistral-7B": {"params": "7B", "type": "General", "org": "Mistral AI"},
	"Llama-3.2-3B": {"params": "3B", "type": "General", "org": "Meta"},
	"Llama-3.1-70B": {"params": "70B", "type": "General", "org": "Meta"},
	"Qwen2.5-72B": {"params": "72B", "type": "General", "org": "Alibaba"},
	"Qwen2.5-Coder": {"params": "32B", "type": "Code", "org": "Alibaba"}
	}

	EVAL_TASKS = {
	"reasoning": {
	"name": "Reasoning (Math)",
	"description": "Tests logical and mathematical reasoning",
	"tasks": [
	{"id": "math_1", "prompt": "A store sells apples for $2 each. If I buy 3 apples and pay with a $10 bill, how much change do I get? Answer with just the number.", "expected": "4", "check_type": "contains"},
	{"id": "math_2", "prompt": "If a train travels at 60 mph for 2.5 hours, how many miles does it travel? Answer with just the number.", "expected": "150", "check_type": "contains"},
	{"id": "math_3", "prompt": "A rectangle has length 8 and width 5. What is its area? Answer with just the number.", "expected": "40", "check_type": "contains"},
	{"id": "logic_1", "prompt": "If all roses are flowers, and some flowers fade quickly, can we conclude that some roses fade quickly? Answer only yes or no.", "expected": "no", "check_type": "contains_lower"},
	{"id": "logic_2", "prompt": "I have a brother. My brother has a brother. How many brothers minimum are in the family? Answer with just the number.", "expected": "2", "check_type": "contains"}
	]
	},
	"knowledge": {
	"name": "Knowledge (Facts)",
	"description": "Tests factual accuracy",
	"tasks": [
	{"id": "fact_1", "prompt": "What is the chemical symbol for gold? Answer with just the symbol.", "expected": "Au", "check_type": "contains"},
	{"id": "fact_2", "prompt": "In what year did World War 2 end? Answer with just the year.", "expected": "1945", "check_type": "contains"},
	{"id": "fact_3", "prompt": "What planet is known as the Red Planet? Answer with just the name.", "expected": "Mars", "check_type": "contains_lower"},
	{"id": "fact_4", "prompt": "How many sides does a hexagon have? Answer with just the number.", "expected": "6", "check_type": "contains"},
	{"id": "fact_5", "prompt": "What is the capital of Japan? Answer with just the city name.", "expected": "Tokyo", "check_type": "contains_lower"}
	]
	},
	"instruction": {
	"name": "Instruction Following",
	"description": "Tests ability to follow format instructions",
	"tasks": [
	{"id": "json_1", "prompt": "Return a JSON object with keys name and age for a 25 year old person named Alice. Return ONLY the JSON, no explanation.", "expected": "name", "check_type": "json_valid"},
	{"id": "format_1", "prompt": "List exactly 3 colors, one per line, no numbers or bullets.", "expected": "3_lines", "check_type": "line_count"},
	{"id": "format_2", "prompt": "Write a single sentence of exactly 5 words about cats.", "expected": "5", "check_type": "word_count"},
	{"id": "constraint_1", "prompt": "Name a fruit. Your answer must start with the letter A. Answer with just the fruit name.", "expected": "a", "check_type": "starts_with_lower"},
	{"id": "constraint_2", "prompt": "Give me a number between 1 and 10. Answer with ONLY the number, nothing else.", "expected": "single_digit", "check_type": "is_single_number"}
	]
	}
	}

	def query_model(model_id: str, prompt: str, max_tokens: int = 256) -> dict:
	client = InferenceClient(model=model_id)
	messages = [{"role": "user", "content": prompt}]

	start_time = time.time()
	try:
	response = client.chat_completion(messages=messages, max_tokens=max_tokens, temperature=0.7)
	latency = time.time() - start_time
	return {"response": response.choices[0].message.content, "latency": latency, "error": None}
	except Exception as e:
	latency = time.time() - start_time
	return {"response": None, "latency": latency, "error": str(e)}

	def check_answer(response: str, expected: str, check_type: str) -> dict:
	if response is None:
	return {"score": 0, "explanation": "No response (error)"}

	response = response.strip()

	if check_type == "contains":
	passed = expected in response
	return {"score": 1 if passed else 0, "explanation": f"Looking for '{expected}' in response"}

	if check_type == "contains_lower":
	passed = expected.lower() in response.lower()
	return {"score": 1 if passed else 0, "explanation": f"Looking for '{expected}' (case-insensitive)"}

	if check_type == "starts_with_lower":
	first_word = response.split()[0] if response.split() else ""
	passed = first_word.lower().startswith(expected.lower())
	return {"score": 1 if passed else 0, "explanation": f"Should start with '{expected}'"}

	if check_type == "json_valid":
	try:
	json_match = re.search(r'\{[^{}]*\}', response)
	passed = json_match is not None
	if passed:
	json.loads(json_match.group())
	except:
	passed = False
	return {"score": 1 if passed else 0, "explanation": "Checking for valid JSON"}

	if check_type == "line_count":
	lines = [l for l in response.strip().split("\n") if l.strip()]
	expected_count = int(expected.split("_")[0])
	passed = len(lines) == expected_count
	return {"score": 1 if passed else 0, "explanation": f"Expected {expected_count} lines, got {len(lines)}"}

	if check_type == "word_count":
	words = response.split()
	expected_count = int(expected)
	passed = len(words) == expected_count
	return {"score": 1 if passed else 0, "explanation": f"Expected {expected_count} words, got {len(words)}"}

	if check_type == "is_single_number":
	numbers = re.findall(r'\b([1-9]\|10)\b', response)
	passed = len(numbers) >= 1 and len(response.strip()) <= 3
	return {"score": 1 if passed else 0, "explanation": "Should be a single number 1-10"}

	return {"score": 0, "explanation": f"Unknown check type: {check_type}"}

	# Pre-computed results
	PRECOMPUTED_CSV = """model,category,category_name,task_id,score,latency,response
	Mistral-7B,reasoning,Reasoning (Math),math_1,1,0.4,4
	Mistral-7B,reasoning,Reasoning (Math),math_2,1,0.2,150
	Mistral-7B,reasoning,Reasoning (Math),math_3,1,0.2,40
	Mistral-7B,reasoning,Reasoning (Math),logic_1,0,0.2,yes
	Mistral-7B,reasoning,Reasoning (Math),logic_2,0,0.2,3
	Mistral-7B,knowledge,Knowledge (Facts),fact_1,1,0.3,Au
	Mistral-7B,knowledge,Knowledge (Facts),fact_2,1,0.8,1945
	Mistral-7B,knowledge,Knowledge (Facts),fact_3,1,0.2,Mars
	Mistral-7B,knowledge,Knowledge (Facts),fact_4,1,0.2,6
	Mistral-7B,knowledge,Knowledge (Facts),fact_5,1,0.2,Tokyo
	Mistral-7B,instruction,Instruction Following,json_1,1,1.9,valid json
	Mistral-7B,instruction,Instruction Following,format_1,1,0.3,3 lines
	Mistral-7B,instruction,Instruction Following,format_2,0,0.3,6 words
	Mistral-7B,instruction,Instruction Following,constraint_1,1,0.2,Apple
	Mistral-7B,instruction,Instruction Following,constraint_2,1,0.2,7
	Llama-3.2-3B,reasoning,Reasoning (Math),math_1,1,2.2,4
	Llama-3.2-3B,reasoning,Reasoning (Math),math_2,1,5.8,150
	Llama-3.2-3B,reasoning,Reasoning (Math),math_3,1,3.5,40
	Llama-3.2-3B,reasoning,Reasoning (Math),logic_1,0,0.9,yes
	Llama-3.2-3B,reasoning,Reasoning (Math),logic_2,0,4.8,3
	Llama-3.2-3B,knowledge,Knowledge (Facts),fact_1,1,5.6,Au
	Llama-3.2-3B,knowledge,Knowledge (Facts),fact_2,1,2.9,1945
	Llama-3.2-3B,knowledge,Knowledge (Facts),fact_3,1,0.8,Mars
	Llama-3.2-3B,knowledge,Knowledge (Facts),fact_4,1,2.7,6
	Llama-3.2-3B,knowledge,Knowledge (Facts),fact_5,1,3.8,Tokyo
	Llama-3.2-3B,instruction,Instruction Following,json_1,0,0.1,error
	Llama-3.2-3B,instruction,Instruction Following,format_1,0,0.1,error
	Llama-3.2-3B,instruction,Instruction Following,format_2,0,0.1,error
	Llama-3.2-3B,instruction,Instruction Following,constraint_1,0,0.1,error
	Llama-3.2-3B,instruction,Instruction Following,constraint_2,0,0.1,error
	Qwen2.5-72B,reasoning,Reasoning (Math),math_1,0,0.1,error
	Qwen2.5-72B,reasoning,Reasoning (Math),math_2,0,0.1,error
	Qwen2.5-72B,reasoning,Reasoning (Math),math_3,0,0.1,error
	Qwen2.5-72B,reasoning,Reasoning (Math),logic_1,0,0.1,error
	Qwen2.5-72B,reasoning,Reasoning (Math),logic_2,0,0.1,error
	Qwen2.5-72B,knowledge,Knowledge (Facts),fact_1,0,0.1,error
	Qwen2.5-72B,knowledge,Knowledge (Facts),fact_2,1,0.9,1945
	Qwen2.5-72B,knowledge,Knowledge (Facts),fact_3,1,1.0,Mars
	Qwen2.5-72B,knowledge,Knowledge (Facts),fact_4,1,0.5,6
	Qwen2.5-72B,knowledge,Knowledge (Facts),fact_5,1,0.8,Tokyo
	Qwen2.5-72B,instruction,Instruction Following,json_1,1,1.2,valid json
	Qwen2.5-72B,instruction,Instruction Following,format_1,1,0.9,3 lines
	Qwen2.5-72B,instruction,Instruction Following,format_2,1,1.1,5 words
	Qwen2.5-72B,instruction,Instruction Following,constraint_1,1,0.7,Apple
	Qwen2.5-72B,instruction,Instruction Following,constraint_2,1,1.0,5
	Qwen2.5-Coder,reasoning,Reasoning (Math),math_1,1,0.4,4
	Qwen2.5-Coder,reasoning,Reasoning (Math),math_2,1,0.4,150
	Qwen2.5-Coder,reasoning,Reasoning (Math),math_3,1,0.4,40
	Qwen2.5-Coder,reasoning,Reasoning (Math),logic_1,1,0.4,no
	Qwen2.5-Coder,reasoning,Reasoning (Math),logic_2,0,0.5,3
	Qwen2.5-Coder,knowledge,Knowledge (Facts),fact_1,1,0.4,Au
	Qwen2.5-Coder,knowledge,Knowledge (Facts),fact_2,1,0.4,1945
	Qwen2.5-Coder,knowledge,Knowledge (Facts),fact_3,1,0.2,Mars
	Qwen2.5-Coder,knowledge,Knowledge (Facts),fact_4,1,0.4,6
	Qwen2.5-Coder,knowledge,Knowledge (Facts),fact_5,1,0.2,Tokyo
	Qwen2.5-Coder,instruction,Instruction Following,json_1,0,0.1,error
	Qwen2.5-Coder,instruction,Instruction Following,format_1,0,0.1,error
	Qwen2.5-Coder,instruction,Instruction Following,format_2,0,0.1,error
	Qwen2.5-Coder,instruction,Instruction Following,constraint_1,0,0.1,error
	Qwen2.5-Coder,instruction,Instruction Following,constraint_2,0,0.1,error
	Llama-3.1-70B,reasoning,Reasoning (Math),math_1,0,0.04,error
	Llama-3.1-70B,reasoning,Reasoning (Math),math_2,0,0.04,error
	Llama-3.1-70B,reasoning,Reasoning (Math),math_3,0,0.04,error
	Llama-3.1-70B,reasoning,Reasoning (Math),logic_1,0,0.04,error
	Llama-3.1-70B,reasoning,Reasoning (Math),logic_2,0,0.04,error
	Llama-3.1-70B,knowledge,Knowledge (Facts),fact_1,0,0.04,error
	Llama-3.1-70B,knowledge,Knowledge (Facts),fact_2,0,0.04,error
	Llama-3.1-70B,knowledge,Knowledge (Facts),fact_3,0,0.04,error
	Llama-3.1-70B,knowledge,Knowledge (Facts),fact_4,0,0.04,error
	Llama-3.1-70B,knowledge,Knowledge (Facts),fact_5,0,0.04,error
	Llama-3.1-70B,instruction,Instruction Following,json_1,0,0.04,error
	Llama-3.1-70B,instruction,Instruction Following,format_1,0,0.04,error
	Llama-3.1-70B,instruction,Instruction Following,format_2,0,0.04,error
	Llama-3.1-70B,instruction,Instruction Following,constraint_1,0,0.04,error
	Llama-3.1-70B,instruction,Instruction Following,constraint_2,0,0.04,error"""

	EVAL_RESULTS = pd.read_csv(StringIO(PRECOMPUTED_CSV))

	def get_summary_stats():
	model_acc = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=False)
	best_model = model_acc.index[0]
	best_acc = model_acc.values[0] * 100

	html = f"""
	<div style="display: flex; gap: 20px; flex-wrap: wrap; justify-content: center; margin-bottom: 20px;">
	<div style="background: linear-gradient(135deg, #e8f5e9, #c8e6c9); padding: 20px; border-radius: 12px; flex: 1; min-width: 180px; max-width: 250px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
	<h3 style="margin: 0; color: #2e7d32; font-size: 14px;">🏆 Best Model</h3>
	<p style="font-size: 22px; margin: 10px 0; font-weight: bold; color: #1b5e20;">{best_model}</p>
	<p style="margin: 0; color: #558b2f; font-size: 13px;">{best_acc:.1f}% accuracy</p>
	</div>
	<div style="background: linear-gradient(135deg, #e3f2fd, #bbdefb); padding: 20px; border-radius: 12px; flex: 1; min-width: 180px; max-width: 250px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
	<h3 style="margin: 0; color: #1565c0; font-size: 14px;">📊 Evaluation Tasks</h3>
	<p style="font-size: 22px; margin: 10px 0; font-weight: bold; color: #0d47a1;">15</p>
	<p style="margin: 0; color: #1976d2; font-size: 13px;">Across 3 categories</p>
	</div>
	<div style="background: linear-gradient(135deg, #fff3e0, #ffe0b2); padding: 20px; border-radius: 12px; flex: 1; min-width: 180px; max-width: 250px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
	<h3 style="margin: 0; color: #ef6c00; font-size: 14px;">🤖 Models Tested</h3>
	<p style="font-size: 22px; margin: 10px 0; font-weight: bold; color: #e65100;">{len(MODELS)}</p>
	<p style="margin: 0; color: #f57c00; font-size: 13px;">3B to 72B parameters</p>
	</div>
	</div>
	"""
	return html

	def get_accuracy_chart():
	model_accuracy = EVAL_RESULTS.groupby('model')['score'].mean().sort_values(ascending=True)

	fig = go.Figure(go.Bar(
	x=model_accuracy.values * 100,
	y=model_accuracy.index,
	orientation='h',
	marker_color=['#ef5350' if v < 0.3 else '#ffca28' if v < 0.6 else '#66bb6a' for v in model_accuracy.values],
	text=[f"{v*100:.1f}%" for v in model_accuracy.values],
	textposition='inside',
	textfont=dict(color='white', size=14)
	))
	fig.update_layout(
	title=dict(text="Overall Accuracy by Model", font=dict(size=16)),
	xaxis_title="Accuracy (%)",
	yaxis_title="",
	height=350,
	margin=dict(l=20, r=20, t=50, b=40),
	xaxis=dict(range=[0, 100])
	)
	return fig

	def get_category_heatmap():
	# Create pivot table
	pivot = EVAL_RESULTS.pivot_table(
	values='score',
	index='model',
	columns='category_name',
	aggfunc='mean'
	).fillna(0) * 100

	# Get data as lists
	models = pivot.index.tolist()
	categories = pivot.columns.tolist()
	z_values = pivot.values.tolist()

	# Create text annotations
	text_values = [[f"{val:.0f}%" for val in row] for row in z_values]

	fig = go.Figure(data=go.Heatmap(
	z=z_values,
	x=categories,
	y=models,
	colorscale='RdYlGn',
	text=text_values,
	texttemplate="%{text}",
	textfont={"size": 14},
	zmin=0,
	zmax=100,
	showscale=True
	))
	fig.update_layout(
	title=dict(text="Accuracy by Model and Task Category", font=dict(size=16)),
	height=350,
	margin=dict(l=20, r=20, t=50, b=40),
	xaxis=dict(title="", tickangle=0),
	yaxis=dict(title="")
	)
	return fig

	def get_latency_chart():
	valid_latency = EVAL_RESULTS[EVAL_RESULTS['latency'] > 0.05]
	latency_by_model = valid_latency.groupby('model')['latency'].mean().sort_values()

	fig = go.Figure(go.Bar(
	x=latency_by_model.index,
	y=latency_by_model.values,
	marker_color=['#66bb6a' if v < 1 else '#ffca28' if v < 2 else '#ef5350' for v in latency_by_model.values],
	text=[f"{v:.2f}s" for v in latency_by_model.values],
	textposition='outside'
	))
	fig.update_layout(
	title=dict(text="Average Response Latency", font=dict(size=16)),
	xaxis_title="",
	yaxis_title="Latency (seconds)",
	height=350,
	margin=dict(l=20, r=20, t=50, b=40)
	)
	return fig

	def get_detailed_results(model_filter, category_filter):
	df = EVAL_RESULTS.copy()

	if model_filter != "All":
	df = df[df['model'] == model_filter]
	if category_filter != "All":
	df = df[df['category_name'] == category_filter]

	display_df = df[['model', 'category_name', 'task_id', 'score', 'latency', 'response']].copy()
	display_df['score'] = display_df['score'].map({1: '✅', 0: '❌'})
	display_df['latency'] = display_df['latency'].apply(lambda x: f"{x:.2f}s")
	display_df.columns = ['Model', 'Category', 'Task', 'Pass', 'Latency', 'Response']

	return display_df

	def run_live_comparison(prompt, model_choices):
	if not prompt.strip():
	return "<p style='color: #666;'>Please enter a prompt.</p>"

	if not model_choices:
	return "<p style='color: #666;'>Please select at least one model.</p>"

	results_html = "<div style='display: flex; flex-direction: column; gap: 15px;'>"

	for model_name in model_choices:
	if model_name in MODELS:
	result = query_model(MODELS[model_name], prompt, max_tokens=200)

	if result["error"]:
	response_text = f"Error: {result['error'][:100]}"
	color = "#ffebee"
	border_color = "#c62828"
	icon = "❌"
	else:
	response_text = result["response"]
	color = "#e8f5e9"
	border_color = "#2e7d32"
	icon = "✅"

	results_html += f"""
	<div style="background: {color}; padding: 15px; border-radius: 8px; border-left: 4px solid {border_color};">
	<h4 style="margin: 0 0 10px 0;">{icon} {model_name} <span style="font-weight: normal; color: #666;">({result['latency']:.2f}s)</span></h4>
	<p style="margin: 0; white-space: pre-wrap; font-family: sans-serif;">{response_text}</p>
	</div>
	"""

	results_html += "</div>"
	return results_html

	# Build the dashboard
	with gr.Blocks(title="LLM Evaluation Dashboard", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🧪 LLM Evaluation Dashboard

	Compare performance of Large Language Models across reasoning, knowledge, and instruction-following tasks.
	""")

	gr.HTML(get_summary_stats())

	with gr.Row():
	with gr.Column():
	gr.Plot(value=get_accuracy_chart(), label="Accuracy")
	with gr.Column():
	gr.Plot(value=get_latency_chart(), label="Latency")

	with gr.Row():
	gr.Plot(value=get_category_heatmap(), label="Category Breakdown")

	gr.Markdown("---")
	gr.Markdown("## 📋 Detailed Results")

	with gr.Row():
	model_dropdown = gr.Dropdown(choices=["All"] + list(MODELS.keys()), value="All", label="Filter by Model")
	category_dropdown = gr.Dropdown(choices=["All", "Reasoning (Math)", "Knowledge (Facts)", "Instruction Following"], value="All", label="Filter by Category")

	results_table = gr.Dataframe(value=get_detailed_results("All", "All"), label="Evaluation Results")

	model_dropdown.change(get_detailed_results, [model_dropdown, category_dropdown], results_table)
	category_dropdown.change(get_detailed_results, [model_dropdown, category_dropdown], results_table)

	gr.Markdown("---")
	gr.Markdown("## 🔄 Live Model Comparison")
	gr.Markdown("Test the models with your own prompts!")

	with gr.Row():
	with gr.Column(scale=2):
	live_prompt = gr.Textbox(label="Your Prompt", placeholder="Enter a question or instruction...", lines=3)
	with gr.Column(scale=1):
	live_models = gr.CheckboxGroup(choices=list(MODELS.keys()), value=["Mistral-7B", "Qwen2.5-72B"], label="Select Models")

	live_btn = gr.Button("🚀 Compare Models", variant="primary")
	live_output = gr.HTML(label="Comparison Results")

	live_btn.click(run_live_comparison, [live_prompt, live_models], live_output)

	gr.Markdown("""
	---
	### 📚 About This Evaluation

	Models: Mistral-7B, Llama-3.2-3B, Llama-3.1-70B, Qwen2.5-72B, Qwen2.5-Coder-32B

	Categories: Reasoning (math/logic), Knowledge (facts), Instruction Following (format compliance)

	Built as part of an AI/ML Engineering portfolio project.
	""")

	if __name__ == "__main__":
	demo.launch()