Spaces:

LaelaZ
/

ai-evaluation-toolkit

Runtime error

App Files Files Community

ai-evaluation-toolkit / app.py

LaelaZ

add AI evaluation toolkit demo

bb1f3f9 13 days ago

raw

history blame contribute delete

21.2 kB

	import gradio as gr

	# ─────────────────────────────────────────────
	# TAB 1 LOGIC: RLHF Pairwise Rater
	# ─────────────────────────────────────────────

	def check_consistency(
	prompt, resp_a, resp_b,
	help_a, harm_a, acc_a, inst_a,
	help_b, harm_b, acc_b, inst_b,
	preference, confidence
	):
	if not resp_a.strip() or not resp_b.strip():
	return "⚠️ Please enter both responses before checking.", ""

	axes = {
	"Helpfulness": (help_a, help_b),
	"Harmlessness": (harm_a, harm_b),
	"Accuracy": (acc_a, acc_b),
	"Instruction-Following": (inst_a, inst_b),
	}

	avg_a = sum(v[0] for v in axes.values()) / 4
	avg_b = sum(v[1] for v in axes.values()) / 4

	# Which axes favour each response?
	a_wins = [(k, va, vb) for k, (va, vb) in axes.items() if va > vb]
	b_wins = [(k, va, vb) for k, (va, vb) in axes.items() if vb > va]
	ties = [(k, va, vb) for k, (va, vb) in axes.items() if va == vb]

	conf_label = {1: "Low", 2: "Medium", 3: "High"}[confidence]

	# Build score table
	table_rows = ""
	for ax, (va, vb) in axes.items():
	winner = "A ✓" if va > vb else ("B ✓" if vb > va else "Tie")
	table_rows += f"\| {ax} \| {va} \| {vb} \| {winner} \|\n"
	table_rows += f"\| Average \| {avg_a:.2f} \| {avg_b:.2f} \| {'A ✓' if avg_a > avg_b else ('B ✓' if avg_b > avg_a else 'Tie')} \|\n"

	score_table = (
	"### Score Summary\n\n"
	"\| Axis \| Response A \| Response B \| Higher \|\n"
	"\|------\|-----------\|-----------\|--------\|\n"
	+ table_rows
	)

	# Consistency check
	if preference == "Tie":
	if abs(avg_a - avg_b) >= 1.0:
	msg = (
	f"⚠️ Possible inconsistency: You selected 'Tie', but the average scores differ by "
	f"{abs(avg_a - avg_b):.2f} points (A avg: {avg_a:.2f}, B avg: {avg_b:.2f}). "
	f"A tie is most appropriate when averages are within ~0.5 of each other."
	)
	else:
	msg = f"✅ Consistent: A 'Tie' verdict aligns with close average scores (A: {avg_a:.2f}, B: {avg_b:.2f})."

	elif preference == "A is better":
	if avg_b >= avg_a:
	detail = ", ".join(f"{k}: A={va} vs B={vb}" for k, va, vb in b_wins) or "none"
	msg = (
	f"⚠️ Inconsistency detected: You selected 'A is better' overall, but Response B "
	f"scored higher on {len(b_wins)}/4 axes. "
	f"Axes favouring B: {detail}. "
	f"Overall averages — A: {avg_a:.2f}, B: {avg_b:.2f}. Consider reviewing your overall verdict."
	)
	else:
	msg = f"✅ Consistent: 'A is better' aligns with higher per-axis averages (A: {avg_a:.2f} vs B: {avg_b:.2f})."

	else: # B is better
	if avg_a >= avg_b:
	detail = ", ".join(f"{k}: A={va} vs B={vb}" for k, va, vb in a_wins) or "none"
	msg = (
	f"⚠️ Inconsistency detected: You selected 'B is better' overall, but Response A "
	f"scored higher on {len(a_wins)}/4 axes. "
	f"Axes favouring A: {detail}. "
	f"Overall averages — A: {avg_a:.2f}, B: {avg_b:.2f}. Consider reviewing your overall verdict."
	)
	else:
	msg = f"✅ Consistent: 'B is better' aligns with higher per-axis averages (B: {avg_b:.2f} vs A: {avg_a:.2f})."

	msg += f"\n\nConfidence: {conf_label} ({confidence}/3)"

	return msg, score_table


	# ─────────────────────────────────────────────
	# TAB 2 LOGIC: Content Policy Rater
	# ─────────────────────────────────────────────

	VERDICTS = ["PASS", "FLAG", "BLOCK"]
	VERDICT_RANK = {"PASS": 0, "FLAG": 1, "BLOCK": 2}
	VERDICT_EMOJI = {"PASS": "🟢", "FLAG": "🟡", "BLOCK": "🔴"}

	CRITERIA = [
	"Factual Accuracy",
	"Safety",
	"Bias",
	"PII Exposure",
	"On-Policy",
	"Clarity",
	]

	def get_overall_verdict(*args):
	content = args[0]
	if not content.strip():
	return "⚠️ Please enter content to evaluate.", ""

	# args layout: content, then pairs of (verdict, reasoning) × 6, then mode
	verdicts = [args[1 + i*2] for i in range(6)]
	reasonings= [args[1 + i*2 + 1] for i in range(6)]
	mode = args[13]

	ranks = [VERDICT_RANK[v] for v in verdicts]

	if mode == "worst_wins":
	final_rank = max(ranks)
	explanation = "worst_wins: The overall verdict equals the most severe individual criterion rating."
	elif mode == "majority":
	from collections import Counter
	c = Counter(verdicts)
	final_rank = VERDICT_RANK[c.most_common(1)[0][0]]
	explanation = f"majority: The most frequent verdict ({c.most_common(1)[0][0]}, {c.most_common(1)[0][1]}/6 criteria) wins."
	else: # weighted
	weights = {"Factual Accuracy": 2, "Safety": 3, "Bias": 2, "PII Exposure": 3, "On-Policy": 2, "Clarity": 1}
	total_w = sum(weights.values())
	weighted_score = sum(weights[c] * ranks[i] for i, c in enumerate(CRITERIA)) / total_w
	final_rank = 0 if weighted_score < 0.5 else (1 if weighted_score < 1.5 else 2)
	explanation = f"weighted: Safety and PII carry 3× weight. Weighted score: {weighted_score:.2f} → {'PASS' if final_rank==0 else 'FLAG' if final_rank==1 else 'BLOCK'}."

	final_verdict = VERDICTS[final_rank]
	emoji = VERDICT_EMOJI[final_verdict]

	header = f"## {emoji} Overall Verdict: {final_verdict}\n\n{explanation}\n\n"

	table = "### Per-Criterion Breakdown\n\n\| Criterion \| Verdict \| Reasoning \|\n\|-----------\|---------\|----------\|\n"
	for i, crit in enumerate(CRITERIA):
	v = verdicts[i]
	r = reasonings[i].strip() if reasonings[i].strip() else "—"
	ev = VERDICT_EMOJI[v]
	table += f"\| {crit} \| {ev} {v} \| {r} \|\n"

	return header, table


	# ─────────────────────────────────────────────
	# TAB 3 LOGIC: Observation vs Inference
	# ─────────────────────────────────────────────

	INFERENCE_SIGNALS = [
	# (signal phrase, example clean alternative)
	("seems", "Describe exactly what you see, not what it suggests."),
	("appears", "Describe exactly what you see, not what it suggests."),
	("looks like", "Describe the specific visual or measurable property instead."),
	("looks ", "Describe the specific visual or measurable property instead."),
	("probably", "Remove speculation — state only what is directly observable."),
	("likely", "Remove speculation — state only what is directly observable."),
	("might", "Remove hedging — state only what is directly observable."),
	("may ", "Remove hedging — state only what is directly observable."),
	("should ", "Avoid prescriptive language in an observation."),
	("is bad", "Describe the specific measurable problem, not a judgment."),
	("is good", "Describe the specific measurable quality, not a judgment."),
	("is wrong", "Describe the exact discrepancy observed."),
	("is broken", "Describe what specifically does not work as expected."),
	("is inconsistent","Specify the exact values or positions that differ."),
	("unclear", "Describe what specific information is missing or ambiguous."),
	("confusing", "Describe the specific element that causes confusion."),
	("feels ", "Feelings are inferences. Describe the observable trigger instead."),
	("indicates", "'Indicates' draws a conclusion. State the raw signal only."),
	("suggests", "'Suggests' draws a conclusion. State the raw signal only."),
	("implies", "'Implies' draws a conclusion. State the raw signal only."),
	("because", "Causal claims belong in the inference, not the observation."),
	]

	def analyze_obs_inf(observation, inference):
	if not observation.strip():
	return "⚠️ Please enter an observation.", ""

	obs_lower = observation.lower()
	found = [(sig, tip) for sig, tip in INFERENCE_SIGNALS if sig in obs_lower]

	if not found:
	obs_result = (
	"✅ Clean observation — specific and factual. "
	"No inference language detected."
	)
	else:
	issues = "\n".join(
	f"- '{sig.strip()}' — {tip}" for sig, tip in found[:3]
	)
	obs_result = (
	f"⚠️ Observation contains inference language ({len(found)} signal(s) found):\n\n"
	+ issues
	+ "\n\nTip: An observation should answer 'What did you literally see/measure?' — "
	"no judgments, no causes, no speculation."
	)

	inf_result = ""
	if inference.strip():
	inf_lower = inference.lower()
	# Inferences should contain reasoning words — flag if completely bare
	reasoning_words = ["because", "therefore", "so ", "thus", "indicates", "suggests",
	"means", "implies", "likely", "probably", "conclude"]
	has_reasoning = any(w in inf_lower for w in reasoning_words)
	if has_reasoning:
	inf_result = "\n\n✅ Inference — contains reasoning language, which is appropriate here."
	else:
	inf_result = (
	"\n\n💡 Inference tip: Your inference reads like a bare statement. "
	"Strong inferences explain why — try adding 'because', 'therefore', or 'this suggests'."
	)

	examples = """
	---

	### Reference: Good vs Bad Examples

	\| # \| ❌ Contaminated Observation \| ✅ Clean Observation \|
	\|---\|----------------------------\|----------------------\|
	\| 1 \| "The button looks inconsistent with the rest of the UI." \| "The Save button is 8 px lower than the Cancel button; Save uses Inter 14px, Cancel uses Inter 16px." \|
	\| 2 \| "The error message is confusing." \| "The error message reads 'Error 403' with no additional context and no retry option." \|
	\| 3 \| "The response seems off-topic." \| "The response does not mention the word 'Python' despite the prompt asking for a Python code example." \|

	Rule of thumb: If you can't photograph or measure it, it's probably an inference.
	"""

	return obs_result + inf_result, examples


	# ─────────────────────────────────────────────
	# BUILD THE GRADIO APP
	# ─────────────────────────────────────────────

	HEADER = """
	<div style="text-align:center; padding: 16px 0 8px 0;">
	<h1 style="font-size:2rem; margin-bottom:4px;">🎯 AI Evaluation Toolkit</h1>
	<p style="color:#666; font-size:0.95rem;">
	Interactive demos of AI training data quality-control workflows.<br>
	Built by <a href="https://github.com/LaelaZorana" target="_blank">Laela Zorana</a> ·
	<a href="https://huggingface.co/LaelaZ" target="_blank">HuggingFace</a> ·
	<a href="https://kaggle.com/laelazorana" target="_blank">Kaggle</a>
	</p>
	</div>
	"""

	with gr.Blocks(title="AI Evaluation Toolkit", theme=gr.themes.Soft()) as demo:
	gr.HTML(HEADER)

	with gr.Tabs():

	# ── TAB 1 ──────────────────────────────────────────────────────────
	with gr.Tab("⚖️ RLHF Pairwise Rater"):
	gr.Markdown(
	"Rate two AI responses on four quality axes, then check whether your overall "
	"preference is consistent with your per-axis scores."
	)
	prompt_box = gr.Textbox(
	label="Prompt / Task",
	placeholder="e.g. Explain gradient descent in simple terms.",
	lines=2,
	)
	with gr.Row():
	resp_a = gr.Textbox(label="Response A", lines=6,
	placeholder="Paste Response A here…")
	resp_b = gr.Textbox(label="Response B", lines=6,
	placeholder="Paste Response B here…")

	gr.Markdown("#### Axis Ratings (1 = Poor · 5 = Excellent)")
	with gr.Row():
	with gr.Column():
	gr.Markdown("Response A")
	help_a = gr.Slider(1, 5, value=3, step=1, label="Helpfulness A")
	harm_a = gr.Slider(1, 5, value=3, step=1, label="Harmlessness A")
	acc_a = gr.Slider(1, 5, value=3, step=1, label="Accuracy A")
	inst_a = gr.Slider(1, 5, value=3, step=1, label="Instruction-Following A")
	with gr.Column():
	gr.Markdown("Response B")
	help_b = gr.Slider(1, 5, value=3, step=1, label="Helpfulness B")
	harm_b = gr.Slider(1, 5, value=3, step=1, label="Harmlessness B")
	acc_b = gr.Slider(1, 5, value=3, step=1, label="Accuracy B")
	inst_b = gr.Slider(1, 5, value=3, step=1, label="Instruction-Following B")

	gr.Markdown("#### Overall Judgment")
	with gr.Row():
	preference = gr.Radio(
	["A is better", "B is better", "Tie"],
	label="Overall Preference",
	value="Tie",
	)
	confidence = gr.Slider(1, 3, value=2, step=1,
	label="Confidence (1=Low, 2=Medium, 3=High)")

	check_btn = gr.Button("Check Consistency", variant="primary")

	consistency_out = gr.Markdown(label="Consistency Check")
	table_out = gr.Markdown(label="Score Summary")

	check_btn.click(
	check_consistency,
	inputs=[prompt_box, resp_a, resp_b,
	help_a, harm_a, acc_a, inst_a,
	help_b, harm_b, acc_b, inst_b,
	preference, confidence],
	outputs=[consistency_out, table_out],
	)

	gr.Examples(
	examples=[
	[
	"Explain gradient descent simply.",
	"Gradient descent is an optimization algorithm that minimizes a loss function by iteratively moving in the direction of steepest descent as defined by the negative of the gradient.",
	"Imagine you're lost on a foggy mountain and want to reach the valley. Each step you take downhill is gradient descent — you keep moving in whichever direction is steepest until you can't go lower.",
	4, 5, 4, 4,
	5, 5, 3, 5,
	"A is better", 2,
	]
	],
	inputs=[prompt_box, resp_a, resp_b,
	help_a, harm_a, acc_a, inst_a,
	help_b, harm_b, acc_b, inst_b,
	preference, confidence],
	label="Example: spot the inconsistency",
	)

	# ── TAB 2 ──────────────────────────────────────────────────────────
	with gr.Tab("📋 Content Policy Rater"):
	gr.Markdown(
	"Score a piece of content against six policy criteria, then aggregate to an "
	"overall PASS / FLAG / BLOCK verdict using one of three aggregation rules."
	)
	content_box = gr.Textbox(
	label="Content to Evaluate",
	lines=5,
	placeholder="Paste the AI-generated content here…",
	)

	gr.Markdown("#### Per-Criterion Ratings")

	criterion_inputs = []
	for crit in CRITERIA:
	with gr.Row():
	v = gr.Dropdown(
	choices=VERDICTS, value="PASS",
	label=f"{crit} — Verdict", scale=1,
	)
	r = gr.Textbox(
	label=f"{crit} — Reasoning",
	placeholder="Brief justification…",
	scale=3,
	)
	criterion_inputs.extend([v, r])

	agg_mode = gr.Radio(
	["worst_wins", "majority", "weighted"],
	label="Aggregation Mode",
	value="worst_wins",
	info=(
	"worst_wins = most severe criterion wins \| "
	"majority = most common verdict \| "
	"weighted = Safety & PII get 3× weight"
	),
	)

	verdict_btn = gr.Button("Get Overall Verdict", variant="primary")
	verdict_out = gr.Markdown(label="Overall Verdict")
	criterion_table = gr.Markdown(label="Per-Criterion Breakdown")

	verdict_btn.click(
	get_overall_verdict,
	inputs=[content_box] + criterion_inputs + [agg_mode],
	outputs=[verdict_out, criterion_table],
	)

	gr.Examples(
	examples=[[
	"To make a profit, you should invest in index funds, which historically return ~7% annually after inflation. Past performance doesn't guarantee future results — John Smith at 123 Main St made $200k last year.",
	"PASS", "Historically accurate.",
	"FLAG", "No direct harm, but financial advice disclaimer missing.",
	"PASS", "No biased framing detected.",
	"BLOCK", "Contains full name and street address of a real-sounding individual.",
	"FLAG", "Financial advice without credentials disclaimer violates policy.",
	"PASS", "Clear and readable.",
	"worst_wins",
	]],
	inputs=[content_box] + criterion_inputs + [agg_mode],
	label="Example: PII forces a BLOCK",
	)

	# ── TAB 3 ──────────────────────────────────────────────────────────
	with gr.Tab("🔬 Observation vs Inference"):
	gr.Markdown(
	"Practice keeping observations clean (factual, specific, no conclusions embedded) "
	"and inferences grounded (explicitly tied to what was observed). "
	"This discipline is core to high-quality AI evaluation and bug reporting."
	)
	with gr.Row():
	obs_box = gr.Textbox(
	label="What did you observe?",
	lines=4,
	placeholder="e.g. The modal dialog closes immediately after opening without any user interaction.",
	scale=1,
	)
	inf_box = gr.Textbox(
	label="What do you conclude from it? (optional)",
	lines=4,
	placeholder="e.g. This suggests the dismiss event fires on mount rather than on user action.",
	scale=1,
	)

	analyze_btn = gr.Button("Analyze", variant="primary")
	analysis_out = gr.Markdown(label="Analysis")
	examples_out = gr.Markdown(label="Reference Examples")

	analyze_btn.click(
	analyze_obs_inf,
	inputs=[obs_box, inf_box],
	outputs=[analysis_out, examples_out],
	)

	gr.Examples(
	examples=[
	[
	"The button looks inconsistent with the rest of the UI.",
	"It probably wasn't designed by the same person.",
	],
	[
	"The Save button is 8 px lower than the Cancel button; Save uses Inter 14px Bold, Cancel uses Inter 16px Regular.",
	"The vertical misalignment and font inconsistency suggest the two buttons were added in separate PRs without a shared spacing token.",
	],
	[
	"The model's response seems off-topic and confusing.",
	"",
	],
	],
	inputs=[obs_box, inf_box],
	label="Try these examples",
	)

	demo.launch()