Spaces:

sammoftah
/

benchmark-builder

Sleeping

App Files Files Community

benchmark-builder / app.py

sammoftah

Deploy Benchmark Builder

c812306 verified 15 days ago

raw

history blame contribute delete

12.2 kB

	"""
	Benchmark Builder
	Create small, auditable multiple-choice evaluation datasets for LLMs.
	"""

	import json
	import os
	import re
	import sys
	from typing import Dict, List, Tuple

	import gradio as gr

	sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
	from shared.components import create_footer, create_method_panel, create_premium_hero


	try:
	from huggingface_hub import InferenceClient
	except Exception: # pragma: no cover - optional on local machines
	InferenceClient = None


	SEED_QUESTIONS = [
	{
	"question": "Which retrieval signal is strongest when a user query uses rare exact terms?",
	"correct_answer": "BM25 lexical matching",
	"distractors": [
	"Random negative sampling",
	"Temperature scaling",
	"Decoder-only attention masking",
	],
	"subject": "Information Retrieval",
	"difficulty": "Medium",
	"rationale": "BM25 rewards rare exact terms through inverse document frequency.",
	}
	]

	DOMAIN_DISTRACTORS = {
	"Machine Learning": [
	"dropout regularization",
	"batch normalization",
	"cosine learning-rate decay",
	"gradient clipping",
	"early stopping",
	"teacher forcing",
	],
	"Information Retrieval": [
	"dense vector search",
	"query expansion",
	"reciprocal rank fusion",
	"cross-encoder reranking",
	"metadata filtering",
	"semantic chunking",
	],
	"AI Safety": [
	"output filtering",
	"least-privilege tool access",
	"prompt isolation",
	"red-team evaluation",
	"policy classification",
	"adversarial testing",
	],
	"Data Engineering": [
	"schema validation",
	"deduplication",
	"entity resolution",
	"partition pruning",
	"incremental backfills",
	"lineage tracking",
	],
	}


	def _hf_generate(prompt: str) -> List[str]:
	"""Use HF Inference when configured; otherwise return an empty list."""
	token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
	if not token or InferenceClient is None:
	return []

	client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=token)
	response = client.text_generation(prompt, max_new_tokens=220, temperature=0.35)
	match = re.search(r"\[[\s\S]*\]", response)
	if not match:
	return []
	try:
	parsed = json.loads(match.group(0))
	except json.JSONDecodeError:
	return []
	return [str(item).strip() for item in parsed if str(item).strip()][:3]


	def _fallback_distractors(question: str, correct_answer: str, subject: str) -> List[str]:
	pool = DOMAIN_DISTRACTORS.get(subject, DOMAIN_DISTRACTORS["Machine Learning"])
	answer_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", correct_answer)}
	chosen = []
	for candidate in pool:
	candidate_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", candidate)}
	if candidate.lower() != correct_answer.lower() and not answer_terms.intersection(candidate_terms):
	chosen.append(candidate)
	if len(chosen) == 3:
	break

	if len(chosen) < 3:
	chosen.extend(["a plausible baseline method", "a purely random heuristic", "a manual review process"])

	if re.search(r"\bwhich\b\|\bwhat\b", question.lower()):
	return [item[:1].upper() + item[1:] for item in chosen[:3]]
	return chosen[:3]


	def generate_distractors(question: str, correct_answer: str, subject: str, difficulty: str) -> Tuple[List[str], str]:
	prompt = f"""
	You create benchmark-quality multiple-choice distractors.
	Return only a JSON array of exactly 3 short wrong answers.
	Subject: {subject}
	Difficulty: {difficulty}
	Question: {question}
	Correct answer: {correct_answer}
	Distractors must be plausible, mutually distinct, and not reveal the answer.
	"""
	generated = _hf_generate(prompt)
	source = "HF Inference model" if len(generated) == 3 else "deterministic local heuristic"
	distractors = generated if len(generated) == 3 else _fallback_distractors(question, correct_answer, subject)
	return distractors, source


	def audit_question(question: str, correct_answer: str, distractors: List[str]) -> List[Dict[str, str]]:
	options = [correct_answer] + distractors
	lower_options = [option.strip().lower() for option in options]
	checks = []

	checks.append({
	"check": "Duplicate options",
	"result": "pass" if len(set(lower_options)) == len(lower_options) else "review",
	"detail": "All answer choices are unique." if len(set(lower_options)) == len(lower_options) else "At least two choices are identical.",
	})

	answer_words = set(re.findall(r"[a-zA-Z]{4,}", correct_answer.lower()))
	leaked = any(answer_words.intersection(set(re.findall(r"[a-zA-Z]{4,}", d.lower()))) for d in distractors)
	checks.append({
	"check": "Answer leakage",
	"result": "review" if leaked else "pass",
	"detail": "A distractor shares key answer terms." if leaked else "Distractors avoid obvious answer words.",
	})

	lengths = [len(option) for option in options]
	balanced = max(lengths) - min(lengths) <= max(18, len(correct_answer))
	checks.append({
	"check": "Length balance",
	"result": "pass" if balanced else "review",
	"detail": "Choices have comparable length." if balanced else "One option is much longer or shorter than the rest.",
	})

	stem_ok = len(question.strip()) >= 24 and question.strip().endswith("?")
	checks.append({
	"check": "Question stem",
	"result": "pass" if stem_ok else "review",
	"detail": "Stem is specific and phrased as a question." if stem_ok else "Make the stem more specific and end it with a question mark.",
	})
	return checks


	def render_question(question_data: Dict[str, object]) -> str:
	letters = "ABCD"
	options = [question_data["correct_answer"]] + question_data["distractors"]
	option_html = ""
	for idx, option in enumerate(options):
	is_answer = option == question_data["correct_answer"]
	option_html += f"""
	<div class="info-card" style="margin:0.55rem 0; border-left:4px solid {'#22c55e' if is_answer else '#e5e7eb'} !important;">
	<strong>{letters[idx]}.</strong> {option}
	{'<span style="float:right; color:#15803d; font-weight:800;">answer</span>' if is_answer else ''}
	</div>
	"""

	return f"""
	<div class="info-card">
	<p style="margin:0 0 0.4rem 0; color:#e8935c; font-weight:800;">{question_data['subject']} · {question_data['difficulty']}</p>
	<h3 style="margin-top:0;">{question_data['question']}</h3>
	{option_html}
	<p><strong>Rationale:</strong> {question_data['rationale']}</p>
	<p><strong>Distractor source:</strong> {question_data['source']}</p>
	</div>
	"""


	def render_audit(checks: List[Dict[str, str]]) -> str:
	lines = ["\| Check \| Result \| Detail \|", "\|---\|---\|---\|"]
	for check in checks:
	badge = "Pass" if check["result"] == "pass" else "Review"
	lines.append(f"\| {check['check']} \| {badge} \| {check['detail']} \|")
	return "\n".join(lines)


	def add_question(
	question: str,
	correct_answer: str,
	subject: str,
	difficulty: str,
	rationale: str,
	state: List[Dict[str, object]],
	):
	if not question or not correct_answer:
	return state, "Add a question and correct answer first.", "", ""

	distractors, source = generate_distractors(question, correct_answer, subject, difficulty)
	item = {
	"question": question.strip(),
	"correct_answer": correct_answer.strip(),
	"distractors": distractors,
	"subject": subject,
	"difficulty": difficulty,
	"rationale": rationale.strip() or "Add a short rationale before publishing this benchmark.",
	"source": source,
	}
	next_state = [*state, item]
	checks = audit_question(item["question"], item["correct_answer"], item["distractors"])
	status = f"Dataset now has {len(next_state)} questions. Review flags before publishing."
	return next_state, status, render_question(item), render_audit(checks)


	def export_benchmark(benchmark_name: str, state: List[Dict[str, object]], export_format: str) -> str:
	safe_name = re.sub(r"[^a-zA-Z0-9_\-/]", "-", benchmark_name.strip() or "my-eval-benchmark")
	rows = state or SEED_QUESTIONS

	if export_format == "JSONL":
	return "\n".join(json.dumps(row, ensure_ascii=False) for row in rows)

	if export_format == "HF Dataset Script":
	return f"""from datasets import Dataset

	rows = {json.dumps(rows, indent=2)}
	dataset = Dataset.from_list(rows)
	dataset.push_to_hub("{safe_name}")
	"""

	return json.dumps({"name": safe_name, "questions": rows, "total": len(rows)}, indent=2)


	with gr.Blocks(title="Benchmark Builder", theme=gr.themes.Soft()) as app:
	state = gr.State(SEED_QUESTIONS)
	create_premium_hero(
	"Benchmark Builder",
	"Design small, inspectable evaluation sets with plausible distractors, quality checks, and Hugging Face Dataset export.",
	"📊",
	badge="Evaluation Engineering",
	highlights=["Optional HF inference", "Distractor audit", "Dataset push script"],
	)
	create_method_panel({
	"Technique": "LLM-assisted benchmark authoring with deterministic guardrails.",
	"What it proves": "You understand evaluation data quality, not just prompt generation.",
	"HF capability": "Ready to publish as a Dataset and evaluate models on the Hub.",
	})

	with gr.Row():
	with gr.Column(scale=1):
	question_input = gr.Textbox(
	label="Question",
	value="Which retrieval signal is strongest when a user query uses rare exact terms?",
	lines=3,
	)
	answer_input = gr.Textbox(label="Correct answer", value="BM25 lexical matching")
	subject = gr.Dropdown(
	choices=list(DOMAIN_DISTRACTORS.keys()),
	value="Information Retrieval",
	label="Subject",
	)
	difficulty = gr.Radio(["Easy", "Medium", "Hard"], value="Medium", label="Difficulty")
	rationale = gr.Textbox(
	label="Rationale",
	value="BM25 rewards rare exact terms through inverse document frequency.",
	lines=2,
	)
	add_btn = gr.Button("Generate Distractors + Audit", variant="primary")
	with gr.Column(scale=1):
	status_output = gr.Markdown("Add a question to generate a benchmark-ready item.")
	preview_output = gr.HTML(render_question({**SEED_QUESTIONS[0], "source": "seed example"}))
	audit_output = gr.Markdown(render_audit(audit_question(
	SEED_QUESTIONS[0]["question"],
	SEED_QUESTIONS[0]["correct_answer"],
	SEED_QUESTIONS[0]["distractors"],
	)))

	add_btn.click(
	add_question,
	inputs=[question_input, answer_input, subject, difficulty, rationale, state],
	outputs=[state, status_output, preview_output, audit_output],
	)

	gr.Markdown("## Export")
	with gr.Row():
	benchmark_name = gr.Textbox(label="Hub dataset name", value="username/retrieval-mini-eval")
	export_format = gr.Dropdown(["JSON", "JSONL", "HF Dataset Script"], value="JSON", label="Format")
	export_btn = gr.Button("Generate Export", variant="secondary")
	export_output = gr.Code(label="Benchmark artifact", language="python", lines=16)
	export_btn.click(export_benchmark, inputs=[benchmark_name, state, export_format], outputs=export_output)

	gr.Markdown("""
	## Why This Is Useful

	Evaluation sets fail quietly when distractors are weak, duplicated, or reveal the answer. This Space teaches a better workflow: generate candidates, audit them, keep rationales, and publish the result as a versioned Hugging Face Dataset.
	""")
	create_footer("Benchmark Builder")


	if __name__ == "__main__":
	app.launch(server_name="0.0.0.0", server_port=7860)