sammoftah's picture
Deploy Benchmark Builder
c812306 verified
"""
Benchmark Builder
Create small, auditable multiple-choice evaluation datasets for LLMs.
"""
import json
import os
import re
import sys
from typing import Dict, List, Tuple
import gradio as gr
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from shared.components import create_footer, create_method_panel, create_premium_hero
try:
from huggingface_hub import InferenceClient
except Exception: # pragma: no cover - optional on local machines
InferenceClient = None
SEED_QUESTIONS = [
{
"question": "Which retrieval signal is strongest when a user query uses rare exact terms?",
"correct_answer": "BM25 lexical matching",
"distractors": [
"Random negative sampling",
"Temperature scaling",
"Decoder-only attention masking",
],
"subject": "Information Retrieval",
"difficulty": "Medium",
"rationale": "BM25 rewards rare exact terms through inverse document frequency.",
}
]
DOMAIN_DISTRACTORS = {
"Machine Learning": [
"dropout regularization",
"batch normalization",
"cosine learning-rate decay",
"gradient clipping",
"early stopping",
"teacher forcing",
],
"Information Retrieval": [
"dense vector search",
"query expansion",
"reciprocal rank fusion",
"cross-encoder reranking",
"metadata filtering",
"semantic chunking",
],
"AI Safety": [
"output filtering",
"least-privilege tool access",
"prompt isolation",
"red-team evaluation",
"policy classification",
"adversarial testing",
],
"Data Engineering": [
"schema validation",
"deduplication",
"entity resolution",
"partition pruning",
"incremental backfills",
"lineage tracking",
],
}
def _hf_generate(prompt: str) -> List[str]:
"""Use HF Inference when configured; otherwise return an empty list."""
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
if not token or InferenceClient is None:
return []
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=token)
response = client.text_generation(prompt, max_new_tokens=220, temperature=0.35)
match = re.search(r"\[[\s\S]*\]", response)
if not match:
return []
try:
parsed = json.loads(match.group(0))
except json.JSONDecodeError:
return []
return [str(item).strip() for item in parsed if str(item).strip()][:3]
def _fallback_distractors(question: str, correct_answer: str, subject: str) -> List[str]:
pool = DOMAIN_DISTRACTORS.get(subject, DOMAIN_DISTRACTORS["Machine Learning"])
answer_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", correct_answer)}
chosen = []
for candidate in pool:
candidate_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", candidate)}
if candidate.lower() != correct_answer.lower() and not answer_terms.intersection(candidate_terms):
chosen.append(candidate)
if len(chosen) == 3:
break
if len(chosen) < 3:
chosen.extend(["a plausible baseline method", "a purely random heuristic", "a manual review process"])
if re.search(r"\bwhich\b|\bwhat\b", question.lower()):
return [item[:1].upper() + item[1:] for item in chosen[:3]]
return chosen[:3]
def generate_distractors(question: str, correct_answer: str, subject: str, difficulty: str) -> Tuple[List[str], str]:
prompt = f"""
You create benchmark-quality multiple-choice distractors.
Return only a JSON array of exactly 3 short wrong answers.
Subject: {subject}
Difficulty: {difficulty}
Question: {question}
Correct answer: {correct_answer}
Distractors must be plausible, mutually distinct, and not reveal the answer.
"""
generated = _hf_generate(prompt)
source = "HF Inference model" if len(generated) == 3 else "deterministic local heuristic"
distractors = generated if len(generated) == 3 else _fallback_distractors(question, correct_answer, subject)
return distractors, source
def audit_question(question: str, correct_answer: str, distractors: List[str]) -> List[Dict[str, str]]:
options = [correct_answer] + distractors
lower_options = [option.strip().lower() for option in options]
checks = []
checks.append({
"check": "Duplicate options",
"result": "pass" if len(set(lower_options)) == len(lower_options) else "review",
"detail": "All answer choices are unique." if len(set(lower_options)) == len(lower_options) else "At least two choices are identical.",
})
answer_words = set(re.findall(r"[a-zA-Z]{4,}", correct_answer.lower()))
leaked = any(answer_words.intersection(set(re.findall(r"[a-zA-Z]{4,}", d.lower()))) for d in distractors)
checks.append({
"check": "Answer leakage",
"result": "review" if leaked else "pass",
"detail": "A distractor shares key answer terms." if leaked else "Distractors avoid obvious answer words.",
})
lengths = [len(option) for option in options]
balanced = max(lengths) - min(lengths) <= max(18, len(correct_answer))
checks.append({
"check": "Length balance",
"result": "pass" if balanced else "review",
"detail": "Choices have comparable length." if balanced else "One option is much longer or shorter than the rest.",
})
stem_ok = len(question.strip()) >= 24 and question.strip().endswith("?")
checks.append({
"check": "Question stem",
"result": "pass" if stem_ok else "review",
"detail": "Stem is specific and phrased as a question." if stem_ok else "Make the stem more specific and end it with a question mark.",
})
return checks
def render_question(question_data: Dict[str, object]) -> str:
letters = "ABCD"
options = [question_data["correct_answer"]] + question_data["distractors"]
option_html = ""
for idx, option in enumerate(options):
is_answer = option == question_data["correct_answer"]
option_html += f"""
<div class="info-card" style="margin:0.55rem 0; border-left:4px solid {'#22c55e' if is_answer else '#e5e7eb'} !important;">
<strong>{letters[idx]}.</strong> {option}
{'<span style="float:right; color:#15803d; font-weight:800;">answer</span>' if is_answer else ''}
</div>
"""
return f"""
<div class="info-card">
<p style="margin:0 0 0.4rem 0; color:#e8935c; font-weight:800;">{question_data['subject']} · {question_data['difficulty']}</p>
<h3 style="margin-top:0;">{question_data['question']}</h3>
{option_html}
<p><strong>Rationale:</strong> {question_data['rationale']}</p>
<p><strong>Distractor source:</strong> {question_data['source']}</p>
</div>
"""
def render_audit(checks: List[Dict[str, str]]) -> str:
lines = ["| Check | Result | Detail |", "|---|---|---|"]
for check in checks:
badge = "Pass" if check["result"] == "pass" else "Review"
lines.append(f"| {check['check']} | {badge} | {check['detail']} |")
return "\n".join(lines)
def add_question(
question: str,
correct_answer: str,
subject: str,
difficulty: str,
rationale: str,
state: List[Dict[str, object]],
):
if not question or not correct_answer:
return state, "Add a question and correct answer first.", "", ""
distractors, source = generate_distractors(question, correct_answer, subject, difficulty)
item = {
"question": question.strip(),
"correct_answer": correct_answer.strip(),
"distractors": distractors,
"subject": subject,
"difficulty": difficulty,
"rationale": rationale.strip() or "Add a short rationale before publishing this benchmark.",
"source": source,
}
next_state = [*state, item]
checks = audit_question(item["question"], item["correct_answer"], item["distractors"])
status = f"Dataset now has {len(next_state)} questions. Review flags before publishing."
return next_state, status, render_question(item), render_audit(checks)
def export_benchmark(benchmark_name: str, state: List[Dict[str, object]], export_format: str) -> str:
safe_name = re.sub(r"[^a-zA-Z0-9_\-/]", "-", benchmark_name.strip() or "my-eval-benchmark")
rows = state or SEED_QUESTIONS
if export_format == "JSONL":
return "\n".join(json.dumps(row, ensure_ascii=False) for row in rows)
if export_format == "HF Dataset Script":
return f"""from datasets import Dataset
rows = {json.dumps(rows, indent=2)}
dataset = Dataset.from_list(rows)
dataset.push_to_hub("{safe_name}")
"""
return json.dumps({"name": safe_name, "questions": rows, "total": len(rows)}, indent=2)
with gr.Blocks(title="Benchmark Builder", theme=gr.themes.Soft()) as app:
state = gr.State(SEED_QUESTIONS)
create_premium_hero(
"Benchmark Builder",
"Design small, inspectable evaluation sets with plausible distractors, quality checks, and Hugging Face Dataset export.",
"📊",
badge="Evaluation Engineering",
highlights=["Optional HF inference", "Distractor audit", "Dataset push script"],
)
create_method_panel({
"Technique": "LLM-assisted benchmark authoring with deterministic guardrails.",
"What it proves": "You understand evaluation data quality, not just prompt generation.",
"HF capability": "Ready to publish as a Dataset and evaluate models on the Hub.",
})
with gr.Row():
with gr.Column(scale=1):
question_input = gr.Textbox(
label="Question",
value="Which retrieval signal is strongest when a user query uses rare exact terms?",
lines=3,
)
answer_input = gr.Textbox(label="Correct answer", value="BM25 lexical matching")
subject = gr.Dropdown(
choices=list(DOMAIN_DISTRACTORS.keys()),
value="Information Retrieval",
label="Subject",
)
difficulty = gr.Radio(["Easy", "Medium", "Hard"], value="Medium", label="Difficulty")
rationale = gr.Textbox(
label="Rationale",
value="BM25 rewards rare exact terms through inverse document frequency.",
lines=2,
)
add_btn = gr.Button("Generate Distractors + Audit", variant="primary")
with gr.Column(scale=1):
status_output = gr.Markdown("Add a question to generate a benchmark-ready item.")
preview_output = gr.HTML(render_question({**SEED_QUESTIONS[0], "source": "seed example"}))
audit_output = gr.Markdown(render_audit(audit_question(
SEED_QUESTIONS[0]["question"],
SEED_QUESTIONS[0]["correct_answer"],
SEED_QUESTIONS[0]["distractors"],
)))
add_btn.click(
add_question,
inputs=[question_input, answer_input, subject, difficulty, rationale, state],
outputs=[state, status_output, preview_output, audit_output],
)
gr.Markdown("## Export")
with gr.Row():
benchmark_name = gr.Textbox(label="Hub dataset name", value="username/retrieval-mini-eval")
export_format = gr.Dropdown(["JSON", "JSONL", "HF Dataset Script"], value="JSON", label="Format")
export_btn = gr.Button("Generate Export", variant="secondary")
export_output = gr.Code(label="Benchmark artifact", language="python", lines=16)
export_btn.click(export_benchmark, inputs=[benchmark_name, state, export_format], outputs=export_output)
gr.Markdown("""
## Why This Is Useful
Evaluation sets fail quietly when distractors are weak, duplicated, or reveal the answer. This Space teaches a better workflow: generate candidates, audit them, keep rationales, and publish the result as a versioned Hugging Face Dataset.
""")
create_footer("Benchmark Builder")
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860)