""" Benchmark Builder Create small, auditable multiple-choice evaluation datasets for LLMs. """ import json import os import re import sys from typing import Dict, List, Tuple import gradio as gr sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from shared.components import create_footer, create_method_panel, create_premium_hero try: from huggingface_hub import InferenceClient except Exception: # pragma: no cover - optional on local machines InferenceClient = None SEED_QUESTIONS = [ { "question": "Which retrieval signal is strongest when a user query uses rare exact terms?", "correct_answer": "BM25 lexical matching", "distractors": [ "Random negative sampling", "Temperature scaling", "Decoder-only attention masking", ], "subject": "Information Retrieval", "difficulty": "Medium", "rationale": "BM25 rewards rare exact terms through inverse document frequency.", } ] DOMAIN_DISTRACTORS = { "Machine Learning": [ "dropout regularization", "batch normalization", "cosine learning-rate decay", "gradient clipping", "early stopping", "teacher forcing", ], "Information Retrieval": [ "dense vector search", "query expansion", "reciprocal rank fusion", "cross-encoder reranking", "metadata filtering", "semantic chunking", ], "AI Safety": [ "output filtering", "least-privilege tool access", "prompt isolation", "red-team evaluation", "policy classification", "adversarial testing", ], "Data Engineering": [ "schema validation", "deduplication", "entity resolution", "partition pruning", "incremental backfills", "lineage tracking", ], } def _hf_generate(prompt: str) -> List[str]: """Use HF Inference when configured; otherwise return an empty list.""" token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") if not token or InferenceClient is None: return [] client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=token) response = client.text_generation(prompt, max_new_tokens=220, temperature=0.35) match = re.search(r"\[[\s\S]*\]", response) if not match: return [] try: parsed = json.loads(match.group(0)) except json.JSONDecodeError: return [] return [str(item).strip() for item in parsed if str(item).strip()][:3] def _fallback_distractors(question: str, correct_answer: str, subject: str) -> List[str]: pool = DOMAIN_DISTRACTORS.get(subject, DOMAIN_DISTRACTORS["Machine Learning"]) answer_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", correct_answer)} chosen = [] for candidate in pool: candidate_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", candidate)} if candidate.lower() != correct_answer.lower() and not answer_terms.intersection(candidate_terms): chosen.append(candidate) if len(chosen) == 3: break if len(chosen) < 3: chosen.extend(["a plausible baseline method", "a purely random heuristic", "a manual review process"]) if re.search(r"\bwhich\b|\bwhat\b", question.lower()): return [item[:1].upper() + item[1:] for item in chosen[:3]] return chosen[:3] def generate_distractors(question: str, correct_answer: str, subject: str, difficulty: str) -> Tuple[List[str], str]: prompt = f""" You create benchmark-quality multiple-choice distractors. Return only a JSON array of exactly 3 short wrong answers. Subject: {subject} Difficulty: {difficulty} Question: {question} Correct answer: {correct_answer} Distractors must be plausible, mutually distinct, and not reveal the answer. """ generated = _hf_generate(prompt) source = "HF Inference model" if len(generated) == 3 else "deterministic local heuristic" distractors = generated if len(generated) == 3 else _fallback_distractors(question, correct_answer, subject) return distractors, source def audit_question(question: str, correct_answer: str, distractors: List[str]) -> List[Dict[str, str]]: options = [correct_answer] + distractors lower_options = [option.strip().lower() for option in options] checks = [] checks.append({ "check": "Duplicate options", "result": "pass" if len(set(lower_options)) == len(lower_options) else "review", "detail": "All answer choices are unique." if len(set(lower_options)) == len(lower_options) else "At least two choices are identical.", }) answer_words = set(re.findall(r"[a-zA-Z]{4,}", correct_answer.lower())) leaked = any(answer_words.intersection(set(re.findall(r"[a-zA-Z]{4,}", d.lower()))) for d in distractors) checks.append({ "check": "Answer leakage", "result": "review" if leaked else "pass", "detail": "A distractor shares key answer terms." if leaked else "Distractors avoid obvious answer words.", }) lengths = [len(option) for option in options] balanced = max(lengths) - min(lengths) <= max(18, len(correct_answer)) checks.append({ "check": "Length balance", "result": "pass" if balanced else "review", "detail": "Choices have comparable length." if balanced else "One option is much longer or shorter than the rest.", }) stem_ok = len(question.strip()) >= 24 and question.strip().endswith("?") checks.append({ "check": "Question stem", "result": "pass" if stem_ok else "review", "detail": "Stem is specific and phrased as a question." if stem_ok else "Make the stem more specific and end it with a question mark.", }) return checks def render_question(question_data: Dict[str, object]) -> str: letters = "ABCD" options = [question_data["correct_answer"]] + question_data["distractors"] option_html = "" for idx, option in enumerate(options): is_answer = option == question_data["correct_answer"] option_html += f"""
{letters[idx]}. {option} {'answer' if is_answer else ''}
""" return f"""

{question_data['subject']} ยท {question_data['difficulty']}

{question_data['question']}

{option_html}

Rationale: {question_data['rationale']}

Distractor source: {question_data['source']}

""" def render_audit(checks: List[Dict[str, str]]) -> str: lines = ["| Check | Result | Detail |", "|---|---|---|"] for check in checks: badge = "Pass" if check["result"] == "pass" else "Review" lines.append(f"| {check['check']} | {badge} | {check['detail']} |") return "\n".join(lines) def add_question( question: str, correct_answer: str, subject: str, difficulty: str, rationale: str, state: List[Dict[str, object]], ): if not question or not correct_answer: return state, "Add a question and correct answer first.", "", "" distractors, source = generate_distractors(question, correct_answer, subject, difficulty) item = { "question": question.strip(), "correct_answer": correct_answer.strip(), "distractors": distractors, "subject": subject, "difficulty": difficulty, "rationale": rationale.strip() or "Add a short rationale before publishing this benchmark.", "source": source, } next_state = [*state, item] checks = audit_question(item["question"], item["correct_answer"], item["distractors"]) status = f"Dataset now has {len(next_state)} questions. Review flags before publishing." return next_state, status, render_question(item), render_audit(checks) def export_benchmark(benchmark_name: str, state: List[Dict[str, object]], export_format: str) -> str: safe_name = re.sub(r"[^a-zA-Z0-9_\-/]", "-", benchmark_name.strip() or "my-eval-benchmark") rows = state or SEED_QUESTIONS if export_format == "JSONL": return "\n".join(json.dumps(row, ensure_ascii=False) for row in rows) if export_format == "HF Dataset Script": return f"""from datasets import Dataset rows = {json.dumps(rows, indent=2)} dataset = Dataset.from_list(rows) dataset.push_to_hub("{safe_name}") """ return json.dumps({"name": safe_name, "questions": rows, "total": len(rows)}, indent=2) with gr.Blocks(title="Benchmark Builder", theme=gr.themes.Soft()) as app: state = gr.State(SEED_QUESTIONS) create_premium_hero( "Benchmark Builder", "Design small, inspectable evaluation sets with plausible distractors, quality checks, and Hugging Face Dataset export.", "๐Ÿ“Š", badge="Evaluation Engineering", highlights=["Optional HF inference", "Distractor audit", "Dataset push script"], ) create_method_panel({ "Technique": "LLM-assisted benchmark authoring with deterministic guardrails.", "What it proves": "You understand evaluation data quality, not just prompt generation.", "HF capability": "Ready to publish as a Dataset and evaluate models on the Hub.", }) with gr.Row(): with gr.Column(scale=1): question_input = gr.Textbox( label="Question", value="Which retrieval signal is strongest when a user query uses rare exact terms?", lines=3, ) answer_input = gr.Textbox(label="Correct answer", value="BM25 lexical matching") subject = gr.Dropdown( choices=list(DOMAIN_DISTRACTORS.keys()), value="Information Retrieval", label="Subject", ) difficulty = gr.Radio(["Easy", "Medium", "Hard"], value="Medium", label="Difficulty") rationale = gr.Textbox( label="Rationale", value="BM25 rewards rare exact terms through inverse document frequency.", lines=2, ) add_btn = gr.Button("Generate Distractors + Audit", variant="primary") with gr.Column(scale=1): status_output = gr.Markdown("Add a question to generate a benchmark-ready item.") preview_output = gr.HTML(render_question({**SEED_QUESTIONS[0], "source": "seed example"})) audit_output = gr.Markdown(render_audit(audit_question( SEED_QUESTIONS[0]["question"], SEED_QUESTIONS[0]["correct_answer"], SEED_QUESTIONS[0]["distractors"], ))) add_btn.click( add_question, inputs=[question_input, answer_input, subject, difficulty, rationale, state], outputs=[state, status_output, preview_output, audit_output], ) gr.Markdown("## Export") with gr.Row(): benchmark_name = gr.Textbox(label="Hub dataset name", value="username/retrieval-mini-eval") export_format = gr.Dropdown(["JSON", "JSONL", "HF Dataset Script"], value="JSON", label="Format") export_btn = gr.Button("Generate Export", variant="secondary") export_output = gr.Code(label="Benchmark artifact", language="python", lines=16) export_btn.click(export_benchmark, inputs=[benchmark_name, state, export_format], outputs=export_output) gr.Markdown(""" ## Why This Is Useful Evaluation sets fail quietly when distractors are weak, duplicated, or reveal the answer. This Space teaches a better workflow: generate candidates, audit them, keep rationales, and publish the result as a versioned Hugging Face Dataset. """) create_footer("Benchmark Builder") if __name__ == "__main__": app.launch(server_name="0.0.0.0", server_port=7860)