Spaces:
Sleeping
Sleeping
| """ | |
| Benchmark Builder | |
| Create small, auditable multiple-choice evaluation datasets for LLMs. | |
| """ | |
| import json | |
| import os | |
| import re | |
| import sys | |
| from typing import Dict, List, Tuple | |
| import gradio as gr | |
| sys.path.append(os.path.join(os.path.dirname(__file__), "..")) | |
| from shared.components import create_footer, create_method_panel, create_premium_hero | |
| try: | |
| from huggingface_hub import InferenceClient | |
| except Exception: # pragma: no cover - optional on local machines | |
| InferenceClient = None | |
| SEED_QUESTIONS = [ | |
| { | |
| "question": "Which retrieval signal is strongest when a user query uses rare exact terms?", | |
| "correct_answer": "BM25 lexical matching", | |
| "distractors": [ | |
| "Random negative sampling", | |
| "Temperature scaling", | |
| "Decoder-only attention masking", | |
| ], | |
| "subject": "Information Retrieval", | |
| "difficulty": "Medium", | |
| "rationale": "BM25 rewards rare exact terms through inverse document frequency.", | |
| } | |
| ] | |
| DOMAIN_DISTRACTORS = { | |
| "Machine Learning": [ | |
| "dropout regularization", | |
| "batch normalization", | |
| "cosine learning-rate decay", | |
| "gradient clipping", | |
| "early stopping", | |
| "teacher forcing", | |
| ], | |
| "Information Retrieval": [ | |
| "dense vector search", | |
| "query expansion", | |
| "reciprocal rank fusion", | |
| "cross-encoder reranking", | |
| "metadata filtering", | |
| "semantic chunking", | |
| ], | |
| "AI Safety": [ | |
| "output filtering", | |
| "least-privilege tool access", | |
| "prompt isolation", | |
| "red-team evaluation", | |
| "policy classification", | |
| "adversarial testing", | |
| ], | |
| "Data Engineering": [ | |
| "schema validation", | |
| "deduplication", | |
| "entity resolution", | |
| "partition pruning", | |
| "incremental backfills", | |
| "lineage tracking", | |
| ], | |
| } | |
| def _hf_generate(prompt: str) -> List[str]: | |
| """Use HF Inference when configured; otherwise return an empty list.""" | |
| token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
| if not token or InferenceClient is None: | |
| return [] | |
| client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=token) | |
| response = client.text_generation(prompt, max_new_tokens=220, temperature=0.35) | |
| match = re.search(r"\[[\s\S]*\]", response) | |
| if not match: | |
| return [] | |
| try: | |
| parsed = json.loads(match.group(0)) | |
| except json.JSONDecodeError: | |
| return [] | |
| return [str(item).strip() for item in parsed if str(item).strip()][:3] | |
| def _fallback_distractors(question: str, correct_answer: str, subject: str) -> List[str]: | |
| pool = DOMAIN_DISTRACTORS.get(subject, DOMAIN_DISTRACTORS["Machine Learning"]) | |
| answer_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", correct_answer)} | |
| chosen = [] | |
| for candidate in pool: | |
| candidate_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", candidate)} | |
| if candidate.lower() != correct_answer.lower() and not answer_terms.intersection(candidate_terms): | |
| chosen.append(candidate) | |
| if len(chosen) == 3: | |
| break | |
| if len(chosen) < 3: | |
| chosen.extend(["a plausible baseline method", "a purely random heuristic", "a manual review process"]) | |
| if re.search(r"\bwhich\b|\bwhat\b", question.lower()): | |
| return [item[:1].upper() + item[1:] for item in chosen[:3]] | |
| return chosen[:3] | |
| def generate_distractors(question: str, correct_answer: str, subject: str, difficulty: str) -> Tuple[List[str], str]: | |
| prompt = f""" | |
| You create benchmark-quality multiple-choice distractors. | |
| Return only a JSON array of exactly 3 short wrong answers. | |
| Subject: {subject} | |
| Difficulty: {difficulty} | |
| Question: {question} | |
| Correct answer: {correct_answer} | |
| Distractors must be plausible, mutually distinct, and not reveal the answer. | |
| """ | |
| generated = _hf_generate(prompt) | |
| source = "HF Inference model" if len(generated) == 3 else "deterministic local heuristic" | |
| distractors = generated if len(generated) == 3 else _fallback_distractors(question, correct_answer, subject) | |
| return distractors, source | |
| def audit_question(question: str, correct_answer: str, distractors: List[str]) -> List[Dict[str, str]]: | |
| options = [correct_answer] + distractors | |
| lower_options = [option.strip().lower() for option in options] | |
| checks = [] | |
| checks.append({ | |
| "check": "Duplicate options", | |
| "result": "pass" if len(set(lower_options)) == len(lower_options) else "review", | |
| "detail": "All answer choices are unique." if len(set(lower_options)) == len(lower_options) else "At least two choices are identical.", | |
| }) | |
| answer_words = set(re.findall(r"[a-zA-Z]{4,}", correct_answer.lower())) | |
| leaked = any(answer_words.intersection(set(re.findall(r"[a-zA-Z]{4,}", d.lower()))) for d in distractors) | |
| checks.append({ | |
| "check": "Answer leakage", | |
| "result": "review" if leaked else "pass", | |
| "detail": "A distractor shares key answer terms." if leaked else "Distractors avoid obvious answer words.", | |
| }) | |
| lengths = [len(option) for option in options] | |
| balanced = max(lengths) - min(lengths) <= max(18, len(correct_answer)) | |
| checks.append({ | |
| "check": "Length balance", | |
| "result": "pass" if balanced else "review", | |
| "detail": "Choices have comparable length." if balanced else "One option is much longer or shorter than the rest.", | |
| }) | |
| stem_ok = len(question.strip()) >= 24 and question.strip().endswith("?") | |
| checks.append({ | |
| "check": "Question stem", | |
| "result": "pass" if stem_ok else "review", | |
| "detail": "Stem is specific and phrased as a question." if stem_ok else "Make the stem more specific and end it with a question mark.", | |
| }) | |
| return checks | |
| def render_question(question_data: Dict[str, object]) -> str: | |
| letters = "ABCD" | |
| options = [question_data["correct_answer"]] + question_data["distractors"] | |
| option_html = "" | |
| for idx, option in enumerate(options): | |
| is_answer = option == question_data["correct_answer"] | |
| option_html += f""" | |
| <div class="info-card" style="margin:0.55rem 0; border-left:4px solid {'#22c55e' if is_answer else '#e5e7eb'} !important;"> | |
| <strong>{letters[idx]}.</strong> {option} | |
| {'<span style="float:right; color:#15803d; font-weight:800;">answer</span>' if is_answer else ''} | |
| </div> | |
| """ | |
| return f""" | |
| <div class="info-card"> | |
| <p style="margin:0 0 0.4rem 0; color:#e8935c; font-weight:800;">{question_data['subject']} · {question_data['difficulty']}</p> | |
| <h3 style="margin-top:0;">{question_data['question']}</h3> | |
| {option_html} | |
| <p><strong>Rationale:</strong> {question_data['rationale']}</p> | |
| <p><strong>Distractor source:</strong> {question_data['source']}</p> | |
| </div> | |
| """ | |
| def render_audit(checks: List[Dict[str, str]]) -> str: | |
| lines = ["| Check | Result | Detail |", "|---|---|---|"] | |
| for check in checks: | |
| badge = "Pass" if check["result"] == "pass" else "Review" | |
| lines.append(f"| {check['check']} | {badge} | {check['detail']} |") | |
| return "\n".join(lines) | |
| def add_question( | |
| question: str, | |
| correct_answer: str, | |
| subject: str, | |
| difficulty: str, | |
| rationale: str, | |
| state: List[Dict[str, object]], | |
| ): | |
| if not question or not correct_answer: | |
| return state, "Add a question and correct answer first.", "", "" | |
| distractors, source = generate_distractors(question, correct_answer, subject, difficulty) | |
| item = { | |
| "question": question.strip(), | |
| "correct_answer": correct_answer.strip(), | |
| "distractors": distractors, | |
| "subject": subject, | |
| "difficulty": difficulty, | |
| "rationale": rationale.strip() or "Add a short rationale before publishing this benchmark.", | |
| "source": source, | |
| } | |
| next_state = [*state, item] | |
| checks = audit_question(item["question"], item["correct_answer"], item["distractors"]) | |
| status = f"Dataset now has {len(next_state)} questions. Review flags before publishing." | |
| return next_state, status, render_question(item), render_audit(checks) | |
| def export_benchmark(benchmark_name: str, state: List[Dict[str, object]], export_format: str) -> str: | |
| safe_name = re.sub(r"[^a-zA-Z0-9_\-/]", "-", benchmark_name.strip() or "my-eval-benchmark") | |
| rows = state or SEED_QUESTIONS | |
| if export_format == "JSONL": | |
| return "\n".join(json.dumps(row, ensure_ascii=False) for row in rows) | |
| if export_format == "HF Dataset Script": | |
| return f"""from datasets import Dataset | |
| rows = {json.dumps(rows, indent=2)} | |
| dataset = Dataset.from_list(rows) | |
| dataset.push_to_hub("{safe_name}") | |
| """ | |
| return json.dumps({"name": safe_name, "questions": rows, "total": len(rows)}, indent=2) | |
| with gr.Blocks(title="Benchmark Builder", theme=gr.themes.Soft()) as app: | |
| state = gr.State(SEED_QUESTIONS) | |
| create_premium_hero( | |
| "Benchmark Builder", | |
| "Design small, inspectable evaluation sets with plausible distractors, quality checks, and Hugging Face Dataset export.", | |
| "📊", | |
| badge="Evaluation Engineering", | |
| highlights=["Optional HF inference", "Distractor audit", "Dataset push script"], | |
| ) | |
| create_method_panel({ | |
| "Technique": "LLM-assisted benchmark authoring with deterministic guardrails.", | |
| "What it proves": "You understand evaluation data quality, not just prompt generation.", | |
| "HF capability": "Ready to publish as a Dataset and evaluate models on the Hub.", | |
| }) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| question_input = gr.Textbox( | |
| label="Question", | |
| value="Which retrieval signal is strongest when a user query uses rare exact terms?", | |
| lines=3, | |
| ) | |
| answer_input = gr.Textbox(label="Correct answer", value="BM25 lexical matching") | |
| subject = gr.Dropdown( | |
| choices=list(DOMAIN_DISTRACTORS.keys()), | |
| value="Information Retrieval", | |
| label="Subject", | |
| ) | |
| difficulty = gr.Radio(["Easy", "Medium", "Hard"], value="Medium", label="Difficulty") | |
| rationale = gr.Textbox( | |
| label="Rationale", | |
| value="BM25 rewards rare exact terms through inverse document frequency.", | |
| lines=2, | |
| ) | |
| add_btn = gr.Button("Generate Distractors + Audit", variant="primary") | |
| with gr.Column(scale=1): | |
| status_output = gr.Markdown("Add a question to generate a benchmark-ready item.") | |
| preview_output = gr.HTML(render_question({**SEED_QUESTIONS[0], "source": "seed example"})) | |
| audit_output = gr.Markdown(render_audit(audit_question( | |
| SEED_QUESTIONS[0]["question"], | |
| SEED_QUESTIONS[0]["correct_answer"], | |
| SEED_QUESTIONS[0]["distractors"], | |
| ))) | |
| add_btn.click( | |
| add_question, | |
| inputs=[question_input, answer_input, subject, difficulty, rationale, state], | |
| outputs=[state, status_output, preview_output, audit_output], | |
| ) | |
| gr.Markdown("## Export") | |
| with gr.Row(): | |
| benchmark_name = gr.Textbox(label="Hub dataset name", value="username/retrieval-mini-eval") | |
| export_format = gr.Dropdown(["JSON", "JSONL", "HF Dataset Script"], value="JSON", label="Format") | |
| export_btn = gr.Button("Generate Export", variant="secondary") | |
| export_output = gr.Code(label="Benchmark artifact", language="python", lines=16) | |
| export_btn.click(export_benchmark, inputs=[benchmark_name, state, export_format], outputs=export_output) | |
| gr.Markdown(""" | |
| ## Why This Is Useful | |
| Evaluation sets fail quietly when distractors are weak, duplicated, or reveal the answer. This Space teaches a better workflow: generate candidates, audit them, keep rationales, and publish the result as a versioned Hugging Face Dataset. | |
| """) | |
| create_footer("Benchmark Builder") | |
| if __name__ == "__main__": | |
| app.launch(server_name="0.0.0.0", server_port=7860) | |