""" Benchmark Builder Create small, auditable multiple-choice evaluation datasets for LLMs. """ import json import os import re import sys from typing import Dict, List, Tuple import gradio as gr sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from shared.components import create_footer, create_method_panel, create_premium_hero try: from huggingface_hub import InferenceClient except Exception: # pragma: no cover - optional on local machines InferenceClient = None SEED_QUESTIONS = [ { "question": "Which retrieval signal is strongest when a user query uses rare exact terms?", "correct_answer": "BM25 lexical matching", "distractors": [ "Random negative sampling", "Temperature scaling", "Decoder-only attention masking", ], "subject": "Information Retrieval", "difficulty": "Medium", "rationale": "BM25 rewards rare exact terms through inverse document frequency.", } ] DOMAIN_DISTRACTORS = { "Machine Learning": [ "dropout regularization", "batch normalization", "cosine learning-rate decay", "gradient clipping", "early stopping", "teacher forcing", ], "Information Retrieval": [ "dense vector search", "query expansion", "reciprocal rank fusion", "cross-encoder reranking", "metadata filtering", "semantic chunking", ], "AI Safety": [ "output filtering", "least-privilege tool access", "prompt isolation", "red-team evaluation", "policy classification", "adversarial testing", ], "Data Engineering": [ "schema validation", "deduplication", "entity resolution", "partition pruning", "incremental backfills", "lineage tracking", ], } def _hf_generate(prompt: str) -> List[str]: """Use HF Inference when configured; otherwise return an empty list.""" token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") if not token or InferenceClient is None: return [] client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=token) response = client.text_generation(prompt, max_new_tokens=220, temperature=0.35) match = re.search(r"\[[\s\S]*\]", response) if not match: return [] try: parsed = json.loads(match.group(0)) except json.JSONDecodeError: return [] return [str(item).strip() for item in parsed if str(item).strip()][:3] def _fallback_distractors(question: str, correct_answer: str, subject: str) -> List[str]: pool = DOMAIN_DISTRACTORS.get(subject, DOMAIN_DISTRACTORS["Machine Learning"]) answer_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", correct_answer)} chosen = [] for candidate in pool: candidate_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", candidate)} if candidate.lower() != correct_answer.lower() and not answer_terms.intersection(candidate_terms): chosen.append(candidate) if len(chosen) == 3: break if len(chosen) < 3: chosen.extend(["a plausible baseline method", "a purely random heuristic", "a manual review process"]) if re.search(r"\bwhich\b|\bwhat\b", question.lower()): return [item[:1].upper() + item[1:] for item in chosen[:3]] return chosen[:3] def generate_distractors(question: str, correct_answer: str, subject: str, difficulty: str) -> Tuple[List[str], str]: prompt = f""" You create benchmark-quality multiple-choice distractors. Return only a JSON array of exactly 3 short wrong answers. Subject: {subject} Difficulty: {difficulty} Question: {question} Correct answer: {correct_answer} Distractors must be plausible, mutually distinct, and not reveal the answer. """ generated = _hf_generate(prompt) source = "HF Inference model" if len(generated) == 3 else "deterministic local heuristic" distractors = generated if len(generated) == 3 else _fallback_distractors(question, correct_answer, subject) return distractors, source def audit_question(question: str, correct_answer: str, distractors: List[str]) -> List[Dict[str, str]]: options = [correct_answer] + distractors lower_options = [option.strip().lower() for option in options] checks = [] checks.append({ "check": "Duplicate options", "result": "pass" if len(set(lower_options)) == len(lower_options) else "review", "detail": "All answer choices are unique." if len(set(lower_options)) == len(lower_options) else "At least two choices are identical.", }) answer_words = set(re.findall(r"[a-zA-Z]{4,}", correct_answer.lower())) leaked = any(answer_words.intersection(set(re.findall(r"[a-zA-Z]{4,}", d.lower()))) for d in distractors) checks.append({ "check": "Answer leakage", "result": "review" if leaked else "pass", "detail": "A distractor shares key answer terms." if leaked else "Distractors avoid obvious answer words.", }) lengths = [len(option) for option in options] balanced = max(lengths) - min(lengths) <= max(18, len(correct_answer)) checks.append({ "check": "Length balance", "result": "pass" if balanced else "review", "detail": "Choices have comparable length." if balanced else "One option is much longer or shorter than the rest.", }) stem_ok = len(question.strip()) >= 24 and question.strip().endswith("?") checks.append({ "check": "Question stem", "result": "pass" if stem_ok else "review", "detail": "Stem is specific and phrased as a question." if stem_ok else "Make the stem more specific and end it with a question mark.", }) return checks def render_question(question_data: Dict[str, object]) -> str: letters = "ABCD" options = [question_data["correct_answer"]] + question_data["distractors"] option_html = "" for idx, option in enumerate(options): is_answer = option == question_data["correct_answer"] option_html += f"""
{question_data['subject']} ยท {question_data['difficulty']}
Rationale: {question_data['rationale']}
Distractor source: {question_data['source']}