Spaces:

sammoftah
/

benchmark-builder

Sleeping

File size: 12,153 Bytes

c812306

"""
Benchmark Builder
Create small, auditable multiple-choice evaluation datasets for LLMs.
"""

import json
import os
import re
import sys
from typing import Dict, List, Tuple

import gradio as gr

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from shared.components import create_footer, create_method_panel, create_premium_hero


try:
    from huggingface_hub import InferenceClient
except Exception:  # pragma: no cover - optional on local machines
    InferenceClient = None


SEED_QUESTIONS = [
    {
        "question": "Which retrieval signal is strongest when a user query uses rare exact terms?",
        "correct_answer": "BM25 lexical matching",
        "distractors": [
            "Random negative sampling",
            "Temperature scaling",
            "Decoder-only attention masking",
        ],
        "subject": "Information Retrieval",
        "difficulty": "Medium",
        "rationale": "BM25 rewards rare exact terms through inverse document frequency.",
    }
]

DOMAIN_DISTRACTORS = {
    "Machine Learning": [
        "dropout regularization",
        "batch normalization",
        "cosine learning-rate decay",
        "gradient clipping",
        "early stopping",
        "teacher forcing",
    ],
    "Information Retrieval": [
        "dense vector search",
        "query expansion",
        "reciprocal rank fusion",
        "cross-encoder reranking",
        "metadata filtering",
        "semantic chunking",
    ],
    "AI Safety": [
        "output filtering",
        "least-privilege tool access",
        "prompt isolation",
        "red-team evaluation",
        "policy classification",
        "adversarial testing",
    ],
    "Data Engineering": [
        "schema validation",
        "deduplication",
        "entity resolution",
        "partition pruning",
        "incremental backfills",
        "lineage tracking",
    ],
}


def _hf_generate(prompt: str) -> List[str]:
    """Use HF Inference when configured; otherwise return an empty list."""
    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
    if not token or InferenceClient is None:
        return []

    client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=token)
    response = client.text_generation(prompt, max_new_tokens=220, temperature=0.35)
    match = re.search(r"\[[\s\S]*\]", response)
    if not match:
        return []
    try:
        parsed = json.loads(match.group(0))
    except json.JSONDecodeError:
        return []
    return [str(item).strip() for item in parsed if str(item).strip()][:3]


def _fallback_distractors(question: str, correct_answer: str, subject: str) -> List[str]:
    pool = DOMAIN_DISTRACTORS.get(subject, DOMAIN_DISTRACTORS["Machine Learning"])
    answer_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", correct_answer)}
    chosen = []
    for candidate in pool:
        candidate_terms = {term.lower() for term in re.findall(r"[A-Za-z][A-Za-z\-]+", candidate)}
        if candidate.lower() != correct_answer.lower() and not answer_terms.intersection(candidate_terms):
            chosen.append(candidate)
        if len(chosen) == 3:
            break

    if len(chosen) < 3:
        chosen.extend(["a plausible baseline method", "a purely random heuristic", "a manual review process"])

    if re.search(r"\bwhich\b|\bwhat\b", question.lower()):
        return [item[:1].upper() + item[1:] for item in chosen[:3]]
    return chosen[:3]


def generate_distractors(question: str, correct_answer: str, subject: str, difficulty: str) -> Tuple[List[str], str]:
    prompt = f"""
You create benchmark-quality multiple-choice distractors.
Return only a JSON array of exactly 3 short wrong answers.
Subject: {subject}
Difficulty: {difficulty}
Question: {question}
Correct answer: {correct_answer}
Distractors must be plausible, mutually distinct, and not reveal the answer.
"""
    generated = _hf_generate(prompt)
    source = "HF Inference model" if len(generated) == 3 else "deterministic local heuristic"
    distractors = generated if len(generated) == 3 else _fallback_distractors(question, correct_answer, subject)
    return distractors, source


def audit_question(question: str, correct_answer: str, distractors: List[str]) -> List[Dict[str, str]]:
    options = [correct_answer] + distractors
    lower_options = [option.strip().lower() for option in options]
    checks = []

    checks.append({
        "check": "Duplicate options",
        "result": "pass" if len(set(lower_options)) == len(lower_options) else "review",
        "detail": "All answer choices are unique." if len(set(lower_options)) == len(lower_options) else "At least two choices are identical.",
    })

    answer_words = set(re.findall(r"[a-zA-Z]{4,}", correct_answer.lower()))
    leaked = any(answer_words.intersection(set(re.findall(r"[a-zA-Z]{4,}", d.lower()))) for d in distractors)
    checks.append({
        "check": "Answer leakage",
        "result": "review" if leaked else "pass",
        "detail": "A distractor shares key answer terms." if leaked else "Distractors avoid obvious answer words.",
    })

    lengths = [len(option) for option in options]
    balanced = max(lengths) - min(lengths) <= max(18, len(correct_answer))
    checks.append({
        "check": "Length balance",
        "result": "pass" if balanced else "review",
        "detail": "Choices have comparable length." if balanced else "One option is much longer or shorter than the rest.",
    })

    stem_ok = len(question.strip()) >= 24 and question.strip().endswith("?")
    checks.append({
        "check": "Question stem",
        "result": "pass" if stem_ok else "review",
        "detail": "Stem is specific and phrased as a question." if stem_ok else "Make the stem more specific and end it with a question mark.",
    })
    return checks


def render_question(question_data: Dict[str, object]) -> str:
    letters = "ABCD"
    options = [question_data["correct_answer"]] + question_data["distractors"]
    option_html = ""
    for idx, option in enumerate(options):
        is_answer = option == question_data["correct_answer"]
        option_html += f"""
        <div class="info-card" style="margin:0.55rem 0; border-left:4px solid {'#22c55e' if is_answer else '#e5e7eb'} !important;">
          <strong>{letters[idx]}.</strong> {option}
          {'<span style="float:right; color:#15803d; font-weight:800;">answer</span>' if is_answer else ''}
        </div>
        """

    return f"""
    <div class="info-card">
      <p style="margin:0 0 0.4rem 0; color:#e8935c; font-weight:800;">{question_data['subject']} · {question_data['difficulty']}</p>
      <h3 style="margin-top:0;">{question_data['question']}</h3>
      {option_html}
      <p><strong>Rationale:</strong> {question_data['rationale']}</p>
      <p><strong>Distractor source:</strong> {question_data['source']}</p>
    </div>
    """


def render_audit(checks: List[Dict[str, str]]) -> str:
    lines = ["| Check | Result | Detail |", "|---|---|---|"]
    for check in checks:
        badge = "Pass" if check["result"] == "pass" else "Review"
        lines.append(f"| {check['check']} | {badge} | {check['detail']} |")
    return "\n".join(lines)


def add_question(
    question: str,
    correct_answer: str,
    subject: str,
    difficulty: str,
    rationale: str,
    state: List[Dict[str, object]],
):
    if not question or not correct_answer:
        return state, "Add a question and correct answer first.", "", ""

    distractors, source = generate_distractors(question, correct_answer, subject, difficulty)
    item = {
        "question": question.strip(),
        "correct_answer": correct_answer.strip(),
        "distractors": distractors,
        "subject": subject,
        "difficulty": difficulty,
        "rationale": rationale.strip() or "Add a short rationale before publishing this benchmark.",
        "source": source,
    }
    next_state = [*state, item]
    checks = audit_question(item["question"], item["correct_answer"], item["distractors"])
    status = f"Dataset now has {len(next_state)} questions. Review flags before publishing."
    return next_state, status, render_question(item), render_audit(checks)


def export_benchmark(benchmark_name: str, state: List[Dict[str, object]], export_format: str) -> str:
    safe_name = re.sub(r"[^a-zA-Z0-9_\-/]", "-", benchmark_name.strip() or "my-eval-benchmark")
    rows = state or SEED_QUESTIONS

    if export_format == "JSONL":
        return "\n".join(json.dumps(row, ensure_ascii=False) for row in rows)

    if export_format == "HF Dataset Script":
        return f"""from datasets import Dataset

rows = {json.dumps(rows, indent=2)}
dataset = Dataset.from_list(rows)
dataset.push_to_hub("{safe_name}")
"""

    return json.dumps({"name": safe_name, "questions": rows, "total": len(rows)}, indent=2)


with gr.Blocks(title="Benchmark Builder", theme=gr.themes.Soft()) as app:
    state = gr.State(SEED_QUESTIONS)
    create_premium_hero(
        "Benchmark Builder",
        "Design small, inspectable evaluation sets with plausible distractors, quality checks, and Hugging Face Dataset export.",
        "📊",
        badge="Evaluation Engineering",
        highlights=["Optional HF inference", "Distractor audit", "Dataset push script"],
    )
    create_method_panel({
        "Technique": "LLM-assisted benchmark authoring with deterministic guardrails.",
        "What it proves": "You understand evaluation data quality, not just prompt generation.",
        "HF capability": "Ready to publish as a Dataset and evaluate models on the Hub.",
    })

    with gr.Row():
        with gr.Column(scale=1):
            question_input = gr.Textbox(
                label="Question",
                value="Which retrieval signal is strongest when a user query uses rare exact terms?",
                lines=3,
            )
            answer_input = gr.Textbox(label="Correct answer", value="BM25 lexical matching")
            subject = gr.Dropdown(
                choices=list(DOMAIN_DISTRACTORS.keys()),
                value="Information Retrieval",
                label="Subject",
            )
            difficulty = gr.Radio(["Easy", "Medium", "Hard"], value="Medium", label="Difficulty")
            rationale = gr.Textbox(
                label="Rationale",
                value="BM25 rewards rare exact terms through inverse document frequency.",
                lines=2,
            )
            add_btn = gr.Button("Generate Distractors + Audit", variant="primary")
        with gr.Column(scale=1):
            status_output = gr.Markdown("Add a question to generate a benchmark-ready item.")
            preview_output = gr.HTML(render_question({**SEED_QUESTIONS[0], "source": "seed example"}))
            audit_output = gr.Markdown(render_audit(audit_question(
                SEED_QUESTIONS[0]["question"],
                SEED_QUESTIONS[0]["correct_answer"],
                SEED_QUESTIONS[0]["distractors"],
            )))

    add_btn.click(
        add_question,
        inputs=[question_input, answer_input, subject, difficulty, rationale, state],
        outputs=[state, status_output, preview_output, audit_output],
    )

    gr.Markdown("## Export")
    with gr.Row():
        benchmark_name = gr.Textbox(label="Hub dataset name", value="username/retrieval-mini-eval")
        export_format = gr.Dropdown(["JSON", "JSONL", "HF Dataset Script"], value="JSON", label="Format")
    export_btn = gr.Button("Generate Export", variant="secondary")
    export_output = gr.Code(label="Benchmark artifact", language="python", lines=16)
    export_btn.click(export_benchmark, inputs=[benchmark_name, state, export_format], outputs=export_output)

    gr.Markdown("""
## Why This Is Useful

Evaluation sets fail quietly when distractors are weak, duplicated, or reveal the answer. This Space teaches a better workflow: generate candidates, audit them, keep rationales, and publish the result as a versioned Hugging Face Dataset.
""")
    create_footer("Benchmark Builder")


if __name__ == "__main__":
    app.launch(server_name="0.0.0.0", server_port=7860)