Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import json | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Any | |
| class BenchmarkSpec: | |
| name: str | |
| family: str | |
| required_fields: tuple[str, ...] | |
| REGISTRY: dict[str, BenchmarkSpec] = { | |
| "MBPP": BenchmarkSpec("MBPP", "coding", ("task_id", "prompt", "tests")), | |
| "HumanEval": BenchmarkSpec("HumanEval", "coding", ("task_id", "prompt", "test")), | |
| "GSM8K": BenchmarkSpec("GSM8K", "reasoning", ("question", "answer")), | |
| "ARC-Challenge": BenchmarkSpec("ARC-Challenge", "reasoning", ("question", "choices", "answer")), | |
| } | |
| def validate_sample(benchmark_name: str, sample: dict[str, Any]) -> None: | |
| spec = REGISTRY[benchmark_name] | |
| for field in spec.required_fields: | |
| if field not in sample: | |
| raise ValueError(f"{benchmark_name} sample missing required field: {field}") | |
| def build_prompt(benchmark_name: str, sample: dict[str, Any]) -> str: | |
| validate_sample(benchmark_name, sample) | |
| if benchmark_name == "MBPP": | |
| tests = sample["tests"] | |
| rendered_tests = "\n".join(str(t) for t in tests) | |
| return ( | |
| "Write a Python function that solves the task below.\n\n" | |
| f"Task:\n{sample['prompt']}\n\n" | |
| f"Tests:\n{rendered_tests}\n" | |
| ) | |
| if benchmark_name == "HumanEval": | |
| return ( | |
| "Complete the following Python function exactly as specified.\n\n" | |
| f"Prompt:\n{sample['prompt']}\n\n" | |
| f"Reference test:\n{sample['test']}\n" | |
| ) | |
| if benchmark_name == "GSM8K": | |
| return f"Solve the following math word problem. Return only the final answer.\n\nQuestion: {sample['question']}\n" | |
| if benchmark_name == "ARC-Challenge": | |
| choices = sample["choices"] | |
| rendered_choices = "\n".join(f"- {choice}" for choice in choices) | |
| return ( | |
| "Answer the following multiple-choice science question. Return only the correct option text or label.\n\n" | |
| f"Question: {sample['question']}\nChoices:\n{rendered_choices}\n" | |
| ) | |
| raise ValueError(f"Unknown benchmark: {benchmark_name}") | |
| def load_cycle_benchmark_suite(path: Path) -> dict[str, dict[str, BenchmarkSpec]]: | |
| payload = json.loads(path.read_text(encoding="utf-8")) | |
| out: dict[str, dict[str, BenchmarkSpec]] = {"coding_benchmarks": {}, "reasoning_benchmarks": {}} | |
| for section in ("coding_benchmarks", "reasoning_benchmarks"): | |
| if section not in payload: | |
| raise ValueError(f"missing benchmark section: {section}") | |
| for slot in ("fast_iteration", "milestone"): | |
| if slot not in payload[section]: | |
| raise ValueError(f"missing benchmark slot: {section}.{slot}") | |
| name = payload[section][slot]["name"] | |
| if name not in REGISTRY: | |
| raise ValueError(f"unsupported benchmark: {name}") | |
| out[section][slot] = REGISTRY[name] | |
| return out | |
| def main() -> int: | |
| path = Path("artifacts/benchmark_suite.cycle1.json") | |
| suite = load_cycle_benchmark_suite(path) | |
| print(json.dumps({k: {slot: spec.name for slot, spec in section.items()} for k, section in suite.items()}, indent=2)) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |