Spaces:
Sleeping
Sleeping
| /** | |
| * List of supported Evaluation Frameworks supported in the `eval.yaml` file in benchmarks datasets. | |
| */ | |
| export const EVALUATION_FRAMEWORKS = { | |
| "inspect-ai": { | |
| name: "inspect-ai", | |
| description: "Inspect AI is an open-source framework for large language model evaluations.", | |
| url: "https://inspect.aisi.org.uk/", | |
| }, | |
| "math-arena": { | |
| name: "math-arena", | |
| description: "MathArena is a platform for evaluation of LLMs on latest math competitions and olympiads.", | |
| url: "https://github.com/eth-sri/matharena", | |
| }, | |
| mteb: { | |
| name: "mteb", | |
| description: "Multimodal toolbox for evaluating embeddings and retrieval systems.", | |
| url: "https://github.com/embeddings-benchmark/mteb", | |
| }, | |
| "olmocr-bench": { | |
| name: "olmocr-bench", | |
| description: "olmOCR-Bench is a framework for evaluating document-level OCR of various tools.", | |
| url: "https://github.com/allenai/olmocr/tree/main/olmocr/bench", | |
| }, | |
| harbor: { | |
| name: "harbor", | |
| description: "Harbor is a framework for evaluating and optimizing agents and language models.", | |
| url: "https://github.com/laude-institute/harbor", | |
| }, | |
| archipelago: { | |
| name: "archipelago", | |
| description: "Archipelago is a system for running and evaluating AI agents against MCP applications.", | |
| url: "https://github.com/Mercor-Intelligence/archipelago", | |
| }, | |
| "swe-bench": { | |
| name: "swe-bench", | |
| description: "SWE Bench is a framework for evaluating the performance of LLMs on software engineering tasks.", | |
| url: "https://github.com/swe-bench/swe-bench", | |
| }, | |
| "swe-bench-pro": { | |
| name: "swe-bench-pro", | |
| description: | |
| "SWE-Bench Pro is a challenging benchmark evaluating LLMs/Agents on long-horizon software engineering tasks.", | |
| url: "https://github.com/scaleapi/SWE-bench_Pro-os", | |
| }, | |
| "nemo-evaluator": { | |
| name: "nemo-evaluator", | |
| description: | |
| "NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models across 100+ benchmarks.", | |
| url: "https://github.com/NVIDIA-NeMo/Evaluator", | |
| }, | |
| } as const; | |