Mooizz's picture
Upload folder using huggingface_hub
1070765 verified
/**
* List of supported Evaluation Frameworks supported in the `eval.yaml` file in benchmarks datasets.
*/
export const EVALUATION_FRAMEWORKS = {
"inspect-ai": {
name: "inspect-ai",
description: "Inspect AI is an open-source framework for large language model evaluations.",
url: "https://inspect.aisi.org.uk/",
},
"math-arena": {
name: "math-arena",
description: "MathArena is a platform for evaluation of LLMs on latest math competitions and olympiads.",
url: "https://github.com/eth-sri/matharena",
},
mteb: {
name: "mteb",
description: "Multimodal toolbox for evaluating embeddings and retrieval systems.",
url: "https://github.com/embeddings-benchmark/mteb",
},
"olmocr-bench": {
name: "olmocr-bench",
description: "olmOCR-Bench is a framework for evaluating document-level OCR of various tools.",
url: "https://github.com/allenai/olmocr/tree/main/olmocr/bench",
},
harbor: {
name: "harbor",
description: "Harbor is a framework for evaluating and optimizing agents and language models.",
url: "https://github.com/laude-institute/harbor",
},
archipelago: {
name: "archipelago",
description: "Archipelago is a system for running and evaluating AI agents against MCP applications.",
url: "https://github.com/Mercor-Intelligence/archipelago",
},
"swe-bench": {
name: "swe-bench",
description: "SWE Bench is a framework for evaluating the performance of LLMs on software engineering tasks.",
url: "https://github.com/swe-bench/swe-bench",
},
"swe-bench-pro": {
name: "swe-bench-pro",
description:
"SWE-Bench Pro is a challenging benchmark evaluating LLMs/Agents on long-horizon software engineering tasks.",
url: "https://github.com/scaleapi/SWE-bench_Pro-os",
},
"nemo-evaluator": {
name: "nemo-evaluator",
description:
"NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models across 100+ benchmarks.",
url: "https://github.com/NVIDIA-NeMo/Evaluator",
},
} as const;