Mooizz's picture
Upload folder using huggingface_hub
1070765 verified
/**
* List of supported Evaluation Frameworks supported in the `eval.yaml` file in benchmarks datasets.
*/
export declare const EVALUATION_FRAMEWORKS: {
readonly "inspect-ai": {
readonly name: "inspect-ai";
readonly description: "Inspect AI is an open-source framework for large language model evaluations.";
readonly url: "https://inspect.aisi.org.uk/";
};
readonly "math-arena": {
readonly name: "math-arena";
readonly description: "MathArena is a platform for evaluation of LLMs on latest math competitions and olympiads.";
readonly url: "https://github.com/eth-sri/matharena";
};
readonly mteb: {
readonly name: "mteb";
readonly description: "Multimodal toolbox for evaluating embeddings and retrieval systems.";
readonly url: "https://github.com/embeddings-benchmark/mteb";
};
readonly "olmocr-bench": {
readonly name: "olmocr-bench";
readonly description: "olmOCR-Bench is a framework for evaluating document-level OCR of various tools.";
readonly url: "https://github.com/allenai/olmocr/tree/main/olmocr/bench";
};
readonly harbor: {
readonly name: "harbor";
readonly description: "Harbor is a framework for evaluating and optimizing agents and language models.";
readonly url: "https://github.com/laude-institute/harbor";
};
readonly archipelago: {
readonly name: "archipelago";
readonly description: "Archipelago is a system for running and evaluating AI agents against MCP applications.";
readonly url: "https://github.com/Mercor-Intelligence/archipelago";
};
readonly "swe-bench": {
readonly name: "swe-bench";
readonly description: "SWE Bench is a framework for evaluating the performance of LLMs on software engineering tasks.";
readonly url: "https://github.com/swe-bench/swe-bench";
};
readonly "swe-bench-pro": {
readonly name: "swe-bench-pro";
readonly description: "SWE-Bench Pro is a challenging benchmark evaluating LLMs/Agents on long-horizon software engineering tasks.";
readonly url: "https://github.com/scaleapi/SWE-bench_Pro-os";
};
readonly "nemo-evaluator": {
readonly name: "nemo-evaluator";
readonly description: "NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models across 100+ benchmarks.";
readonly url: "https://github.com/NVIDIA-NeMo/Evaluator";
};
};
//# sourceMappingURL=eval.d.ts.map