Spaces:
Paused
Paused
| /** | |
| * List of supported Evaluation Frameworks supported in the `eval.yaml` file in benchmarks datasets. | |
| */ | |
| export declare const EVALUATION_FRAMEWORKS: { | |
| readonly "inspect-ai": { | |
| readonly name: "inspect-ai"; | |
| readonly description: "Inspect AI is an open-source framework for large language model evaluations."; | |
| readonly url: "https://inspect.aisi.org.uk/"; | |
| }; | |
| readonly "math-arena": { | |
| readonly name: "math-arena"; | |
| readonly description: "MathArena is a platform for evaluation of LLMs on latest math competitions and olympiads."; | |
| readonly url: "https://github.com/eth-sri/matharena"; | |
| }; | |
| readonly mteb: { | |
| readonly name: "mteb"; | |
| readonly description: "Multimodal toolbox for evaluating embeddings and retrieval systems."; | |
| readonly url: "https://github.com/embeddings-benchmark/mteb"; | |
| }; | |
| readonly "olmocr-bench": { | |
| readonly name: "olmocr-bench"; | |
| readonly description: "olmOCR-Bench is a framework for evaluating document-level OCR of various tools."; | |
| readonly url: "https://github.com/allenai/olmocr/tree/main/olmocr/bench"; | |
| }; | |
| readonly harbor: { | |
| readonly name: "harbor"; | |
| readonly description: "Harbor is a framework for evaluating and optimizing agents and language models."; | |
| readonly url: "https://github.com/laude-institute/harbor"; | |
| }; | |
| readonly archipelago: { | |
| readonly name: "archipelago"; | |
| readonly description: "Archipelago is a system for running and evaluating AI agents against MCP applications."; | |
| readonly url: "https://github.com/Mercor-Intelligence/archipelago"; | |
| }; | |
| readonly "swe-bench": { | |
| readonly name: "swe-bench"; | |
| readonly description: "SWE Bench is a framework for evaluating the performance of LLMs on software engineering tasks."; | |
| readonly url: "https://github.com/swe-bench/swe-bench"; | |
| }; | |
| readonly "swe-bench-pro": { | |
| readonly name: "swe-bench-pro"; | |
| readonly description: "SWE-Bench Pro is a challenging benchmark evaluating LLMs/Agents on long-horizon software engineering tasks."; | |
| readonly url: "https://github.com/scaleapi/SWE-bench_Pro-os"; | |
| }; | |
| readonly "nemo-evaluator": { | |
| readonly name: "nemo-evaluator"; | |
| readonly description: "NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models across 100+ benchmarks."; | |
| readonly url: "https://github.com/NVIDIA-NeMo/Evaluator"; | |
| }; | |
| }; | |
| //# sourceMappingURL=eval.d.ts.map |