File size: 2,009 Bytes
1070765
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/**
 * List of supported Evaluation Frameworks supported in the `eval.yaml` file in benchmarks datasets.
 */
export const EVALUATION_FRAMEWORKS = {
	"inspect-ai": {
		name: "inspect-ai",
		description: "Inspect AI is an open-source framework for large language model evaluations.",
		url: "https://inspect.aisi.org.uk/",
	},
	"math-arena": {
		name: "math-arena",
		description: "MathArena is a platform for evaluation of LLMs on latest math competitions and olympiads.",
		url: "https://github.com/eth-sri/matharena",
	},
	mteb: {
		name: "mteb",
		description: "Multimodal toolbox for evaluating embeddings and retrieval systems.",
		url: "https://github.com/embeddings-benchmark/mteb",
	},
	"olmocr-bench": {
		name: "olmocr-bench",
		description: "olmOCR-Bench is a framework for evaluating document-level OCR of various tools.",
		url: "https://github.com/allenai/olmocr/tree/main/olmocr/bench",
	},
	harbor: {
		name: "harbor",
		description: "Harbor is a framework for evaluating and optimizing agents and language models.",
		url: "https://github.com/laude-institute/harbor",
	},
	archipelago: {
		name: "archipelago",
		description: "Archipelago is a system for running and evaluating AI agents against MCP applications.",
		url: "https://github.com/Mercor-Intelligence/archipelago",
	},
	"swe-bench": {
		name: "swe-bench",
		description: "SWE Bench is a framework for evaluating the performance of LLMs on software engineering tasks.",
		url: "https://github.com/swe-bench/swe-bench",
	},
	"swe-bench-pro": {
		name: "swe-bench-pro",
		description:
			"SWE-Bench Pro is a challenging benchmark evaluating LLMs/Agents on long-horizon software engineering tasks.",
		url: "https://github.com/scaleapi/SWE-bench_Pro-os",
	},
	"nemo-evaluator": {
		name: "nemo-evaluator",
		description:
			"NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models across 100+ benchmarks.",
		url: "https://github.com/NVIDIA-NeMo/Evaluator",
	},
} as const;