Spaces:

Mooizz
/

New-space-openenv

Sleeping

App Files Files Community

New-space-openenv / node_modules /@huggingface /tasks /src /eval.ts

Mooizz

Upload folder using huggingface_hub

1070765 verified 8 days ago

raw

history blame contribute delete

2.01 kB

	/**
	* List of supported Evaluation Frameworks supported in the `eval.yaml` file in benchmarks datasets.
	*/
	export const EVALUATION_FRAMEWORKS = {
	"inspect-ai": {
	name: "inspect-ai",
	description: "Inspect AI is an open-source framework for large language model evaluations.",
	url: "https://inspect.aisi.org.uk/",
	},
	"math-arena": {
	name: "math-arena",
	description: "MathArena is a platform for evaluation of LLMs on latest math competitions and olympiads.",
	url: "https://github.com/eth-sri/matharena",
	},
	mteb: {
	name: "mteb",
	description: "Multimodal toolbox for evaluating embeddings and retrieval systems.",
	url: "https://github.com/embeddings-benchmark/mteb",
	},
	"olmocr-bench": {
	name: "olmocr-bench",
	description: "olmOCR-Bench is a framework for evaluating document-level OCR of various tools.",
	url: "https://github.com/allenai/olmocr/tree/main/olmocr/bench",
	},
	harbor: {
	name: "harbor",
	description: "Harbor is a framework for evaluating and optimizing agents and language models.",
	url: "https://github.com/laude-institute/harbor",
	},
	archipelago: {
	name: "archipelago",
	description: "Archipelago is a system for running and evaluating AI agents against MCP applications.",
	url: "https://github.com/Mercor-Intelligence/archipelago",
	},
	"swe-bench": {
	name: "swe-bench",
	description: "SWE Bench is a framework for evaluating the performance of LLMs on software engineering tasks.",
	url: "https://github.com/swe-bench/swe-bench",
	},
	"swe-bench-pro": {
	name: "swe-bench-pro",
	description:
	"SWE-Bench Pro is a challenging benchmark evaluating LLMs/Agents on long-horizon software engineering tasks.",
	url: "https://github.com/scaleapi/SWE-bench_Pro-os",
	},
	"nemo-evaluator": {
	name: "nemo-evaluator",
	description:
	"NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models across 100+ benchmarks.",
	url: "https://github.com/NVIDIA-NeMo/Evaluator",
	},
	} as const;