Spaces:

Mooizz
/

New-space-openenv

Paused

App Files Files Community

New-space-openenv / node_modules /@huggingface /tasks /dist /commonjs /eval.d.ts

Mooizz

Upload folder using huggingface_hub

1070765 verified about 1 month ago

raw

history blame contribute delete

2.58 kB

	/**
	* List of supported Evaluation Frameworks supported in the `eval.yaml` file in benchmarks datasets.
	*/
	export declare const EVALUATION_FRAMEWORKS: {
	readonly "inspect-ai": {
	readonly name: "inspect-ai";
	readonly description: "Inspect AI is an open-source framework for large language model evaluations.";
	readonly url: "https://inspect.aisi.org.uk/";
	};
	readonly "math-arena": {
	readonly name: "math-arena";
	readonly description: "MathArena is a platform for evaluation of LLMs on latest math competitions and olympiads.";
	readonly url: "https://github.com/eth-sri/matharena";
	};
	readonly mteb: {
	readonly name: "mteb";
	readonly description: "Multimodal toolbox for evaluating embeddings and retrieval systems.";
	readonly url: "https://github.com/embeddings-benchmark/mteb";
	};
	readonly "olmocr-bench": {
	readonly name: "olmocr-bench";
	readonly description: "olmOCR-Bench is a framework for evaluating document-level OCR of various tools.";
	readonly url: "https://github.com/allenai/olmocr/tree/main/olmocr/bench";
	};
	readonly harbor: {
	readonly name: "harbor";
	readonly description: "Harbor is a framework for evaluating and optimizing agents and language models.";
	readonly url: "https://github.com/laude-institute/harbor";
	};
	readonly archipelago: {
	readonly name: "archipelago";
	readonly description: "Archipelago is a system for running and evaluating AI agents against MCP applications.";
	readonly url: "https://github.com/Mercor-Intelligence/archipelago";
	};
	readonly "swe-bench": {
	readonly name: "swe-bench";
	readonly description: "SWE Bench is a framework for evaluating the performance of LLMs on software engineering tasks.";
	readonly url: "https://github.com/swe-bench/swe-bench";
	};
	readonly "swe-bench-pro": {
	readonly name: "swe-bench-pro";
	readonly description: "SWE-Bench Pro is a challenging benchmark evaluating LLMs/Agents on long-horizon software engineering tasks.";
	readonly url: "https://github.com/scaleapi/SWE-bench_Pro-os";
	};
	readonly "nemo-evaluator": {
	readonly name: "nemo-evaluator";
	readonly description: "NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models across 100+ benchmarks.";
	readonly url: "https://github.com/NVIDIA-NeMo/Evaluator";
	};
	};
	//# sourceMappingURL=eval.d.ts.map