theapemachine
/

cortex

Model card Files Files and versions

cortex / benchmark /__init__.py

theapemachine's picture

Add benchmark harness: __init__.py

7dd817a verified 10 days ago

history blame contribute delete

844 Bytes

	"""
	Cortex Benchmark Harness
	========================

	A self-contained evaluation suite for comparing base LLMs against
	Cortex-enhanced versions across standard NLP benchmarks and
	Cortex-specific capability tests (memory, hallucination detection, etc.).

	Benchmarks:
	Standard: HellaSwag, ARC-Easy, ARC-Challenge, PIQA, WinoGrande, MMLU
	Memory: Passkey Retrieval, Multi-Hop Memory
	Hallucination: HaluEval-QA
	"""

	from benchmark.scoring import log_likelihood_score, generate_and_check
	from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
	from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
	from benchmark.runner import BenchmarkRunner

	__all__ = [
	"log_likelihood_score",
	"generate_and_check",
	"TASK_REGISTRY",
	"BenchmarkTask",
	"PasskeyRetrieval",
	"MultiHopMemory",
	"BenchmarkRunner",
	]