cortex / benchmark /__init__.py
theapemachine's picture
Add benchmark harness: __init__.py
7dd817a verified
raw
history blame contribute delete
844 Bytes
"""
Cortex Benchmark Harness
========================
A self-contained evaluation suite for comparing base LLMs against
Cortex-enhanced versions across standard NLP benchmarks and
Cortex-specific capability tests (memory, hallucination detection, etc.).
Benchmarks:
Standard: HellaSwag, ARC-Easy, ARC-Challenge, PIQA, WinoGrande, MMLU
Memory: Passkey Retrieval, Multi-Hop Memory
Hallucination: HaluEval-QA
"""
from benchmark.scoring import log_likelihood_score, generate_and_check
from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
from benchmark.runner import BenchmarkRunner
__all__ = [
"log_likelihood_score",
"generate_and_check",
"TASK_REGISTRY",
"BenchmarkTask",
"PasskeyRetrieval",
"MultiHopMemory",
"BenchmarkRunner",
]