File size: 844 Bytes
7dd817a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | """
Cortex Benchmark Harness
========================
A self-contained evaluation suite for comparing base LLMs against
Cortex-enhanced versions across standard NLP benchmarks and
Cortex-specific capability tests (memory, hallucination detection, etc.).
Benchmarks:
Standard: HellaSwag, ARC-Easy, ARC-Challenge, PIQA, WinoGrande, MMLU
Memory: Passkey Retrieval, Multi-Hop Memory
Hallucination: HaluEval-QA
"""
from benchmark.scoring import log_likelihood_score, generate_and_check
from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
from benchmark.runner import BenchmarkRunner
__all__ = [
"log_likelihood_score",
"generate_and_check",
"TASK_REGISTRY",
"BenchmarkTask",
"PasskeyRetrieval",
"MultiHopMemory",
"BenchmarkRunner",
]
|