File size: 844 Bytes

7dd817a

"""
Cortex Benchmark Harness
========================

A self-contained evaluation suite for comparing base LLMs against
Cortex-enhanced versions across standard NLP benchmarks and
Cortex-specific capability tests (memory, hallucination detection, etc.).

Benchmarks:
  Standard:  HellaSwag, ARC-Easy, ARC-Challenge, PIQA, WinoGrande, MMLU
  Memory:    Passkey Retrieval, Multi-Hop Memory
  Hallucination: HaluEval-QA
"""

from benchmark.scoring import log_likelihood_score, generate_and_check
from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
from benchmark.runner import BenchmarkRunner

__all__ = [
    "log_likelihood_score",
    "generate_and_check",
    "TASK_REGISTRY",
    "BenchmarkTask",
    "PasskeyRetrieval",
    "MultiHopMemory",
    "BenchmarkRunner",
]