""" Cortex Benchmark Harness ======================== A self-contained evaluation suite for comparing base LLMs against Cortex-enhanced versions across standard NLP benchmarks and Cortex-specific capability tests (memory, hallucination detection, etc.). Benchmarks: Standard: HellaSwag, ARC-Easy, ARC-Challenge, PIQA, WinoGrande, MMLU Memory: Passkey Retrieval, Multi-Hop Memory Hallucination: HaluEval-QA """ from benchmark.scoring import log_likelihood_score, generate_and_check from benchmark.tasks import TASK_REGISTRY, BenchmarkTask from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory from benchmark.runner import BenchmarkRunner __all__ = [ "log_likelihood_score", "generate_and_check", "TASK_REGISTRY", "BenchmarkTask", "PasskeyRetrieval", "MultiHopMemory", "BenchmarkRunner", ]