| """ | |
| Cortex Benchmark Harness | |
| ======================== | |
| A self-contained evaluation suite for comparing base LLMs against | |
| Cortex-enhanced versions across standard NLP benchmarks and | |
| Cortex-specific capability tests (memory, hallucination detection, etc.). | |
| Benchmarks: | |
| Standard: HellaSwag, ARC-Easy, ARC-Challenge, PIQA, WinoGrande, MMLU | |
| Memory: Passkey Retrieval, Multi-Hop Memory | |
| Hallucination: HaluEval-QA | |
| """ | |
| from benchmark.scoring import log_likelihood_score, generate_and_check | |
| from benchmark.tasks import TASK_REGISTRY, BenchmarkTask | |
| from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory | |
| from benchmark.runner import BenchmarkRunner | |
| __all__ = [ | |
| "log_likelihood_score", | |
| "generate_and_check", | |
| "TASK_REGISTRY", | |
| "BenchmarkTask", | |
| "PasskeyRetrieval", | |
| "MultiHopMemory", | |
| "BenchmarkRunner", | |
| ] | |