File size: 844 Bytes
7dd817a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
"""
Cortex Benchmark Harness
========================

A self-contained evaluation suite for comparing base LLMs against
Cortex-enhanced versions across standard NLP benchmarks and
Cortex-specific capability tests (memory, hallucination detection, etc.).

Benchmarks:
  Standard:  HellaSwag, ARC-Easy, ARC-Challenge, PIQA, WinoGrande, MMLU
  Memory:    Passkey Retrieval, Multi-Hop Memory
  Hallucination: HaluEval-QA
"""

from benchmark.scoring import log_likelihood_score, generate_and_check
from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
from benchmark.runner import BenchmarkRunner

__all__ = [
    "log_likelihood_score",
    "generate_and_check",
    "TASK_REGISTRY",
    "BenchmarkTask",
    "PasskeyRetrieval",
    "MultiHopMemory",
    "BenchmarkRunner",
]