File size: 1,050 Bytes
8d18b7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""Comprehensive Evaluation and Benchmarking Suite for Zenith Models"""

from .benchmark import BenchmarkSuite, BenchmarkConfig
from .metrics import (
    compute_perplexity,
    compute_accuracy,
    compute_em_score,
    compute_f1_score,
    compute_eq_metrics,
    compute_code_metrics,
    compute_reasoning_metrics,
)
from .eval_datasets import (
    EvaluationDataset,
    load_human_eval,
    load_mbpp,
    load_gsm8k,
    load_math,
    load_truthfulqa,
    load_emotional_bench,
)
from .comparative_eval import ComparativeEvaluator, ModelComparison

__all__ = [
    "BenchmarkSuite",
    "BenchmarkConfig",
    "compute_perplexity",
    "compute_accuracy",
    "compute_em_score",
    "compute_f1_score",
    "compute_eq_metrics",
    "compute_code_metrics",
    "compute_reasoning_metrics",
    "EvaluationDataset",
    "load_human_eval",
    "load_mbpp",
    "load_gsm8k",
    "load_math",
    "load_truthfulqa",
    "load_emotional_bench",
    "ComparativeEvaluator",
    "ModelComparison",
]