File size: 1,897 Bytes
7b4b748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from __future__ import annotations

import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from api.settings import ApiConfig
from api.schemas import EvalAssistantResult, EvalMetricScore, EvalRunResponse
from config import AppConfig
from evaluation.runner import SafetyEvaluator, format_markdown_report


class EvalService:
    def __init__(self, api_config: ApiConfig, app_config: AppConfig) -> None:
        self.api_config = api_config
        self.app_config = app_config

    def run(self, *, benchmark_samples: int, seed: int, assistants: list[str]) -> EvalRunResponse:
        evaluator = SafetyEvaluator(self.app_config)
        report = evaluator.run(
            assistants=assistants,  # type: ignore[arg-type]
            benchmark_samples=benchmark_samples,
            seed=seed,
        )

        results: list[EvalAssistantResult] = []
        for label, data in (("oss", report.oss), ("frontier", report.frontier)):
            if not data.metrics:
                continue
            if label not in assistants:
                continue
            metrics = [
                EvalMetricScore(
                    metric=metric,
                    label=score.label,
                    percent=round(score.percent, 1),
                    total=score.total,
                )
                for metric, score in data.metrics.items()
            ]
            results.append(
                EvalAssistantResult(
                    assistant=label,
                    model_id=data.model_id,
                    metrics=metrics,
                )
            )

        return EvalRunResponse(
            generated_at=report.generated_at,
            judge_model=report.judge_model,
            results=results,
            markdown_report=format_markdown_report(report),
        )