Spaces:
Running
Running
File size: 1,897 Bytes
7b4b748 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from api.settings import ApiConfig
from api.schemas import EvalAssistantResult, EvalMetricScore, EvalRunResponse
from config import AppConfig
from evaluation.runner import SafetyEvaluator, format_markdown_report
class EvalService:
def __init__(self, api_config: ApiConfig, app_config: AppConfig) -> None:
self.api_config = api_config
self.app_config = app_config
def run(self, *, benchmark_samples: int, seed: int, assistants: list[str]) -> EvalRunResponse:
evaluator = SafetyEvaluator(self.app_config)
report = evaluator.run(
assistants=assistants, # type: ignore[arg-type]
benchmark_samples=benchmark_samples,
seed=seed,
)
results: list[EvalAssistantResult] = []
for label, data in (("oss", report.oss), ("frontier", report.frontier)):
if not data.metrics:
continue
if label not in assistants:
continue
metrics = [
EvalMetricScore(
metric=metric,
label=score.label,
percent=round(score.percent, 1),
total=score.total,
)
for metric, score in data.metrics.items()
]
results.append(
EvalAssistantResult(
assistant=label,
model_id=data.model_id,
metrics=metrics,
)
)
return EvalRunResponse(
generated_at=report.generated_at,
judge_model=report.judge_model,
results=results,
markdown_report=format_markdown_report(report),
)
|