ollive-api / evaluation /gradio_eval.py
Karthik Namboori
Deploy ollive FastAPI Docker Space
7b4b748
"""Gradio UI handlers for the official three-metric evaluation."""
from __future__ import annotations
from datetime import UTC, datetime
from pathlib import Path
import gradio as gr
from config import AppConfig
from evaluation.runner import AssistantKind, SafetyEvaluator, format_markdown_report, save_report
from evaluation.suites import METRIC_SUITES, describe_official_plan
ASSISTANT_CHOICES = ["oss", "frontier"]
def _normalize_assistants(assistants: list[str]) -> list[AssistantKind]:
if not assistants:
return ASSISTANT_CHOICES.copy()
return [name for name in assistants if name in ASSISTANT_CHOICES] # type: ignore[misc]
def preview_evaluation_plan(
assistants: list[str],
benchmark_samples: int,
seed: int,
config: AppConfig,
) -> str:
selected_assistants = _normalize_assistants(assistants)
evaluator = SafetyEvaluator(config)
prompts = evaluator.build_prompt_set(
benchmark_samples=int(benchmark_samples),
seed=int(seed),
)
lines = [
describe_official_plan(int(benchmark_samples)),
"",
f"**Assistants:** {', '.join(selected_assistants)}",
f"**Total prompts:** {len(prompts)}",
f"**Total model runs:** {len(prompts) * len(selected_assistants)}",
f"**Judge model:** `{config.judge_model_id}`",
"",
"### Prompt preview",
]
for item in prompts[:30]:
tag = item.benchmark or "custom"
lines.append(
f"- **{METRIC_SUITES[item.metric].label}** `[{tag}]` "
f"`{item.id}`: {item.prompt[:90]}..."
)
if len(prompts) > 30:
lines.append(f"- _…and {len(prompts) - 30} more_")
return "\n".join(lines)
def run_evaluation_ui(
assistants: list[str],
benchmark_samples: int,
seed: int,
save_results: bool,
config: AppConfig,
progress: gr.Progress | None = None,
):
selected_assistants = _normalize_assistants(assistants)
if not selected_assistants:
raise gr.Error("Select at least one assistant to evaluate.")
evaluator = SafetyEvaluator(config)
prompts = evaluator.build_prompt_set(
benchmark_samples=int(benchmark_samples),
seed=int(seed),
)
if not prompts:
raise gr.Error("No evaluation prompts were loaded.")
total_steps = len(prompts) * len(selected_assistants)
tracker = progress or gr.Progress()
results = []
model_ids: dict[AssistantKind, str] = {}
yield "", "_Starting evaluation…_", None, 0.0
completed = 0
for kind, item, result in evaluator.iter_eval(prompts, selected_assistants):
completed += 1
message = f"{METRIC_SUITES[item.metric].label} · {kind} · {item.id}"
model_ids[kind] = result.model_id
results.append(result)
pct = completed / total_steps
tracker(pct, desc=message, total=total_steps)
yield "", f"**Progress:** {completed}/{total_steps} — `{message}`", None, pct
report = evaluator.build_report(
results,
model_ids,
benchmark_samples=int(benchmark_samples),
seed=int(seed),
)
markdown = format_markdown_report(report)
saved_path: str | None = None
if save_results:
stamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
results_dir = Path("results")
json_path = results_dir / f"eval_ui_{stamp}.json"
md_path = results_dir / f"eval_ui_{stamp}.md"
save_report(report, json_path)
md_path.write_text(markdown, encoding="utf-8")
saved_path = str(json_path.resolve())
status = (
f"Completed **{len(report.results)}** scored responses across "
f"**{len(selected_assistants)}** assistants."
)
if saved_path:
status += f" Saved to `{saved_path}`."
yield markdown, status, saved_path, 1.0
def build_evaluation_tab(config: AppConfig) -> None:
metric_lines = "\n".join(
f"- **{suite.label}:** 10 custom prompts + `{suite.benchmark}`"
for suite in METRIC_SUITES.values()
)
with gr.Tab("Evaluation"):
gr.Markdown(
"### Official three-metric evaluation\n"
"Compare OSS vs frontier on exactly three percentages:\n"
f"{metric_lines}\n\n"
f"LLM-as-judge: `{config.judge_model_id}`"
)
with gr.Row():
with gr.Column(scale=1):
assistants = gr.CheckboxGroup(
label="Assistants",
choices=ASSISTANT_CHOICES,
value=ASSISTANT_CHOICES,
)
benchmark_samples = gr.Slider(
label="Public benchmark samples per metric",
minimum=1,
maximum=20,
step=1,
value=10,
)
seed = gr.Number(label="Random seed", value=42, precision=0)
save_results = gr.Checkbox(
label="Save JSON + Markdown to results/",
value=True,
)
with gr.Row():
preview_btn = gr.Button("Preview plan", variant="secondary")
run_btn = gr.Button("Run evaluation", variant="primary")
with gr.Column(scale=2):
status = gr.Markdown("_Configure settings and run evaluation._")
progress_pct = gr.Slider(
label="Evaluation progress",
minimum=0,
maximum=1,
step=0.01,
value=0,
interactive=False,
)
report = gr.Markdown("")
download = gr.File(label="Download JSON report", interactive=False)
shared_inputs = [assistants, benchmark_samples, seed]
preview_btn.click(
fn=lambda *args: (
preview_evaluation_plan(*args, config=config),
"_Preview ready — no models called._",
None,
0.0,
),
inputs=shared_inputs,
outputs=[report, status, download, progress_pct],
)
def _run_eval(
assistants_value,
benchmark_samples_value,
seed_value,
save_results_value,
progress=gr.Progress(),
):
yield from run_evaluation_ui(
assistants=assistants_value,
benchmark_samples=benchmark_samples_value,
seed=seed_value,
save_results=save_results_value,
config=config,
progress=progress,
)
run_btn.click(
fn=_run_eval,
inputs=[*shared_inputs, save_results],
outputs=[report, status, download, progress_pct],
)