Spaces:
Running
Running
| """Gradio UI handlers for the official three-metric evaluation.""" | |
| from __future__ import annotations | |
| from datetime import UTC, datetime | |
| from pathlib import Path | |
| import gradio as gr | |
| from config import AppConfig | |
| from evaluation.runner import AssistantKind, SafetyEvaluator, format_markdown_report, save_report | |
| from evaluation.suites import METRIC_SUITES, describe_official_plan | |
| ASSISTANT_CHOICES = ["oss", "frontier"] | |
| def _normalize_assistants(assistants: list[str]) -> list[AssistantKind]: | |
| if not assistants: | |
| return ASSISTANT_CHOICES.copy() | |
| return [name for name in assistants if name in ASSISTANT_CHOICES] # type: ignore[misc] | |
| def preview_evaluation_plan( | |
| assistants: list[str], | |
| benchmark_samples: int, | |
| seed: int, | |
| config: AppConfig, | |
| ) -> str: | |
| selected_assistants = _normalize_assistants(assistants) | |
| evaluator = SafetyEvaluator(config) | |
| prompts = evaluator.build_prompt_set( | |
| benchmark_samples=int(benchmark_samples), | |
| seed=int(seed), | |
| ) | |
| lines = [ | |
| describe_official_plan(int(benchmark_samples)), | |
| "", | |
| f"**Assistants:** {', '.join(selected_assistants)}", | |
| f"**Total prompts:** {len(prompts)}", | |
| f"**Total model runs:** {len(prompts) * len(selected_assistants)}", | |
| f"**Judge model:** `{config.judge_model_id}`", | |
| "", | |
| "### Prompt preview", | |
| ] | |
| for item in prompts[:30]: | |
| tag = item.benchmark or "custom" | |
| lines.append( | |
| f"- **{METRIC_SUITES[item.metric].label}** `[{tag}]` " | |
| f"`{item.id}`: {item.prompt[:90]}..." | |
| ) | |
| if len(prompts) > 30: | |
| lines.append(f"- _…and {len(prompts) - 30} more_") | |
| return "\n".join(lines) | |
| def run_evaluation_ui( | |
| assistants: list[str], | |
| benchmark_samples: int, | |
| seed: int, | |
| save_results: bool, | |
| config: AppConfig, | |
| progress: gr.Progress | None = None, | |
| ): | |
| selected_assistants = _normalize_assistants(assistants) | |
| if not selected_assistants: | |
| raise gr.Error("Select at least one assistant to evaluate.") | |
| evaluator = SafetyEvaluator(config) | |
| prompts = evaluator.build_prompt_set( | |
| benchmark_samples=int(benchmark_samples), | |
| seed=int(seed), | |
| ) | |
| if not prompts: | |
| raise gr.Error("No evaluation prompts were loaded.") | |
| total_steps = len(prompts) * len(selected_assistants) | |
| tracker = progress or gr.Progress() | |
| results = [] | |
| model_ids: dict[AssistantKind, str] = {} | |
| yield "", "_Starting evaluation…_", None, 0.0 | |
| completed = 0 | |
| for kind, item, result in evaluator.iter_eval(prompts, selected_assistants): | |
| completed += 1 | |
| message = f"{METRIC_SUITES[item.metric].label} · {kind} · {item.id}" | |
| model_ids[kind] = result.model_id | |
| results.append(result) | |
| pct = completed / total_steps | |
| tracker(pct, desc=message, total=total_steps) | |
| yield "", f"**Progress:** {completed}/{total_steps} — `{message}`", None, pct | |
| report = evaluator.build_report( | |
| results, | |
| model_ids, | |
| benchmark_samples=int(benchmark_samples), | |
| seed=int(seed), | |
| ) | |
| markdown = format_markdown_report(report) | |
| saved_path: str | None = None | |
| if save_results: | |
| stamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") | |
| results_dir = Path("results") | |
| json_path = results_dir / f"eval_ui_{stamp}.json" | |
| md_path = results_dir / f"eval_ui_{stamp}.md" | |
| save_report(report, json_path) | |
| md_path.write_text(markdown, encoding="utf-8") | |
| saved_path = str(json_path.resolve()) | |
| status = ( | |
| f"Completed **{len(report.results)}** scored responses across " | |
| f"**{len(selected_assistants)}** assistants." | |
| ) | |
| if saved_path: | |
| status += f" Saved to `{saved_path}`." | |
| yield markdown, status, saved_path, 1.0 | |
| def build_evaluation_tab(config: AppConfig) -> None: | |
| metric_lines = "\n".join( | |
| f"- **{suite.label}:** 10 custom prompts + `{suite.benchmark}`" | |
| for suite in METRIC_SUITES.values() | |
| ) | |
| with gr.Tab("Evaluation"): | |
| gr.Markdown( | |
| "### Official three-metric evaluation\n" | |
| "Compare OSS vs frontier on exactly three percentages:\n" | |
| f"{metric_lines}\n\n" | |
| f"LLM-as-judge: `{config.judge_model_id}`" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| assistants = gr.CheckboxGroup( | |
| label="Assistants", | |
| choices=ASSISTANT_CHOICES, | |
| value=ASSISTANT_CHOICES, | |
| ) | |
| benchmark_samples = gr.Slider( | |
| label="Public benchmark samples per metric", | |
| minimum=1, | |
| maximum=20, | |
| step=1, | |
| value=10, | |
| ) | |
| seed = gr.Number(label="Random seed", value=42, precision=0) | |
| save_results = gr.Checkbox( | |
| label="Save JSON + Markdown to results/", | |
| value=True, | |
| ) | |
| with gr.Row(): | |
| preview_btn = gr.Button("Preview plan", variant="secondary") | |
| run_btn = gr.Button("Run evaluation", variant="primary") | |
| with gr.Column(scale=2): | |
| status = gr.Markdown("_Configure settings and run evaluation._") | |
| progress_pct = gr.Slider( | |
| label="Evaluation progress", | |
| minimum=0, | |
| maximum=1, | |
| step=0.01, | |
| value=0, | |
| interactive=False, | |
| ) | |
| report = gr.Markdown("") | |
| download = gr.File(label="Download JSON report", interactive=False) | |
| shared_inputs = [assistants, benchmark_samples, seed] | |
| preview_btn.click( | |
| fn=lambda *args: ( | |
| preview_evaluation_plan(*args, config=config), | |
| "_Preview ready — no models called._", | |
| None, | |
| 0.0, | |
| ), | |
| inputs=shared_inputs, | |
| outputs=[report, status, download, progress_pct], | |
| ) | |
| def _run_eval( | |
| assistants_value, | |
| benchmark_samples_value, | |
| seed_value, | |
| save_results_value, | |
| progress=gr.Progress(), | |
| ): | |
| yield from run_evaluation_ui( | |
| assistants=assistants_value, | |
| benchmark_samples=benchmark_samples_value, | |
| seed=seed_value, | |
| save_results=save_results_value, | |
| config=config, | |
| progress=progress, | |
| ) | |
| run_btn.click( | |
| fn=_run_eval, | |
| inputs=[*shared_inputs, save_results], | |
| outputs=[report, status, download, progress_pct], | |
| ) | |