| """LLM Judge orchestration for the leaderboard backend. |
| |
| Runs the cross-model judge panel on each successfully scored task and |
| merges the resulting LLM points into the algorithmic component scores |
| to produce hybrid totals (28 LLM points + 72 algorithmic points = 100). |
| |
| The judge panel uses 3 judges from different model families with |
| self-exclusion (PoLL, Verga et al. 2024). Individual judge calls are |
| synchronous; we process tasks sequentially to keep the API spend |
| predictable. Provider keys are read from environment variables that |
| must be configured as HuggingFace Space secrets: |
| |
| ANTHROPIC_API_KEY |
| OPENAI_API_KEY |
| GOOGLE_API_KEY |
| DEEPSEEK_API_KEY |
| """ |
|
|
| from __future__ import annotations |
|
|
| import logging |
| from typing import Any |
|
|
| from llm_judge import ( |
| LLMJudgePanel, |
| detect_agent_family, |
| merge_algo_and_judge_scores, |
| split_algo_score, |
| ) |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def _build_algo_dict(task_result: dict[str, Any]) -> dict[str, float]: |
| """Pull per-component algo scores from a task result. |
| |
| Prefers 'cpu_scores' (post-Boltz) but falls back to 'final_scores' |
| if it has been computed already. |
| """ |
| if "cpu_scores" in task_result: |
| return dict(task_result["cpu_scores"]) |
| if "final_scores" in task_result: |
| return dict(task_result["final_scores"]) |
| return { |
| "approach": 0, |
| "orchestration": 0, |
| "quality": 0, |
| "feasibility": 0, |
| "novelty": 0, |
| "diversity": 0, |
| } |
|
|
|
|
| def run_judge_panel( |
| per_task_results: dict[str, dict[str, Any]], |
| agent_id: str, |
| dry_run: bool = False, |
| progress_callback=None, |
| ) -> dict[str, dict[str, Any]]: |
| """Run the LLM judge panel over every successful task in a submission. |
| |
| For each task with a non-empty design output: |
| 1. Look up the original task prompt (used to give the panel context). |
| 2. Build a 3-judge panel that excludes the agent's own model family. |
| 3. Run all judges synchronously and aggregate. |
| 4. Compute the hybrid component scores by: |
| - splitting each algo score into its algo-portion (split_algo_score) |
| - adding the matching judge LLM-portion (merge_algo_and_judge_scores) |
| 5. Store both raw judge results and final hybrid scores on the task. |
| |
| The function modifies per_task_results in place and also returns it. |
| |
| Args: |
| per_task_results: Dict mapping task_id → task result (from the |
| dispatcher + boltz post-eval pipeline). |
| agent_id: Agent identifier for self-exclusion (e.g., 'gpt5-tools'). |
| dry_run: If True, judges return midpoint scores without API calls. |
| progress_callback: Optional callable(task_id, i, total). |
| |
| Returns: |
| The same dict, now augmented with 'judge_scores' and 'hybrid_scores' |
| per task and 'hybrid_total' on each successful entry. |
| """ |
| from eval_tasks import get_task |
|
|
| family = detect_agent_family(agent_id) |
| panel = LLMJudgePanel(agent_model_family=family, dry_run=dry_run) |
| logger.info( |
| f"LLM judge panel for agent '{agent_id}' (family={family}): " |
| f"{len(panel.judges)} judges, dry_run={dry_run}" |
| ) |
|
|
| eligible = [ |
| tid for tid, r in per_task_results.items() |
| if r.get("success") and r.get("sequences") |
| ] |
| total = len(eligible) |
|
|
| for i, task_id in enumerate(eligible): |
| result = per_task_results[task_id] |
|
|
| |
| |
| |
| task_data = get_task(task_id) or {} |
| task_description = task_data.get("prompt_md") or f"BioDesignBench task {task_id}" |
|
|
| algo_metrics = result.get("agent_metrics", {}) |
| if "boltz_metrics" in result: |
| algo_metrics = {**algo_metrics, **result["boltz_metrics"]} |
|
|
| try: |
| judge_result = panel.evaluate_sync( |
| task_description=task_description, |
| tool_call_log=result.get("run_log", []), |
| designed_sequences=result.get("sequences", []), |
| algorithmic_metrics=algo_metrics, |
| ) |
| except Exception as e: |
| logger.error(f"Judge panel failed on {task_id}: {e}") |
| judge_result = None |
|
|
| |
| algo_full = _build_algo_dict(result) |
| rubric_max = { |
| "approach": 20, "orchestration": 15, "quality": 35, |
| "feasibility": 15, "novelty": 5, "diversity": 10, |
| } |
| algo_split = { |
| comp: split_algo_score(comp, score, rubric_max[comp]) |
| for comp, score in algo_full.items() |
| } |
|
|
| hybrid = merge_algo_and_judge_scores(algo_split, judge_result) |
| hybrid_total = sum(hybrid.values()) |
|
|
| result["judge_scores"] = judge_result |
| result["hybrid_scores"] = hybrid |
| result["hybrid_total"] = round(hybrid_total, 2) |
|
|
| if progress_callback: |
| progress_callback(task_id, i + 1, total) |
|
|
| logger.info( |
| f"[{i+1}/{total}] {task_id}: hybrid={hybrid_total:.1f}" |
| ) |
|
|
| return per_task_results |
|
|