File size: 5,255 Bytes
8e08ed6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | """LLM Judge orchestration for the leaderboard backend.
Runs the cross-model judge panel on each successfully scored task and
merges the resulting LLM points into the algorithmic component scores
to produce hybrid totals (28 LLM points + 72 algorithmic points = 100).
The judge panel uses 3 judges from different model families with
self-exclusion (PoLL, Verga et al. 2024). Individual judge calls are
synchronous; we process tasks sequentially to keep the API spend
predictable. Provider keys are read from environment variables that
must be configured as HuggingFace Space secrets:
ANTHROPIC_API_KEY
OPENAI_API_KEY
GOOGLE_API_KEY
DEEPSEEK_API_KEY
"""
from __future__ import annotations
import logging
from typing import Any
from llm_judge import (
LLMJudgePanel,
detect_agent_family,
merge_algo_and_judge_scores,
split_algo_score,
)
logger = logging.getLogger(__name__)
def _build_algo_dict(task_result: dict[str, Any]) -> dict[str, float]:
"""Pull per-component algo scores from a task result.
Prefers 'cpu_scores' (post-Boltz) but falls back to 'final_scores'
if it has been computed already.
"""
if "cpu_scores" in task_result:
return dict(task_result["cpu_scores"])
if "final_scores" in task_result:
return dict(task_result["final_scores"])
return {
"approach": 0,
"orchestration": 0,
"quality": 0,
"feasibility": 0,
"novelty": 0,
"diversity": 0,
}
def run_judge_panel(
per_task_results: dict[str, dict[str, Any]],
agent_id: str,
dry_run: bool = False,
progress_callback=None,
) -> dict[str, dict[str, Any]]:
"""Run the LLM judge panel over every successful task in a submission.
For each task with a non-empty design output:
1. Look up the original task prompt (used to give the panel context).
2. Build a 3-judge panel that excludes the agent's own model family.
3. Run all judges synchronously and aggregate.
4. Compute the hybrid component scores by:
- splitting each algo score into its algo-portion (split_algo_score)
- adding the matching judge LLM-portion (merge_algo_and_judge_scores)
5. Store both raw judge results and final hybrid scores on the task.
The function modifies per_task_results in place and also returns it.
Args:
per_task_results: Dict mapping task_id → task result (from the
dispatcher + boltz post-eval pipeline).
agent_id: Agent identifier for self-exclusion (e.g., 'gpt5-tools').
dry_run: If True, judges return midpoint scores without API calls.
progress_callback: Optional callable(task_id, i, total).
Returns:
The same dict, now augmented with 'judge_scores' and 'hybrid_scores'
per task and 'hybrid_total' on each successful entry.
"""
from eval_tasks import get_task
family = detect_agent_family(agent_id)
panel = LLMJudgePanel(agent_model_family=family, dry_run=dry_run)
logger.info(
f"LLM judge panel for agent '{agent_id}' (family={family}): "
f"{len(panel.judges)} judges, dry_run={dry_run}"
)
eligible = [
tid for tid, r in per_task_results.items()
if r.get("success") and r.get("sequences")
]
total = len(eligible)
for i, task_id in enumerate(eligible):
result = per_task_results[task_id]
# Pull task prompt for judge context. If the dataset is not
# reachable (e.g., dev mode without HF_TOKEN) we still run with
# a placeholder description rather than aborting the whole run.
task_data = get_task(task_id) or {}
task_description = task_data.get("prompt_md") or f"BioDesignBench task {task_id}"
algo_metrics = result.get("agent_metrics", {})
if "boltz_metrics" in result:
algo_metrics = {**algo_metrics, **result["boltz_metrics"]}
try:
judge_result = panel.evaluate_sync(
task_description=task_description,
tool_call_log=result.get("run_log", []),
designed_sequences=result.get("sequences", []),
algorithmic_metrics=algo_metrics,
)
except Exception as e:
logger.error(f"Judge panel failed on {task_id}: {e}")
judge_result = None
# Build algo-portion dict (split each component down to its algo max)
algo_full = _build_algo_dict(result)
rubric_max = {
"approach": 20, "orchestration": 15, "quality": 35,
"feasibility": 15, "novelty": 5, "diversity": 10,
}
algo_split = {
comp: split_algo_score(comp, score, rubric_max[comp])
for comp, score in algo_full.items()
}
hybrid = merge_algo_and_judge_scores(algo_split, judge_result)
hybrid_total = sum(hybrid.values())
result["judge_scores"] = judge_result
result["hybrid_scores"] = hybrid
result["hybrid_total"] = round(hybrid_total, 2)
if progress_callback:
progress_callback(task_id, i + 1, total)
logger.info(
f"[{i+1}/{total}] {task_id}: hybrid={hybrid_total:.1f}"
)
return per_task_results
|