File size: 5,255 Bytes
8e08ed6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""LLM Judge orchestration for the leaderboard backend.

Runs the cross-model judge panel on each successfully scored task and
merges the resulting LLM points into the algorithmic component scores
to produce hybrid totals (28 LLM points + 72 algorithmic points = 100).

The judge panel uses 3 judges from different model families with
self-exclusion (PoLL, Verga et al. 2024). Individual judge calls are
synchronous; we process tasks sequentially to keep the API spend
predictable. Provider keys are read from environment variables that
must be configured as HuggingFace Space secrets:

    ANTHROPIC_API_KEY
    OPENAI_API_KEY
    GOOGLE_API_KEY
    DEEPSEEK_API_KEY
"""

from __future__ import annotations

import logging
from typing import Any

from llm_judge import (
    LLMJudgePanel,
    detect_agent_family,
    merge_algo_and_judge_scores,
    split_algo_score,
)

logger = logging.getLogger(__name__)


def _build_algo_dict(task_result: dict[str, Any]) -> dict[str, float]:
    """Pull per-component algo scores from a task result.

    Prefers 'cpu_scores' (post-Boltz) but falls back to 'final_scores'
    if it has been computed already.
    """
    if "cpu_scores" in task_result:
        return dict(task_result["cpu_scores"])
    if "final_scores" in task_result:
        return dict(task_result["final_scores"])
    return {
        "approach": 0,
        "orchestration": 0,
        "quality": 0,
        "feasibility": 0,
        "novelty": 0,
        "diversity": 0,
    }


def run_judge_panel(
    per_task_results: dict[str, dict[str, Any]],
    agent_id: str,
    dry_run: bool = False,
    progress_callback=None,
) -> dict[str, dict[str, Any]]:
    """Run the LLM judge panel over every successful task in a submission.

    For each task with a non-empty design output:
      1. Look up the original task prompt (used to give the panel context).
      2. Build a 3-judge panel that excludes the agent's own model family.
      3. Run all judges synchronously and aggregate.
      4. Compute the hybrid component scores by:
           - splitting each algo score into its algo-portion (split_algo_score)
           - adding the matching judge LLM-portion (merge_algo_and_judge_scores)
      5. Store both raw judge results and final hybrid scores on the task.

    The function modifies per_task_results in place and also returns it.

    Args:
        per_task_results: Dict mapping task_id → task result (from the
            dispatcher + boltz post-eval pipeline).
        agent_id: Agent identifier for self-exclusion (e.g., 'gpt5-tools').
        dry_run: If True, judges return midpoint scores without API calls.
        progress_callback: Optional callable(task_id, i, total).

    Returns:
        The same dict, now augmented with 'judge_scores' and 'hybrid_scores'
        per task and 'hybrid_total' on each successful entry.
    """
    from eval_tasks import get_task

    family = detect_agent_family(agent_id)
    panel = LLMJudgePanel(agent_model_family=family, dry_run=dry_run)
    logger.info(
        f"LLM judge panel for agent '{agent_id}' (family={family}): "
        f"{len(panel.judges)} judges, dry_run={dry_run}"
    )

    eligible = [
        tid for tid, r in per_task_results.items()
        if r.get("success") and r.get("sequences")
    ]
    total = len(eligible)

    for i, task_id in enumerate(eligible):
        result = per_task_results[task_id]

        # Pull task prompt for judge context. If the dataset is not
        # reachable (e.g., dev mode without HF_TOKEN) we still run with
        # a placeholder description rather than aborting the whole run.
        task_data = get_task(task_id) or {}
        task_description = task_data.get("prompt_md") or f"BioDesignBench task {task_id}"

        algo_metrics = result.get("agent_metrics", {})
        if "boltz_metrics" in result:
            algo_metrics = {**algo_metrics, **result["boltz_metrics"]}

        try:
            judge_result = panel.evaluate_sync(
                task_description=task_description,
                tool_call_log=result.get("run_log", []),
                designed_sequences=result.get("sequences", []),
                algorithmic_metrics=algo_metrics,
            )
        except Exception as e:
            logger.error(f"Judge panel failed on {task_id}: {e}")
            judge_result = None

        # Build algo-portion dict (split each component down to its algo max)
        algo_full = _build_algo_dict(result)
        rubric_max = {
            "approach": 20, "orchestration": 15, "quality": 35,
            "feasibility": 15, "novelty": 5, "diversity": 10,
        }
        algo_split = {
            comp: split_algo_score(comp, score, rubric_max[comp])
            for comp, score in algo_full.items()
        }

        hybrid = merge_algo_and_judge_scores(algo_split, judge_result)
        hybrid_total = sum(hybrid.values())

        result["judge_scores"] = judge_result
        result["hybrid_scores"] = hybrid
        result["hybrid_total"] = round(hybrid_total, 2)

        if progress_callback:
            progress_callback(task_id, i + 1, total)

        logger.info(
            f"[{i+1}/{total}] {task_id}: hybrid={hybrid_total:.1f}"
        )

    return per_task_results