Spaces:

RomeroLab-Duke
/

BioDesignBench-Leaderboard

Running

File size: 5,255 Bytes

8e08ed6

"""LLM Judge orchestration for the leaderboard backend.

Runs the cross-model judge panel on each successfully scored task and
merges the resulting LLM points into the algorithmic component scores
to produce hybrid totals (28 LLM points + 72 algorithmic points = 100).

The judge panel uses 3 judges from different model families with
self-exclusion (PoLL, Verga et al. 2024). Individual judge calls are
synchronous; we process tasks sequentially to keep the API spend
predictable. Provider keys are read from environment variables that
must be configured as HuggingFace Space secrets:

    ANTHROPIC_API_KEY
    OPENAI_API_KEY
    GOOGLE_API_KEY
    DEEPSEEK_API_KEY
"""

from __future__ import annotations

import logging
from typing import Any

from llm_judge import (
    LLMJudgePanel,
    detect_agent_family,
    merge_algo_and_judge_scores,
    split_algo_score,
)

logger = logging.getLogger(__name__)


def _build_algo_dict(task_result: dict[str, Any]) -> dict[str, float]:
    """Pull per-component algo scores from a task result.

    Prefers 'cpu_scores' (post-Boltz) but falls back to 'final_scores'
    if it has been computed already.
    """
    if "cpu_scores" in task_result:
        return dict(task_result["cpu_scores"])
    if "final_scores" in task_result:
        return dict(task_result["final_scores"])
    return {
        "approach": 0,
        "orchestration": 0,
        "quality": 0,
        "feasibility": 0,
        "novelty": 0,
        "diversity": 0,
    }


def run_judge_panel(
    per_task_results: dict[str, dict[str, Any]],
    agent_id: str,
    dry_run: bool = False,
    progress_callback=None,
) -> dict[str, dict[str, Any]]:
    """Run the LLM judge panel over every successful task in a submission.

    For each task with a non-empty design output:
      1. Look up the original task prompt (used to give the panel context).
      2. Build a 3-judge panel that excludes the agent's own model family.
      3. Run all judges synchronously and aggregate.
      4. Compute the hybrid component scores by:
           - splitting each algo score into its algo-portion (split_algo_score)
           - adding the matching judge LLM-portion (merge_algo_and_judge_scores)
      5. Store both raw judge results and final hybrid scores on the task.

    The function modifies per_task_results in place and also returns it.

    Args:
        per_task_results: Dict mapping task_id → task result (from the
            dispatcher + boltz post-eval pipeline).
        agent_id: Agent identifier for self-exclusion (e.g., 'gpt5-tools').
        dry_run: If True, judges return midpoint scores without API calls.
        progress_callback: Optional callable(task_id, i, total).

    Returns:
        The same dict, now augmented with 'judge_scores' and 'hybrid_scores'
        per task and 'hybrid_total' on each successful entry.
    """
    from eval_tasks import get_task

    family = detect_agent_family(agent_id)
    panel = LLMJudgePanel(agent_model_family=family, dry_run=dry_run)
    logger.info(
        f"LLM judge panel for agent '{agent_id}' (family={family}): "
        f"{len(panel.judges)} judges, dry_run={dry_run}"
    )

    eligible = [
        tid for tid, r in per_task_results.items()
        if r.get("success") and r.get("sequences")
    ]
    total = len(eligible)

    for i, task_id in enumerate(eligible):
        result = per_task_results[task_id]

        # Pull task prompt for judge context. If the dataset is not
        # reachable (e.g., dev mode without HF_TOKEN) we still run with
        # a placeholder description rather than aborting the whole run.
        task_data = get_task(task_id) or {}
        task_description = task_data.get("prompt_md") or f"BioDesignBench task {task_id}"

        algo_metrics = result.get("agent_metrics", {})
        if "boltz_metrics" in result:
            algo_metrics = {**algo_metrics, **result["boltz_metrics"]}

        try:
            judge_result = panel.evaluate_sync(
                task_description=task_description,
                tool_call_log=result.get("run_log", []),
                designed_sequences=result.get("sequences", []),
                algorithmic_metrics=algo_metrics,
            )
        except Exception as e:
            logger.error(f"Judge panel failed on {task_id}: {e}")
            judge_result = None

        # Build algo-portion dict (split each component down to its algo max)
        algo_full = _build_algo_dict(result)
        rubric_max = {
            "approach": 20, "orchestration": 15, "quality": 35,
            "feasibility": 15, "novelty": 5, "diversity": 10,
        }
        algo_split = {
            comp: split_algo_score(comp, score, rubric_max[comp])
            for comp, score in algo_full.items()
        }

        hybrid = merge_algo_and_judge_scores(algo_split, judge_result)
        hybrid_total = sum(hybrid.values())

        result["judge_scores"] = judge_result
        result["hybrid_scores"] = hybrid
        result["hybrid_total"] = round(hybrid_total, 2)

        if progress_callback:
            progress_callback(task_id, i + 1, total)

        logger.info(
            f"[{i+1}/{total}] {task_id}: hybrid={hybrid_total:.1f}"
        )

    return per_task_results