Spaces:

RomeroLab-Duke
/

BioDesignBench-Leaderboard

Running

File size: 6,308 Bytes

8e08ed6

"""Score aggregation and merging for LLM judge panel.

Implements:
- Weighted averaging with outlier downweighting
- Algo + LLM score merging with rubric cap enforcement
- Weight split configuration (72/28 algo-LLM)
"""

from __future__ import annotations

import statistics
from typing import Any

from llm_judge.rubrics import JUDGE_DIMENSIONS


# ---------------------------------------------------------------------------
# Weight split: algo + LLM portions per component (must sum to rubric max)
# ---------------------------------------------------------------------------

WEIGHT_SPLIT: dict[str, dict[str, int]] = {
    "approach":      {"algo": 10, "llm": 10},   # 20 total
    "orchestration": {"algo": 7,  "llm": 8},    # 15 total
    "quality":       {"algo": 35, "llm": 0},    # 35 total (no LLM)
    "feasibility":   {"algo": 10, "llm": 5},    # 15 total
    "novelty":       {"algo": 3,  "llm": 2},    # 5 total
    "diversity":     {"algo": 7,  "llm": 3},    # 10 total
}

# Mapping from LLM judge dimension → rubric component
_JUDGE_DIM_TO_COMPONENT: dict[str, str] = {
    "approach_strategy": "approach",
    "orchestration_reasoning": "orchestration",
    "bio_feasibility": "feasibility",
    "novelty_quality": "novelty",
    "diversity_quality": "diversity",
}

# Rubric max per component
_RUBRIC_MAX: dict[str, int] = {
    "approach": 20,
    "orchestration": 15,
    "quality": 35,
    "feasibility": 15,
    "novelty": 5,
    "diversity": 10,
}


def aggregate_judge_scores(
    judge_results: list[dict[str, dict[str, Any]]],
) -> dict[str, dict[str, Any]]:
    """Aggregate scores from multiple judges with outlier downweighting.

    For each dimension:
    1. Collect raw scores from all judges
    2. Compute median
    3. Downweight outliers (>2 points from median) by 0.5x
    4. Compute weighted average

    Args:
        judge_results: List of per-judge result dicts.
            Each maps dimension_name → {reasoning, score}.

    Returns:
        Aggregated dict mapping dimension_name → {score, reasoning, raw_scores}.

    Raises:
        ValueError: If judge_results is empty.
    """
    if not judge_results:
        raise ValueError("aggregate_judge_scores requires at least one judge result")

    if len(judge_results) == 1:
        # Single judge: pass through directly
        result = {}
        for dim in JUDGE_DIMENSIONS:
            entry = judge_results[0].get(dim, {"score": 0, "reasoning": ""})
            result[dim] = {
                "score": float(entry["score"]),
                "reasoning": entry["reasoning"],
                "raw_scores": [entry["score"]],
            }
        return result

    aggregated = {}
    for dim, info in JUDGE_DIMENSIONS.items():
        raw_scores = []
        reasonings = []
        for jr in judge_results:
            entry = jr.get(dim, {"score": info["max_score"] // 2, "reasoning": ""})
            raw_scores.append(float(entry["score"]))
            reasonings.append(entry.get("reasoning", ""))

        # Outlier detection: downweight scores >2 points from median
        med = statistics.median(raw_scores)
        weights = []
        for s in raw_scores:
            if abs(s - med) > 2.0:
                weights.append(0.5)
            else:
                weights.append(1.0)

        # Weighted average
        weighted_sum = sum(s * w for s, w in zip(raw_scores, weights))
        weight_total = sum(weights)
        avg = weighted_sum / weight_total if weight_total > 0 else 0

        # Clamp to valid range
        avg = max(0, min(avg, info["max_score"]))

        aggregated[dim] = {
            "score": round(avg, 1),
            "reasoning": " | ".join(
                f"[Judge {i+1}] {r}" for i, r in enumerate(reasonings) if r
            ),
            "raw_scores": raw_scores,
        }

    return aggregated


def split_algo_score(
    component: str,
    original_score: float,
    original_max: int,
) -> float:
    """Scale an algorithmic score to its algo-only portion.

    For the hybrid system, algorithmic scores are computed against the
    original rubric max (e.g., approach out of 20), then scaled down
    to the algo-only portion (e.g., 10 out of 20).

    Quality is special: it keeps its full 35 points (no LLM portion).

    Args:
        component: Rubric component name.
        original_score: Score computed against original max.
        original_max: Original rubric max for this component.

    Returns:
        Scaled score for the algo-only portion.
    """
    split = WEIGHT_SPLIT.get(component)
    if split is None:
        return original_score

    algo_max = split["algo"]

    if split["llm"] == 0:
        # No LLM portion — return original score unchanged
        return original_score

    # Scale: (original_score / original_max) * algo_max
    if original_max == 0:
        return 0.0
    ratio = original_score / original_max
    return round(ratio * algo_max, 2)


def merge_algo_and_judge_scores(
    algo_scores: dict[str, float | int],
    judge_scores: dict[str, dict[str, Any]] | None,
) -> dict[str, float]:
    """Merge algorithmic and LLM judge scores into final component scores.

    Args:
        algo_scores: Dict mapping component → algo-portion score.
            These should already be split via split_algo_score().
        judge_scores: Aggregated judge scores (from aggregate_judge_scores).
            None if LLM judge is disabled.

    Returns:
        Dict mapping component → final merged score (capped at rubric max).
    """
    if judge_scores is None:
        return dict(algo_scores)

    merged = {}
    for component, algo_score in algo_scores.items():
        rubric_max = _RUBRIC_MAX.get(component, 100)

        # Find matching judge dimension
        judge_dim = None
        for jd, comp in _JUDGE_DIM_TO_COMPONENT.items():
            if comp == component:
                judge_dim = jd
                break

        if judge_dim and judge_dim in judge_scores:
            llm_score = judge_scores[judge_dim].get("score", 0)
            if isinstance(llm_score, dict):
                llm_score = llm_score.get("score", 0)
            total = algo_score + llm_score
        else:
            total = algo_score

        merged[component] = min(total, rubric_max)

    return merged