Jasonkim8652's picture
Phase A: integrate LLM judge panel for hybrid scoring
8e08ed6 verified
"""Score aggregation and merging for LLM judge panel.
Implements:
- Weighted averaging with outlier downweighting
- Algo + LLM score merging with rubric cap enforcement
- Weight split configuration (72/28 algo-LLM)
"""
from __future__ import annotations
import statistics
from typing import Any
from llm_judge.rubrics import JUDGE_DIMENSIONS
# ---------------------------------------------------------------------------
# Weight split: algo + LLM portions per component (must sum to rubric max)
# ---------------------------------------------------------------------------
WEIGHT_SPLIT: dict[str, dict[str, int]] = {
"approach": {"algo": 10, "llm": 10}, # 20 total
"orchestration": {"algo": 7, "llm": 8}, # 15 total
"quality": {"algo": 35, "llm": 0}, # 35 total (no LLM)
"feasibility": {"algo": 10, "llm": 5}, # 15 total
"novelty": {"algo": 3, "llm": 2}, # 5 total
"diversity": {"algo": 7, "llm": 3}, # 10 total
}
# Mapping from LLM judge dimension → rubric component
_JUDGE_DIM_TO_COMPONENT: dict[str, str] = {
"approach_strategy": "approach",
"orchestration_reasoning": "orchestration",
"bio_feasibility": "feasibility",
"novelty_quality": "novelty",
"diversity_quality": "diversity",
}
# Rubric max per component
_RUBRIC_MAX: dict[str, int] = {
"approach": 20,
"orchestration": 15,
"quality": 35,
"feasibility": 15,
"novelty": 5,
"diversity": 10,
}
def aggregate_judge_scores(
judge_results: list[dict[str, dict[str, Any]]],
) -> dict[str, dict[str, Any]]:
"""Aggregate scores from multiple judges with outlier downweighting.
For each dimension:
1. Collect raw scores from all judges
2. Compute median
3. Downweight outliers (>2 points from median) by 0.5x
4. Compute weighted average
Args:
judge_results: List of per-judge result dicts.
Each maps dimension_name → {reasoning, score}.
Returns:
Aggregated dict mapping dimension_name → {score, reasoning, raw_scores}.
Raises:
ValueError: If judge_results is empty.
"""
if not judge_results:
raise ValueError("aggregate_judge_scores requires at least one judge result")
if len(judge_results) == 1:
# Single judge: pass through directly
result = {}
for dim in JUDGE_DIMENSIONS:
entry = judge_results[0].get(dim, {"score": 0, "reasoning": ""})
result[dim] = {
"score": float(entry["score"]),
"reasoning": entry["reasoning"],
"raw_scores": [entry["score"]],
}
return result
aggregated = {}
for dim, info in JUDGE_DIMENSIONS.items():
raw_scores = []
reasonings = []
for jr in judge_results:
entry = jr.get(dim, {"score": info["max_score"] // 2, "reasoning": ""})
raw_scores.append(float(entry["score"]))
reasonings.append(entry.get("reasoning", ""))
# Outlier detection: downweight scores >2 points from median
med = statistics.median(raw_scores)
weights = []
for s in raw_scores:
if abs(s - med) > 2.0:
weights.append(0.5)
else:
weights.append(1.0)
# Weighted average
weighted_sum = sum(s * w for s, w in zip(raw_scores, weights))
weight_total = sum(weights)
avg = weighted_sum / weight_total if weight_total > 0 else 0
# Clamp to valid range
avg = max(0, min(avg, info["max_score"]))
aggregated[dim] = {
"score": round(avg, 1),
"reasoning": " | ".join(
f"[Judge {i+1}] {r}" for i, r in enumerate(reasonings) if r
),
"raw_scores": raw_scores,
}
return aggregated
def split_algo_score(
component: str,
original_score: float,
original_max: int,
) -> float:
"""Scale an algorithmic score to its algo-only portion.
For the hybrid system, algorithmic scores are computed against the
original rubric max (e.g., approach out of 20), then scaled down
to the algo-only portion (e.g., 10 out of 20).
Quality is special: it keeps its full 35 points (no LLM portion).
Args:
component: Rubric component name.
original_score: Score computed against original max.
original_max: Original rubric max for this component.
Returns:
Scaled score for the algo-only portion.
"""
split = WEIGHT_SPLIT.get(component)
if split is None:
return original_score
algo_max = split["algo"]
if split["llm"] == 0:
# No LLM portion — return original score unchanged
return original_score
# Scale: (original_score / original_max) * algo_max
if original_max == 0:
return 0.0
ratio = original_score / original_max
return round(ratio * algo_max, 2)
def merge_algo_and_judge_scores(
algo_scores: dict[str, float | int],
judge_scores: dict[str, dict[str, Any]] | None,
) -> dict[str, float]:
"""Merge algorithmic and LLM judge scores into final component scores.
Args:
algo_scores: Dict mapping component → algo-portion score.
These should already be split via split_algo_score().
judge_scores: Aggregated judge scores (from aggregate_judge_scores).
None if LLM judge is disabled.
Returns:
Dict mapping component → final merged score (capped at rubric max).
"""
if judge_scores is None:
return dict(algo_scores)
merged = {}
for component, algo_score in algo_scores.items():
rubric_max = _RUBRIC_MAX.get(component, 100)
# Find matching judge dimension
judge_dim = None
for jd, comp in _JUDGE_DIM_TO_COMPONENT.items():
if comp == component:
judge_dim = jd
break
if judge_dim and judge_dim in judge_scores:
llm_score = judge_scores[judge_dim].get("score", 0)
if isinstance(llm_score, dict):
llm_score = llm_score.get("score", 0)
total = algo_score + llm_score
else:
total = algo_score
merged[component] = min(total, rubric_max)
return merged