| """Score aggregation and merging for LLM judge panel. |
| |
| Implements: |
| - Weighted averaging with outlier downweighting |
| - Algo + LLM score merging with rubric cap enforcement |
| - Weight split configuration (72/28 algo-LLM) |
| """ |
|
|
| from __future__ import annotations |
|
|
| import statistics |
| from typing import Any |
|
|
| from llm_judge.rubrics import JUDGE_DIMENSIONS |
|
|
|
|
| |
| |
| |
|
|
| WEIGHT_SPLIT: dict[str, dict[str, int]] = { |
| "approach": {"algo": 10, "llm": 10}, |
| "orchestration": {"algo": 7, "llm": 8}, |
| "quality": {"algo": 35, "llm": 0}, |
| "feasibility": {"algo": 10, "llm": 5}, |
| "novelty": {"algo": 3, "llm": 2}, |
| "diversity": {"algo": 7, "llm": 3}, |
| } |
|
|
| |
| _JUDGE_DIM_TO_COMPONENT: dict[str, str] = { |
| "approach_strategy": "approach", |
| "orchestration_reasoning": "orchestration", |
| "bio_feasibility": "feasibility", |
| "novelty_quality": "novelty", |
| "diversity_quality": "diversity", |
| } |
|
|
| |
| _RUBRIC_MAX: dict[str, int] = { |
| "approach": 20, |
| "orchestration": 15, |
| "quality": 35, |
| "feasibility": 15, |
| "novelty": 5, |
| "diversity": 10, |
| } |
|
|
|
|
| def aggregate_judge_scores( |
| judge_results: list[dict[str, dict[str, Any]]], |
| ) -> dict[str, dict[str, Any]]: |
| """Aggregate scores from multiple judges with outlier downweighting. |
| |
| For each dimension: |
| 1. Collect raw scores from all judges |
| 2. Compute median |
| 3. Downweight outliers (>2 points from median) by 0.5x |
| 4. Compute weighted average |
| |
| Args: |
| judge_results: List of per-judge result dicts. |
| Each maps dimension_name → {reasoning, score}. |
| |
| Returns: |
| Aggregated dict mapping dimension_name → {score, reasoning, raw_scores}. |
| |
| Raises: |
| ValueError: If judge_results is empty. |
| """ |
| if not judge_results: |
| raise ValueError("aggregate_judge_scores requires at least one judge result") |
|
|
| if len(judge_results) == 1: |
| |
| result = {} |
| for dim in JUDGE_DIMENSIONS: |
| entry = judge_results[0].get(dim, {"score": 0, "reasoning": ""}) |
| result[dim] = { |
| "score": float(entry["score"]), |
| "reasoning": entry["reasoning"], |
| "raw_scores": [entry["score"]], |
| } |
| return result |
|
|
| aggregated = {} |
| for dim, info in JUDGE_DIMENSIONS.items(): |
| raw_scores = [] |
| reasonings = [] |
| for jr in judge_results: |
| entry = jr.get(dim, {"score": info["max_score"] // 2, "reasoning": ""}) |
| raw_scores.append(float(entry["score"])) |
| reasonings.append(entry.get("reasoning", "")) |
|
|
| |
| med = statistics.median(raw_scores) |
| weights = [] |
| for s in raw_scores: |
| if abs(s - med) > 2.0: |
| weights.append(0.5) |
| else: |
| weights.append(1.0) |
|
|
| |
| weighted_sum = sum(s * w for s, w in zip(raw_scores, weights)) |
| weight_total = sum(weights) |
| avg = weighted_sum / weight_total if weight_total > 0 else 0 |
|
|
| |
| avg = max(0, min(avg, info["max_score"])) |
|
|
| aggregated[dim] = { |
| "score": round(avg, 1), |
| "reasoning": " | ".join( |
| f"[Judge {i+1}] {r}" for i, r in enumerate(reasonings) if r |
| ), |
| "raw_scores": raw_scores, |
| } |
|
|
| return aggregated |
|
|
|
|
| def split_algo_score( |
| component: str, |
| original_score: float, |
| original_max: int, |
| ) -> float: |
| """Scale an algorithmic score to its algo-only portion. |
| |
| For the hybrid system, algorithmic scores are computed against the |
| original rubric max (e.g., approach out of 20), then scaled down |
| to the algo-only portion (e.g., 10 out of 20). |
| |
| Quality is special: it keeps its full 35 points (no LLM portion). |
| |
| Args: |
| component: Rubric component name. |
| original_score: Score computed against original max. |
| original_max: Original rubric max for this component. |
| |
| Returns: |
| Scaled score for the algo-only portion. |
| """ |
| split = WEIGHT_SPLIT.get(component) |
| if split is None: |
| return original_score |
|
|
| algo_max = split["algo"] |
|
|
| if split["llm"] == 0: |
| |
| return original_score |
|
|
| |
| if original_max == 0: |
| return 0.0 |
| ratio = original_score / original_max |
| return round(ratio * algo_max, 2) |
|
|
|
|
| def merge_algo_and_judge_scores( |
| algo_scores: dict[str, float | int], |
| judge_scores: dict[str, dict[str, Any]] | None, |
| ) -> dict[str, float]: |
| """Merge algorithmic and LLM judge scores into final component scores. |
| |
| Args: |
| algo_scores: Dict mapping component → algo-portion score. |
| These should already be split via split_algo_score(). |
| judge_scores: Aggregated judge scores (from aggregate_judge_scores). |
| None if LLM judge is disabled. |
| |
| Returns: |
| Dict mapping component → final merged score (capped at rubric max). |
| """ |
| if judge_scores is None: |
| return dict(algo_scores) |
|
|
| merged = {} |
| for component, algo_score in algo_scores.items(): |
| rubric_max = _RUBRIC_MAX.get(component, 100) |
|
|
| |
| judge_dim = None |
| for jd, comp in _JUDGE_DIM_TO_COMPONENT.items(): |
| if comp == component: |
| judge_dim = jd |
| break |
|
|
| if judge_dim and judge_dim in judge_scores: |
| llm_score = judge_scores[judge_dim].get("score", 0) |
| if isinstance(llm_score, dict): |
| llm_score = llm_score.get("score", 0) |
| total = algo_score + llm_score |
| else: |
| total = algo_score |
|
|
| merged[component] = min(total, rubric_max) |
|
|
| return merged |
|
|