SDR-Arena / evaluation /weighted_score.py
behavior-in-the-wild's picture
Deploy SDR-Arena leaderboard
f9e2361 verified
"""
Weighted Coverage Score Calculator.
Computes weighted coverage scores from Likert (0-5) evaluations.
Formula:
score = Sum(item_scores) / (5.0 * count_items)
Result is 0.0 to 1.0 (multiply by 100 for percentage)
Adapted from calculate_weighted_coverage.py.
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional
def calculate_score(coverage_data: Optional[Dict[str, Any]]) -> float:
"""
Calculate the weighted coverage score from Likert evaluations.
Args:
coverage_data: Dict with "evaluations" key containing list of
evaluation dicts, each with a "score" field (0-5).
Returns:
Float between 0.0 and 1.0 representing the weighted coverage score.
Returns 0.0 if input is empty or has no valid evaluations.
"""
if not coverage_data:
return 0.0
evaluations = coverage_data.get("evaluations", [])
if not evaluations:
return 0.0
total_score = 0
count = 0
for item in evaluations:
s = item.get("score", 0)
if isinstance(s, (int, float)):
total_score += s
count += 1
if count == 0:
return 0.0
# Max score per item is 5
max_possible = count * 5.0
return total_score / max_possible
def calculate_score_percentage(coverage_data: Optional[Dict[str, Any]]) -> float:
"""
Calculate the weighted coverage score as a percentage (0-100).
Args:
coverage_data: Dict with "evaluations" key.
Returns:
Float between 0.0 and 100.0.
"""
return calculate_score(coverage_data) * 100.0
def aggregate_scores(scores: List[float]) -> float:
"""
Compute the average of a list of scores.
Args:
scores: List of float scores.
Returns:
Average score, or 0.0 if empty.
"""
if not scores:
return 0.0
return sum(scores) / len(scores)
def find_coverage_keys(data: List[Dict]) -> List[str]:
"""Find all keys ending with _coverage_scores in the data."""
keys = set()
for item in data:
for key in item.keys():
if key.endswith("_coverage_scores"):
keys.add(key)
return sorted(list(keys))