stylsteer-vlm / src /eval /metric.py
abka03's picture
Deploy StyleSteer-VLM demo
e6f24ae verified
"""HM(Style Score, Semantic Score, Fluency Score) β€” Primary metric.
Harmonic mean of the three LLM-judge dimensions.
"""
import logging
from typing import Dict, List, Tuple
import numpy as np
logger = logging.getLogger(__name__)
def harmonic_mean(*values: float) -> float:
"""Compute the harmonic mean of positive values.
Returns 0 if any value is 0 (penalises single-axis failure).
"""
values = [float(v) for v in values]
if any(v <= 0 for v in values):
return 0.0
n = len(values)
return n / sum(1.0 / v for v in values)
def compute_hm(ss: float, sems: float, flu: float) -> float:
"""Compute HM(Style Score, Semantic Score, Fluency Score).
Args:
ss: Style Score (1–5)
sems: Semantic Score (1–5)
flu: Fluency Score (1–5)
Returns:
Harmonic mean score
"""
return harmonic_mean(ss, sems, flu)
def compute_hm_batch(scores: List[Dict[str, float]]) -> List[float]:
"""Compute HM for a batch of score dicts.
Args:
scores: List of dicts with keys "ss", "sems", "flu"
Returns:
List of HM values
"""
return [compute_hm(s["ss"], s["sems"], s["flu"]) for s in scores]
def aggregate_scores(
all_scores: List[Dict[str, float]],
) -> Dict[str, float]:
"""Aggregate scores across images.
Returns:
Dict with mean and std of SS, SemS, Flu, HM
"""
if not all_scores:
return {
"ss_mean": 0, "ss_std": 0,
"sems_mean": 0, "sems_std": 0,
"flu_mean": 0, "flu_std": 0,
"hm_mean": 0, "hm_std": 0,
"n": 0,
}
ss_vals = [s["ss"] for s in all_scores]
sems_vals = [s["sems"] for s in all_scores]
flu_vals = [s["flu"] for s in all_scores]
hm_vals = compute_hm_batch(all_scores)
return {
"ss_mean": float(np.mean(ss_vals)),
"ss_std": float(np.std(ss_vals)),
"sems_mean": float(np.mean(sems_vals)),
"sems_std": float(np.std(sems_vals)),
"flu_mean": float(np.mean(flu_vals)),
"flu_std": float(np.std(flu_vals)),
"hm_mean": float(np.mean(hm_vals)),
"hm_std": float(np.std(hm_vals)),
"n": len(all_scores),
}
def check_anomalies(
results: Dict[str, Dict[str, float]],
min_hm: float = 1.0,
) -> List[str]:
"""Check for anomalous results.
Flags:
- All methods have identical HM
- Any method HM = 0 or NaN
- HM below min_hm threshold
Returns:
List of warning messages (empty = all OK)
"""
warnings = []
hm_values = [r.get("hm_mean", 0) for r in results.values()]
# Check for NaN
for method, r in results.items():
if np.isnan(r.get("hm_mean", 0)):
warnings.append(f"ANOMALY: {method} has NaN HM score")
# Check for all identical
if len(set(round(h, 4) for h in hm_values)) == 1 and len(hm_values) > 1:
warnings.append("ANOMALY: All methods have identical HM scores")
# Check for below threshold
for method, r in results.items():
hm = r.get("hm_mean", 0)
if hm < min_hm and not np.isnan(hm):
warnings.append(f"WARNING: {method} HM={hm:.2f} < {min_hm}")
return warnings