Spaces:
Sleeping
Sleeping
| """HM(Style Score, Semantic Score, Fluency Score) β Primary metric. | |
| Harmonic mean of the three LLM-judge dimensions. | |
| """ | |
| import logging | |
| from typing import Dict, List, Tuple | |
| import numpy as np | |
| logger = logging.getLogger(__name__) | |
| def harmonic_mean(*values: float) -> float: | |
| """Compute the harmonic mean of positive values. | |
| Returns 0 if any value is 0 (penalises single-axis failure). | |
| """ | |
| values = [float(v) for v in values] | |
| if any(v <= 0 for v in values): | |
| return 0.0 | |
| n = len(values) | |
| return n / sum(1.0 / v for v in values) | |
| def compute_hm(ss: float, sems: float, flu: float) -> float: | |
| """Compute HM(Style Score, Semantic Score, Fluency Score). | |
| Args: | |
| ss: Style Score (1β5) | |
| sems: Semantic Score (1β5) | |
| flu: Fluency Score (1β5) | |
| Returns: | |
| Harmonic mean score | |
| """ | |
| return harmonic_mean(ss, sems, flu) | |
| def compute_hm_batch(scores: List[Dict[str, float]]) -> List[float]: | |
| """Compute HM for a batch of score dicts. | |
| Args: | |
| scores: List of dicts with keys "ss", "sems", "flu" | |
| Returns: | |
| List of HM values | |
| """ | |
| return [compute_hm(s["ss"], s["sems"], s["flu"]) for s in scores] | |
| def aggregate_scores( | |
| all_scores: List[Dict[str, float]], | |
| ) -> Dict[str, float]: | |
| """Aggregate scores across images. | |
| Returns: | |
| Dict with mean and std of SS, SemS, Flu, HM | |
| """ | |
| if not all_scores: | |
| return { | |
| "ss_mean": 0, "ss_std": 0, | |
| "sems_mean": 0, "sems_std": 0, | |
| "flu_mean": 0, "flu_std": 0, | |
| "hm_mean": 0, "hm_std": 0, | |
| "n": 0, | |
| } | |
| ss_vals = [s["ss"] for s in all_scores] | |
| sems_vals = [s["sems"] for s in all_scores] | |
| flu_vals = [s["flu"] for s in all_scores] | |
| hm_vals = compute_hm_batch(all_scores) | |
| return { | |
| "ss_mean": float(np.mean(ss_vals)), | |
| "ss_std": float(np.std(ss_vals)), | |
| "sems_mean": float(np.mean(sems_vals)), | |
| "sems_std": float(np.std(sems_vals)), | |
| "flu_mean": float(np.mean(flu_vals)), | |
| "flu_std": float(np.std(flu_vals)), | |
| "hm_mean": float(np.mean(hm_vals)), | |
| "hm_std": float(np.std(hm_vals)), | |
| "n": len(all_scores), | |
| } | |
| def check_anomalies( | |
| results: Dict[str, Dict[str, float]], | |
| min_hm: float = 1.0, | |
| ) -> List[str]: | |
| """Check for anomalous results. | |
| Flags: | |
| - All methods have identical HM | |
| - Any method HM = 0 or NaN | |
| - HM below min_hm threshold | |
| Returns: | |
| List of warning messages (empty = all OK) | |
| """ | |
| warnings = [] | |
| hm_values = [r.get("hm_mean", 0) for r in results.values()] | |
| # Check for NaN | |
| for method, r in results.items(): | |
| if np.isnan(r.get("hm_mean", 0)): | |
| warnings.append(f"ANOMALY: {method} has NaN HM score") | |
| # Check for all identical | |
| if len(set(round(h, 4) for h in hm_values)) == 1 and len(hm_values) > 1: | |
| warnings.append("ANOMALY: All methods have identical HM scores") | |
| # Check for below threshold | |
| for method, r in results.items(): | |
| hm = r.get("hm_mean", 0) | |
| if hm < min_hm and not np.isnan(hm): | |
| warnings.append(f"WARNING: {method} HM={hm:.2f} < {min_hm}") | |
| return warnings | |