Spaces:

abka03
/

stylsteer-vlm

Sleeping

App Files Files Community

stylsteer-vlm / src /eval /metric.py

abka03

Deploy StyleSteer-VLM demo

e6f24ae verified 2 months ago

raw

history blame contribute delete

3.22 kB

	"""HM(Style Score, Semantic Score, Fluency Score) — Primary metric.

	Harmonic mean of the three LLM-judge dimensions.
	"""

	import logging
	from typing import Dict, List, Tuple

	import numpy as np

	logger = logging.getLogger(__name__)


	def harmonic_mean(*values: float) -> float:
	"""Compute the harmonic mean of positive values.

	Returns 0 if any value is 0 (penalises single-axis failure).
	"""
	values = [float(v) for v in values]
	if any(v <= 0 for v in values):
	return 0.0
	n = len(values)
	return n / sum(1.0 / v for v in values)


	def compute_hm(ss: float, sems: float, flu: float) -> float:
	"""Compute HM(Style Score, Semantic Score, Fluency Score).

	Args:
	ss: Style Score (1–5)
	sems: Semantic Score (1–5)
	flu: Fluency Score (1–5)

	Returns:
	Harmonic mean score
	"""
	return harmonic_mean(ss, sems, flu)


	def compute_hm_batch(scores: List[Dict[str, float]]) -> List[float]:
	"""Compute HM for a batch of score dicts.

	Args:
	scores: List of dicts with keys "ss", "sems", "flu"

	Returns:
	List of HM values
	"""
	return [compute_hm(s["ss"], s["sems"], s["flu"]) for s in scores]


	def aggregate_scores(
	all_scores: List[Dict[str, float]],
	) -> Dict[str, float]:
	"""Aggregate scores across images.

	Returns:
	Dict with mean and std of SS, SemS, Flu, HM
	"""
	if not all_scores:
	return {
	"ss_mean": 0, "ss_std": 0,
	"sems_mean": 0, "sems_std": 0,
	"flu_mean": 0, "flu_std": 0,
	"hm_mean": 0, "hm_std": 0,
	"n": 0,
	}

	ss_vals = [s["ss"] for s in all_scores]
	sems_vals = [s["sems"] for s in all_scores]
	flu_vals = [s["flu"] for s in all_scores]
	hm_vals = compute_hm_batch(all_scores)

	return {
	"ss_mean": float(np.mean(ss_vals)),
	"ss_std": float(np.std(ss_vals)),
	"sems_mean": float(np.mean(sems_vals)),
	"sems_std": float(np.std(sems_vals)),
	"flu_mean": float(np.mean(flu_vals)),
	"flu_std": float(np.std(flu_vals)),
	"hm_mean": float(np.mean(hm_vals)),
	"hm_std": float(np.std(hm_vals)),
	"n": len(all_scores),
	}


	def check_anomalies(
	results: Dict[str, Dict[str, float]],
	min_hm: float = 1.0,
	) -> List[str]:
	"""Check for anomalous results.

	Flags:
	- All methods have identical HM
	- Any method HM = 0 or NaN
	- HM below min_hm threshold

	Returns:
	List of warning messages (empty = all OK)
	"""
	warnings = []

	hm_values = [r.get("hm_mean", 0) for r in results.values()]

	# Check for NaN
	for method, r in results.items():
	if np.isnan(r.get("hm_mean", 0)):
	warnings.append(f"ANOMALY: {method} has NaN HM score")

	# Check for all identical
	if len(set(round(h, 4) for h in hm_values)) == 1 and len(hm_values) > 1:
	warnings.append("ANOMALY: All methods have identical HM scores")

	# Check for below threshold
	for method, r in results.items():
	hm = r.get("hm_mean", 0)
	if hm < min_hm and not np.isnan(hm):
	warnings.append(f"WARNING: {method} HM={hm:.2f} < {min_hm}")

	return warnings