Spaces:

ashaibani
/

slipstream-webgpu

Running

App Files Files Community

slipstream-webgpu / src /evaluate.py

ashaibani

Slipstream WebGPU (in-browser agent)

c658ad5 verified 5 days ago

raw

history blame contribute delete

4.05 kB

	"""Evaluation metrics and harness.

	Headline results are reported on REAL held-out projects (Batselier) - never on the
	synthetic generator that trains the fine-tune. See PLAN.md sections 4 and 6.
	"""
	from __future__ import annotations

	import numpy as np

	from . import evm


	# --------------------------------------------------------------------------- #
	# Metrics
	# --------------------------------------------------------------------------- #
	def mae(y_true, y_pred) -> float:
	return float(np.mean(np.abs(np.asarray(y_true) - np.asarray(y_pred))))


	def rmse(y_true, y_pred) -> float:
	return float(np.sqrt(np.mean((np.asarray(y_true) - np.asarray(y_pred)) ** 2)))


	def mase(y_true, y_pred, y_train, m: int = 1) -> float:
	"""Mean Absolute Scaled Error (scale-free; primary point metric)."""
	y_train = np.asarray(y_train, float)
	if len(y_train) <= m:
	return np.nan
	denom = np.mean(np.abs(np.diff(y_train, n=m)))
	if denom == 0:
	return np.nan
	return mae(y_true, y_pred) / denom


	def pinball(y_true, q_pred, level: float) -> float:
	"""Quantile (pinball) loss at a single quantile level."""
	y_true = np.asarray(y_true, float)
	q_pred = np.asarray(q_pred, float)
	e = y_true - q_pred
	return float(np.mean(np.maximum(level * e, (level - 1.0) * e)))


	def interval_coverage(y_true, lo, hi) -> float:
	"""Empirical coverage of the [lo, hi] interval (target ~0.8 for P10-P90)."""
	y_true = np.asarray(y_true, float)
	return float(np.mean((y_true >= np.asarray(lo)) & (y_true <= np.asarray(hi))))


	# --------------------------------------------------------------------------- #
	# Harness
	# --------------------------------------------------------------------------- #
	def evaluate_on_project(forecast_fn, project, context_frac: float = 0.5) -> dict \| None:
	"""Condition on the first `context_frac` of a project's EV increments and score
	the forecast of the remainder. Also reports derived EAC error."""
	ev_inc = evm.to_increments(project.ev)
	n = len(ev_inc)
	k = max(4, int(n * context_frac))
	horizon = n - k
	if horizon < 1:
	return None
	obs, truth = ev_inc[:k], ev_inc[k:]

	f = forecast_fn(obs, horizon, bac=project.bac, planned_periods=project.planned_finish)

	# Derived final-cost (EAC) error: re-integrate EV forecast, project AC at latest CPI.
	last_cum_ev = project.ev[k - 1]
	eac_pred = last_cum_ev + float(np.sum(f["q50"])) # simplistic; refine with CPI
	eac_true = project.ev[-1]

	return {
	"mae": mae(truth, f["q50"]),
	"mase": mase(truth, f["q50"], obs),
	"pinball10": pinball(truth, f["q10"], 0.1),
	"pinball90": pinball(truth, f["q90"], 0.9),
	"coverage80": interval_coverage(truth, f["q10"], f["q90"]),
	"eac_ape": abs(eac_pred - eac_true) / eac_true if eac_true else np.nan,
	}


	def evaluate_forecaster(forecast_fn, projects, context_frac: float = 0.5):
	"""Mean metrics across a set of projects."""
	import pandas as pd

	rows = [evaluate_on_project(forecast_fn, p, context_frac) for p in projects]
	rows = [r for r in rows if r is not None]
	return pd.DataFrame(rows).mean(numeric_only=True)


	def benchmark(forecasters: dict, projects, context_frac: float = 0.5):
	"""Run several named forecasters and return a comparison table."""
	import pandas as pd

	out = {name: evaluate_forecaster(fn, projects, context_frac) for name, fn in forecasters.items()}
	return pd.DataFrame(out).T


	def load_dslib(path: str):
	"""Load DSLIB (OR&S Ghent) real projects into a list of `synthetic.Project`-shaped
	records. Easiest source is the per-project Excel output sheets (PV/EV/AC columns);
	otherwise convert the `.p2x` files via PMConverter. Filter to 'green' (authentic,
	complete) projects. See data/README.md."""
	raise NotImplementedError(
	"Parse the DSLIB Excel output sheets (or .p2x via PMConverter) into the tidy "
	"schema (project_id, period, pv, ev, ac, bac, planned_finish). See data/README.md."
	)