slipstream-webgpu / src /evaluate.py
ashaibani's picture
Slipstream WebGPU (in-browser agent)
c658ad5 verified
"""Evaluation metrics and harness.
Headline results are reported on REAL held-out projects (Batselier) - never on the
synthetic generator that trains the fine-tune. See PLAN.md sections 4 and 6.
"""
from __future__ import annotations
import numpy as np
from . import evm
# --------------------------------------------------------------------------- #
# Metrics
# --------------------------------------------------------------------------- #
def mae(y_true, y_pred) -> float:
return float(np.mean(np.abs(np.asarray(y_true) - np.asarray(y_pred))))
def rmse(y_true, y_pred) -> float:
return float(np.sqrt(np.mean((np.asarray(y_true) - np.asarray(y_pred)) ** 2)))
def mase(y_true, y_pred, y_train, m: int = 1) -> float:
"""Mean Absolute Scaled Error (scale-free; primary point metric)."""
y_train = np.asarray(y_train, float)
if len(y_train) <= m:
return np.nan
denom = np.mean(np.abs(np.diff(y_train, n=m)))
if denom == 0:
return np.nan
return mae(y_true, y_pred) / denom
def pinball(y_true, q_pred, level: float) -> float:
"""Quantile (pinball) loss at a single quantile level."""
y_true = np.asarray(y_true, float)
q_pred = np.asarray(q_pred, float)
e = y_true - q_pred
return float(np.mean(np.maximum(level * e, (level - 1.0) * e)))
def interval_coverage(y_true, lo, hi) -> float:
"""Empirical coverage of the [lo, hi] interval (target ~0.8 for P10-P90)."""
y_true = np.asarray(y_true, float)
return float(np.mean((y_true >= np.asarray(lo)) & (y_true <= np.asarray(hi))))
# --------------------------------------------------------------------------- #
# Harness
# --------------------------------------------------------------------------- #
def evaluate_on_project(forecast_fn, project, context_frac: float = 0.5) -> dict | None:
"""Condition on the first `context_frac` of a project's EV increments and score
the forecast of the remainder. Also reports derived EAC error."""
ev_inc = evm.to_increments(project.ev)
n = len(ev_inc)
k = max(4, int(n * context_frac))
horizon = n - k
if horizon < 1:
return None
obs, truth = ev_inc[:k], ev_inc[k:]
f = forecast_fn(obs, horizon, bac=project.bac, planned_periods=project.planned_finish)
# Derived final-cost (EAC) error: re-integrate EV forecast, project AC at latest CPI.
last_cum_ev = project.ev[k - 1]
eac_pred = last_cum_ev + float(np.sum(f["q50"])) # simplistic; refine with CPI
eac_true = project.ev[-1]
return {
"mae": mae(truth, f["q50"]),
"mase": mase(truth, f["q50"], obs),
"pinball10": pinball(truth, f["q10"], 0.1),
"pinball90": pinball(truth, f["q90"], 0.9),
"coverage80": interval_coverage(truth, f["q10"], f["q90"]),
"eac_ape": abs(eac_pred - eac_true) / eac_true if eac_true else np.nan,
}
def evaluate_forecaster(forecast_fn, projects, context_frac: float = 0.5):
"""Mean metrics across a set of projects."""
import pandas as pd
rows = [evaluate_on_project(forecast_fn, p, context_frac) for p in projects]
rows = [r for r in rows if r is not None]
return pd.DataFrame(rows).mean(numeric_only=True)
def benchmark(forecasters: dict, projects, context_frac: float = 0.5):
"""Run several named forecasters and return a comparison table."""
import pandas as pd
out = {name: evaluate_forecaster(fn, projects, context_frac) for name, fn in forecasters.items()}
return pd.DataFrame(out).T
def load_dslib(path: str):
"""Load DSLIB (OR&S Ghent) real projects into a list of `synthetic.Project`-shaped
records. Easiest source is the per-project Excel output sheets (PV/EV/AC columns);
otherwise convert the `.p2x` files via PMConverter. Filter to 'green' (authentic,
complete) projects. See data/README.md."""
raise NotImplementedError(
"Parse the DSLIB Excel output sheets (or .p2x via PMConverter) into the tidy "
"schema (project_id, period, pv, ev, ac, bac, planned_finish). See data/README.md."
)