"""Evaluation metrics and harness. Headline results are reported on REAL held-out projects (Batselier) - never on the synthetic generator that trains the fine-tune. See PLAN.md sections 4 and 6. """ from __future__ import annotations import numpy as np from . import evm # --------------------------------------------------------------------------- # # Metrics # --------------------------------------------------------------------------- # def mae(y_true, y_pred) -> float: return float(np.mean(np.abs(np.asarray(y_true) - np.asarray(y_pred)))) def rmse(y_true, y_pred) -> float: return float(np.sqrt(np.mean((np.asarray(y_true) - np.asarray(y_pred)) ** 2))) def mase(y_true, y_pred, y_train, m: int = 1) -> float: """Mean Absolute Scaled Error (scale-free; primary point metric).""" y_train = np.asarray(y_train, float) if len(y_train) <= m: return np.nan denom = np.mean(np.abs(np.diff(y_train, n=m))) if denom == 0: return np.nan return mae(y_true, y_pred) / denom def pinball(y_true, q_pred, level: float) -> float: """Quantile (pinball) loss at a single quantile level.""" y_true = np.asarray(y_true, float) q_pred = np.asarray(q_pred, float) e = y_true - q_pred return float(np.mean(np.maximum(level * e, (level - 1.0) * e))) def interval_coverage(y_true, lo, hi) -> float: """Empirical coverage of the [lo, hi] interval (target ~0.8 for P10-P90).""" y_true = np.asarray(y_true, float) return float(np.mean((y_true >= np.asarray(lo)) & (y_true <= np.asarray(hi)))) # --------------------------------------------------------------------------- # # Harness # --------------------------------------------------------------------------- # def evaluate_on_project(forecast_fn, project, context_frac: float = 0.5) -> dict | None: """Condition on the first `context_frac` of a project's EV increments and score the forecast of the remainder. Also reports derived EAC error.""" ev_inc = evm.to_increments(project.ev) n = len(ev_inc) k = max(4, int(n * context_frac)) horizon = n - k if horizon < 1: return None obs, truth = ev_inc[:k], ev_inc[k:] f = forecast_fn(obs, horizon, bac=project.bac, planned_periods=project.planned_finish) # Derived final-cost (EAC) error: re-integrate EV forecast, project AC at latest CPI. last_cum_ev = project.ev[k - 1] eac_pred = last_cum_ev + float(np.sum(f["q50"])) # simplistic; refine with CPI eac_true = project.ev[-1] return { "mae": mae(truth, f["q50"]), "mase": mase(truth, f["q50"], obs), "pinball10": pinball(truth, f["q10"], 0.1), "pinball90": pinball(truth, f["q90"], 0.9), "coverage80": interval_coverage(truth, f["q10"], f["q90"]), "eac_ape": abs(eac_pred - eac_true) / eac_true if eac_true else np.nan, } def evaluate_forecaster(forecast_fn, projects, context_frac: float = 0.5): """Mean metrics across a set of projects.""" import pandas as pd rows = [evaluate_on_project(forecast_fn, p, context_frac) for p in projects] rows = [r for r in rows if r is not None] return pd.DataFrame(rows).mean(numeric_only=True) def benchmark(forecasters: dict, projects, context_frac: float = 0.5): """Run several named forecasters and return a comparison table.""" import pandas as pd out = {name: evaluate_forecaster(fn, projects, context_frac) for name, fn in forecasters.items()} return pd.DataFrame(out).T def load_dslib(path: str): """Load DSLIB (OR&S Ghent) real projects into a list of `synthetic.Project`-shaped records. Easiest source is the per-project Excel output sheets (PV/EV/AC columns); otherwise convert the `.p2x` files via PMConverter. Filter to 'green' (authentic, complete) projects. See data/README.md.""" raise NotImplementedError( "Parse the DSLIB Excel output sheets (or .p2x via PMConverter) into the tidy " "schema (project_id, period, pv, ev, ac, bac, planned_finish). See data/README.md." )