"""Evaluation metrics and harness.

Headline results are reported on REAL held-out projects (Batselier) - never on the
synthetic generator that trains the fine-tune. See PLAN.md sections 4 and 6.
"""
from __future__ import annotations

import numpy as np

from . import evm


# --------------------------------------------------------------------------- #
# Metrics
# --------------------------------------------------------------------------- #
def mae(y_true, y_pred) -> float:
    return float(np.mean(np.abs(np.asarray(y_true) - np.asarray(y_pred))))


def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(np.mean((np.asarray(y_true) - np.asarray(y_pred)) ** 2)))


def mase(y_true, y_pred, y_train, m: int = 1) -> float:
    """Mean Absolute Scaled Error (scale-free; primary point metric)."""
    y_train = np.asarray(y_train, float)
    if len(y_train) <= m:
        return np.nan
    denom = np.mean(np.abs(np.diff(y_train, n=m)))
    if denom == 0:
        return np.nan
    return mae(y_true, y_pred) / denom


def pinball(y_true, q_pred, level: float) -> float:
    """Quantile (pinball) loss at a single quantile level."""
    y_true = np.asarray(y_true, float)
    q_pred = np.asarray(q_pred, float)
    e = y_true - q_pred
    return float(np.mean(np.maximum(level * e, (level - 1.0) * e)))


def interval_coverage(y_true, lo, hi) -> float:
    """Empirical coverage of the [lo, hi] interval (target ~0.8 for P10-P90)."""
    y_true = np.asarray(y_true, float)
    return float(np.mean((y_true >= np.asarray(lo)) & (y_true <= np.asarray(hi))))


# --------------------------------------------------------------------------- #
# Harness
# --------------------------------------------------------------------------- #
def evaluate_on_project(forecast_fn, project, context_frac: float = 0.5) -> dict | None:
    """Condition on the first `context_frac` of a project's EV increments and score
    the forecast of the remainder. Also reports derived EAC error."""
    ev_inc = evm.to_increments(project.ev)
    n = len(ev_inc)
    k = max(4, int(n * context_frac))
    horizon = n - k
    if horizon < 1:
        return None
    obs, truth = ev_inc[:k], ev_inc[k:]

    f = forecast_fn(obs, horizon, bac=project.bac, planned_periods=project.planned_finish)

    # Derived final-cost (EAC) error: re-integrate EV forecast, project AC at latest CPI.
    last_cum_ev = project.ev[k - 1]
    eac_pred = last_cum_ev + float(np.sum(f["q50"]))   # simplistic; refine with CPI
    eac_true = project.ev[-1]

    return {
        "mae": mae(truth, f["q50"]),
        "mase": mase(truth, f["q50"], obs),
        "pinball10": pinball(truth, f["q10"], 0.1),
        "pinball90": pinball(truth, f["q90"], 0.9),
        "coverage80": interval_coverage(truth, f["q10"], f["q90"]),
        "eac_ape": abs(eac_pred - eac_true) / eac_true if eac_true else np.nan,
    }


def evaluate_forecaster(forecast_fn, projects, context_frac: float = 0.5):
    """Mean metrics across a set of projects."""
    import pandas as pd

    rows = [evaluate_on_project(forecast_fn, p, context_frac) for p in projects]
    rows = [r for r in rows if r is not None]
    return pd.DataFrame(rows).mean(numeric_only=True)


def benchmark(forecasters: dict, projects, context_frac: float = 0.5):
    """Run several named forecasters and return a comparison table."""
    import pandas as pd

    out = {name: evaluate_forecaster(fn, projects, context_frac) for name, fn in forecasters.items()}
    return pd.DataFrame(out).T


def load_dslib(path: str):
    """Load DSLIB (OR&S Ghent) real projects into a list of `synthetic.Project`-shaped
    records. Easiest source is the per-project Excel output sheets (PV/EV/AC columns);
    otherwise convert the `.p2x` files via PMConverter. Filter to 'green' (authentic,
    complete) projects. See data/README.md."""
    raise NotImplementedError(
        "Parse the DSLIB Excel output sheets (or .p2x via PMConverter) into the tidy "
        "schema (project_id, period, pv, ev, ac, bac, planned_finish). See data/README.md."
    )