"""Load the DSLIB (OR&S Ghent) real-project EVM tracking data.

Reads each per-project 'Tracking Overview' sheet (cumulative PV/EV/AC per tracking
period) from data/DSLIB/Excel/*.xlsx into `synthetic.Project` records, so real and
synthetic projects share one interface for the baselines / forecaster / eval harness.

Only ~117 of the 231 projects carry periodic tracking; series are short (median ~8
periods). Filter with `min_periods`. The `.p2x` files (a subset) hold the same data as
XML but the Excel sheets are easier; the other libraries (ASLIB/RCPLIB/MSLIB/MPLIB/
SSLIB/MMLIB) are artificial RCPSP scheduling sets and are NOT used here.
"""
from __future__ import annotations

import glob
import os
import warnings

import numpy as np

from .synthetic import Project

DEFAULT_DIR = os.path.join("data", "DSLIB", "Excel")
_PV, _EV, _AC = "Planned Value (PV)", "Earned Value (EV)", "Actual Cost (AC)"


def load_project(path: str) -> Project | None:
    """Parse one DSLIB Excel workbook's 'Tracking Overview' sheet. Returns None if it
    has no usable PV/EV/AC tracking."""
    import pandas as pd

    try:
        df = pd.read_excel(path, sheet_name="Tracking Overview", header=1)
    except Exception:
        return None
    if not {_PV, _EV, _AC}.issubset(df.columns):
        return None

    pv = pd.to_numeric(df[_PV], errors="coerce")
    ev = pd.to_numeric(df[_EV], errors="coerce")
    ac = pd.to_numeric(df[_AC], errors="coerce")
    mask = pv.notna() & ev.notna() & ac.notna()
    pv, ev, ac = pv[mask].to_numpy(float), ev[mask].to_numpy(float), ac[mask].to_numpy(float)
    n = len(pv)
    if n < 2:
        return None

    bac = float(np.nanmax(pv))
    if not np.isfinite(bac) or bac <= 0:
        return None
    # Baseline finish (in tracking periods) = first period where planned work is complete.
    hit = pv >= 0.999 * bac
    planned_finish = int(np.argmax(hit)) + 1 if hit.any() else n

    return Project(
        name=os.path.splitext(os.path.basename(path))[0],
        period=np.arange(1, n + 1),
        pv=pv,
        ev=ev,
        ac=ac,
        bac=bac,
        planned_finish=planned_finish,
        meta={"source": "DSLIB", "n_track": n},
    )


def load_dslib(excel_dir: str = DEFAULT_DIR, min_periods: int = 8) -> list[Project]:
    """Load all DSLIB projects with at least `min_periods` tracking periods."""
    warnings.filterwarnings("ignore")
    out = []
    for f in sorted(glob.glob(os.path.join(excel_dir, "*.xlsx"))):
        p = load_project(f)
        if p is not None and p.meta["n_track"] >= min_periods:
            out.append(p)
    return out