"""Load the DSLIB (OR&S Ghent) real-project EVM tracking data. Reads each per-project 'Tracking Overview' sheet (cumulative PV/EV/AC per tracking period) from data/DSLIB/Excel/*.xlsx into `synthetic.Project` records, so real and synthetic projects share one interface for the baselines / forecaster / eval harness. Only ~117 of the 231 projects carry periodic tracking; series are short (median ~8 periods). Filter with `min_periods`. The `.p2x` files (a subset) hold the same data as XML but the Excel sheets are easier; the other libraries (ASLIB/RCPLIB/MSLIB/MPLIB/ SSLIB/MMLIB) are artificial RCPSP scheduling sets and are NOT used here. """ from __future__ import annotations import glob import os import warnings import numpy as np from .synthetic import Project DEFAULT_DIR = os.path.join("data", "DSLIB", "Excel") _PV, _EV, _AC = "Planned Value (PV)", "Earned Value (EV)", "Actual Cost (AC)" def load_project(path: str) -> Project | None: """Parse one DSLIB Excel workbook's 'Tracking Overview' sheet. Returns None if it has no usable PV/EV/AC tracking.""" import pandas as pd try: df = pd.read_excel(path, sheet_name="Tracking Overview", header=1) except Exception: return None if not {_PV, _EV, _AC}.issubset(df.columns): return None pv = pd.to_numeric(df[_PV], errors="coerce") ev = pd.to_numeric(df[_EV], errors="coerce") ac = pd.to_numeric(df[_AC], errors="coerce") mask = pv.notna() & ev.notna() & ac.notna() pv, ev, ac = pv[mask].to_numpy(float), ev[mask].to_numpy(float), ac[mask].to_numpy(float) n = len(pv) if n < 2: return None bac = float(np.nanmax(pv)) if not np.isfinite(bac) or bac <= 0: return None # Baseline finish (in tracking periods) = first period where planned work is complete. hit = pv >= 0.999 * bac planned_finish = int(np.argmax(hit)) + 1 if hit.any() else n return Project( name=os.path.splitext(os.path.basename(path))[0], period=np.arange(1, n + 1), pv=pv, ev=ev, ac=ac, bac=bac, planned_finish=planned_finish, meta={"source": "DSLIB", "n_track": n}, ) def load_dslib(excel_dir: str = DEFAULT_DIR, min_periods: int = 8) -> list[Project]: """Load all DSLIB projects with at least `min_periods` tracking periods.""" warnings.filterwarnings("ignore") out = [] for f in sorted(glob.glob(os.path.join(excel_dir, "*.xlsx"))): p = load_project(f) if p is not None and p.meta["n_track"] >= min_periods: out.append(p) return out