slipstream-webgpu / src /dslib.py
ashaibani's picture
Slipstream WebGPU (in-browser agent)
c658ad5 verified
"""Load the DSLIB (OR&S Ghent) real-project EVM tracking data.
Reads each per-project 'Tracking Overview' sheet (cumulative PV/EV/AC per tracking
period) from data/DSLIB/Excel/*.xlsx into `synthetic.Project` records, so real and
synthetic projects share one interface for the baselines / forecaster / eval harness.
Only ~117 of the 231 projects carry periodic tracking; series are short (median ~8
periods). Filter with `min_periods`. The `.p2x` files (a subset) hold the same data as
XML but the Excel sheets are easier; the other libraries (ASLIB/RCPLIB/MSLIB/MPLIB/
SSLIB/MMLIB) are artificial RCPSP scheduling sets and are NOT used here.
"""
from __future__ import annotations
import glob
import os
import warnings
import numpy as np
from .synthetic import Project
DEFAULT_DIR = os.path.join("data", "DSLIB", "Excel")
_PV, _EV, _AC = "Planned Value (PV)", "Earned Value (EV)", "Actual Cost (AC)"
def load_project(path: str) -> Project | None:
"""Parse one DSLIB Excel workbook's 'Tracking Overview' sheet. Returns None if it
has no usable PV/EV/AC tracking."""
import pandas as pd
try:
df = pd.read_excel(path, sheet_name="Tracking Overview", header=1)
except Exception:
return None
if not {_PV, _EV, _AC}.issubset(df.columns):
return None
pv = pd.to_numeric(df[_PV], errors="coerce")
ev = pd.to_numeric(df[_EV], errors="coerce")
ac = pd.to_numeric(df[_AC], errors="coerce")
mask = pv.notna() & ev.notna() & ac.notna()
pv, ev, ac = pv[mask].to_numpy(float), ev[mask].to_numpy(float), ac[mask].to_numpy(float)
n = len(pv)
if n < 2:
return None
bac = float(np.nanmax(pv))
if not np.isfinite(bac) or bac <= 0:
return None
# Baseline finish (in tracking periods) = first period where planned work is complete.
hit = pv >= 0.999 * bac
planned_finish = int(np.argmax(hit)) + 1 if hit.any() else n
return Project(
name=os.path.splitext(os.path.basename(path))[0],
period=np.arange(1, n + 1),
pv=pv,
ev=ev,
ac=ac,
bac=bac,
planned_finish=planned_finish,
meta={"source": "DSLIB", "n_track": n},
)
def load_dslib(excel_dir: str = DEFAULT_DIR, min_periods: int = 8) -> list[Project]:
"""Load all DSLIB projects with at least `min_periods` tracking periods."""
warnings.filterwarnings("ignore")
out = []
for f in sorted(glob.glob(os.path.join(excel_dir, "*.xlsx"))):
p = load_project(f)
if p is not None and p.meta["n_track"] >= min_periods:
out.append(p)
return out