Dynamic-What-if-scheduling / features /feature_engineering.py
suvradeepp's picture
Upload 63 files
03e7fda verified
"""
features/feature_engineering.py
---------------------------------
Computes 12 engineered features per activity for ML models.
"""
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Optional
REFERENCE_DATE = datetime(2024, 6, 1)
ISSUE_SEVERITY_WEIGHTS = {
"design_change": 3.0,
"inspection_fail": 2.0,
"scope_creep": 2.0,
"weather": 1.0,
"material_delay": 1.5,
"labor_shortage": 1.5,
"equipment_breakdown": 1.0,
"safety": 0.5,
}
SEVERITY_MULTIPLIER = {"low": 0.5, "medium": 1.0, "high": 1.5, "critical": 2.5}
def engineer_features(
activities: pd.DataFrame,
loader,
today: Optional[datetime] = None,
) -> pd.DataFrame:
"""
Compute all 12 features for every activity in the dataframe.
Parameters
----------
activities : pd.DataFrame β€” activities to featurize
loader : DataLoader β€” for accessing updates, issues, boq, etc.
today : datetime β€” reference date (defaults to REFERENCE_DATE)
Returns
-------
pd.DataFrame with original columns + 12 new feature columns
"""
if today is None:
today = REFERENCE_DATE
today = pd.Timestamp(today)
df = activities.copy()
# ── Ensure date columns are Timestamps ──────────────────────────────────
date_cols = ["planned_start_date", "planned_end_date",
"actual_start_date", "actual_end_date"]
for col in date_cols:
if col in df.columns:
df[col] = pd.to_datetime(df[col], errors="coerce")
# ── Feature 1: planned_duration ──────────────────────────────────────────
df["planned_duration"] = (
df["planned_end_date"] - df["planned_start_date"]
).dt.days.clip(lower=1)
# ── Feature 2: elapsed_days ──────────────────────────────────────────────
def elapsed(row):
start = row.get("actual_start_date") or row.get("planned_start_date")
if pd.isna(start):
return 0
start = pd.Timestamp(start)
if row["status"] == "completed" and not pd.isna(row.get("actual_end_date")):
return max(1, (pd.Timestamp(row["actual_end_date"]) - start).days)
return max(1, (today - start).days)
df["elapsed_days"] = df.apply(elapsed, axis=1)
# ── Feature 3: progress_rate (% per day) ────────────────────────────────
prog = df.get("progress", pd.Series(0, index=df.index))
df["progress"] = pd.to_numeric(prog, errors="coerce").fillna(0)
df["progress_rate"] = (df["progress"] / df["elapsed_days"]).clip(0, 20)
# ── Feature 4: schedule_variance (days late at start) ───────────────────
if "schedule_variance_days" in df.columns:
df["schedule_variance"] = pd.to_numeric(
df["schedule_variance_days"], errors="coerce").fillna(0)
else:
def sch_var(row):
planned = row.get("planned_start_date")
actual = row.get("actual_start_date")
if pd.isna(planned) or pd.isna(actual):
return 0
return (pd.Timestamp(actual) - pd.Timestamp(planned)).days
df["schedule_variance"] = df.apply(sch_var, axis=1)
# ── Feature 5: delay_ratio (actual/planned β€” for completed) ─────────────
def delay_ratio(row):
if "actual_duration_days" in row and not pd.isna(row["actual_duration_days"]):
pd_dur = max(row["planned_duration"], 1)
return row["actual_duration_days"] / pd_dur
return 1.0
df["delay_ratio"] = df.apply(delay_ratio, axis=1)
# ── Features 6 & 7: issue_count + issue_severity_score ──────────────────
all_issues = loader.issues
if not all_issues.empty:
def issue_stats(activity_id):
iss = all_issues[all_issues["activity_id"] == activity_id]
open_iss = iss[iss["status"] == "open"]
count = len(open_iss)
score = 0.0
for _, row in open_iss.iterrows():
cat_w = ISSUE_SEVERITY_WEIGHTS.get(row.get("category", ""), 1.0)
sev_m = SEVERITY_MULTIPLIER.get(row.get("severity", "medium"), 1.0)
score += cat_w * sev_m
return count, score
issue_data = df["id"].apply(lambda aid: pd.Series(
issue_stats(aid), index=["issue_count", "issue_severity_score"]
))
df["issue_count"] = issue_data["issue_count"]
df["issue_severity_score"] = issue_data["issue_severity_score"]
else:
df["issue_count"] = 0
df["issue_severity_score"] = 0.0
# ── Feature 8: boq_complexity ────────────────────────────────────────────
all_boq = loader.boq
if not all_boq.empty:
def boq_complexity(activity_id):
b = all_boq[all_boq["activity_id"] == activity_id]
if b.empty:
return 0.0
count_score = len(b)
if "total_price" in b.columns and "total_cost" in b.columns:
variance = (b["total_price"] - b["total_cost"]).sum() / max(b["total_cost"].sum(), 1)
return count_score + variance * 0.1
return count_score
df["boq_complexity"] = df["id"].apply(boq_complexity)
else:
df["boq_complexity"] = 0.0
# ── Feature 9: parent_delay (binary) ────────────────────────────────────
def parent_delayed(row):
pred_id = row.get("depends_on")
if not pred_id or (isinstance(pred_id, float) and np.isnan(pred_id)):
return 0
pred_mask = df["id"] == pred_id
if pred_mask.any():
pred_row = df[pred_mask].iloc[0]
return 1 if pred_row.get("schedule_variance", 0) > 2 else 0
return 0
df["parent_delay"] = df.apply(parent_delayed, axis=1)
# ── Feature 10: historical_avg_delay (by category) ─────────────────────
completed_acts = df[df["status"] == "completed"].copy()
if len(completed_acts) > 0:
hist_delay = (
completed_acts.groupby("category")["delay_ratio"]
.mean()
.reset_index(name="historical_avg_delay")
)
df = df.merge(hist_delay, on="category", how="left")
df["historical_avg_delay"] = df["historical_avg_delay"].fillna(1.0)
else:
df["historical_avg_delay"] = 1.0
# ── Features 11 & 12: progress_velocity_7d + progress_acceleration ──────
all_updates = loader.daily_updates
if not all_updates.empty:
all_updates = all_updates.copy()
all_updates["date"] = pd.to_datetime(all_updates["date"], errors="coerce")
if "daily_increment" not in all_updates.columns and "reported_progress" in all_updates.columns:
all_updates = all_updates.sort_values(["activity_id", "date"])
all_updates["daily_increment"] = (
all_updates.groupby("activity_id")["reported_progress"].diff().fillna(0)
)
def velocity_and_accel(activity_id):
upd = all_updates[all_updates["activity_id"] == activity_id].sort_values("date")
if upd.empty:
return 0.0, 0.0
recent = upd.tail(14)
vel_14 = recent["daily_increment"].mean() if len(recent) > 0 else 0
vel_7 = upd.tail(7)["daily_increment"].mean() if len(upd) >= 7 else vel_14
prev_7 = upd.iloc[-14:-7]["daily_increment"].mean() if len(upd) >= 14 else vel_7
accel = vel_7 - prev_7
return float(vel_7), float(accel)
vel_data = df["id"].apply(lambda aid: pd.Series(
velocity_and_accel(aid), index=["progress_velocity_7d", "progress_acceleration"]
))
df["progress_velocity_7d"] = vel_data["progress_velocity_7d"]
df["progress_acceleration"] = vel_data["progress_acceleration"]
else:
df["progress_velocity_7d"] = df["progress_rate"]
df["progress_acceleration"] = 0.0
return df
FEATURE_COLS = [
"planned_duration", "elapsed_days", "progress_rate", "schedule_variance",
"delay_ratio", "issue_count", "issue_severity_score", "boq_complexity",
"parent_delay", "historical_avg_delay", "progress_velocity_7d", "progress_acceleration",
]
TARGET_COL = "delay_ratio"
CATEGORY_COLS = ["category", "project_type"]
def get_ml_ready(df: pd.DataFrame):
"""
Returns X (features), y (target) arrays for ML training.
Only uses completed activities with non-null targets.
"""
from sklearn.preprocessing import LabelEncoder
df = df.copy()
# Encode categorical columns
for cat_col in CATEGORY_COLS:
if cat_col in df.columns:
le = LabelEncoder()
df[f"{cat_col}_enc"] = le.fit_transform(df[cat_col].astype(str))
feat_cols = FEATURE_COLS + [f"{c}_enc" for c in CATEGORY_COLS if c in df.columns]
feat_cols = [c for c in feat_cols if c in df.columns]
y_col = TARGET_COL
mask = df[y_col].notna() & df["status"].isin(["completed"])
return df[mask][feat_cols], df[mask][y_col], feat_cols