| |
| """Re-implement the DECODE paper training pipeline for local I-BLEND data. |
| |
| The paper pipeline is: |
| 1. Fuse energy, occupancy, calendar, and weather-like environmental features. |
| 2. Align everything to a 10-minute sampling rate. |
| 3. Normalize features with Min-Max scaling. |
| 4. Split chronologically into train/validation/test with a 70:15:15 ratio. |
| 5. Compare LSTM with Linear Regression, Decision Tree, and Random Forest. |
| |
| This local implementation supports two targets: |
| - paper_buildings: 7 building-level series matching the paper. |
| - meters: 9 meter-level series from all_buildings_power.csv. |
| |
| Weather is optional because the local weather file in this workspace starts in |
| 2018, while the energy data ends in 2017. The script detects this and continues. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import os |
| import json |
| import math |
| import sys |
| import warnings |
| from dataclasses import dataclass |
| from pathlib import Path |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| DEFAULT_DATA_MINING_ROOT = ROOT if (ROOT / "IIITD_occupancy_dataset").exists() else ROOT.parent |
| DATA_MINING_ROOT = Path(os.environ.get("IBLEND_DATA_ROOT", DEFAULT_DATA_MINING_ROOT)) |
| ENERGY_FILE = Path(os.environ.get("IBLEND_ENERGY_FILE", DATA_MINING_ROOT / "energy_dataset" / "all_buildings_power.csv")) |
| OCCUPANCY_DIR = DATA_MINING_ROOT / "IIITD_occupancy_dataset" / "IIITD_occupancy_dataset" |
| CALENDAR_DIR = DATA_MINING_ROOT / "iiitd_calender_schedule" / "iiitd_calender_schedule" |
| WEATHER_FILE = DATA_MINING_ROOT / "weather_comparison" / "weather_comparison" / "IIITD_and_airport_data.csv" |
| OUT_DIR = ROOT / "decode_reimplementation_outputs" |
| TZ = "Asia/Kolkata" |
|
|
|
|
| PAPER_BUILDINGS = { |
| "Academic": {"meters": ["Academic"], "occupancy": "ACB"}, |
| "Boys_hostel": {"meters": ["Boys_main", "Boys_backup"], "occupancy": "BH"}, |
| "Girls_hostel": {"meters": ["Girls_main", "Girls_backup"], "occupancy": "GH"}, |
| "Library": {"meters": ["Library"], "occupancy": "LB"}, |
| "Lecture": {"meters": ["Lecture"], "occupancy": "LCB"}, |
| "Dining": {"meters": ["Mess"], "occupancy": "DB"}, |
| "Facilities": {"meters": ["Facilities"], "occupancy": "SRB"}, |
| } |
|
|
| METER_TARGETS = { |
| "Academic": {"meters": ["Academic"], "occupancy": "ACB"}, |
| "Boys_main": {"meters": ["Boys_main"], "occupancy": "BH"}, |
| "Boys_backup": {"meters": ["Boys_backup"], "occupancy": "BH"}, |
| "Facilities": {"meters": ["Facilities"], "occupancy": "SRB"}, |
| "Girls_main": {"meters": ["Girls_main"], "occupancy": "GH"}, |
| "Girls_backup": {"meters": ["Girls_backup"], "occupancy": "GH"}, |
| "Lecture": {"meters": ["Lecture"], "occupancy": "LCB"}, |
| "Library": {"meters": ["Library"], "occupancy": "LB"}, |
| "Mess": {"meters": ["Mess"], "occupancy": "DB"}, |
| } |
|
|
|
|
| @dataclass |
| class SplitData: |
| x_train: object |
| x_val: object |
| x_test: object |
| y_train: object |
| y_val: object |
| y_test: object |
| feature_names: list[str] |
| test_span_steps: int | None = None |
|
|
|
|
| def import_stack(): |
| try: |
| import numpy as np |
| import pandas as pd |
| from sklearn.ensemble import RandomForestRegressor |
| from sklearn.linear_model import Ridge |
| from sklearn.metrics import mean_absolute_error, r2_score |
| from sklearn.preprocessing import MinMaxScaler |
| from sklearn.tree import DecisionTreeRegressor |
| except ImportError as exc: |
| missing = str(exc).split("No module named ")[-1].strip("'") |
| raise SystemExit( |
| f"Missing dependency: {missing}\n" |
| "Install the base training stack with:\n" |
| f" {sys.executable} -m pip install pandas numpy scikit-learn\n" |
| "Install LSTM support with either PyTorch or TensorFlow:\n" |
| f" {sys.executable} -m pip install torch\n" |
| f" {sys.executable} -m pip install tensorflow\n" |
| ) from exc |
|
|
| torch_import_error = None |
| try: |
| import torch |
| from torch import nn |
| from torch.utils.data import DataLoader, TensorDataset |
| except Exception as exc: |
| torch = None |
| nn = None |
| DataLoader = None |
| TensorDataset = None |
| torch_import_error = f"{type(exc).__name__}: {exc}" |
|
|
| lgbm_import_error = None |
| try: |
| import lightgbm as lgb |
| except Exception as exc: |
| lgb = None |
| lgbm_import_error = f"{type(exc).__name__}: {exc}" |
|
|
| statsmodels_import_error = None |
| try: |
| from statsmodels.tsa.arima.model import ARIMA |
| except Exception as exc: |
| ARIMA = None |
| statsmodels_import_error = f"{type(exc).__name__}: {exc}" |
|
|
| tf = None |
| keras = None |
| tf_import_error = None |
| disable_tf_when_torch_available = os.environ.get("DECODE_DISABLE_TENSORFLOW", "1") == "1" |
| if disable_tf_when_torch_available and torch is not None: |
| tf_import_error = "disabled because PyTorch is available" |
| else: |
| try: |
| import tensorflow as tf |
| from tensorflow import keras |
| except Exception as exc: |
| tf = None |
| keras = None |
| tf_import_error = f"{type(exc).__name__}: {exc}" |
|
|
| return { |
| "np": np, |
| "pd": pd, |
| "RandomForestRegressor": RandomForestRegressor, |
| "Ridge": Ridge, |
| "DecisionTreeRegressor": DecisionTreeRegressor, |
| "MinMaxScaler": MinMaxScaler, |
| "mean_absolute_error": mean_absolute_error, |
| "r2_score": r2_score, |
| "torch": torch, |
| "nn": nn, |
| "DataLoader": DataLoader, |
| "TensorDataset": TensorDataset, |
| "torch_import_error": torch_import_error, |
| "lgb": lgb, |
| "lgbm_import_error": lgbm_import_error, |
| "ARIMA": ARIMA, |
| "statsmodels_import_error": statsmodels_import_error, |
| "tf": tf, |
| "keras": keras, |
| "tf_import_error": tf_import_error, |
| } |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="DECODE paper re-implementation for I-BLEND data.") |
| parser.add_argument( |
| "--mode", |
| choices=["paper_buildings", "meters"], |
| default="paper_buildings", |
| help="paper_buildings trains 7 building-level targets; meters trains 9 meter-level targets.", |
| ) |
| parser.add_argument( |
| "--target", |
| default="all", |
| help="Target name to train, or 'all'. Names depend on --mode.", |
| ) |
| parser.add_argument("--freq", default="10min", help="Common sampling frequency. Paper uses 10min.") |
| parser.add_argument("--lookback", type=int, default=18, help="LSTM lookback steps. 18 at 10min = 3 hours.") |
| parser.add_argument("--horizon", type=int, default=1, help="Prediction horizon in rows. 1 at 10min = next 10min.") |
| parser.add_argument("--horizon-days", type=float, default=0, help="Prediction horizon in days. Overrides --horizon when > 0.") |
| parser.add_argument("--test-span-days", type=float, default=0, help="Evaluate only this many days from the chronological test split.") |
| parser.add_argument("--epochs", type=int, default=20, help="LSTM epochs. Paper uses 20.") |
| parser.add_argument("--batch-size", type=int, default=64, help="LSTM batch size. Paper uses 64.") |
| parser.add_argument("--rf-trees", type=int, default=500, help="Random Forest trees. Paper tuned to 500.") |
| parser.add_argument("--dl-models", default="lstm,cnn,tcn", help="Comma-separated deep models: lstm,cnn,tcn,none.") |
| parser.add_argument("--include-arima", action="store_true", help="Fit ARIMA(2,1,2). This can be slow on full series.") |
| parser.add_argument("--arima-max-train", type=int, default=20000, help="Maximum recent train points for ARIMA fitting.") |
| parser.add_argument("--max-rows", type=int, default=0, help="Optional cap after preprocessing for quick smoke tests.") |
| parser.add_argument("--include-weather", action="store_true", help="Try to merge local weather data if time ranges overlap.") |
| parser.add_argument("--skip-lstm", action="store_true", help="Only train baseline ML models.") |
| parser.add_argument("--output-dir", default=str(OUT_DIR), help="Output directory.") |
| return parser.parse_args() |
|
|
|
|
| def read_energy_10min(pd, freq: str): |
| if not ENERGY_FILE.exists(): |
| raise FileNotFoundError(f"Missing energy file: {ENERGY_FILE}") |
| df = pd.read_csv(ENERGY_FILE, na_values=["NA", ""]) |
| df["datetime"] = pd.to_datetime(df["timestamp"], unit="s", utc=True).dt.tz_convert(TZ) |
| df = df.drop(columns=["timestamp"]).set_index("datetime").sort_index() |
|
|
| |
| |
| mean_power_w = df.resample(freq).mean() |
| interval_minutes = pd.Timedelta(freq).total_seconds() / 60 |
| energy_wh = mean_power_w * interval_minutes / 60 |
| return energy_wh |
|
|
|
|
| def read_occupancy_10min(pd, code: str, freq: str): |
| path = OCCUPANCY_DIR / f"{code}.csv" |
| if not path.exists(): |
| warnings.warn(f"Missing occupancy file for {code}: {path}") |
| return None |
| df = pd.read_csv(path, na_values=["NA", ""]) |
| df["datetime"] = pd.to_datetime(df["timestamp"], unit="s", utc=True).dt.tz_convert(TZ) |
| df = df.drop(columns=["timestamp"]).set_index("datetime").sort_index() |
| occ = df.resample(freq).mean() |
| occ["occupancy_count"] = occ["occupancy_count"].interpolate(method="time").ffill().bfill() |
| return occ |
|
|
|
|
| def read_calendar(pd): |
| frames = [] |
| for path in sorted(CALENDAR_DIR.glob("calender_year_*.csv")): |
| frames.append(pd.read_csv(path)) |
| if not frames: |
| warnings.warn(f"No calendar files found in {CALENDAR_DIR}") |
| return None |
| cal = pd.concat(frames, ignore_index=True) |
| cal["date"] = pd.to_datetime(cal["Date"]).dt.date |
| cal = cal[["date", "working_day", "activity"]].drop_duplicates("date") |
| cal["working_day"] = pd.to_numeric(cal["working_day"], errors="coerce").fillna(0).astype(int) |
| cal["activity"] = cal["activity"].fillna("unknown").astype(str) |
| cal["activity_code"] = cal["activity"].astype("category").cat.codes |
| return cal |
|
|
|
|
| def read_weather_10min(pd, freq: str): |
| if not WEATHER_FILE.exists(): |
| warnings.warn(f"Missing weather file: {WEATHER_FILE}") |
| return None |
| df = pd.read_csv(WEATHER_FILE, na_values=["NA", ""]) |
| first_col = df.columns[0] |
| df = df.rename(columns={first_col: "datetime"}) |
| df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce") |
| df = df.dropna(subset=["datetime"]).set_index("datetime").sort_index() |
| if df.index.tz is None: |
| df.index = df.index.tz_localize(TZ) |
| weather = df.resample(freq).mean().interpolate(method="time").ffill().bfill() |
| return weather |
|
|
|
|
| def horizon_steps_from_args(pd, args) -> int: |
| if args.horizon_days and args.horizon_days > 0: |
| freq_delta = pd.Timedelta(args.freq) |
| steps = int(round(pd.Timedelta(days=args.horizon_days) / freq_delta)) |
| return max(1, steps) |
| return max(1, args.horizon) |
|
|
|
|
| def test_span_steps_from_args(pd, args) -> int | None: |
| if args.test_span_days and args.test_span_days > 0: |
| freq_delta = pd.Timedelta(args.freq) |
| steps = int(round(pd.Timedelta(days=args.test_span_days) / freq_delta)) |
| return max(1, steps) |
| return None |
|
|
|
|
| def describe_steps(pd, steps: int, freq: str) -> str: |
| delta = pd.Timedelta(freq) * steps |
| total_minutes = delta.total_seconds() / 60 |
| if total_minutes % 1440 == 0: |
| return f"{int(total_minutes // 1440)} day(s)" |
| if total_minutes % 60 == 0: |
| return f"{int(total_minutes // 60)} hour(s)" |
| return f"{total_minutes:g} minute(s)" |
|
|
|
|
| def add_time_features(pd, df): |
| out = df.copy() |
| idx = out.index |
| out["hour"] = idx.hour |
| out["day_of_week"] = idx.dayofweek |
| out["month"] = idx.month |
| out["time_slot"] = idx.hour * 60 + idx.minute |
| out["hour_sin"] = (2 * math.pi * out["hour"] / 24).map(math.sin) |
| out["hour_cos"] = (2 * math.pi * out["hour"] / 24).map(math.cos) |
| out["dow_sin"] = (2 * math.pi * out["day_of_week"] / 7).map(math.sin) |
| out["dow_cos"] = (2 * math.pi * out["day_of_week"] / 7).map(math.cos) |
| out["month_sin"] = (2 * math.pi * out["month"] / 12).map(math.sin) |
| out["month_cos"] = (2 * math.pi * out["month"] / 12).map(math.cos) |
| return out |
|
|
|
|
| def add_historical_features(df): |
| out = df.copy() |
| out["energy_lag_1"] = out["energy_wh"].shift(1) |
| out["energy_lag_6"] = out["energy_wh"].shift(6) |
| out["energy_lag_144"] = out["energy_wh"].shift(144) |
| out["energy_lag_1008"] = out["energy_wh"].shift(1008) |
| out["rolling_mean_6"] = out["energy_wh"].shift(1).rolling(6).mean() |
| out["rolling_mean_144"] = out["energy_wh"].shift(1).rolling(144).mean() |
| out["rolling_std_144"] = out["energy_wh"].shift(1).rolling(144).std() |
|
|
| |
| |
| same_type_groups = out.groupby(["working_day", "time_slot"], sort=False)["energy_wh"] |
| out["same_day_type_lag_1"] = same_type_groups.shift(1) |
| out["same_day_type_lag_2"] = same_type_groups.shift(2) |
| out["same_day_type_lag_3"] = same_type_groups.shift(3) |
| return out |
|
|
|
|
| def build_target_frame(pd, energy_10min, calendar, target_name: str, target_spec: dict, freq: str, include_weather: bool): |
| missing_meters = [m for m in target_spec["meters"] if m not in energy_10min.columns] |
| if missing_meters: |
| raise KeyError(f"{target_name} references missing meter columns: {missing_meters}") |
|
|
| df = pd.DataFrame(index=energy_10min.index) |
| df["energy_wh"] = energy_10min[target_spec["meters"]].sum(axis=1, min_count=1) |
|
|
| occ = read_occupancy_10min(pd, target_spec["occupancy"], freq) |
| if occ is not None: |
| df = df.join(occ[["occupancy_count"]], how="left") |
| else: |
| df["occupancy_count"] = math.nan |
|
|
| df["date"] = df.index.date |
| if calendar is not None: |
| df = df.reset_index().merge(calendar, on="date", how="left").set_index("datetime").sort_index() |
| else: |
| df["working_day"] = (df.index.dayofweek < 5).astype(int) |
| df["activity"] = "unknown" |
| df["activity_code"] = 0 |
|
|
| if include_weather: |
| weather = read_weather_10min(pd, freq) |
| if weather is not None: |
| before = len(df) |
| df = df.join(weather, how="inner") |
| if df.empty: |
| warnings.warn( |
| "Weather data has no overlap with this target after joining. " |
| "Continuing without weather features." |
| ) |
| df = pd.DataFrame(index=energy_10min.index) |
| df["energy_wh"] = energy_10min[target_spec["meters"]].sum(axis=1, min_count=1) |
| occ = read_occupancy_10min(pd, target_spec["occupancy"], freq) |
| if occ is not None: |
| df = df.join(occ[["occupancy_count"]], how="left") |
| df["date"] = df.index.date |
| df = df.reset_index().merge(calendar, on="date", how="left").set_index("datetime").sort_index() |
| elif len(df) < before: |
| warnings.warn(f"Weather join reduced rows from {before} to {len(df)}.") |
|
|
| df["occupancy_count"] = df["occupancy_count"].interpolate(method="time").ffill().bfill() |
| fallback_working_day = pd.Series((df.index.dayofweek < 5).astype(int), index=df.index) |
| df["working_day"] = df["working_day"].fillna(fallback_working_day).astype(int) |
| df["activity"] = df["activity"].fillna("unknown").astype(str) |
| df["activity_code"] = df["activity_code"].fillna(0).astype(int) |
| df = add_time_features(pd, df) |
| df = add_historical_features(df) |
| return df |
|
|
|
|
| def make_ml_split( |
| np, |
| pd, |
| MinMaxScaler, |
| df, |
| horizon: int, |
| max_rows: int, |
| test_span_steps: int | None = None, |
| ) -> tuple[SplitData, object, object]: |
| data = df.copy() |
| data["target"] = data["energy_wh"].shift(-horizon) |
|
|
| non_features = {"target", "date", "activity"} |
| feature_names = [ |
| c for c in data.columns |
| if c not in non_features and pd.api.types.is_numeric_dtype(data[c]) |
| ] |
| clean = data[feature_names + ["target"]].replace([np.inf, -np.inf], np.nan).dropna() |
| if max_rows and len(clean) > max_rows: |
| clean = clean.tail(max_rows) |
|
|
| n = len(clean) |
| if n < 100: |
| raise ValueError(f"Not enough clean rows after preprocessing: {n}") |
| train_end = int(n * 0.70) |
| val_end = int(n * 0.85) |
|
|
| x_raw = clean[feature_names] |
| y_raw = clean[["target"]] |
|
|
| x_scaler = MinMaxScaler() |
| y_scaler = MinMaxScaler() |
|
|
| x_train = x_scaler.fit_transform(x_raw.iloc[:train_end]) |
| x_val = x_scaler.transform(x_raw.iloc[train_end:val_end]) |
| x_test = x_scaler.transform(x_raw.iloc[val_end:]) |
|
|
| y_train = y_scaler.fit_transform(y_raw.iloc[:train_end]).ravel() |
| y_val = y_scaler.transform(y_raw.iloc[train_end:val_end]).ravel() |
| y_test = y_scaler.transform(y_raw.iloc[val_end:]).ravel() |
|
|
| if test_span_steps is not None: |
| x_test = x_test[:test_span_steps] |
| y_test = y_test[:test_span_steps] |
|
|
| split = SplitData( |
| x_train=x_train, |
| x_val=x_val, |
| x_test=x_test, |
| y_train=y_train, |
| y_val=y_val, |
| y_test=y_test, |
| feature_names=feature_names, |
| test_span_steps=test_span_steps, |
| ) |
| return split, y_scaler, clean |
|
|
|
|
| def make_lstm_sequences(np, split: SplitData, lookback: int): |
| x_all = np.vstack([split.x_train, split.x_val, split.x_test]) |
| y_all = np.concatenate([split.y_train, split.y_val, split.y_test]) |
| n_train = len(split.y_train) |
| n_val = len(split.y_val) |
|
|
| xs, ys, end_indices = [], [], [] |
| for end in range(lookback - 1, len(y_all)): |
| |
| |
| |
| |
| xs.append(x_all[end - lookback + 1:end + 1]) |
| ys.append(y_all[end]) |
| end_indices.append(end) |
| xs = np.asarray(xs) |
| ys = np.asarray(ys) |
| end_indices = np.asarray(end_indices) |
|
|
| train_mask = end_indices < n_train |
| val_mask = (end_indices >= n_train) & (end_indices < n_train + n_val) |
| test_mask = end_indices >= n_train + n_val |
|
|
| return ( |
| xs[train_mask], |
| xs[val_mask], |
| xs[test_mask], |
| ys[train_mask], |
| ys[val_mask], |
| ys[test_mask], |
| ) |
|
|
|
|
| def inverse_metric(np, y_scaler, y_true_scaled, y_pred_scaled, mean_absolute_error, r2_score): |
| y_true = y_scaler.inverse_transform(np.asarray(y_true_scaled).reshape(-1, 1)).ravel() |
| y_pred = y_scaler.inverse_transform(np.asarray(y_pred_scaled).reshape(-1, 1)).ravel() |
| return { |
| "mae_wh": float(mean_absolute_error(y_true, y_pred)), |
| "r2": float(r2_score(y_true, y_pred)), |
| } |
|
|
|
|
| def save_feature_importance(pd, out_dir: Path, target_name: str, model_name: str, feature_names: list[str], importances): |
| importance_dir = out_dir / "feature_importance" |
| importance_dir.mkdir(parents=True, exist_ok=True) |
| table = pd.DataFrame({"feature": feature_names, "importance": importances}) |
| table = table.sort_values("importance", ascending=False) |
| table.to_csv(importance_dir / f"{target_name}_{model_name}_feature_importance.csv", index=False) |
|
|
|
|
| def train_baselines(stack, split: SplitData, y_scaler, args, out_dir: Path, target_name: str): |
| np = stack["np"] |
| pd = stack["pd"] |
| models = { |
| "ridge_regression": stack["Ridge"](alpha=1.0), |
| "decision_tree": stack["DecisionTreeRegressor"](max_depth=14, min_samples_split=20, random_state=42), |
| "random_forest": stack["RandomForestRegressor"]( |
| n_estimators=args.rf_trees, |
| random_state=42, |
| n_jobs=-1, |
| min_samples_split=2, |
| ), |
| } |
| if stack["lgb"] is not None: |
| models["lightgbm"] = stack["lgb"].LGBMRegressor( |
| n_estimators=500, |
| learning_rate=0.03, |
| num_leaves=31, |
| subsample=0.9, |
| colsample_bytree=0.9, |
| random_state=42, |
| n_jobs=-1, |
| verbose=-1, |
| ) |
| rows = [] |
| for name, model in models.items(): |
| model.fit(split.x_train, split.y_train) |
| pred = model.predict(split.x_test) |
| metrics = inverse_metric( |
| np, |
| y_scaler, |
| split.y_test, |
| pred, |
| stack["mean_absolute_error"], |
| stack["r2_score"], |
| ) |
| if name == "lightgbm" and hasattr(model, "feature_importances_"): |
| save_feature_importance(pd, out_dir, target_name, name, split.feature_names, model.feature_importances_) |
| metrics["feature_importance_path"] = str( |
| out_dir / "feature_importance" / f"{target_name}_{name}_feature_importance.csv" |
| ) |
| rows.append({"model": name, **metrics}) |
| if stack["lgb"] is None: |
| rows.append({ |
| "model": "lightgbm", |
| "mae_wh": math.nan, |
| "r2": math.nan, |
| "note": f"lightgbm_missing: {stack.get('lgbm_import_error')}", |
| }) |
| return rows |
|
|
|
|
| def train_sequence_torch(stack, split: SplitData, y_scaler, args, model_kind: str): |
| np = stack["np"] |
| torch = stack["torch"] |
| nn = stack["nn"] |
| DataLoader = stack["DataLoader"] |
| TensorDataset = stack["TensorDataset"] |
|
|
| torch.manual_seed(42) |
| np.random.seed(42) |
|
|
| x_train, x_val, x_test, y_train, y_val, y_test = make_lstm_sequences(np, split, args.lookback) |
| if len(x_train) < 100 or len(x_val) < 10 or len(x_test) < 10: |
| return [{"model": model_kind, "mae_wh": math.nan, "r2": math.nan, "note": "not_enough_sequences", "backend": "torch"}] |
|
|
| class DecodeLSTM(nn.Module): |
| def __init__(self, input_size: int): |
| super().__init__() |
| self.lstm = nn.LSTM(input_size=input_size, hidden_size=32, batch_first=True) |
| self.head = nn.Sequential( |
| nn.Linear(32, 5), |
| nn.ReLU(), |
| nn.Linear(5, 5), |
| nn.ReLU(), |
| nn.Linear(5, 1), |
| ) |
|
|
| def forward(self, x): |
| output, _ = self.lstm(x) |
| return self.head(output[:, -1, :]).squeeze(-1) |
|
|
| class DecodeCNN1D(nn.Module): |
| def __init__(self, input_size: int): |
| super().__init__() |
| self.net = nn.Sequential( |
| nn.Conv1d(input_size, 32, kernel_size=3, padding=1), |
| nn.ReLU(), |
| nn.Conv1d(32, 32, kernel_size=3, padding=1), |
| nn.ReLU(), |
| nn.AdaptiveAvgPool1d(1), |
| ) |
| self.head = nn.Sequential( |
| nn.Flatten(), |
| nn.Linear(32, 16), |
| nn.ReLU(), |
| nn.Linear(16, 1), |
| ) |
|
|
| def forward(self, x): |
| x = x.transpose(1, 2) |
| return self.head(self.net(x)).squeeze(-1) |
|
|
| class Chomp1d(nn.Module): |
| def __init__(self, chomp_size: int): |
| super().__init__() |
| self.chomp_size = chomp_size |
|
|
| def forward(self, x): |
| return x[:, :, :-self.chomp_size].contiguous() if self.chomp_size else x |
|
|
| class DecodeTCN(nn.Module): |
| def __init__(self, input_size: int): |
| super().__init__() |
| self.net = nn.Sequential( |
| nn.Conv1d(input_size, 32, kernel_size=3, padding=2, dilation=1), |
| Chomp1d(2), |
| nn.ReLU(), |
| nn.Conv1d(32, 32, kernel_size=3, padding=4, dilation=2), |
| Chomp1d(4), |
| nn.ReLU(), |
| ) |
| self.head = nn.Sequential( |
| nn.Linear(32, 16), |
| nn.ReLU(), |
| nn.Linear(16, 1), |
| ) |
|
|
| def forward(self, x): |
| x = x.transpose(1, 2) |
| output = self.net(x).transpose(1, 2) |
| return self.head(output[:, -1, :]).squeeze(-1) |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model_classes = { |
| "lstm": DecodeLSTM, |
| "cnn1d": DecodeCNN1D, |
| "tcn": DecodeTCN, |
| } |
| model = model_classes[model_kind](input_size=x_train.shape[2]).to(device) |
| criterion = nn.L1Loss() |
| optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3) |
|
|
| train_ds = TensorDataset( |
| torch.tensor(x_train, dtype=torch.float32), |
| torch.tensor(y_train, dtype=torch.float32), |
| ) |
| val_x = torch.tensor(x_val, dtype=torch.float32).to(device) |
| val_y = torch.tensor(y_val, dtype=torch.float32).to(device) |
| train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True) |
|
|
| best_state = None |
| best_val_loss = math.inf |
| patience = 4 |
| patience_left = patience |
| epochs_run = 0 |
|
|
| for epoch in range(args.epochs): |
| model.train() |
| train_loss = 0.0 |
| seen = 0 |
| for batch_x, batch_y in train_loader: |
| batch_x = batch_x.to(device) |
| batch_y = batch_y.to(device) |
| optimizer.zero_grad() |
| pred = model(batch_x) |
| loss = criterion(pred, batch_y) |
| loss.backward() |
| optimizer.step() |
| train_loss += loss.item() * len(batch_y) |
| seen += len(batch_y) |
|
|
| model.eval() |
| with torch.no_grad(): |
| val_pred = model(val_x) |
| val_loss = criterion(val_pred, val_y).item() |
|
|
| epochs_run = epoch + 1 |
| print(f"Epoch {epochs_run}/{args.epochs} - loss: {train_loss / max(seen, 1):.6f} - val_loss: {val_loss:.6f}") |
| if val_loss < best_val_loss: |
| best_val_loss = val_loss |
| best_state = {key: value.detach().cpu().clone() for key, value in model.state_dict().items()} |
| patience_left = patience |
| else: |
| patience_left -= 1 |
| if patience_left <= 0: |
| break |
|
|
| if best_state is not None: |
| model.load_state_dict(best_state) |
|
|
| model.eval() |
| test_loader = DataLoader( |
| TensorDataset(torch.tensor(x_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)), |
| batch_size=args.batch_size, |
| shuffle=False, |
| ) |
| preds = [] |
| with torch.no_grad(): |
| for batch_x, _ in test_loader: |
| preds.append(model(batch_x.to(device)).detach().cpu().numpy()) |
| pred = np.concatenate(preds) |
| metrics = inverse_metric( |
| np, |
| y_scaler, |
| y_test, |
| pred, |
| stack["mean_absolute_error"], |
| stack["r2_score"], |
| ) |
| metrics["epochs_run"] = epochs_run |
| metrics["backend"] = "torch" |
| metrics["device"] = str(device) |
| return [{"model": model_kind, **metrics}] |
|
|
|
|
| def train_lstm_torch(stack, split: SplitData, y_scaler, args): |
| return train_sequence_torch(stack, split, y_scaler, args, "lstm") |
|
|
|
|
| def train_cnn1d_torch(stack, split: SplitData, y_scaler, args): |
| return train_sequence_torch(stack, split, y_scaler, args, "cnn1d") |
|
|
|
|
| def train_tcn_torch(stack, split: SplitData, y_scaler, args): |
| return train_sequence_torch(stack, split, y_scaler, args, "tcn") |
|
|
|
|
| def train_lstm_keras(stack, split: SplitData, y_scaler, args): |
| np = stack["np"] |
| keras = stack["keras"] |
| tf = stack["tf"] |
| tf.random.set_seed(42) |
| np.random.seed(42) |
|
|
| x_train, x_val, x_test, y_train, y_val, y_test = make_lstm_sequences(np, split, args.lookback) |
| if len(x_train) < 100 or len(x_val) < 10 or len(x_test) < 10: |
| return [{"model": "lstm", "mae_wh": math.nan, "r2": math.nan, "note": "not_enough_sequences", "backend": "tensorflow"}] |
|
|
| model = keras.Sequential( |
| [ |
| keras.layers.Input(shape=(x_train.shape[1], x_train.shape[2])), |
| keras.layers.LSTM(32), |
| keras.layers.Dense(5, activation="relu"), |
| keras.layers.Dense(5, activation="relu"), |
| keras.layers.Dense(1), |
| ] |
| ) |
| model.compile(optimizer=keras.optimizers.RMSprop(), loss="mae") |
| early_stop = keras.callbacks.EarlyStopping(monitor="val_loss", patience=4, restore_best_weights=True) |
| history = model.fit( |
| x_train, |
| y_train, |
| validation_data=(x_val, y_val), |
| epochs=args.epochs, |
| batch_size=args.batch_size, |
| verbose=1, |
| callbacks=[early_stop], |
| ) |
| pred = model.predict(x_test, verbose=0).ravel() |
| metrics = inverse_metric( |
| np, |
| y_scaler, |
| y_test, |
| pred, |
| stack["mean_absolute_error"], |
| stack["r2_score"], |
| ) |
| metrics["epochs_run"] = len(history.history["loss"]) |
| metrics["backend"] = "tensorflow" |
| return [{"model": "lstm", **metrics}] |
|
|
|
|
| def train_lstm(stack, split: SplitData, y_scaler, args): |
| if args.skip_lstm: |
| return [{"model": "lstm", "mae_wh": math.nan, "r2": math.nan, "note": "skipped_by_flag"}] |
| if stack["torch"] is not None: |
| return train_lstm_torch(stack, split, y_scaler, args) |
| if stack["keras"] is not None: |
| return train_lstm_keras(stack, split, y_scaler, args) |
| note = ( |
| "lstm_backend_missing: " |
| f"torch={stack.get('torch_import_error')}; " |
| f"tensorflow={stack.get('tf_import_error')}" |
| ) |
| return [{"model": "lstm", "mae_wh": math.nan, "r2": math.nan, "note": note}] |
|
|
|
|
| def train_deep_models(stack, split: SplitData, y_scaler, args): |
| requested = {item.strip().lower() for item in args.dl_models.split(",") if item.strip()} |
| if "none" in requested or args.skip_lstm: |
| return [{"model": "deep_models", "mae_wh": math.nan, "r2": math.nan, "note": "skipped_by_flag"}] |
|
|
| rows = [] |
| if "lstm" in requested: |
| rows.extend(train_lstm(stack, split, y_scaler, args)) |
|
|
| torch_missing_note = f"torch_missing: {stack.get('torch_import_error')}" |
| if "cnn" in requested: |
| requested.add("cnn1d") |
| if "cnn1d" in requested: |
| if stack["torch"] is not None: |
| rows.extend(train_cnn1d_torch(stack, split, y_scaler, args)) |
| else: |
| rows.append({"model": "cnn1d", "mae_wh": math.nan, "r2": math.nan, "note": torch_missing_note}) |
| if "tcn" in requested: |
| if stack["torch"] is not None: |
| rows.extend(train_tcn_torch(stack, split, y_scaler, args)) |
| else: |
| rows.append({"model": "tcn", "mae_wh": math.nan, "r2": math.nan, "note": torch_missing_note}) |
| return rows |
|
|
|
|
| def train_arima(stack, clean, args): |
| if not args.include_arima: |
| return [{"model": "arima", "mae_wh": math.nan, "r2": math.nan, "note": "skipped_enable_with_include_arima"}] |
| if stack["ARIMA"] is None: |
| return [{ |
| "model": "arima", |
| "mae_wh": math.nan, |
| "r2": math.nan, |
| "note": f"statsmodels_missing: {stack.get('statsmodels_import_error')}", |
| }] |
|
|
| n = len(clean) |
| train_end = int(n * 0.70) |
| val_end = int(n * 0.85) |
| history = clean["energy_wh"].iloc[:val_end].dropna() |
| if args.arima_max_train and len(history) > args.arima_max_train: |
| history = history.iloc[-args.arima_max_train:] |
| y_true = clean["target"].iloc[val_end:].dropna() |
| test_span_steps = test_span_steps_from_args(stack["pd"], args) |
| if test_span_steps is not None: |
| y_true = y_true.iloc[:test_span_steps] |
| if len(history) < 50 or len(y_true) < 10: |
| return [{"model": "arima", "mae_wh": math.nan, "r2": math.nan, "note": "not_enough_points"}] |
|
|
| try: |
| model = stack["ARIMA"](history, order=(2, 1, 2)) |
| fitted = model.fit() |
| forecast = fitted.forecast(steps=len(y_true)) |
| metrics = { |
| "mae_wh": float(stack["mean_absolute_error"](y_true, forecast)), |
| "r2": float(stack["r2_score"](y_true, forecast)), |
| "note": f"order=(2,1,2); train_points={len(history)}", |
| } |
| return [{"model": "arima", **metrics}] |
| except Exception as exc: |
| return [{"model": "arima", "mae_wh": math.nan, "r2": math.nan, "note": f"arima_failed: {type(exc).__name__}: {exc}"}] |
|
|
|
|
| def save_dataset(pd, clean, out_dir: Path, target_name: str): |
| out_path = out_dir / "processed" / f"{target_name}_train_ready.csv" |
| out_path.parent.mkdir(parents=True, exist_ok=True) |
| clean.to_csv(out_path) |
| return out_path |
|
|
|
|
| def run_target(stack, args, energy_10min, calendar, target_name: str, target_spec: dict, out_dir: Path): |
| pd = stack["pd"] |
| np = stack["np"] |
| horizon_steps = horizon_steps_from_args(pd, args) |
| test_span_steps = test_span_steps_from_args(pd, args) |
| print(f"\n=== Target: {target_name} ===") |
| print(f"Horizon: {horizon_steps} step(s) = {describe_steps(pd, horizon_steps, args.freq)}") |
| print(f"Lookback: {args.lookback} step(s) = {describe_steps(pd, args.lookback, args.freq)}") |
| if test_span_steps is not None: |
| print(f"Test span: {test_span_steps} step(s) = {describe_steps(pd, test_span_steps, args.freq)}") |
| df = build_target_frame( |
| pd, |
| energy_10min, |
| calendar, |
| target_name, |
| target_spec, |
| args.freq, |
| args.include_weather, |
| ) |
| split, y_scaler, clean = make_ml_split( |
| np, |
| pd, |
| stack["MinMaxScaler"], |
| df, |
| horizon=horizon_steps, |
| max_rows=args.max_rows, |
| test_span_steps=test_span_steps, |
| ) |
| dataset_path = save_dataset(pd, clean, out_dir, target_name) |
| print(f"Rows after feature engineering: {len(clean):,}") |
| print(f"Train/val/test: {len(split.y_train):,}/{len(split.y_val):,}/{len(split.y_test):,}") |
| print(f"Saved train-ready table: {dataset_path}") |
|
|
| rows = [] |
| rows.extend(train_baselines(stack, split, y_scaler, args, out_dir, target_name)) |
| rows.extend(train_deep_models(stack, split, y_scaler, args)) |
| rows.extend(train_arima(stack, clean, args)) |
| for row in rows: |
| row["target"] = target_name |
| row["rows"] = len(clean) |
| row["features"] = len(split.feature_names) |
| row["horizon_steps"] = horizon_steps |
| row["horizon"] = describe_steps(pd, horizon_steps, args.freq) |
| row["lookback_steps"] = args.lookback |
| row["lookback"] = describe_steps(pd, args.lookback, args.freq) |
| row["test_span_steps"] = test_span_steps if test_span_steps is not None else len(split.y_test) |
| row["test_span"] = describe_steps(pd, row["test_span_steps"], args.freq) |
| return rows |
|
|
|
|
| def main() -> int: |
| args = parse_args() |
| stack = import_stack() |
| pd = stack["pd"] |
|
|
| out_dir = Path(args.output_dir) |
| out_dir.mkdir(parents=True, exist_ok=True) |
| (out_dir / "processed").mkdir(exist_ok=True) |
|
|
| config = PAPER_BUILDINGS if args.mode == "paper_buildings" else METER_TARGETS |
| if args.target != "all": |
| if args.target not in config: |
| raise SystemExit(f"Unknown target {args.target!r}. Available: {', '.join(config)}") |
| config = {args.target: config[args.target]} |
|
|
| print("DECODE re-implementation") |
| print(f"Mode: {args.mode}") |
| print(f"Frequency: {args.freq}") |
| print(f"Python executable: {sys.executable}") |
| print(f"PyTorch available: {stack['torch'] is not None}") |
| if stack["torch"] is None: |
| print(f"PyTorch import error: {stack.get('torch_import_error')}") |
| print(f"TensorFlow available: {stack['keras'] is not None}") |
| if stack["keras"] is None: |
| print(f"TensorFlow import error: {stack.get('tf_import_error')}") |
| print(f"LightGBM available: {stack['lgb'] is not None}") |
| if stack["lgb"] is None: |
| print(f"LightGBM import error: {stack.get('lgbm_import_error')}") |
| print(f"Statsmodels ARIMA available: {stack['ARIMA'] is not None}") |
| if stack["ARIMA"] is None: |
| print(f"Statsmodels import error: {stack.get('statsmodels_import_error')}") |
| if not args.skip_lstm: |
| if stack["torch"] is not None: |
| print("LSTM backend: PyTorch") |
| elif stack["keras"] is not None: |
| print("LSTM backend: TensorFlow/Keras") |
| else: |
| print("LSTM backend: unavailable") |
|
|
| energy_10min = read_energy_10min(pd, args.freq) |
| calendar = read_calendar(pd) |
|
|
| all_rows = [] |
| for target_name, target_spec in config.items(): |
| rows = run_target(stack, args, energy_10min, calendar, target_name, target_spec, out_dir) |
| all_rows.extend(rows) |
|
|
| results = pd.DataFrame(all_rows) |
| effective_horizon = horizon_steps_from_args(pd, args) |
| effective_test_span = test_span_steps_from_args(pd, args) |
| result_suffix = f"h{effective_horizon}" |
| if effective_test_span is not None: |
| result_suffix += f"_ts{effective_test_span}" |
| result_path = out_dir / f"results_{args.mode}_{result_suffix}.csv" |
| results.to_csv(result_path, index=False) |
|
|
| metadata = { |
| "mode": args.mode, |
| "target": args.target, |
| "freq": args.freq, |
| "lookback": args.lookback, |
| "horizon": args.horizon, |
| "horizon_days": args.horizon_days, |
| "horizon_steps_effective": horizon_steps_from_args(pd, args), |
| "test_span_days": args.test_span_days, |
| "test_span_steps_effective": test_span_steps_from_args(pd, args), |
| "epochs": args.epochs, |
| "batch_size": args.batch_size, |
| "rf_trees": args.rf_trees, |
| "dl_models": args.dl_models, |
| "include_arima": args.include_arima, |
| "include_weather": args.include_weather, |
| "pytorch_available": stack["torch"] is not None, |
| "torch_import_error": stack.get("torch_import_error"), |
| "lightgbm_available": stack["lgb"] is not None, |
| "lightgbm_import_error": stack.get("lgbm_import_error"), |
| "statsmodels_available": stack["ARIMA"] is not None, |
| "statsmodels_import_error": stack.get("statsmodels_import_error"), |
| "tensorflow_available": stack["keras"] is not None, |
| "tensorflow_import_error": stack.get("tf_import_error"), |
| "energy_file": str(ENERGY_FILE), |
| "occupancy_dir": str(OCCUPANCY_DIR), |
| "calendar_dir": str(CALENDAR_DIR), |
| "weather_file": str(WEATHER_FILE), |
| } |
| (out_dir / f"run_config_{args.mode}.json").write_text(json.dumps(metadata, indent=2), encoding="utf-8") |
|
|
| print("\n=== Results ===") |
| print(results.sort_values(["target", "mae_wh"]).to_string(index=False)) |
| print(f"\nSaved results: {result_path}") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|