#!/usr/bin/env python3 """Re-implement the DECODE paper training pipeline for local I-BLEND data. The paper pipeline is: 1. Fuse energy, occupancy, calendar, and weather-like environmental features. 2. Align everything to a 10-minute sampling rate. 3. Normalize features with Min-Max scaling. 4. Split chronologically into train/validation/test with a 70:15:15 ratio. 5. Compare LSTM with Linear Regression, Decision Tree, and Random Forest. This local implementation supports two targets: - paper_buildings: 7 building-level series matching the paper. - meters: 9 meter-level series from all_buildings_power.csv. Weather is optional because the local weather file in this workspace starts in 2018, while the energy data ends in 2017. The script detects this and continues. """ from __future__ import annotations import argparse import os import json import math import sys import warnings from dataclasses import dataclass from pathlib import Path ROOT = Path(__file__).resolve().parents[1] DEFAULT_DATA_MINING_ROOT = ROOT if (ROOT / "IIITD_occupancy_dataset").exists() else ROOT.parent DATA_MINING_ROOT = Path(os.environ.get("IBLEND_DATA_ROOT", DEFAULT_DATA_MINING_ROOT)) ENERGY_FILE = Path(os.environ.get("IBLEND_ENERGY_FILE", DATA_MINING_ROOT / "energy_dataset" / "all_buildings_power.csv")) OCCUPANCY_DIR = DATA_MINING_ROOT / "IIITD_occupancy_dataset" / "IIITD_occupancy_dataset" CALENDAR_DIR = DATA_MINING_ROOT / "iiitd_calender_schedule" / "iiitd_calender_schedule" WEATHER_FILE = DATA_MINING_ROOT / "weather_comparison" / "weather_comparison" / "IIITD_and_airport_data.csv" OUT_DIR = ROOT / "decode_reimplementation_outputs" TZ = "Asia/Kolkata" PAPER_BUILDINGS = { "Academic": {"meters": ["Academic"], "occupancy": "ACB"}, "Boys_hostel": {"meters": ["Boys_main", "Boys_backup"], "occupancy": "BH"}, "Girls_hostel": {"meters": ["Girls_main", "Girls_backup"], "occupancy": "GH"}, "Library": {"meters": ["Library"], "occupancy": "LB"}, "Lecture": {"meters": ["Lecture"], "occupancy": "LCB"}, "Dining": {"meters": ["Mess"], "occupancy": "DB"}, "Facilities": {"meters": ["Facilities"], "occupancy": "SRB"}, } METER_TARGETS = { "Academic": {"meters": ["Academic"], "occupancy": "ACB"}, "Boys_main": {"meters": ["Boys_main"], "occupancy": "BH"}, "Boys_backup": {"meters": ["Boys_backup"], "occupancy": "BH"}, "Facilities": {"meters": ["Facilities"], "occupancy": "SRB"}, "Girls_main": {"meters": ["Girls_main"], "occupancy": "GH"}, "Girls_backup": {"meters": ["Girls_backup"], "occupancy": "GH"}, "Lecture": {"meters": ["Lecture"], "occupancy": "LCB"}, "Library": {"meters": ["Library"], "occupancy": "LB"}, "Mess": {"meters": ["Mess"], "occupancy": "DB"}, } @dataclass class SplitData: x_train: object x_val: object x_test: object y_train: object y_val: object y_test: object feature_names: list[str] test_span_steps: int | None = None def import_stack(): try: import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import Ridge from sklearn.metrics import mean_absolute_error, r2_score from sklearn.preprocessing import MinMaxScaler from sklearn.tree import DecisionTreeRegressor except ImportError as exc: missing = str(exc).split("No module named ")[-1].strip("'") raise SystemExit( f"Missing dependency: {missing}\n" "Install the base training stack with:\n" f" {sys.executable} -m pip install pandas numpy scikit-learn\n" "Install LSTM support with either PyTorch or TensorFlow:\n" f" {sys.executable} -m pip install torch\n" f" {sys.executable} -m pip install tensorflow\n" ) from exc torch_import_error = None try: import torch from torch import nn from torch.utils.data import DataLoader, TensorDataset except Exception as exc: torch = None nn = None DataLoader = None TensorDataset = None torch_import_error = f"{type(exc).__name__}: {exc}" lgbm_import_error = None try: import lightgbm as lgb except Exception as exc: lgb = None lgbm_import_error = f"{type(exc).__name__}: {exc}" statsmodels_import_error = None try: from statsmodels.tsa.arima.model import ARIMA except Exception as exc: ARIMA = None statsmodels_import_error = f"{type(exc).__name__}: {exc}" tf = None keras = None tf_import_error = None disable_tf_when_torch_available = os.environ.get("DECODE_DISABLE_TENSORFLOW", "1") == "1" if disable_tf_when_torch_available and torch is not None: tf_import_error = "disabled because PyTorch is available" else: try: import tensorflow as tf from tensorflow import keras except Exception as exc: tf = None keras = None tf_import_error = f"{type(exc).__name__}: {exc}" return { "np": np, "pd": pd, "RandomForestRegressor": RandomForestRegressor, "Ridge": Ridge, "DecisionTreeRegressor": DecisionTreeRegressor, "MinMaxScaler": MinMaxScaler, "mean_absolute_error": mean_absolute_error, "r2_score": r2_score, "torch": torch, "nn": nn, "DataLoader": DataLoader, "TensorDataset": TensorDataset, "torch_import_error": torch_import_error, "lgb": lgb, "lgbm_import_error": lgbm_import_error, "ARIMA": ARIMA, "statsmodels_import_error": statsmodels_import_error, "tf": tf, "keras": keras, "tf_import_error": tf_import_error, } def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="DECODE paper re-implementation for I-BLEND data.") parser.add_argument( "--mode", choices=["paper_buildings", "meters"], default="paper_buildings", help="paper_buildings trains 7 building-level targets; meters trains 9 meter-level targets.", ) parser.add_argument( "--target", default="all", help="Target name to train, or 'all'. Names depend on --mode.", ) parser.add_argument("--freq", default="10min", help="Common sampling frequency. Paper uses 10min.") parser.add_argument("--lookback", type=int, default=18, help="LSTM lookback steps. 18 at 10min = 3 hours.") parser.add_argument("--horizon", type=int, default=1, help="Prediction horizon in rows. 1 at 10min = next 10min.") parser.add_argument("--horizon-days", type=float, default=0, help="Prediction horizon in days. Overrides --horizon when > 0.") parser.add_argument("--test-span-days", type=float, default=0, help="Evaluate only this many days from the chronological test split.") parser.add_argument("--epochs", type=int, default=20, help="LSTM epochs. Paper uses 20.") parser.add_argument("--batch-size", type=int, default=64, help="LSTM batch size. Paper uses 64.") parser.add_argument("--rf-trees", type=int, default=500, help="Random Forest trees. Paper tuned to 500.") parser.add_argument("--dl-models", default="lstm,cnn,tcn", help="Comma-separated deep models: lstm,cnn,tcn,none.") parser.add_argument("--include-arima", action="store_true", help="Fit ARIMA(2,1,2). This can be slow on full series.") parser.add_argument("--arima-max-train", type=int, default=20000, help="Maximum recent train points for ARIMA fitting.") parser.add_argument("--max-rows", type=int, default=0, help="Optional cap after preprocessing for quick smoke tests.") parser.add_argument("--include-weather", action="store_true", help="Try to merge local weather data if time ranges overlap.") parser.add_argument("--skip-lstm", action="store_true", help="Only train baseline ML models.") parser.add_argument("--output-dir", default=str(OUT_DIR), help="Output directory.") return parser.parse_args() def read_energy_10min(pd, freq: str): if not ENERGY_FILE.exists(): raise FileNotFoundError(f"Missing energy file: {ENERGY_FILE}") df = pd.read_csv(ENERGY_FILE, na_values=["NA", ""]) df["datetime"] = pd.to_datetime(df["timestamp"], unit="s", utc=True).dt.tz_convert(TZ) df = df.drop(columns=["timestamp"]).set_index("datetime").sort_index() # Paper predicts energy in Wh. Original columns are power in W at 1-minute resolution. # For a 10-minute interval: Wh = mean(W) * 10 / 60. mean_power_w = df.resample(freq).mean() interval_minutes = pd.Timedelta(freq).total_seconds() / 60 energy_wh = mean_power_w * interval_minutes / 60 return energy_wh def read_occupancy_10min(pd, code: str, freq: str): path = OCCUPANCY_DIR / f"{code}.csv" if not path.exists(): warnings.warn(f"Missing occupancy file for {code}: {path}") return None df = pd.read_csv(path, na_values=["NA", ""]) df["datetime"] = pd.to_datetime(df["timestamp"], unit="s", utc=True).dt.tz_convert(TZ) df = df.drop(columns=["timestamp"]).set_index("datetime").sort_index() occ = df.resample(freq).mean() occ["occupancy_count"] = occ["occupancy_count"].interpolate(method="time").ffill().bfill() return occ def read_calendar(pd): frames = [] for path in sorted(CALENDAR_DIR.glob("calender_year_*.csv")): frames.append(pd.read_csv(path)) if not frames: warnings.warn(f"No calendar files found in {CALENDAR_DIR}") return None cal = pd.concat(frames, ignore_index=True) cal["date"] = pd.to_datetime(cal["Date"]).dt.date cal = cal[["date", "working_day", "activity"]].drop_duplicates("date") cal["working_day"] = pd.to_numeric(cal["working_day"], errors="coerce").fillna(0).astype(int) cal["activity"] = cal["activity"].fillna("unknown").astype(str) cal["activity_code"] = cal["activity"].astype("category").cat.codes return cal def read_weather_10min(pd, freq: str): if not WEATHER_FILE.exists(): warnings.warn(f"Missing weather file: {WEATHER_FILE}") return None df = pd.read_csv(WEATHER_FILE, na_values=["NA", ""]) first_col = df.columns[0] df = df.rename(columns={first_col: "datetime"}) df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce") df = df.dropna(subset=["datetime"]).set_index("datetime").sort_index() if df.index.tz is None: df.index = df.index.tz_localize(TZ) weather = df.resample(freq).mean().interpolate(method="time").ffill().bfill() return weather def horizon_steps_from_args(pd, args) -> int: if args.horizon_days and args.horizon_days > 0: freq_delta = pd.Timedelta(args.freq) steps = int(round(pd.Timedelta(days=args.horizon_days) / freq_delta)) return max(1, steps) return max(1, args.horizon) def test_span_steps_from_args(pd, args) -> int | None: if args.test_span_days and args.test_span_days > 0: freq_delta = pd.Timedelta(args.freq) steps = int(round(pd.Timedelta(days=args.test_span_days) / freq_delta)) return max(1, steps) return None def describe_steps(pd, steps: int, freq: str) -> str: delta = pd.Timedelta(freq) * steps total_minutes = delta.total_seconds() / 60 if total_minutes % 1440 == 0: return f"{int(total_minutes // 1440)} day(s)" if total_minutes % 60 == 0: return f"{int(total_minutes // 60)} hour(s)" return f"{total_minutes:g} minute(s)" def add_time_features(pd, df): out = df.copy() idx = out.index out["hour"] = idx.hour out["day_of_week"] = idx.dayofweek out["month"] = idx.month out["time_slot"] = idx.hour * 60 + idx.minute out["hour_sin"] = (2 * math.pi * out["hour"] / 24).map(math.sin) out["hour_cos"] = (2 * math.pi * out["hour"] / 24).map(math.cos) out["dow_sin"] = (2 * math.pi * out["day_of_week"] / 7).map(math.sin) out["dow_cos"] = (2 * math.pi * out["day_of_week"] / 7).map(math.cos) out["month_sin"] = (2 * math.pi * out["month"] / 12).map(math.sin) out["month_cos"] = (2 * math.pi * out["month"] / 12).map(math.cos) return out def add_historical_features(df): out = df.copy() out["energy_lag_1"] = out["energy_wh"].shift(1) out["energy_lag_6"] = out["energy_wh"].shift(6) out["energy_lag_144"] = out["energy_wh"].shift(144) out["energy_lag_1008"] = out["energy_wh"].shift(1008) out["rolling_mean_6"] = out["energy_wh"].shift(1).rolling(6).mean() out["rolling_mean_144"] = out["energy_wh"].shift(1).rolling(144).mean() out["rolling_std_144"] = out["energy_wh"].shift(1).rolling(144).std() # DECODE-style baseline features: previous three days with same working-day class # and same time instant. same_type_groups = out.groupby(["working_day", "time_slot"], sort=False)["energy_wh"] out["same_day_type_lag_1"] = same_type_groups.shift(1) out["same_day_type_lag_2"] = same_type_groups.shift(2) out["same_day_type_lag_3"] = same_type_groups.shift(3) return out def build_target_frame(pd, energy_10min, calendar, target_name: str, target_spec: dict, freq: str, include_weather: bool): missing_meters = [m for m in target_spec["meters"] if m not in energy_10min.columns] if missing_meters: raise KeyError(f"{target_name} references missing meter columns: {missing_meters}") df = pd.DataFrame(index=energy_10min.index) df["energy_wh"] = energy_10min[target_spec["meters"]].sum(axis=1, min_count=1) occ = read_occupancy_10min(pd, target_spec["occupancy"], freq) if occ is not None: df = df.join(occ[["occupancy_count"]], how="left") else: df["occupancy_count"] = math.nan df["date"] = df.index.date if calendar is not None: df = df.reset_index().merge(calendar, on="date", how="left").set_index("datetime").sort_index() else: df["working_day"] = (df.index.dayofweek < 5).astype(int) df["activity"] = "unknown" df["activity_code"] = 0 if include_weather: weather = read_weather_10min(pd, freq) if weather is not None: before = len(df) df = df.join(weather, how="inner") if df.empty: warnings.warn( "Weather data has no overlap with this target after joining. " "Continuing without weather features." ) df = pd.DataFrame(index=energy_10min.index) df["energy_wh"] = energy_10min[target_spec["meters"]].sum(axis=1, min_count=1) occ = read_occupancy_10min(pd, target_spec["occupancy"], freq) if occ is not None: df = df.join(occ[["occupancy_count"]], how="left") df["date"] = df.index.date df = df.reset_index().merge(calendar, on="date", how="left").set_index("datetime").sort_index() elif len(df) < before: warnings.warn(f"Weather join reduced rows from {before} to {len(df)}.") df["occupancy_count"] = df["occupancy_count"].interpolate(method="time").ffill().bfill() fallback_working_day = pd.Series((df.index.dayofweek < 5).astype(int), index=df.index) df["working_day"] = df["working_day"].fillna(fallback_working_day).astype(int) df["activity"] = df["activity"].fillna("unknown").astype(str) df["activity_code"] = df["activity_code"].fillna(0).astype(int) df = add_time_features(pd, df) df = add_historical_features(df) return df def make_ml_split( np, pd, MinMaxScaler, df, horizon: int, max_rows: int, test_span_steps: int | None = None, ) -> tuple[SplitData, object, object]: data = df.copy() data["target"] = data["energy_wh"].shift(-horizon) non_features = {"target", "date", "activity"} feature_names = [ c for c in data.columns if c not in non_features and pd.api.types.is_numeric_dtype(data[c]) ] clean = data[feature_names + ["target"]].replace([np.inf, -np.inf], np.nan).dropna() if max_rows and len(clean) > max_rows: clean = clean.tail(max_rows) n = len(clean) if n < 100: raise ValueError(f"Not enough clean rows after preprocessing: {n}") train_end = int(n * 0.70) val_end = int(n * 0.85) x_raw = clean[feature_names] y_raw = clean[["target"]] x_scaler = MinMaxScaler() y_scaler = MinMaxScaler() x_train = x_scaler.fit_transform(x_raw.iloc[:train_end]) x_val = x_scaler.transform(x_raw.iloc[train_end:val_end]) x_test = x_scaler.transform(x_raw.iloc[val_end:]) y_train = y_scaler.fit_transform(y_raw.iloc[:train_end]).ravel() y_val = y_scaler.transform(y_raw.iloc[train_end:val_end]).ravel() y_test = y_scaler.transform(y_raw.iloc[val_end:]).ravel() if test_span_steps is not None: x_test = x_test[:test_span_steps] y_test = y_test[:test_span_steps] split = SplitData( x_train=x_train, x_val=x_val, x_test=x_test, y_train=y_train, y_val=y_val, y_test=y_test, feature_names=feature_names, test_span_steps=test_span_steps, ) return split, y_scaler, clean def make_lstm_sequences(np, split: SplitData, lookback: int): x_all = np.vstack([split.x_train, split.x_val, split.x_test]) y_all = np.concatenate([split.y_train, split.y_val, split.y_test]) n_train = len(split.y_train) n_val = len(split.y_val) xs, ys, end_indices = [], [], [] for end in range(lookback - 1, len(y_all)): # Use the sequence ending at row `end` to predict that row's target. # The target was already shifted by --horizon during feature engineering, # so excluding row `end` here would make sequence models forecast one # extra step farther than the baseline models. xs.append(x_all[end - lookback + 1:end + 1]) ys.append(y_all[end]) end_indices.append(end) xs = np.asarray(xs) ys = np.asarray(ys) end_indices = np.asarray(end_indices) train_mask = end_indices < n_train val_mask = (end_indices >= n_train) & (end_indices < n_train + n_val) test_mask = end_indices >= n_train + n_val return ( xs[train_mask], xs[val_mask], xs[test_mask], ys[train_mask], ys[val_mask], ys[test_mask], ) def inverse_metric(np, y_scaler, y_true_scaled, y_pred_scaled, mean_absolute_error, r2_score): y_true = y_scaler.inverse_transform(np.asarray(y_true_scaled).reshape(-1, 1)).ravel() y_pred = y_scaler.inverse_transform(np.asarray(y_pred_scaled).reshape(-1, 1)).ravel() return { "mae_wh": float(mean_absolute_error(y_true, y_pred)), "r2": float(r2_score(y_true, y_pred)), } def save_feature_importance(pd, out_dir: Path, target_name: str, model_name: str, feature_names: list[str], importances): importance_dir = out_dir / "feature_importance" importance_dir.mkdir(parents=True, exist_ok=True) table = pd.DataFrame({"feature": feature_names, "importance": importances}) table = table.sort_values("importance", ascending=False) table.to_csv(importance_dir / f"{target_name}_{model_name}_feature_importance.csv", index=False) def train_baselines(stack, split: SplitData, y_scaler, args, out_dir: Path, target_name: str): np = stack["np"] pd = stack["pd"] models = { "ridge_regression": stack["Ridge"](alpha=1.0), "decision_tree": stack["DecisionTreeRegressor"](max_depth=14, min_samples_split=20, random_state=42), "random_forest": stack["RandomForestRegressor"]( n_estimators=args.rf_trees, random_state=42, n_jobs=-1, min_samples_split=2, ), } if stack["lgb"] is not None: models["lightgbm"] = stack["lgb"].LGBMRegressor( n_estimators=500, learning_rate=0.03, num_leaves=31, subsample=0.9, colsample_bytree=0.9, random_state=42, n_jobs=-1, verbose=-1, ) rows = [] for name, model in models.items(): model.fit(split.x_train, split.y_train) pred = model.predict(split.x_test) metrics = inverse_metric( np, y_scaler, split.y_test, pred, stack["mean_absolute_error"], stack["r2_score"], ) if name == "lightgbm" and hasattr(model, "feature_importances_"): save_feature_importance(pd, out_dir, target_name, name, split.feature_names, model.feature_importances_) metrics["feature_importance_path"] = str( out_dir / "feature_importance" / f"{target_name}_{name}_feature_importance.csv" ) rows.append({"model": name, **metrics}) if stack["lgb"] is None: rows.append({ "model": "lightgbm", "mae_wh": math.nan, "r2": math.nan, "note": f"lightgbm_missing: {stack.get('lgbm_import_error')}", }) return rows def train_sequence_torch(stack, split: SplitData, y_scaler, args, model_kind: str): np = stack["np"] torch = stack["torch"] nn = stack["nn"] DataLoader = stack["DataLoader"] TensorDataset = stack["TensorDataset"] torch.manual_seed(42) np.random.seed(42) x_train, x_val, x_test, y_train, y_val, y_test = make_lstm_sequences(np, split, args.lookback) if len(x_train) < 100 or len(x_val) < 10 or len(x_test) < 10: return [{"model": model_kind, "mae_wh": math.nan, "r2": math.nan, "note": "not_enough_sequences", "backend": "torch"}] class DecodeLSTM(nn.Module): def __init__(self, input_size: int): super().__init__() self.lstm = nn.LSTM(input_size=input_size, hidden_size=32, batch_first=True) self.head = nn.Sequential( nn.Linear(32, 5), nn.ReLU(), nn.Linear(5, 5), nn.ReLU(), nn.Linear(5, 1), ) def forward(self, x): output, _ = self.lstm(x) return self.head(output[:, -1, :]).squeeze(-1) class DecodeCNN1D(nn.Module): def __init__(self, input_size: int): super().__init__() self.net = nn.Sequential( nn.Conv1d(input_size, 32, kernel_size=3, padding=1), nn.ReLU(), nn.Conv1d(32, 32, kernel_size=3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool1d(1), ) self.head = nn.Sequential( nn.Flatten(), nn.Linear(32, 16), nn.ReLU(), nn.Linear(16, 1), ) def forward(self, x): x = x.transpose(1, 2) return self.head(self.net(x)).squeeze(-1) class Chomp1d(nn.Module): def __init__(self, chomp_size: int): super().__init__() self.chomp_size = chomp_size def forward(self, x): return x[:, :, :-self.chomp_size].contiguous() if self.chomp_size else x class DecodeTCN(nn.Module): def __init__(self, input_size: int): super().__init__() self.net = nn.Sequential( nn.Conv1d(input_size, 32, kernel_size=3, padding=2, dilation=1), Chomp1d(2), nn.ReLU(), nn.Conv1d(32, 32, kernel_size=3, padding=4, dilation=2), Chomp1d(4), nn.ReLU(), ) self.head = nn.Sequential( nn.Linear(32, 16), nn.ReLU(), nn.Linear(16, 1), ) def forward(self, x): x = x.transpose(1, 2) output = self.net(x).transpose(1, 2) return self.head(output[:, -1, :]).squeeze(-1) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_classes = { "lstm": DecodeLSTM, "cnn1d": DecodeCNN1D, "tcn": DecodeTCN, } model = model_classes[model_kind](input_size=x_train.shape[2]).to(device) criterion = nn.L1Loss() optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3) train_ds = TensorDataset( torch.tensor(x_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32), ) val_x = torch.tensor(x_val, dtype=torch.float32).to(device) val_y = torch.tensor(y_val, dtype=torch.float32).to(device) train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True) best_state = None best_val_loss = math.inf patience = 4 patience_left = patience epochs_run = 0 for epoch in range(args.epochs): model.train() train_loss = 0.0 seen = 0 for batch_x, batch_y in train_loader: batch_x = batch_x.to(device) batch_y = batch_y.to(device) optimizer.zero_grad() pred = model(batch_x) loss = criterion(pred, batch_y) loss.backward() optimizer.step() train_loss += loss.item() * len(batch_y) seen += len(batch_y) model.eval() with torch.no_grad(): val_pred = model(val_x) val_loss = criterion(val_pred, val_y).item() epochs_run = epoch + 1 print(f"Epoch {epochs_run}/{args.epochs} - loss: {train_loss / max(seen, 1):.6f} - val_loss: {val_loss:.6f}") if val_loss < best_val_loss: best_val_loss = val_loss best_state = {key: value.detach().cpu().clone() for key, value in model.state_dict().items()} patience_left = patience else: patience_left -= 1 if patience_left <= 0: break if best_state is not None: model.load_state_dict(best_state) model.eval() test_loader = DataLoader( TensorDataset(torch.tensor(x_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)), batch_size=args.batch_size, shuffle=False, ) preds = [] with torch.no_grad(): for batch_x, _ in test_loader: preds.append(model(batch_x.to(device)).detach().cpu().numpy()) pred = np.concatenate(preds) metrics = inverse_metric( np, y_scaler, y_test, pred, stack["mean_absolute_error"], stack["r2_score"], ) metrics["epochs_run"] = epochs_run metrics["backend"] = "torch" metrics["device"] = str(device) return [{"model": model_kind, **metrics}] def train_lstm_torch(stack, split: SplitData, y_scaler, args): return train_sequence_torch(stack, split, y_scaler, args, "lstm") def train_cnn1d_torch(stack, split: SplitData, y_scaler, args): return train_sequence_torch(stack, split, y_scaler, args, "cnn1d") def train_tcn_torch(stack, split: SplitData, y_scaler, args): return train_sequence_torch(stack, split, y_scaler, args, "tcn") def train_lstm_keras(stack, split: SplitData, y_scaler, args): np = stack["np"] keras = stack["keras"] tf = stack["tf"] tf.random.set_seed(42) np.random.seed(42) x_train, x_val, x_test, y_train, y_val, y_test = make_lstm_sequences(np, split, args.lookback) if len(x_train) < 100 or len(x_val) < 10 or len(x_test) < 10: return [{"model": "lstm", "mae_wh": math.nan, "r2": math.nan, "note": "not_enough_sequences", "backend": "tensorflow"}] model = keras.Sequential( [ keras.layers.Input(shape=(x_train.shape[1], x_train.shape[2])), keras.layers.LSTM(32), keras.layers.Dense(5, activation="relu"), keras.layers.Dense(5, activation="relu"), keras.layers.Dense(1), ] ) model.compile(optimizer=keras.optimizers.RMSprop(), loss="mae") early_stop = keras.callbacks.EarlyStopping(monitor="val_loss", patience=4, restore_best_weights=True) history = model.fit( x_train, y_train, validation_data=(x_val, y_val), epochs=args.epochs, batch_size=args.batch_size, verbose=1, callbacks=[early_stop], ) pred = model.predict(x_test, verbose=0).ravel() metrics = inverse_metric( np, y_scaler, y_test, pred, stack["mean_absolute_error"], stack["r2_score"], ) metrics["epochs_run"] = len(history.history["loss"]) metrics["backend"] = "tensorflow" return [{"model": "lstm", **metrics}] def train_lstm(stack, split: SplitData, y_scaler, args): if args.skip_lstm: return [{"model": "lstm", "mae_wh": math.nan, "r2": math.nan, "note": "skipped_by_flag"}] if stack["torch"] is not None: return train_lstm_torch(stack, split, y_scaler, args) if stack["keras"] is not None: return train_lstm_keras(stack, split, y_scaler, args) note = ( "lstm_backend_missing: " f"torch={stack.get('torch_import_error')}; " f"tensorflow={stack.get('tf_import_error')}" ) return [{"model": "lstm", "mae_wh": math.nan, "r2": math.nan, "note": note}] def train_deep_models(stack, split: SplitData, y_scaler, args): requested = {item.strip().lower() for item in args.dl_models.split(",") if item.strip()} if "none" in requested or args.skip_lstm: return [{"model": "deep_models", "mae_wh": math.nan, "r2": math.nan, "note": "skipped_by_flag"}] rows = [] if "lstm" in requested: rows.extend(train_lstm(stack, split, y_scaler, args)) torch_missing_note = f"torch_missing: {stack.get('torch_import_error')}" if "cnn" in requested: requested.add("cnn1d") if "cnn1d" in requested: if stack["torch"] is not None: rows.extend(train_cnn1d_torch(stack, split, y_scaler, args)) else: rows.append({"model": "cnn1d", "mae_wh": math.nan, "r2": math.nan, "note": torch_missing_note}) if "tcn" in requested: if stack["torch"] is not None: rows.extend(train_tcn_torch(stack, split, y_scaler, args)) else: rows.append({"model": "tcn", "mae_wh": math.nan, "r2": math.nan, "note": torch_missing_note}) return rows def train_arima(stack, clean, args): if not args.include_arima: return [{"model": "arima", "mae_wh": math.nan, "r2": math.nan, "note": "skipped_enable_with_include_arima"}] if stack["ARIMA"] is None: return [{ "model": "arima", "mae_wh": math.nan, "r2": math.nan, "note": f"statsmodels_missing: {stack.get('statsmodels_import_error')}", }] n = len(clean) train_end = int(n * 0.70) val_end = int(n * 0.85) history = clean["energy_wh"].iloc[:val_end].dropna() if args.arima_max_train and len(history) > args.arima_max_train: history = history.iloc[-args.arima_max_train:] y_true = clean["target"].iloc[val_end:].dropna() test_span_steps = test_span_steps_from_args(stack["pd"], args) if test_span_steps is not None: y_true = y_true.iloc[:test_span_steps] if len(history) < 50 or len(y_true) < 10: return [{"model": "arima", "mae_wh": math.nan, "r2": math.nan, "note": "not_enough_points"}] try: model = stack["ARIMA"](history, order=(2, 1, 2)) fitted = model.fit() forecast = fitted.forecast(steps=len(y_true)) metrics = { "mae_wh": float(stack["mean_absolute_error"](y_true, forecast)), "r2": float(stack["r2_score"](y_true, forecast)), "note": f"order=(2,1,2); train_points={len(history)}", } return [{"model": "arima", **metrics}] except Exception as exc: return [{"model": "arima", "mae_wh": math.nan, "r2": math.nan, "note": f"arima_failed: {type(exc).__name__}: {exc}"}] def save_dataset(pd, clean, out_dir: Path, target_name: str): out_path = out_dir / "processed" / f"{target_name}_train_ready.csv" out_path.parent.mkdir(parents=True, exist_ok=True) clean.to_csv(out_path) return out_path def run_target(stack, args, energy_10min, calendar, target_name: str, target_spec: dict, out_dir: Path): pd = stack["pd"] np = stack["np"] horizon_steps = horizon_steps_from_args(pd, args) test_span_steps = test_span_steps_from_args(pd, args) print(f"\n=== Target: {target_name} ===") print(f"Horizon: {horizon_steps} step(s) = {describe_steps(pd, horizon_steps, args.freq)}") print(f"Lookback: {args.lookback} step(s) = {describe_steps(pd, args.lookback, args.freq)}") if test_span_steps is not None: print(f"Test span: {test_span_steps} step(s) = {describe_steps(pd, test_span_steps, args.freq)}") df = build_target_frame( pd, energy_10min, calendar, target_name, target_spec, args.freq, args.include_weather, ) split, y_scaler, clean = make_ml_split( np, pd, stack["MinMaxScaler"], df, horizon=horizon_steps, max_rows=args.max_rows, test_span_steps=test_span_steps, ) dataset_path = save_dataset(pd, clean, out_dir, target_name) print(f"Rows after feature engineering: {len(clean):,}") print(f"Train/val/test: {len(split.y_train):,}/{len(split.y_val):,}/{len(split.y_test):,}") print(f"Saved train-ready table: {dataset_path}") rows = [] rows.extend(train_baselines(stack, split, y_scaler, args, out_dir, target_name)) rows.extend(train_deep_models(stack, split, y_scaler, args)) rows.extend(train_arima(stack, clean, args)) for row in rows: row["target"] = target_name row["rows"] = len(clean) row["features"] = len(split.feature_names) row["horizon_steps"] = horizon_steps row["horizon"] = describe_steps(pd, horizon_steps, args.freq) row["lookback_steps"] = args.lookback row["lookback"] = describe_steps(pd, args.lookback, args.freq) row["test_span_steps"] = test_span_steps if test_span_steps is not None else len(split.y_test) row["test_span"] = describe_steps(pd, row["test_span_steps"], args.freq) return rows def main() -> int: args = parse_args() stack = import_stack() pd = stack["pd"] out_dir = Path(args.output_dir) out_dir.mkdir(parents=True, exist_ok=True) (out_dir / "processed").mkdir(exist_ok=True) config = PAPER_BUILDINGS if args.mode == "paper_buildings" else METER_TARGETS if args.target != "all": if args.target not in config: raise SystemExit(f"Unknown target {args.target!r}. Available: {', '.join(config)}") config = {args.target: config[args.target]} print("DECODE re-implementation") print(f"Mode: {args.mode}") print(f"Frequency: {args.freq}") print(f"Python executable: {sys.executable}") print(f"PyTorch available: {stack['torch'] is not None}") if stack["torch"] is None: print(f"PyTorch import error: {stack.get('torch_import_error')}") print(f"TensorFlow available: {stack['keras'] is not None}") if stack["keras"] is None: print(f"TensorFlow import error: {stack.get('tf_import_error')}") print(f"LightGBM available: {stack['lgb'] is not None}") if stack["lgb"] is None: print(f"LightGBM import error: {stack.get('lgbm_import_error')}") print(f"Statsmodels ARIMA available: {stack['ARIMA'] is not None}") if stack["ARIMA"] is None: print(f"Statsmodels import error: {stack.get('statsmodels_import_error')}") if not args.skip_lstm: if stack["torch"] is not None: print("LSTM backend: PyTorch") elif stack["keras"] is not None: print("LSTM backend: TensorFlow/Keras") else: print("LSTM backend: unavailable") energy_10min = read_energy_10min(pd, args.freq) calendar = read_calendar(pd) all_rows = [] for target_name, target_spec in config.items(): rows = run_target(stack, args, energy_10min, calendar, target_name, target_spec, out_dir) all_rows.extend(rows) results = pd.DataFrame(all_rows) effective_horizon = horizon_steps_from_args(pd, args) effective_test_span = test_span_steps_from_args(pd, args) result_suffix = f"h{effective_horizon}" if effective_test_span is not None: result_suffix += f"_ts{effective_test_span}" result_path = out_dir / f"results_{args.mode}_{result_suffix}.csv" results.to_csv(result_path, index=False) metadata = { "mode": args.mode, "target": args.target, "freq": args.freq, "lookback": args.lookback, "horizon": args.horizon, "horizon_days": args.horizon_days, "horizon_steps_effective": horizon_steps_from_args(pd, args), "test_span_days": args.test_span_days, "test_span_steps_effective": test_span_steps_from_args(pd, args), "epochs": args.epochs, "batch_size": args.batch_size, "rf_trees": args.rf_trees, "dl_models": args.dl_models, "include_arima": args.include_arima, "include_weather": args.include_weather, "pytorch_available": stack["torch"] is not None, "torch_import_error": stack.get("torch_import_error"), "lightgbm_available": stack["lgb"] is not None, "lightgbm_import_error": stack.get("lgbm_import_error"), "statsmodels_available": stack["ARIMA"] is not None, "statsmodels_import_error": stack.get("statsmodels_import_error"), "tensorflow_available": stack["keras"] is not None, "tensorflow_import_error": stack.get("tf_import_error"), "energy_file": str(ENERGY_FILE), "occupancy_dir": str(OCCUPANCY_DIR), "calendar_dir": str(CALENDAR_DIR), "weather_file": str(WEATHER_FILE), } (out_dir / f"run_config_{args.mode}.json").write_text(json.dumps(metadata, indent=2), encoding="utf-8") print("\n=== Results ===") print(results.sort_values(["target", "mae_wh"]).to_string(index=False)) print(f"\nSaved results: {result_path}") return 0 if __name__ == "__main__": raise SystemExit(main())