decode-iblend-code / scripts /decode_reimplementation.py
HoangTrungNguyen's picture
Upload scripts/decode_reimplementation.py with huggingface_hub
c06d939 verified
Raw
History Blame Contribute Delete
38.3 kB
#!/usr/bin/env python3
"""Re-implement the DECODE paper training pipeline for local I-BLEND data.
The paper pipeline is:
1. Fuse energy, occupancy, calendar, and weather-like environmental features.
2. Align everything to a 10-minute sampling rate.
3. Normalize features with Min-Max scaling.
4. Split chronologically into train/validation/test with a 70:15:15 ratio.
5. Compare LSTM with Linear Regression, Decision Tree, and Random Forest.
This local implementation supports two targets:
- paper_buildings: 7 building-level series matching the paper.
- meters: 9 meter-level series from all_buildings_power.csv.
Weather is optional because the local weather file in this workspace starts in
2018, while the energy data ends in 2017. The script detects this and continues.
"""
from __future__ import annotations
import argparse
import os
import json
import math
import sys
import warnings
from dataclasses import dataclass
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_DATA_MINING_ROOT = ROOT if (ROOT / "IIITD_occupancy_dataset").exists() else ROOT.parent
DATA_MINING_ROOT = Path(os.environ.get("IBLEND_DATA_ROOT", DEFAULT_DATA_MINING_ROOT))
ENERGY_FILE = Path(os.environ.get("IBLEND_ENERGY_FILE", DATA_MINING_ROOT / "energy_dataset" / "all_buildings_power.csv"))
OCCUPANCY_DIR = DATA_MINING_ROOT / "IIITD_occupancy_dataset" / "IIITD_occupancy_dataset"
CALENDAR_DIR = DATA_MINING_ROOT / "iiitd_calender_schedule" / "iiitd_calender_schedule"
WEATHER_FILE = DATA_MINING_ROOT / "weather_comparison" / "weather_comparison" / "IIITD_and_airport_data.csv"
OUT_DIR = ROOT / "decode_reimplementation_outputs"
TZ = "Asia/Kolkata"
PAPER_BUILDINGS = {
"Academic": {"meters": ["Academic"], "occupancy": "ACB"},
"Boys_hostel": {"meters": ["Boys_main", "Boys_backup"], "occupancy": "BH"},
"Girls_hostel": {"meters": ["Girls_main", "Girls_backup"], "occupancy": "GH"},
"Library": {"meters": ["Library"], "occupancy": "LB"},
"Lecture": {"meters": ["Lecture"], "occupancy": "LCB"},
"Dining": {"meters": ["Mess"], "occupancy": "DB"},
"Facilities": {"meters": ["Facilities"], "occupancy": "SRB"},
}
METER_TARGETS = {
"Academic": {"meters": ["Academic"], "occupancy": "ACB"},
"Boys_main": {"meters": ["Boys_main"], "occupancy": "BH"},
"Boys_backup": {"meters": ["Boys_backup"], "occupancy": "BH"},
"Facilities": {"meters": ["Facilities"], "occupancy": "SRB"},
"Girls_main": {"meters": ["Girls_main"], "occupancy": "GH"},
"Girls_backup": {"meters": ["Girls_backup"], "occupancy": "GH"},
"Lecture": {"meters": ["Lecture"], "occupancy": "LCB"},
"Library": {"meters": ["Library"], "occupancy": "LB"},
"Mess": {"meters": ["Mess"], "occupancy": "DB"},
}
@dataclass
class SplitData:
x_train: object
x_val: object
x_test: object
y_train: object
y_val: object
y_test: object
feature_names: list[str]
test_span_steps: int | None = None
def import_stack():
try:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
except ImportError as exc:
missing = str(exc).split("No module named ")[-1].strip("'")
raise SystemExit(
f"Missing dependency: {missing}\n"
"Install the base training stack with:\n"
f" {sys.executable} -m pip install pandas numpy scikit-learn\n"
"Install LSTM support with either PyTorch or TensorFlow:\n"
f" {sys.executable} -m pip install torch\n"
f" {sys.executable} -m pip install tensorflow\n"
) from exc
torch_import_error = None
try:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
except Exception as exc:
torch = None
nn = None
DataLoader = None
TensorDataset = None
torch_import_error = f"{type(exc).__name__}: {exc}"
lgbm_import_error = None
try:
import lightgbm as lgb
except Exception as exc:
lgb = None
lgbm_import_error = f"{type(exc).__name__}: {exc}"
statsmodels_import_error = None
try:
from statsmodels.tsa.arima.model import ARIMA
except Exception as exc:
ARIMA = None
statsmodels_import_error = f"{type(exc).__name__}: {exc}"
tf = None
keras = None
tf_import_error = None
disable_tf_when_torch_available = os.environ.get("DECODE_DISABLE_TENSORFLOW", "1") == "1"
if disable_tf_when_torch_available and torch is not None:
tf_import_error = "disabled because PyTorch is available"
else:
try:
import tensorflow as tf
from tensorflow import keras
except Exception as exc:
tf = None
keras = None
tf_import_error = f"{type(exc).__name__}: {exc}"
return {
"np": np,
"pd": pd,
"RandomForestRegressor": RandomForestRegressor,
"Ridge": Ridge,
"DecisionTreeRegressor": DecisionTreeRegressor,
"MinMaxScaler": MinMaxScaler,
"mean_absolute_error": mean_absolute_error,
"r2_score": r2_score,
"torch": torch,
"nn": nn,
"DataLoader": DataLoader,
"TensorDataset": TensorDataset,
"torch_import_error": torch_import_error,
"lgb": lgb,
"lgbm_import_error": lgbm_import_error,
"ARIMA": ARIMA,
"statsmodels_import_error": statsmodels_import_error,
"tf": tf,
"keras": keras,
"tf_import_error": tf_import_error,
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="DECODE paper re-implementation for I-BLEND data.")
parser.add_argument(
"--mode",
choices=["paper_buildings", "meters"],
default="paper_buildings",
help="paper_buildings trains 7 building-level targets; meters trains 9 meter-level targets.",
)
parser.add_argument(
"--target",
default="all",
help="Target name to train, or 'all'. Names depend on --mode.",
)
parser.add_argument("--freq", default="10min", help="Common sampling frequency. Paper uses 10min.")
parser.add_argument("--lookback", type=int, default=18, help="LSTM lookback steps. 18 at 10min = 3 hours.")
parser.add_argument("--horizon", type=int, default=1, help="Prediction horizon in rows. 1 at 10min = next 10min.")
parser.add_argument("--horizon-days", type=float, default=0, help="Prediction horizon in days. Overrides --horizon when > 0.")
parser.add_argument("--test-span-days", type=float, default=0, help="Evaluate only this many days from the chronological test split.")
parser.add_argument("--epochs", type=int, default=20, help="LSTM epochs. Paper uses 20.")
parser.add_argument("--batch-size", type=int, default=64, help="LSTM batch size. Paper uses 64.")
parser.add_argument("--rf-trees", type=int, default=500, help="Random Forest trees. Paper tuned to 500.")
parser.add_argument("--dl-models", default="lstm,cnn,tcn", help="Comma-separated deep models: lstm,cnn,tcn,none.")
parser.add_argument("--include-arima", action="store_true", help="Fit ARIMA(2,1,2). This can be slow on full series.")
parser.add_argument("--arima-max-train", type=int, default=20000, help="Maximum recent train points for ARIMA fitting.")
parser.add_argument("--max-rows", type=int, default=0, help="Optional cap after preprocessing for quick smoke tests.")
parser.add_argument("--include-weather", action="store_true", help="Try to merge local weather data if time ranges overlap.")
parser.add_argument("--skip-lstm", action="store_true", help="Only train baseline ML models.")
parser.add_argument("--output-dir", default=str(OUT_DIR), help="Output directory.")
return parser.parse_args()
def read_energy_10min(pd, freq: str):
if not ENERGY_FILE.exists():
raise FileNotFoundError(f"Missing energy file: {ENERGY_FILE}")
df = pd.read_csv(ENERGY_FILE, na_values=["NA", ""])
df["datetime"] = pd.to_datetime(df["timestamp"], unit="s", utc=True).dt.tz_convert(TZ)
df = df.drop(columns=["timestamp"]).set_index("datetime").sort_index()
# Paper predicts energy in Wh. Original columns are power in W at 1-minute resolution.
# For a 10-minute interval: Wh = mean(W) * 10 / 60.
mean_power_w = df.resample(freq).mean()
interval_minutes = pd.Timedelta(freq).total_seconds() / 60
energy_wh = mean_power_w * interval_minutes / 60
return energy_wh
def read_occupancy_10min(pd, code: str, freq: str):
path = OCCUPANCY_DIR / f"{code}.csv"
if not path.exists():
warnings.warn(f"Missing occupancy file for {code}: {path}")
return None
df = pd.read_csv(path, na_values=["NA", ""])
df["datetime"] = pd.to_datetime(df["timestamp"], unit="s", utc=True).dt.tz_convert(TZ)
df = df.drop(columns=["timestamp"]).set_index("datetime").sort_index()
occ = df.resample(freq).mean()
occ["occupancy_count"] = occ["occupancy_count"].interpolate(method="time").ffill().bfill()
return occ
def read_calendar(pd):
frames = []
for path in sorted(CALENDAR_DIR.glob("calender_year_*.csv")):
frames.append(pd.read_csv(path))
if not frames:
warnings.warn(f"No calendar files found in {CALENDAR_DIR}")
return None
cal = pd.concat(frames, ignore_index=True)
cal["date"] = pd.to_datetime(cal["Date"]).dt.date
cal = cal[["date", "working_day", "activity"]].drop_duplicates("date")
cal["working_day"] = pd.to_numeric(cal["working_day"], errors="coerce").fillna(0).astype(int)
cal["activity"] = cal["activity"].fillna("unknown").astype(str)
cal["activity_code"] = cal["activity"].astype("category").cat.codes
return cal
def read_weather_10min(pd, freq: str):
if not WEATHER_FILE.exists():
warnings.warn(f"Missing weather file: {WEATHER_FILE}")
return None
df = pd.read_csv(WEATHER_FILE, na_values=["NA", ""])
first_col = df.columns[0]
df = df.rename(columns={first_col: "datetime"})
df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
df = df.dropna(subset=["datetime"]).set_index("datetime").sort_index()
if df.index.tz is None:
df.index = df.index.tz_localize(TZ)
weather = df.resample(freq).mean().interpolate(method="time").ffill().bfill()
return weather
def horizon_steps_from_args(pd, args) -> int:
if args.horizon_days and args.horizon_days > 0:
freq_delta = pd.Timedelta(args.freq)
steps = int(round(pd.Timedelta(days=args.horizon_days) / freq_delta))
return max(1, steps)
return max(1, args.horizon)
def test_span_steps_from_args(pd, args) -> int | None:
if args.test_span_days and args.test_span_days > 0:
freq_delta = pd.Timedelta(args.freq)
steps = int(round(pd.Timedelta(days=args.test_span_days) / freq_delta))
return max(1, steps)
return None
def describe_steps(pd, steps: int, freq: str) -> str:
delta = pd.Timedelta(freq) * steps
total_minutes = delta.total_seconds() / 60
if total_minutes % 1440 == 0:
return f"{int(total_minutes // 1440)} day(s)"
if total_minutes % 60 == 0:
return f"{int(total_minutes // 60)} hour(s)"
return f"{total_minutes:g} minute(s)"
def add_time_features(pd, df):
out = df.copy()
idx = out.index
out["hour"] = idx.hour
out["day_of_week"] = idx.dayofweek
out["month"] = idx.month
out["time_slot"] = idx.hour * 60 + idx.minute
out["hour_sin"] = (2 * math.pi * out["hour"] / 24).map(math.sin)
out["hour_cos"] = (2 * math.pi * out["hour"] / 24).map(math.cos)
out["dow_sin"] = (2 * math.pi * out["day_of_week"] / 7).map(math.sin)
out["dow_cos"] = (2 * math.pi * out["day_of_week"] / 7).map(math.cos)
out["month_sin"] = (2 * math.pi * out["month"] / 12).map(math.sin)
out["month_cos"] = (2 * math.pi * out["month"] / 12).map(math.cos)
return out
def add_historical_features(df):
out = df.copy()
out["energy_lag_1"] = out["energy_wh"].shift(1)
out["energy_lag_6"] = out["energy_wh"].shift(6)
out["energy_lag_144"] = out["energy_wh"].shift(144)
out["energy_lag_1008"] = out["energy_wh"].shift(1008)
out["rolling_mean_6"] = out["energy_wh"].shift(1).rolling(6).mean()
out["rolling_mean_144"] = out["energy_wh"].shift(1).rolling(144).mean()
out["rolling_std_144"] = out["energy_wh"].shift(1).rolling(144).std()
# DECODE-style baseline features: previous three days with same working-day class
# and same time instant.
same_type_groups = out.groupby(["working_day", "time_slot"], sort=False)["energy_wh"]
out["same_day_type_lag_1"] = same_type_groups.shift(1)
out["same_day_type_lag_2"] = same_type_groups.shift(2)
out["same_day_type_lag_3"] = same_type_groups.shift(3)
return out
def build_target_frame(pd, energy_10min, calendar, target_name: str, target_spec: dict, freq: str, include_weather: bool):
missing_meters = [m for m in target_spec["meters"] if m not in energy_10min.columns]
if missing_meters:
raise KeyError(f"{target_name} references missing meter columns: {missing_meters}")
df = pd.DataFrame(index=energy_10min.index)
df["energy_wh"] = energy_10min[target_spec["meters"]].sum(axis=1, min_count=1)
occ = read_occupancy_10min(pd, target_spec["occupancy"], freq)
if occ is not None:
df = df.join(occ[["occupancy_count"]], how="left")
else:
df["occupancy_count"] = math.nan
df["date"] = df.index.date
if calendar is not None:
df = df.reset_index().merge(calendar, on="date", how="left").set_index("datetime").sort_index()
else:
df["working_day"] = (df.index.dayofweek < 5).astype(int)
df["activity"] = "unknown"
df["activity_code"] = 0
if include_weather:
weather = read_weather_10min(pd, freq)
if weather is not None:
before = len(df)
df = df.join(weather, how="inner")
if df.empty:
warnings.warn(
"Weather data has no overlap with this target after joining. "
"Continuing without weather features."
)
df = pd.DataFrame(index=energy_10min.index)
df["energy_wh"] = energy_10min[target_spec["meters"]].sum(axis=1, min_count=1)
occ = read_occupancy_10min(pd, target_spec["occupancy"], freq)
if occ is not None:
df = df.join(occ[["occupancy_count"]], how="left")
df["date"] = df.index.date
df = df.reset_index().merge(calendar, on="date", how="left").set_index("datetime").sort_index()
elif len(df) < before:
warnings.warn(f"Weather join reduced rows from {before} to {len(df)}.")
df["occupancy_count"] = df["occupancy_count"].interpolate(method="time").ffill().bfill()
fallback_working_day = pd.Series((df.index.dayofweek < 5).astype(int), index=df.index)
df["working_day"] = df["working_day"].fillna(fallback_working_day).astype(int)
df["activity"] = df["activity"].fillna("unknown").astype(str)
df["activity_code"] = df["activity_code"].fillna(0).astype(int)
df = add_time_features(pd, df)
df = add_historical_features(df)
return df
def make_ml_split(
np,
pd,
MinMaxScaler,
df,
horizon: int,
max_rows: int,
test_span_steps: int | None = None,
) -> tuple[SplitData, object, object]:
data = df.copy()
data["target"] = data["energy_wh"].shift(-horizon)
non_features = {"target", "date", "activity"}
feature_names = [
c for c in data.columns
if c not in non_features and pd.api.types.is_numeric_dtype(data[c])
]
clean = data[feature_names + ["target"]].replace([np.inf, -np.inf], np.nan).dropna()
if max_rows and len(clean) > max_rows:
clean = clean.tail(max_rows)
n = len(clean)
if n < 100:
raise ValueError(f"Not enough clean rows after preprocessing: {n}")
train_end = int(n * 0.70)
val_end = int(n * 0.85)
x_raw = clean[feature_names]
y_raw = clean[["target"]]
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()
x_train = x_scaler.fit_transform(x_raw.iloc[:train_end])
x_val = x_scaler.transform(x_raw.iloc[train_end:val_end])
x_test = x_scaler.transform(x_raw.iloc[val_end:])
y_train = y_scaler.fit_transform(y_raw.iloc[:train_end]).ravel()
y_val = y_scaler.transform(y_raw.iloc[train_end:val_end]).ravel()
y_test = y_scaler.transform(y_raw.iloc[val_end:]).ravel()
if test_span_steps is not None:
x_test = x_test[:test_span_steps]
y_test = y_test[:test_span_steps]
split = SplitData(
x_train=x_train,
x_val=x_val,
x_test=x_test,
y_train=y_train,
y_val=y_val,
y_test=y_test,
feature_names=feature_names,
test_span_steps=test_span_steps,
)
return split, y_scaler, clean
def make_lstm_sequences(np, split: SplitData, lookback: int):
x_all = np.vstack([split.x_train, split.x_val, split.x_test])
y_all = np.concatenate([split.y_train, split.y_val, split.y_test])
n_train = len(split.y_train)
n_val = len(split.y_val)
xs, ys, end_indices = [], [], []
for end in range(lookback - 1, len(y_all)):
# Use the sequence ending at row `end` to predict that row's target.
# The target was already shifted by --horizon during feature engineering,
# so excluding row `end` here would make sequence models forecast one
# extra step farther than the baseline models.
xs.append(x_all[end - lookback + 1:end + 1])
ys.append(y_all[end])
end_indices.append(end)
xs = np.asarray(xs)
ys = np.asarray(ys)
end_indices = np.asarray(end_indices)
train_mask = end_indices < n_train
val_mask = (end_indices >= n_train) & (end_indices < n_train + n_val)
test_mask = end_indices >= n_train + n_val
return (
xs[train_mask],
xs[val_mask],
xs[test_mask],
ys[train_mask],
ys[val_mask],
ys[test_mask],
)
def inverse_metric(np, y_scaler, y_true_scaled, y_pred_scaled, mean_absolute_error, r2_score):
y_true = y_scaler.inverse_transform(np.asarray(y_true_scaled).reshape(-1, 1)).ravel()
y_pred = y_scaler.inverse_transform(np.asarray(y_pred_scaled).reshape(-1, 1)).ravel()
return {
"mae_wh": float(mean_absolute_error(y_true, y_pred)),
"r2": float(r2_score(y_true, y_pred)),
}
def save_feature_importance(pd, out_dir: Path, target_name: str, model_name: str, feature_names: list[str], importances):
importance_dir = out_dir / "feature_importance"
importance_dir.mkdir(parents=True, exist_ok=True)
table = pd.DataFrame({"feature": feature_names, "importance": importances})
table = table.sort_values("importance", ascending=False)
table.to_csv(importance_dir / f"{target_name}_{model_name}_feature_importance.csv", index=False)
def train_baselines(stack, split: SplitData, y_scaler, args, out_dir: Path, target_name: str):
np = stack["np"]
pd = stack["pd"]
models = {
"ridge_regression": stack["Ridge"](alpha=1.0),
"decision_tree": stack["DecisionTreeRegressor"](max_depth=14, min_samples_split=20, random_state=42),
"random_forest": stack["RandomForestRegressor"](
n_estimators=args.rf_trees,
random_state=42,
n_jobs=-1,
min_samples_split=2,
),
}
if stack["lgb"] is not None:
models["lightgbm"] = stack["lgb"].LGBMRegressor(
n_estimators=500,
learning_rate=0.03,
num_leaves=31,
subsample=0.9,
colsample_bytree=0.9,
random_state=42,
n_jobs=-1,
verbose=-1,
)
rows = []
for name, model in models.items():
model.fit(split.x_train, split.y_train)
pred = model.predict(split.x_test)
metrics = inverse_metric(
np,
y_scaler,
split.y_test,
pred,
stack["mean_absolute_error"],
stack["r2_score"],
)
if name == "lightgbm" and hasattr(model, "feature_importances_"):
save_feature_importance(pd, out_dir, target_name, name, split.feature_names, model.feature_importances_)
metrics["feature_importance_path"] = str(
out_dir / "feature_importance" / f"{target_name}_{name}_feature_importance.csv"
)
rows.append({"model": name, **metrics})
if stack["lgb"] is None:
rows.append({
"model": "lightgbm",
"mae_wh": math.nan,
"r2": math.nan,
"note": f"lightgbm_missing: {stack.get('lgbm_import_error')}",
})
return rows
def train_sequence_torch(stack, split: SplitData, y_scaler, args, model_kind: str):
np = stack["np"]
torch = stack["torch"]
nn = stack["nn"]
DataLoader = stack["DataLoader"]
TensorDataset = stack["TensorDataset"]
torch.manual_seed(42)
np.random.seed(42)
x_train, x_val, x_test, y_train, y_val, y_test = make_lstm_sequences(np, split, args.lookback)
if len(x_train) < 100 or len(x_val) < 10 or len(x_test) < 10:
return [{"model": model_kind, "mae_wh": math.nan, "r2": math.nan, "note": "not_enough_sequences", "backend": "torch"}]
class DecodeLSTM(nn.Module):
def __init__(self, input_size: int):
super().__init__()
self.lstm = nn.LSTM(input_size=input_size, hidden_size=32, batch_first=True)
self.head = nn.Sequential(
nn.Linear(32, 5),
nn.ReLU(),
nn.Linear(5, 5),
nn.ReLU(),
nn.Linear(5, 1),
)
def forward(self, x):
output, _ = self.lstm(x)
return self.head(output[:, -1, :]).squeeze(-1)
class DecodeCNN1D(nn.Module):
def __init__(self, input_size: int):
super().__init__()
self.net = nn.Sequential(
nn.Conv1d(input_size, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv1d(32, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool1d(1),
)
self.head = nn.Sequential(
nn.Flatten(),
nn.Linear(32, 16),
nn.ReLU(),
nn.Linear(16, 1),
)
def forward(self, x):
x = x.transpose(1, 2)
return self.head(self.net(x)).squeeze(-1)
class Chomp1d(nn.Module):
def __init__(self, chomp_size: int):
super().__init__()
self.chomp_size = chomp_size
def forward(self, x):
return x[:, :, :-self.chomp_size].contiguous() if self.chomp_size else x
class DecodeTCN(nn.Module):
def __init__(self, input_size: int):
super().__init__()
self.net = nn.Sequential(
nn.Conv1d(input_size, 32, kernel_size=3, padding=2, dilation=1),
Chomp1d(2),
nn.ReLU(),
nn.Conv1d(32, 32, kernel_size=3, padding=4, dilation=2),
Chomp1d(4),
nn.ReLU(),
)
self.head = nn.Sequential(
nn.Linear(32, 16),
nn.ReLU(),
nn.Linear(16, 1),
)
def forward(self, x):
x = x.transpose(1, 2)
output = self.net(x).transpose(1, 2)
return self.head(output[:, -1, :]).squeeze(-1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_classes = {
"lstm": DecodeLSTM,
"cnn1d": DecodeCNN1D,
"tcn": DecodeTCN,
}
model = model_classes[model_kind](input_size=x_train.shape[2]).to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3)
train_ds = TensorDataset(
torch.tensor(x_train, dtype=torch.float32),
torch.tensor(y_train, dtype=torch.float32),
)
val_x = torch.tensor(x_val, dtype=torch.float32).to(device)
val_y = torch.tensor(y_val, dtype=torch.float32).to(device)
train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True)
best_state = None
best_val_loss = math.inf
patience = 4
patience_left = patience
epochs_run = 0
for epoch in range(args.epochs):
model.train()
train_loss = 0.0
seen = 0
for batch_x, batch_y in train_loader:
batch_x = batch_x.to(device)
batch_y = batch_y.to(device)
optimizer.zero_grad()
pred = model(batch_x)
loss = criterion(pred, batch_y)
loss.backward()
optimizer.step()
train_loss += loss.item() * len(batch_y)
seen += len(batch_y)
model.eval()
with torch.no_grad():
val_pred = model(val_x)
val_loss = criterion(val_pred, val_y).item()
epochs_run = epoch + 1
print(f"Epoch {epochs_run}/{args.epochs} - loss: {train_loss / max(seen, 1):.6f} - val_loss: {val_loss:.6f}")
if val_loss < best_val_loss:
best_val_loss = val_loss
best_state = {key: value.detach().cpu().clone() for key, value in model.state_dict().items()}
patience_left = patience
else:
patience_left -= 1
if patience_left <= 0:
break
if best_state is not None:
model.load_state_dict(best_state)
model.eval()
test_loader = DataLoader(
TensorDataset(torch.tensor(x_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)),
batch_size=args.batch_size,
shuffle=False,
)
preds = []
with torch.no_grad():
for batch_x, _ in test_loader:
preds.append(model(batch_x.to(device)).detach().cpu().numpy())
pred = np.concatenate(preds)
metrics = inverse_metric(
np,
y_scaler,
y_test,
pred,
stack["mean_absolute_error"],
stack["r2_score"],
)
metrics["epochs_run"] = epochs_run
metrics["backend"] = "torch"
metrics["device"] = str(device)
return [{"model": model_kind, **metrics}]
def train_lstm_torch(stack, split: SplitData, y_scaler, args):
return train_sequence_torch(stack, split, y_scaler, args, "lstm")
def train_cnn1d_torch(stack, split: SplitData, y_scaler, args):
return train_sequence_torch(stack, split, y_scaler, args, "cnn1d")
def train_tcn_torch(stack, split: SplitData, y_scaler, args):
return train_sequence_torch(stack, split, y_scaler, args, "tcn")
def train_lstm_keras(stack, split: SplitData, y_scaler, args):
np = stack["np"]
keras = stack["keras"]
tf = stack["tf"]
tf.random.set_seed(42)
np.random.seed(42)
x_train, x_val, x_test, y_train, y_val, y_test = make_lstm_sequences(np, split, args.lookback)
if len(x_train) < 100 or len(x_val) < 10 or len(x_test) < 10:
return [{"model": "lstm", "mae_wh": math.nan, "r2": math.nan, "note": "not_enough_sequences", "backend": "tensorflow"}]
model = keras.Sequential(
[
keras.layers.Input(shape=(x_train.shape[1], x_train.shape[2])),
keras.layers.LSTM(32),
keras.layers.Dense(5, activation="relu"),
keras.layers.Dense(5, activation="relu"),
keras.layers.Dense(1),
]
)
model.compile(optimizer=keras.optimizers.RMSprop(), loss="mae")
early_stop = keras.callbacks.EarlyStopping(monitor="val_loss", patience=4, restore_best_weights=True)
history = model.fit(
x_train,
y_train,
validation_data=(x_val, y_val),
epochs=args.epochs,
batch_size=args.batch_size,
verbose=1,
callbacks=[early_stop],
)
pred = model.predict(x_test, verbose=0).ravel()
metrics = inverse_metric(
np,
y_scaler,
y_test,
pred,
stack["mean_absolute_error"],
stack["r2_score"],
)
metrics["epochs_run"] = len(history.history["loss"])
metrics["backend"] = "tensorflow"
return [{"model": "lstm", **metrics}]
def train_lstm(stack, split: SplitData, y_scaler, args):
if args.skip_lstm:
return [{"model": "lstm", "mae_wh": math.nan, "r2": math.nan, "note": "skipped_by_flag"}]
if stack["torch"] is not None:
return train_lstm_torch(stack, split, y_scaler, args)
if stack["keras"] is not None:
return train_lstm_keras(stack, split, y_scaler, args)
note = (
"lstm_backend_missing: "
f"torch={stack.get('torch_import_error')}; "
f"tensorflow={stack.get('tf_import_error')}"
)
return [{"model": "lstm", "mae_wh": math.nan, "r2": math.nan, "note": note}]
def train_deep_models(stack, split: SplitData, y_scaler, args):
requested = {item.strip().lower() for item in args.dl_models.split(",") if item.strip()}
if "none" in requested or args.skip_lstm:
return [{"model": "deep_models", "mae_wh": math.nan, "r2": math.nan, "note": "skipped_by_flag"}]
rows = []
if "lstm" in requested:
rows.extend(train_lstm(stack, split, y_scaler, args))
torch_missing_note = f"torch_missing: {stack.get('torch_import_error')}"
if "cnn" in requested:
requested.add("cnn1d")
if "cnn1d" in requested:
if stack["torch"] is not None:
rows.extend(train_cnn1d_torch(stack, split, y_scaler, args))
else:
rows.append({"model": "cnn1d", "mae_wh": math.nan, "r2": math.nan, "note": torch_missing_note})
if "tcn" in requested:
if stack["torch"] is not None:
rows.extend(train_tcn_torch(stack, split, y_scaler, args))
else:
rows.append({"model": "tcn", "mae_wh": math.nan, "r2": math.nan, "note": torch_missing_note})
return rows
def train_arima(stack, clean, args):
if not args.include_arima:
return [{"model": "arima", "mae_wh": math.nan, "r2": math.nan, "note": "skipped_enable_with_include_arima"}]
if stack["ARIMA"] is None:
return [{
"model": "arima",
"mae_wh": math.nan,
"r2": math.nan,
"note": f"statsmodels_missing: {stack.get('statsmodels_import_error')}",
}]
n = len(clean)
train_end = int(n * 0.70)
val_end = int(n * 0.85)
history = clean["energy_wh"].iloc[:val_end].dropna()
if args.arima_max_train and len(history) > args.arima_max_train:
history = history.iloc[-args.arima_max_train:]
y_true = clean["target"].iloc[val_end:].dropna()
test_span_steps = test_span_steps_from_args(stack["pd"], args)
if test_span_steps is not None:
y_true = y_true.iloc[:test_span_steps]
if len(history) < 50 or len(y_true) < 10:
return [{"model": "arima", "mae_wh": math.nan, "r2": math.nan, "note": "not_enough_points"}]
try:
model = stack["ARIMA"](history, order=(2, 1, 2))
fitted = model.fit()
forecast = fitted.forecast(steps=len(y_true))
metrics = {
"mae_wh": float(stack["mean_absolute_error"](y_true, forecast)),
"r2": float(stack["r2_score"](y_true, forecast)),
"note": f"order=(2,1,2); train_points={len(history)}",
}
return [{"model": "arima", **metrics}]
except Exception as exc:
return [{"model": "arima", "mae_wh": math.nan, "r2": math.nan, "note": f"arima_failed: {type(exc).__name__}: {exc}"}]
def save_dataset(pd, clean, out_dir: Path, target_name: str):
out_path = out_dir / "processed" / f"{target_name}_train_ready.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
clean.to_csv(out_path)
return out_path
def run_target(stack, args, energy_10min, calendar, target_name: str, target_spec: dict, out_dir: Path):
pd = stack["pd"]
np = stack["np"]
horizon_steps = horizon_steps_from_args(pd, args)
test_span_steps = test_span_steps_from_args(pd, args)
print(f"\n=== Target: {target_name} ===")
print(f"Horizon: {horizon_steps} step(s) = {describe_steps(pd, horizon_steps, args.freq)}")
print(f"Lookback: {args.lookback} step(s) = {describe_steps(pd, args.lookback, args.freq)}")
if test_span_steps is not None:
print(f"Test span: {test_span_steps} step(s) = {describe_steps(pd, test_span_steps, args.freq)}")
df = build_target_frame(
pd,
energy_10min,
calendar,
target_name,
target_spec,
args.freq,
args.include_weather,
)
split, y_scaler, clean = make_ml_split(
np,
pd,
stack["MinMaxScaler"],
df,
horizon=horizon_steps,
max_rows=args.max_rows,
test_span_steps=test_span_steps,
)
dataset_path = save_dataset(pd, clean, out_dir, target_name)
print(f"Rows after feature engineering: {len(clean):,}")
print(f"Train/val/test: {len(split.y_train):,}/{len(split.y_val):,}/{len(split.y_test):,}")
print(f"Saved train-ready table: {dataset_path}")
rows = []
rows.extend(train_baselines(stack, split, y_scaler, args, out_dir, target_name))
rows.extend(train_deep_models(stack, split, y_scaler, args))
rows.extend(train_arima(stack, clean, args))
for row in rows:
row["target"] = target_name
row["rows"] = len(clean)
row["features"] = len(split.feature_names)
row["horizon_steps"] = horizon_steps
row["horizon"] = describe_steps(pd, horizon_steps, args.freq)
row["lookback_steps"] = args.lookback
row["lookback"] = describe_steps(pd, args.lookback, args.freq)
row["test_span_steps"] = test_span_steps if test_span_steps is not None else len(split.y_test)
row["test_span"] = describe_steps(pd, row["test_span_steps"], args.freq)
return rows
def main() -> int:
args = parse_args()
stack = import_stack()
pd = stack["pd"]
out_dir = Path(args.output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / "processed").mkdir(exist_ok=True)
config = PAPER_BUILDINGS if args.mode == "paper_buildings" else METER_TARGETS
if args.target != "all":
if args.target not in config:
raise SystemExit(f"Unknown target {args.target!r}. Available: {', '.join(config)}")
config = {args.target: config[args.target]}
print("DECODE re-implementation")
print(f"Mode: {args.mode}")
print(f"Frequency: {args.freq}")
print(f"Python executable: {sys.executable}")
print(f"PyTorch available: {stack['torch'] is not None}")
if stack["torch"] is None:
print(f"PyTorch import error: {stack.get('torch_import_error')}")
print(f"TensorFlow available: {stack['keras'] is not None}")
if stack["keras"] is None:
print(f"TensorFlow import error: {stack.get('tf_import_error')}")
print(f"LightGBM available: {stack['lgb'] is not None}")
if stack["lgb"] is None:
print(f"LightGBM import error: {stack.get('lgbm_import_error')}")
print(f"Statsmodels ARIMA available: {stack['ARIMA'] is not None}")
if stack["ARIMA"] is None:
print(f"Statsmodels import error: {stack.get('statsmodels_import_error')}")
if not args.skip_lstm:
if stack["torch"] is not None:
print("LSTM backend: PyTorch")
elif stack["keras"] is not None:
print("LSTM backend: TensorFlow/Keras")
else:
print("LSTM backend: unavailable")
energy_10min = read_energy_10min(pd, args.freq)
calendar = read_calendar(pd)
all_rows = []
for target_name, target_spec in config.items():
rows = run_target(stack, args, energy_10min, calendar, target_name, target_spec, out_dir)
all_rows.extend(rows)
results = pd.DataFrame(all_rows)
effective_horizon = horizon_steps_from_args(pd, args)
effective_test_span = test_span_steps_from_args(pd, args)
result_suffix = f"h{effective_horizon}"
if effective_test_span is not None:
result_suffix += f"_ts{effective_test_span}"
result_path = out_dir / f"results_{args.mode}_{result_suffix}.csv"
results.to_csv(result_path, index=False)
metadata = {
"mode": args.mode,
"target": args.target,
"freq": args.freq,
"lookback": args.lookback,
"horizon": args.horizon,
"horizon_days": args.horizon_days,
"horizon_steps_effective": horizon_steps_from_args(pd, args),
"test_span_days": args.test_span_days,
"test_span_steps_effective": test_span_steps_from_args(pd, args),
"epochs": args.epochs,
"batch_size": args.batch_size,
"rf_trees": args.rf_trees,
"dl_models": args.dl_models,
"include_arima": args.include_arima,
"include_weather": args.include_weather,
"pytorch_available": stack["torch"] is not None,
"torch_import_error": stack.get("torch_import_error"),
"lightgbm_available": stack["lgb"] is not None,
"lightgbm_import_error": stack.get("lgbm_import_error"),
"statsmodels_available": stack["ARIMA"] is not None,
"statsmodels_import_error": stack.get("statsmodels_import_error"),
"tensorflow_available": stack["keras"] is not None,
"tensorflow_import_error": stack.get("tf_import_error"),
"energy_file": str(ENERGY_FILE),
"occupancy_dir": str(OCCUPANCY_DIR),
"calendar_dir": str(CALENDAR_DIR),
"weather_file": str(WEATHER_FILE),
}
(out_dir / f"run_config_{args.mode}.json").write_text(json.dumps(metadata, indent=2), encoding="utf-8")
print("\n=== Results ===")
print(results.sort_values(["target", "mae_wh"]).to_string(index=False))
print(f"\nSaved results: {result_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())