File size: 14,635 Bytes

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bdb90c6a",
   "metadata": {},
   "source": [
    "# DECODE Paper Re-implementation\n",
    "\n",
    "Notebook wrapper này chạy lại pipeline theo paper DECODE trên dữ liệu local I-BLEND. Logic chính nằm trong `scripts/decode_reimplementation.py`.\n",
    "\n",
    "Pipeline:\n",
    "- Resample về 10 phút.\n",
    "- Convert power W thành interval energy Wh.\n",
    "- Merge occupancy + calendar/schedule.\n",
    "- Tạo historical features, gồm 3 ngày trước cùng loại working/non-working day tại cùng thời điểm.\n",
    "- Min-Max normalization.\n",
    "- Split theo thời gian 70/15/15.\n",
    "- Train Ridge, Decision Tree, Random Forest, LightGBM nếu có, LSTM, 1D-CNN, TCN, và ARIMA nếu bật `--include-arima`.\n",
    "- LSTM/1D-CNN/TCN ưu tiên PyTorch nếu có; LSTM fallback TensorFlow/Keras."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6c2854f",
   "metadata": {},
   "source": [
    "## 0. Download Data And Code From Hugging Face\n",
    "\n",
    "Khi chạy notebook ở môi trường khác, tải **data repo** và **code repo** riêng. Nếu đang chạy local trong workspace này thì có thể bỏ qua cell này; `SCRIPT_PATH` sẽ được set về script local ở cell kế tiếp."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f558695",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run this in a fresh environment after uploading data and code to separate Hugging Face repos.\n",
    "# This notebook intentionally uses subprocess instead of notebook magics, so it also works in Marimo/Colab-like runners.\n",
    "\n",
    "from pathlib import Path\n",
    "import importlib.util\n",
    "import os\n",
    "import shlex\n",
    "import subprocess\n",
    "import sys\n",
    "\n",
    "def run_cmd(cmd, *, env=None):\n",
    "    cmd = [str(x) for x in cmd]\n",
    "    print(\"Running:\", \" \".join(shlex.quote(x) for x in cmd))\n",
    "    return subprocess.run(cmd, check=True, env=env)\n",
    "\n",
    "def pip_install(*packages, extra_args=None):\n",
    "    cmd = [sys.executable, \"-m\", \"pip\", \"install\", \"-q\"]\n",
    "    if extra_args:\n",
    "        cmd.extend(extra_args)\n",
    "    cmd.extend(packages)\n",
    "    run_cmd(cmd)\n",
    "\n",
    "def ensure_package(import_name, pip_name=None, *, extra_args=None):\n",
    "    if importlib.util.find_spec(import_name) is None:\n",
    "        pip_install(pip_name or import_name, extra_args=extra_args)\n",
    "\n",
    "ensure_package(\"huggingface_hub\")\n",
    "ensure_package(\"numpy\")\n",
    "ensure_package(\"pandas\")\n",
    "ensure_package(\"sklearn\", \"scikit-learn\")\n",
    "ensure_package(\"statsmodels\")\n",
    "ensure_package(\"lightgbm\")\n",
    "\n",
    "# GPU PyTorch wheel. The marimo GPU runner currently exposes CUDA 13.0.\n",
    "# If your platform needs a different CUDA build, change cu130 to cu121/cu124.\n",
    "ensure_package(\"torch\", \"torch\", extra_args=[\"--index-url\", \"https://download.pytorch.org/whl/cu130\"])\n",
    "\n",
    "import torch\n",
    "print(\"PyTorch:\", torch.__version__)\n",
    "print(\"CUDA available:\", torch.cuda.is_available())\n",
    "if torch.cuda.is_available():\n",
    "    print(\"GPU:\", torch.cuda.get_device_name(0))\n",
    "\n",
    "HF_DATA_REPO_ID = \"HoangTrungNguyen/decode-iblend-data\"\n",
    "HF_CODE_REPO_ID = \"HoangTrungNguyen/decode-iblend-code\"\n",
    "CODE_REPO_TYPE = \"model\"  # use \"dataset\" if you uploaded code as a dataset repo\n",
    "\n",
    "DATA_DIR = Path(\"hf_data\")\n",
    "CODE_DIR = Path(\"hf_code\")\n",
    "LOCAL_SCRIPT = Path(\"scripts/decode_reimplementation.py\")\n",
    "\n",
    "if HF_DATA_REPO_ID and HF_CODE_REPO_ID:\n",
    "    from huggingface_hub import snapshot_download\n",
    "    snapshot_download(\n",
    "        repo_id=HF_DATA_REPO_ID,\n",
    "        repo_type=\"dataset\",\n",
    "        local_dir=str(DATA_DIR),\n",
    "        local_dir_use_symlinks=False,\n",
    "    )\n",
    "    snapshot_download(\n",
    "        repo_id=HF_CODE_REPO_ID,\n",
    "        repo_type=CODE_REPO_TYPE,\n",
    "        local_dir=str(CODE_DIR),\n",
    "        local_dir_use_symlinks=False,\n",
    "    )\n",
    "    os.environ[\"IBLEND_DATA_ROOT\"] = str(DATA_DIR.resolve())\n",
    "    SCRIPT_PATH = str((CODE_DIR / \"scripts\" / \"decode_reimplementation.py\").resolve())\n",
    "else:\n",
    "    SCRIPT_PATH = str(LOCAL_SCRIPT.resolve())\n",
    "    print(\"Set HF_DATA_REPO_ID and HF_CODE_REPO_ID first if you need to download from Hugging Face.\")\n",
    "\n",
    "OUTPUT_DIR = Path(SCRIPT_PATH).resolve().parents[1] / \"decode_reimplementation_outputs\"\n",
    "\n",
    "TRAIN_ENV = os.environ.copy()\n",
    "TRAIN_ENV[\"DECODE_DISABLE_TENSORFLOW\"] = \"1\"\n",
    "TRAIN_ENV.pop(\"CUDA_VISIBLE_DEVICES\", None)\n",
    "if \"IBLEND_DATA_ROOT\" in os.environ:\n",
    "    TRAIN_ENV[\"IBLEND_DATA_ROOT\"] = os.environ[\"IBLEND_DATA_ROOT\"]\n",
    "\n",
    "print(\"Data root:\", os.environ.get(\"IBLEND_DATA_ROOT\", \"local/default\"))\n",
    "print(\"Script path:\", SCRIPT_PATH)\n",
    "print(\"Output dir:\", OUTPUT_DIR)\n",
    "print(\"Subprocess CUDA_VISIBLE_DEVICES:\", TRAIN_ENV.get(\"CUDA_VISIBLE_DEVICES\", \"all devices\"))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "450521e7",
   "metadata": {},
   "source": [
    "## 1. Check Dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83b7b2b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import importlib.util\n",
    "\n",
    "packages = ['numpy', 'pandas', 'sklearn', 'torch', 'lightgbm', 'statsmodels']\n",
    "for package in packages:\n",
    "    status = 'OK' if importlib.util.find_spec(package) else 'MISSING'\n",
    "    print(f'{package}: {status}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "038ac26a",
   "metadata": {},
   "source": [
    "Cell đầu đã tự cài các package cần thiết bằng `subprocess` trong đúng kernel hiện tại. Deep learning chạy bằng PyTorch; nếu CUDA khả dụng, script sẽ tự dùng GPU."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "quick_smoke_md",
   "metadata": {},
   "source": [
    "## 2. Quick Smoke Test\n",
    "\n",
    "Chạy nhanh một building với 20,000 dòng để kiểm tra pipeline, LightGBM feature importance và PyTorch GPU."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "quick_smoke_code",
   "metadata": {},
   "outputs": [],
   "source": [
    "subprocess.run([\n",
    "    sys.executable, SCRIPT_PATH,\n",
    "    \"--mode\", \"paper_buildings\",\n",
    "    \"--target\", \"Academic\",\n",
    "    \"--max-rows\", \"20000\",\n",
    "    \"--epochs\", \"5\",\n",
    "    \"--batch-size\", \"64\",\n",
    "    \"--dl-models\", \"lstm\",\n",
    "], check=True, env=TRAIN_ENV)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "baseline_md",
   "metadata": {},
   "source": [
    "## 3. Baselines + LightGBM Feature Importance\n",
    "\n",
    "Chạy toàn bộ 7 building-level targets với Ridge, Decision Tree, Random Forest và LightGBM. Deep learning được skip ở cell này để lấy baseline và file feature importance trước."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "baseline_code",
   "metadata": {},
   "outputs": [],
   "source": [
    "subprocess.run([\n",
    "    sys.executable, SCRIPT_PATH,\n",
    "    \"--mode\", \"paper_buildings\",\n",
    "    \"--skip-lstm\",\n",
    "], check=True, env=TRAIN_ENV)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "lstm_all_md",
   "metadata": {},
   "source": [
    "## 4. Deep Learning: LSTM For All Buildings\n",
    "\n",
    "Chạy riêng LSTM cho toàn bộ 7 building-level targets. Default `horizon=1` nghĩa là dự đoán 10 phút tới."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "lstm_all_code",
   "metadata": {},
   "outputs": [],
   "source": [
    "subprocess.run([\n",
    "    sys.executable, SCRIPT_PATH,\n",
    "    \"--mode\", \"paper_buildings\",\n",
    "    \"--epochs\", \"20\",\n",
    "    \"--batch-size\", \"64\",\n",
    "    \"--dl-models\", \"lstm\",\n",
    "], check=True, env=TRAIN_ENV)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cnn_all_md",
   "metadata": {},
   "source": [
    "## 5. Deep Learning: 1D-CNN For All Buildings\n",
    "\n",
    "Chạy riêng 1D-CNN cho toàn bộ 7 building-level targets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cnn_all_code",
   "metadata": {},
   "outputs": [],
   "source": [
    "subprocess.run([\n",
    "    sys.executable, SCRIPT_PATH,\n",
    "    \"--mode\", \"paper_buildings\",\n",
    "    \"--epochs\", \"20\",\n",
    "    \"--batch-size\", \"64\",\n",
    "    \"--dl-models\", \"cnn\",\n",
    "], check=True, env=TRAIN_ENV)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "tcn_all_md",
   "metadata": {},
   "source": [
    "## 6. Deep Learning: TCN For All Buildings\n",
    "\n",
    "Chạy riêng TCN cho toàn bộ 7 building-level targets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "tcn_all_code",
   "metadata": {},
   "outputs": [],
   "source": [
    "subprocess.run([\n",
    "    sys.executable, SCRIPT_PATH,\n",
    "    \"--mode\", \"paper_buildings\",\n",
    "    \"--epochs\", \"20\",\n",
    "    \"--batch-size\", \"64\",\n",
    "    \"--dl-models\", \"tcn\",\n",
    "], check=True, env=TRAIN_ENV)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "meters_md",
   "metadata": {},
   "source": [
    "## 7. Optional: Train 9 Local Meter Models\n",
    "\n",
    "Chế độ này train riêng 9 meter trong `all_buildings_power.csv`. Chạy từng model riêng để dễ so sánh và tránh một cell quá dài."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "meters_code",
   "metadata": {},
   "outputs": [],
   "source": [
    "for model_name in [\"lstm\", \"cnn\", \"tcn\"]:\n",
    "    subprocess.run([\n",
    "        sys.executable, SCRIPT_PATH,\n",
    "        \"--mode\", \"meters\",\n",
    "        \"--epochs\", \"20\",\n",
    "        \"--batch-size\", \"64\",\n",
    "        \"--dl-models\", model_name,\n",
    "    ], check=True, env=TRAIN_ENV)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "horizon_md",
   "metadata": {},
   "source": [
    "## 8. Forecast Horizon Examples\n",
    "\n",
    "Mặc định `--horizon 1` nghĩa là dự báo 1 bước tới. Với `freq=10min`, đó là 10 phút tới. Muốn dự báo 1 ngày tới, dùng `--horizon-days 1`. ARIMA có thể chậm nên chỉ bật khi cần bằng `--include-arima`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "horizon_code",
   "metadata": {},
   "outputs": [],
   "source": [
    "subprocess.run([\n",
    "    sys.executable, SCRIPT_PATH,\n",
    "    \"--mode\", \"paper_buildings\",\n",
    "    \"--target\", \"Academic\",\n",
    "    \"--horizon-days\", \"1\",\n",
    "    \"--lookback\", \"144\",\n",
    "    \"--epochs\", \"20\",\n",
    "    \"--batch-size\", \"64\",\n",
    "    \"--dl-models\", \"lstm\",\n",
    "], check=True, env=TRAIN_ENV)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "arima_code",
   "metadata": {},
   "outputs": [],
   "source": [
    "subprocess.run([\n",
    "    sys.executable, SCRIPT_PATH,\n",
    "    \"--mode\", \"paper_buildings\",\n",
    "    \"--target\", \"Academic\",\n",
    "    \"--horizon\", \"6\",\n",
    "    \"--lookback\", \"144\",\n",
    "    \"--skip-lstm\",\n",
    "    \"--include-arima\",\n",
    "    \"--arima-max-train\", \"20000\",\n",
    "], check=True, env=TRAIN_ENV)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "testspan_md",
   "metadata": {},
   "source": [
    "## 9. Paper-style Test Span: 1 Week\n",
    "\n",
    "Giống cách paper báo cáo performance trên các test spans, cell này vẫn split chronological 70/15/15 nhưng chỉ evaluate 1 tuần đầu của test split. Mặc định horizon vẫn là 10 phút tới."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "testspan_dl_code",
   "metadata": {},
   "outputs": [],
   "source": [
    "for model_name in [\"lstm\", \"cnn\", \"tcn\"]:\n",
    "    subprocess.run([\n",
    "        sys.executable, SCRIPT_PATH,\n",
    "        \"--mode\", \"paper_buildings\",\n",
    "        \"--target\", \"Academic\",\n",
    "        \"--test-span-days\", \"7\",\n",
    "        \"--dl-models\", model_name,\n",
    "        \"--epochs\", \"20\",\n",
    "        \"--batch-size\", \"64\",\n",
    "    ], check=True, env=TRAIN_ENV)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "testspan_arima_code",
   "metadata": {},
   "outputs": [],
   "source": [
    "subprocess.run([\n",
    "    sys.executable, SCRIPT_PATH,\n",
    "    \"--mode\", \"paper_buildings\",\n",
    "    \"--target\", \"Academic\",\n",
    "    \"--test-span-days\", \"7\",\n",
    "    \"--skip-lstm\",\n",
    "    \"--include-arima\",\n",
    "    \"--arima-max-train\", \"20000\",\n",
    "], check=True, env=TRAIN_ENV)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5687ab98",
   "metadata": {},
   "source": [
    "## 10. Load Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2f16525",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "\n",
    "out = OUTPUT_DIR\n",
    "for path in sorted(out.glob('results_*.csv')):\n",
    "    print('\n",
    "', path)\n",
    "    results_df = pd.read_csv(path).sort_values(['target', 'model'])\n",
    "    display(results_df)\n",
    "\n",
    "importance_dir = out / 'feature_importance'\n",
    "if importance_dir.exists():\n",
    "    print('\n",
    "Feature importance files:')\n",
    "    for path in sorted(importance_dir.glob('*_lightgbm_feature_importance.csv')):\n",
    "        print(path)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tf_linux",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}