HoangTrungNguyen
/

decode-iblend-code

Model card Files Files and versions

xet

Community

HoangTrungNguyen commited on 5 days ago

Commit

06e1c90

verified ·

1 Parent(s): 626bb3f

Upload scripts/preprocess_and_eda_by_building.py with huggingface_hub

Browse files

Files changed (1) hide show

scripts/preprocess_and_eda_by_building.py +1110 -0

scripts/preprocess_and_eda_by_building.py ADDED Viewed

	@@ -0,0 +1,1110 @@

+#!/usr/bin/env python3
+"""Preprocess campus building energy data and create per-building EDA files.
+The script intentionally uses only the Python standard library so it can run in
+minimal environments. It reads the wide minute-level `all_buildings_power.csv`,
+converts watts to kW, converts UNIX timestamps to Asia/Kolkata time, aggregates
+hourly/daily analysis-ready files, and writes EDA reports for each meter and
+building type.
+"""
+from __future__ import annotations
+import csv
+import math
+import statistics
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Iterable
+from zoneinfo import ZoneInfo
+ROOT = Path(__file__).resolve().parents[1]
+DATA_FILE = ROOT / "energy_dataset" / "all_buildings_power.csv"
+TRANSFORMER_POWER_FILE = ROOT / "energy_dataset" / "all_transformer_power.csv"
+OUT_DIR = ROOT / "preprocessed_outputs"
+EDA_DIR = ROOT / "eda_by_building_type"
+FULL_EDA_DIR = ROOT / "eda_energy_full"
+IST = ZoneInfo("Asia/Kolkata")
+METERS = [
+    "Academic",
+    "Boys_main",
+    "Boys_backup",
+    "Facilities",
+    "Girls_main",
+    "Girls_backup",
+    "Lecture",
+    "Library",
+    "Mess",
+]
+METER_META = {
+    "Academic": {
+        "type": "academic",
+        "display": "Academic building",
+        "note": "Main academic block; expected to follow working-day and teaching-hour patterns.",
+    },
+    "Boys_main": {
+        "type": "hostel",
+        "display": "Boys hostel mains",
+        "note": "Main grid supply for boys hostel; residential load should be steadier than academic spaces.",
+    },
+    "Boys_backup": {
+        "type": "hostel",
+        "display": "Boys hostel UPS",
+        "note": "Backup/UPS supply for boys hostel; useful for separating essential residential load.",
+    },
+    "Facilities": {
+        "type": "facilities",
+        "display": "Facilities building",
+        "note": "Campus facilities load; may include operational equipment and irregular maintenance activity.",
+    },
+    "Girls_main": {
+        "type": "hostel",
+        "display": "Girls hostel mains",
+        "note": "Main grid supply for girls hostel; residential load should remain active across weekends.",
+    },
+    "Girls_backup": {
+        "type": "hostel",
+        "display": "Girls hostel UPS",
+        "note": "Backup/UPS supply for girls hostel; currently one of the cleanest meter series.",
+    },
+    "Lecture": {
+        "type": "lecture",
+        "display": "Lecture building",
+        "note": "Lecture/classroom load; expected to be schedule-driven with many low or zero periods.",
+    },
+    "Library": {
+        "type": "library",
+        "display": "Library building",
+        "note": "Library load; expected to reflect opening hours, study periods, and exam-season usage.",
+    },
+    "Mess": {
+        "type": "mess",
+        "display": "Dining/Mess building",
+        "note": "Dining building load; expected to show meal-time equipment peaks.",
+    },
+}
+EDA_ASPECTS = [
+    {
+        "aspect": "Data quality and coverage",
+        "why": "Check whether each building has enough usable observations before comparing consumption.",
+        "outputs": "missing %, observed rows, availability %, zero %, negative %",
+    },
+    {
+        "aspect": "Load magnitude and ranking",
+        "why": "Identify which buildings consume the most power and should be prioritized.",
+        "outputs": "mean kW, median kW, p75/p95 kW, max kW",
+    },
+    {
+        "aspect": "Temporal behavior",
+        "why": "Campus energy is strongly tied to operating hours, class schedules, weekends, and seasons.",
+        "outputs": "hourly profile, weekday profile, monthly profile, daily trend",
+    },
+    {
+        "aspect": "Peak demand",
+        "why": "Peak events drive capacity planning, demand response, and transformer stress.",
+        "outputs": "top 10 peak timestamps, p95 kW, max kW",
+    },
+    {
+        "aspect": "Stability and variability",
+        "why": "Stable loads behave differently from occupancy-driven loads and need different models.",
+        "outputs": "std kW, coefficient of variation, p95/median ratio",
+    },
+    {
+        "aspect": "Building type comparison",
+        "why": "Academic, hostel, library, mess, lecture, and facilities loads represent different use cases.",
+        "outputs": "type-level hourly/daily cleaned files and type-level reports",
+    },
+]
+COMMON_EDA_FLOW = [
+    "1. Validate data coverage and missing/zero/negative values.",
+    "2. Convert timestamp to Asia/Kolkata and power from W to kW.",
+    "3. Aggregate minute data to hourly and daily clean datasets.",
+    "4. Summarize distribution: mean, median, p75, p95, max, variability.",
+    "5. Analyze temporal patterns by hour of day, weekday, month, and daily trend.",
+    "6. Extract peak events for operational review.",
+    "7. Compare the meter with its building type and with all other buildings.",
+]
+TRANSFORMER_COLUMNS = ["transfomer_1", "transfomer_2", "transfomer_3"]
+@dataclass
+class SeriesStats:
+    count: int = 0
+    missing: int = 0
+    zero: int = 0
+    negative: int = 0
+    total_kw: float = 0.0
+    total_sq_kw: float = 0.0
+    min_kw: float | None = None
+    max_kw: float | None = None
+    first_seen: str | None = None
+    last_seen: str | None = None
+    values_for_quantiles: list[float] = field(default_factory=list)
+    def add(self, value: float | None, timestamp: str) -> None:
+        if value is None:
+            self.missing += 1
+            return
+        self.count += 1
+        self.total_kw += value
+        self.total_sq_kw += value * value
+        self.min_kw = value if self.min_kw is None else min(self.min_kw, value)
+        self.max_kw = value if self.max_kw is None else max(self.max_kw, value)
+        if value == 0:
+            self.zero += 1
+        if value < 0:
+            self.negative += 1
+        if self.first_seen is None:
+            self.first_seen = timestamp
+        self.last_seen = timestamp
+        self.values_for_quantiles.append(value)
+    @property
+    def mean_kw(self) -> float:
+        return self.total_kw / self.count if self.count else math.nan
+    @property
+    def std_kw(self) -> float:
+        if self.count <= 1:
+            return math.nan
+        variance = (self.total_sq_kw - (self.total_kw * self.total_kw / self.count)) / (self.count - 1)
+        return math.sqrt(max(variance, 0.0))
+@dataclass
+class Bucket:
+    sum_kw: float = 0.0
+    count: int = 0
+    missing: int = 0
+    def add(self, value: float | None) -> None:
+        if value is None:
+            self.missing += 1
+            return
+        self.sum_kw += value
+        self.count += 1
+    @property
+    def mean_kw(self) -> float:
+        return self.sum_kw / self.count if self.count else math.nan
+    @property
+    def availability_pct(self) -> float:
+        total = self.count + self.missing
+        return self.count / total * 100 if total else math.nan
+def parse_watts(value: str) -> float | None:
+    value = value.strip()
+    if not value or value.upper() == "NA":
+        return None
+    try:
+        watts = float(value)
+    except ValueError:
+        return None
+    return watts / 1000.0
+def parse_timestamp(value: str) -> int:
+    """Read UNIX timestamps stored either as integers or scientific notation."""
+    return int(float(value.strip()))
+def fmt(value: float | int | None, digits: int = 3) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
+        return ""
+    return f"{value:.{digits}f}"
+def quantile(sorted_values: list[float], q: float) -> float:
+    if not sorted_values:
+        return math.nan
+    if len(sorted_values) == 1:
+        return sorted_values[0]
+    pos = (len(sorted_values) - 1) * q
+    lower = math.floor(pos)
+    upper = math.ceil(pos)
+    if lower == upper:
+        return sorted_values[int(pos)]
+    return sorted_values[lower] * (upper - pos) + sorted_values[upper] * (pos - lower)
+def ensure_dirs() -> None:
+    for path in [
+        OUT_DIR,
+        EDA_DIR,
+        EDA_DIR / "per_meter",
+        EDA_DIR / "by_type",
+        EDA_DIR / "charts",
+        FULL_EDA_DIR,
+        FULL_EDA_DIR / "charts",
+        FULL_EDA_DIR / "buildings",
+        FULL_EDA_DIR / "building_types",
+    ]:
+        path.mkdir(parents=True, exist_ok=True)
+    for meter in METERS:
+        (FULL_EDA_DIR / "buildings" / meter.lower()).mkdir(parents=True, exist_ok=True)
+    for building_type in sorted({meta["type"] for meta in METER_META.values()}):
+        (FULL_EDA_DIR / "building_types" / building_type).mkdir(parents=True, exist_ok=True)
+def write_csv(path: Path, fieldnames: list[str], rows: Iterable[dict[str, object]]) -> None:
+    with path.open("w", newline="", encoding="utf-8") as file:
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
+def svg_line_chart(path: Path, title: str, points: list[tuple[str, float]], y_label: str = "kW") -> None:
+    width, height = 920, 360
+    margin_left, margin_right, margin_top, margin_bottom = 70, 30, 50, 70
+    plot_w = width - margin_left - margin_right
+    plot_h = height - margin_top - margin_bottom
+    values = [v for _, v in points if not math.isnan(v)]
+    if not values:
+        return
+    min_v, max_v = min(values), max(values)
+    if min_v == max_v:
+        min_v -= 1
+        max_v += 1
+    coords = []
+    usable = [(label, value) for label, value in points if not math.isnan(value)]
+    for i, (_, value) in enumerate(usable):
+        x = margin_left + (plot_w * i / max(len(usable) - 1, 1))
+        y = margin_top + plot_h - ((value - min_v) / (max_v - min_v) * plot_h)
+        coords.append(f"{x:.1f},{y:.1f}")
+    tick_labels = []
+    for i in range(0, len(usable), max(1, len(usable) // 8)):
+        label, _ = usable[i]
+        x = margin_left + (plot_w * i / max(len(usable) - 1, 1))
+        tick_labels.append(
+            f'<text x="{x:.1f}" y="{height - 25}" text-anchor="middle" '
+            f'font-size="11" fill="#475569">{label}</text>'
+        )
+    svg = f"""<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">
+  <rect width="100%" height="100%" fill="#ffffff"/>
+  <text x="{margin_left}" y="28" font-size="20" font-family="Arial, sans-serif" font-weight="700" fill="#111827">{title}</text>
+  <line x1="{margin_left}" y1="{margin_top}" x2="{margin_left}" y2="{height - margin_bottom}" stroke="#CBD5E1"/>
+  <line x1="{margin_left}" y1="{height - margin_bottom}" x2="{width - margin_right}" y2="{height - margin_bottom}" stroke="#CBD5E1"/>
+  <text x="18" y="{margin_top + 10}" font-size="12" font-family="Arial, sans-serif" fill="#475569">{fmt(max_v, 1)} {y_label}</text>
+  <text x="18" y="{height - margin_bottom}" font-size="12" font-family="Arial, sans-serif" fill="#475569">{fmt(min_v, 1)} {y_label}</text>
+  <polyline points="{' '.join(coords)}" fill="none" stroke="#0F766E" stroke-width="2.5"/>
+  {''.join(tick_labels)}
+</svg>
+"""
+    path.write_text(svg, encoding="utf-8")
+def svg_bar_chart(path: Path, title: str, bars: list[tuple[str, float]], y_label: str = "kW") -> None:
+    width, height = 920, 420
+    margin_left, margin_right, margin_top, margin_bottom = 70, 30, 55, 110
+    plot_w = width - margin_left - margin_right
+    plot_h = height - margin_top - margin_bottom
+    clean_bars = [(label, value) for label, value in bars if not math.isnan(value)]
+    if not clean_bars:
+        return
+    max_v = max(value for _, value in clean_bars)
+    max_v = max_v if max_v > 0 else 1
+    step = plot_w / len(clean_bars)
+    bar_w = step * 0.62
+    rects = []
+    labels = []
+    for i, (label, value) in enumerate(clean_bars):
+        x = margin_left + i * step + (step - bar_w) / 2
+        bar_h = value / max_v * plot_h
+        y = margin_top + plot_h - bar_h
+        rects.append(
+            f'<rect x="{x:.1f}" y="{y:.1f}" width="{bar_w:.1f}" height="{bar_h:.1f}" '
+            f'fill="#2563EB" rx="3"/>'
+        )
+        labels.append(
+            f'<text x="{x + bar_w / 2:.1f}" y="{height - 72}" text-anchor="end" '
+            f'transform="rotate(-38 {x + bar_w / 2:.1f},{height - 72})" '
+            f'font-size="11" font-family="Arial, sans-serif" fill="#475569">{label}</text>'
+        )
+        labels.append(
+            f'<text x="{x + bar_w / 2:.1f}" y="{y - 6:.1f}" text-anchor="middle" '
+            f'font-size="11" font-family="Arial, sans-serif" fill="#111827">{fmt(value, 1)}</text>'
+        )
+    svg = f"""<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">
+  <rect width="100%" height="100%" fill="#ffffff"/>
+  <text x="{margin_left}" y="30" font-size="20" font-family="Arial, sans-serif" font-weight="700" fill="#111827">{title}</text>
+  <line x1="{margin_left}" y1="{margin_top}" x2="{margin_left}" y2="{height - margin_bottom}" stroke="#CBD5E1"/>
+  <line x1="{margin_left}" y1="{height - margin_bottom}" x2="{width - margin_right}" y2="{height - margin_bottom}" stroke="#CBD5E1"/>
+  <text x="18" y="{margin_top + 10}" font-size="12" font-family="Arial, sans-serif" fill="#475569">{fmt(max_v, 1)} {y_label}</text>
+  {''.join(rects)}
+  {''.join(labels)}
+</svg>
+"""
+    path.write_text(svg, encoding="utf-8")
+def quality_label(missing_pct: float, zero_pct: float) -> str:
+    if missing_pct <= 5 and zero_pct <= 10:
+        return "strong"
+    if missing_pct <= 15 and zero_pct <= 25:
+        return "usable"
+    if missing_pct <= 30:
+        return "limited"
+    return "weak"
+def summarize_transformer_power() -> list[dict[str, object]]:
+    stats = {column: SeriesStats() for column in TRANSFORMER_COLUMNS}
+    if not TRANSFORMER_POWER_FILE.exists():
+        return []
+    with TRANSFORMER_POWER_FILE.open("r", newline="", encoding="utf-8") as file:
+        reader = csv.DictReader(file)
+        for row in reader:
+            dt = datetime.fromtimestamp(parse_timestamp(row["timestamp"]), tz=UTC).astimezone(IST)
+            dt_iso = dt.isoformat()
+            for column in TRANSFORMER_COLUMNS:
+                stats[column].add(parse_watts(row[column]), dt_iso)
+    rows = []
+    for column, item in stats.items():
+        values = sorted(item.values_for_quantiles)
+        total_points = item.count + item.missing
+        rows.append(
+            {
+                "transformer": column,
+                "rows_total": total_points,
+                "observed_rows": item.count,
+                "missing_rows": item.missing,
+                "missing_pct": fmt(item.missing / total_points * 100 if total_points else math.nan),
+                "zero_pct": fmt(item.zero / item.count * 100 if item.count else math.nan),
+                "mean_kw": fmt(item.mean_kw),
+                "median_kw": fmt(quantile(values, 0.50)),
+                "p95_kw": fmt(quantile(values, 0.95)),
+                "max_kw": fmt(item.max_kw),
+                "first_seen_ist": item.first_seen or "",
+                "last_seen_ist": item.last_seen or "",
+            }
+        )
+    return rows
+def write_analysis_plan() -> None:
+    lines = [
+        "# EDA Analysis Plan Before Per-Building EDA",
+        "",
+        "This dataset should be analyzed with the same flow for all 9 building meters so comparisons stay fair.",
+        "",
+        "## Dataset-specific questions",
+        "",
+        "- Which meters have enough coverage to trust?",
+        "- Which buildings consume the most energy on average and during peaks?",
+        "- Which buildings show schedule-driven behavior by hour, weekday, and month?",
+        "- Which loads are stable enough for baseline forecasting?",
+        "- Which meters need caution because of missing data or many zeros?",
+        "- How do building types differ: academic, hostel, facilities, lecture, library, and mess?",
+        "",
+        "## Recommended analysis aspects",
+        "",
+        "| Aspect | Why it matters | Output files/metrics |",
+        "| --- | --- | --- |",
+    ]
+    for item in EDA_ASPECTS:
+        lines.append(f"| {item['aspect']} | {item['why']} | {item['outputs']} |")
+    lines.extend(["", "## Common EDA flow used for every building", ""])
+    lines.extend([f"- {step}" for step in COMMON_EDA_FLOW])
+    lines.extend(
+        [
+            "",
+            "## Practical interpretation",
+            "",
+            "- Use `strong` meters for headline insights and modeling first.",
+            "- Use `usable` meters for comparison after checking missing periods.",
+            "- Treat `limited` meters carefully; avoid overclaiming precise trends.",
+            "- Treat a high zero percentage as a separate operating-state signal, not automatically as clean data.",
+            "",
+            "## Current feature coverage",
+            "",
+            "| Feature | Current status | Notes |",
+            "| --- | --- | --- |",
+            "| Energy consumption | Calculated | Uses the 1-minute building and transformer power CSV files. |",
+            "| Schedule/time pattern | Calculated as proxy | Uses hour-of-day, weekday/weekend, and month from timestamps. No explicit class timetable file is present. |",
+            "| Occupancy | Not calculated yet | No occupancy CSV is present in the current workspace. Add it to join at 10-minute or hourly resolution. |",
+            "| Weather | Not calculated yet | No weather CSV is present in the current workspace. Add it to join by timestamp before correlation/regression. |",
+        ]
+    )
+    content = "\n".join(lines) + "\n"
+    (EDA_DIR / "eda_analysis_plan.md").write_text(content, encoding="utf-8")
+    (FULL_EDA_DIR / "00_eda_scope_and_flow.md").write_text(content, encoding="utf-8")
+def main() -> None:
+    ensure_dirs()
+    write_analysis_plan()
+    stats = {meter: SeriesStats() for meter in METERS}
+    hourly = defaultdict(Bucket)
+    daily = defaultdict(Bucket)
+    hour_profile = defaultdict(Bucket)
+    weekday_profile = defaultdict(Bucket)
+    month_profile = defaultdict(Bucket)
+    type_hourly = defaultdict(Bucket)
+    type_daily = defaultdict(Bucket)
+    top_peaks = {meter: [] for meter in METERS}
+    row_count = 0
+    with DATA_FILE.open("r", newline="", encoding="utf-8") as file:
+        reader = csv.DictReader(file)
+        for row in reader:
+            row_count += 1
+            dt = datetime.fromtimestamp(parse_timestamp(row["timestamp"]), tz=UTC).astimezone(IST)
+            dt_iso = dt.isoformat()
+            hour_key = dt.strftime("%Y-%m-%d %H:00:00%z")
+            day_key = dt.strftime("%Y-%m-%d")
+            month_key = dt.strftime("%Y-%m")
+            weekday_key = str(dt.weekday())
+            hour_of_day = f"{dt.hour:02d}"
+            values_by_type = defaultdict(list)
+            for meter in METERS:
+                value = parse_watts(row[meter])
+                stats[meter].add(value, dt_iso)
+                hourly[(meter, hour_key)].add(value)
+                daily[(meter, day_key)].add(value)
+                hour_profile[(meter, hour_of_day)].add(value)
+                weekday_profile[(meter, weekday_key)].add(value)
+                month_profile[(meter, month_key)].add(value)
+                if value is not None:
+                    values_by_type[METER_META[meter]["type"]].append(value)
+                    peaks = top_peaks[meter]
+                    peaks.append((value, dt_iso))
+                    peaks.sort(key=lambda item: item[0], reverse=True)
+                    del peaks[10:]
+                else:
+                    type_hourly[(METER_META[meter]["type"], hour_key)].add(None)
+                    type_daily[(METER_META[meter]["type"], day_key)].add(None)
+            for building_type, values in values_by_type.items():
+                total_type_kw = sum(values)
+                type_hourly[(building_type, hour_key)].add(total_type_kw)
+                type_daily[(building_type, day_key)].add(total_type_kw)
+    summary_rows = []
+    for meter, item in stats.items():
+        values = sorted(item.values_for_quantiles)
+        total_points = item.count + item.missing
+        summary_rows.append(
+            {
+                "meter": meter,
+                "building_type": METER_META[meter]["type"],
+                "display_name": METER_META[meter]["display"],
+                "building_note": METER_META[meter]["note"],
+                "rows_total": total_points,
+                "observed_rows": item.count,
+                "missing_rows": item.missing,
+                "missing_pct": fmt(item.missing / total_points * 100 if total_points else math.nan),
+                "zero_pct": fmt(item.zero / item.count * 100 if item.count else math.nan),
+                "negative_pct": fmt(item.negative / item.count * 100 if item.count else math.nan),
+                "mean_kw": fmt(item.mean_kw),
+                "std_kw": fmt(item.std_kw),
+                "min_kw": fmt(item.min_kw),
+                "p25_kw": fmt(quantile(values, 0.25)),
+                "median_kw": fmt(quantile(values, 0.50)),
+                "p75_kw": fmt(quantile(values, 0.75)),
+                "p95_kw": fmt(quantile(values, 0.95)),
+                "max_kw": fmt(item.max_kw),
+                "first_seen_ist": item.first_seen or "",
+                "last_seen_ist": item.last_seen or "",
+            }
+        )
+    write_csv(
+        OUT_DIR / "building_preprocessing_summary.csv",
+        list(summary_rows[0].keys()),
+        summary_rows,
+    )
+    write_csv(
+        FULL_EDA_DIR / "01_common_building_power_summary.csv",
+        list(summary_rows[0].keys()),
+        summary_rows,
+    )
+    common_flow_rows = []
+    for row in summary_rows:
+        meter = row["meter"]
+        weekdays = [weekday_profile[(meter, str(day))].mean_kw for day in range(5)]
+        weekends = [weekday_profile[(meter, str(day))].mean_kw for day in range(5, 7)]
+        weekday_mean = statistics.fmean([value for value in weekdays if not math.isnan(value)])
+        weekend_mean = statistics.fmean([value for value in weekends if not math.isnan(value)])
+        mean_kw = float(row["mean_kw"]) if row["mean_kw"] else math.nan
+        median_kw = float(row["median_kw"]) if row["median_kw"] else math.nan
+        std_kw = float(row["std_kw"]) if row["std_kw"] else math.nan
+        p95_kw = float(row["p95_kw"]) if row["p95_kw"] else math.nan
+        missing_pct = float(row["missing_pct"]) if row["missing_pct"] else math.nan
+        zero_pct = float(row["zero_pct"]) if row["zero_pct"] else math.nan
+        cv = std_kw / mean_kw if mean_kw else math.nan
+        p95_to_median = p95_kw / median_kw if median_kw else math.nan
+        common_flow_rows.append(
+            {
+                "meter": meter,
+                "building_type": row["building_type"],
+                "display_name": row["display_name"],
+                "building_note": row["building_note"],
+                "quality_label": quality_label(missing_pct, zero_pct),
+                "missing_pct": row["missing_pct"],
+                "zero_pct": row["zero_pct"],
+                "mean_kw": row["mean_kw"],
+                "median_kw": row["median_kw"],
+                "p95_kw": row["p95_kw"],
+                "max_kw": row["max_kw"],
+                "cv": fmt(cv),
+                "p95_to_median": fmt(p95_to_median),
+                "weekday_mean_kw": fmt(weekday_mean),
+                "weekend_mean_kw": fmt(weekend_mean),
+                "weekend_delta_pct": fmt((weekend_mean - weekday_mean) / weekday_mean * 100 if weekday_mean else math.nan),
+                "first_seen_ist": row["first_seen_ist"],
+                "last_seen_ist": row["last_seen_ist"],
+            }
+        )
+    write_csv(
+        EDA_DIR / "all_meters_common_flow_summary.csv",
+        [
+            "meter",
+            "building_type",
+            "display_name",
+            "building_note",
+            "quality_label",
+            "missing_pct",
+            "zero_pct",
+            "mean_kw",
+            "median_kw",
+            "p95_kw",
+            "max_kw",
+            "cv",
+            "p95_to_median",
+            "weekday_mean_kw",
+            "weekend_mean_kw",
+            "weekend_delta_pct",
+            "first_seen_ist",
+            "last_seen_ist",
+        ],
+        common_flow_rows,
+    )
+    write_csv(
+        FULL_EDA_DIR / "02_common_all_meters_flow_summary.csv",
+        [
+            "meter",
+            "building_type",
+            "display_name",
+            "building_note",
+            "quality_label",
+            "missing_pct",
+            "zero_pct",
+            "mean_kw",
+            "median_kw",
+            "p95_kw",
+            "max_kw",
+            "cv",
+            "p95_to_median",
+            "weekday_mean_kw",
+            "weekend_mean_kw",
+            "weekend_delta_pct",
+            "first_seen_ist",
+            "last_seen_ist",
+        ],
+        common_flow_rows,
+    )
+    transformer_rows = summarize_transformer_power()
+    if transformer_rows:
+        write_csv(
+            FULL_EDA_DIR / "03_common_transformer_power_summary.csv",
+            list(transformer_rows[0].keys()),
+            transformer_rows,
+        )
+        svg_bar_chart(
+            FULL_EDA_DIR / "charts" / "common_transformer_mean_power.svg",
+            "Transformer mean power",
+            [(row["transformer"], float(row["mean_kw"])) for row in transformer_rows if row["mean_kw"]],
+        )
+    ranked_rows = sorted(common_flow_rows, key=lambda row: (float(row["missing_pct"]), float(row["zero_pct"])))
+    svg_bar_chart(
+        FULL_EDA_DIR / "charts" / "common_building_mean_power_ranking.svg",
+        "Building mean power ranking",
+        [(row["meter"], float(row["mean_kw"])) for row in sorted(common_flow_rows, key=lambda item: float(item["mean_kw"]), reverse=True)],
+    )
+    svg_bar_chart(
+        FULL_EDA_DIR / "charts" / "common_building_missing_pct.svg",
+        "Building missing data percentage",
+        [(row["meter"], float(row["missing_pct"])) for row in sorted(common_flow_rows, key=lambda item: float(item["missing_pct"]), reverse=True)],
+        y_label="%",
+    )
+    all_building_lines = [
+        "# All 9 Buildings Common EDA",
+        "",
+        "Each building meter is analyzed with the same flow so the outputs can be compared directly.",
+        "",
+        "## Analysis aspects before EDA",
+        "",
+        "| Aspect | Why it matters |",
+        "| --- | --- |",
+    ]
+    for item in EDA_ASPECTS:
+        all_building_lines.append(f"| {item['aspect']} | {item['why']} |")
+    all_building_lines.extend(["", "## Common flow", ""])
+    all_building_lines.extend([f"- {step}" for step in COMMON_EDA_FLOW])
+    all_building_lines.extend(
+        [
+            "",
+            "## Building annotations",
+            "",
+            "| Meter | Display name | Type | Note |",
+            "| --- | --- | --- | --- |",
+        ]
+    )
+    for row in common_flow_rows:
+        all_building_lines.append(
+            f"| {row['meter']} | {row['display_name']} | {row['building_type']} | {row['building_note']} |"
+        )
+    all_building_lines.extend(
+        [
+            "",
+            "## 9-building comparable summary",
+            "",
+            "| Rank by quality | Meter | Type | Quality | Missing % | Zero % | Mean kW | Median kW | P95 kW | CV | Weekend delta % |",
+            "| ---: | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
+        ]
+    )
+    for rank, row in enumerate(ranked_rows, start=1):
+        all_building_lines.append(
+            f"| {rank} | {row['meter']} | {row['building_type']} | {row['quality_label']} | "
+            f"{row['missing_pct']} | {row['zero_pct']} | {row['mean_kw']} | {row['median_kw']} | "
+            f"{row['p95_kw']} | {row['cv']} | {row['weekend_delta_pct']} |"
+        )
+    all_building_lines.extend(
+        [
+            "",
+            "## Recommended interpretation order",
+            "",
+            "- Start with `quality_label`, `missing_pct`, and `zero_pct`.",
+            "- Use `mean_kw`, `median_kw`, and `p95_kw` for load ranking.",
+            "- Use `cv` and `p95_to_median` for volatility.",
+            "- Use `weekend_delta_pct` for schedule sensitivity.",
+            "- Drill into each `per_meter/*_eda.md` file for the same meter-level flow.",
+        ]
+    )
+    (EDA_DIR / "all_buildings_common_eda.md").write_text("\n".join(all_building_lines) + "\n", encoding="utf-8")
+    (FULL_EDA_DIR / "04_common_all_buildings_eda.md").write_text("\n".join(all_building_lines) + "\n", encoding="utf-8")
+    hourly_rows = []
+    for (meter, hour_key), bucket in sorted(hourly.items(), key=lambda item: (item[0][0], item[0][1])):
+        hourly_rows.append(
+            {
+                "datetime_hour_ist": hour_key,
+                "meter": meter,
+                "building_type": METER_META[meter]["type"],
+                "mean_kw": fmt(bucket.mean_kw),
+                "observed_minutes": bucket.count,
+                "missing_minutes": bucket.missing,
+                "availability_pct": fmt(bucket.availability_pct),
+                "approx_kwh": fmt(bucket.mean_kw if bucket.count else math.nan),
+            }
+        )
+    write_csv(
+        OUT_DIR / "building_power_hourly_clean.csv",
+        [
+            "datetime_hour_ist",
+            "meter",
+            "building_type",
+            "mean_kw",
+            "observed_minutes",
+            "missing_minutes",
+            "availability_pct",
+            "approx_kwh",
+        ],
+        hourly_rows,
+    )
+    daily_rows = []
+    for (meter, day_key), bucket in sorted(daily.items(), key=lambda item: (item[0][0], item[0][1])):
+        daily_rows.append(
+            {
+                "date_ist": day_key,
+                "meter": meter,
+                "building_type": METER_META[meter]["type"],
+                "mean_kw": fmt(bucket.mean_kw),
+                "observed_minutes": bucket.count,
+                "missing_minutes": bucket.missing,
+                "availability_pct": fmt(bucket.availability_pct),
+                "approx_kwh": fmt(bucket.mean_kw * 24 if bucket.count else math.nan),
+            }
+        )
+    write_csv(
+        OUT_DIR / "building_power_daily_clean.csv",
+        [
+            "date_ist",
+            "meter",
+            "building_type",
+            "mean_kw",
+            "observed_minutes",
+            "missing_minutes",
+            "availability_pct",
+            "approx_kwh",
+        ],
+        daily_rows,
+    )
+    type_hourly_rows = []
+    for (building_type, hour_key), bucket in sorted(type_hourly.items(), key=lambda item: (item[0][0], item[0][1])):
+        type_hourly_rows.append(
+            {
+                "datetime_hour_ist": hour_key,
+                "building_type": building_type,
+                "mean_kw": fmt(bucket.mean_kw),
+                "observed_meter_values": bucket.count,
+                "missing_meter_values": bucket.missing,
+                "availability_pct": fmt(bucket.availability_pct),
+            }
+        )
+    write_csv(
+        OUT_DIR / "building_type_hourly_clean.csv",
+        [
+            "datetime_hour_ist",
+            "building_type",
+            "mean_kw",
+            "observed_meter_values",
+            "missing_meter_values",
+            "availability_pct",
+        ],
+        type_hourly_rows,
+    )
+    type_daily_rows = []
+    for (building_type, day_key), bucket in sorted(type_daily.items(), key=lambda item: (item[0][0], item[0][1])):
+        type_daily_rows.append(
+            {
+                "date_ist": day_key,
+                "building_type": building_type,
+                "mean_kw": fmt(bucket.mean_kw),
+                "observed_meter_values": bucket.count,
+                "missing_meter_values": bucket.missing,
+                "availability_pct": fmt(bucket.availability_pct),
+                "approx_kwh": fmt(bucket.mean_kw * 24 if bucket.count else math.nan),
+            }
+        )
+    write_csv(
+        OUT_DIR / "building_type_daily_clean.csv",
+        [
+            "date_ist",
+            "building_type",
+            "mean_kw",
+            "observed_meter_values",
+            "missing_meter_values",
+            "availability_pct",
+            "approx_kwh",
+        ],
+        type_daily_rows,
+    )
+    for meter in METERS:
+        meter_slug = meter.lower()
+        building_dir = FULL_EDA_DIR / "buildings" / meter_slug
+        hourly_profile_rows = []
+        for hour in [f"{i:02d}" for i in range(24)]:
+            bucket = hour_profile[(meter, hour)]
+            hourly_profile_rows.append(
+                {
+                    "hour_ist": hour,
+                    "mean_kw": fmt(bucket.mean_kw),
+                    "observed_points": bucket.count,
+                    "availability_pct": fmt(bucket.availability_pct),
+                }
+            )
+        write_csv(
+            EDA_DIR / "per_meter" / f"{meter_slug}_hourly_profile.csv",
+            ["hour_ist", "mean_kw", "observed_points", "availability_pct"],
+            hourly_profile_rows,
+        )
+        write_csv(
+            building_dir / "hourly_profile.csv",
+            ["hour_ist", "mean_kw", "observed_points", "availability_pct"],
+            hourly_profile_rows,
+        )
+        weekday_rows = []
+        for weekday in range(7):
+            bucket = weekday_profile[(meter, str(weekday))]
+            weekday_rows.append(
+                {
+                    "weekday": weekday,
+                    "weekday_name": ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][weekday],
+                    "mean_kw": fmt(bucket.mean_kw),
+                    "observed_points": bucket.count,
+                    "availability_pct": fmt(bucket.availability_pct),
+                }
+            )
+        write_csv(
+            EDA_DIR / "per_meter" / f"{meter_slug}_weekday_profile.csv",
+            ["weekday", "weekday_name", "mean_kw", "observed_points", "availability_pct"],
+            weekday_rows,
+        )
+        write_csv(
+            building_dir / "weekday_profile.csv",
+            ["weekday", "weekday_name", "mean_kw", "observed_points", "availability_pct"],
+            weekday_rows,
+        )
+        monthly_rows = []
+        for (profile_meter, month), bucket in sorted(month_profile.items()):
+            if profile_meter != meter:
+                continue
+            monthly_rows.append(
+                {
+                    "month_ist": month,
+                    "mean_kw": fmt(bucket.mean_kw),
+                    "observed_points": bucket.count,
+                    "availability_pct": fmt(bucket.availability_pct),
+                }
+            )
+        write_csv(
+            EDA_DIR / "per_meter" / f"{meter_slug}_monthly_profile.csv",
+            ["month_ist", "mean_kw", "observed_points", "availability_pct"],
+            monthly_rows,
+        )
+        write_csv(
+            building_dir / "monthly_profile.csv",
+            ["month_ist", "mean_kw", "observed_points", "availability_pct"],
+            monthly_rows,
+        )
+        peak_rows = [
+            {"rank": index + 1, "datetime_ist": timestamp, "kw": fmt(value)}
+            for index, (value, timestamp) in enumerate(top_peaks[meter])
+        ]
+        write_csv(EDA_DIR / "per_meter" / f"{meter_slug}_top_peaks.csv", ["rank", "datetime_ist", "kw"], peak_rows)
+        write_csv(building_dir / "top_peaks.csv", ["rank", "datetime_ist", "kw"], peak_rows)
+        svg_line_chart(
+            EDA_DIR / "charts" / f"{meter_slug}_hourly_profile.svg",
+            f"{METER_META[meter]['display']} - mean hourly load",
+            [(row["hour_ist"], float(row["mean_kw"]) if row["mean_kw"] else math.nan) for row in hourly_profile_rows],
+        )
+        svg_line_chart(
+            building_dir / "hourly_profile.svg",
+            f"{METER_META[meter]['display']} - mean hourly load",
+            [(row["hour_ist"], float(row["mean_kw"]) if row["mean_kw"] else math.nan) for row in hourly_profile_rows],
+        )
+        summary = next(row for row in summary_rows if row["meter"] == meter)
+        common_summary = next(row for row in common_flow_rows if row["meter"] == meter)
+        common_flow_md = "\n".join([f"- {step}" for step in COMMON_EDA_FLOW])
+        md = f"""# {METER_META[meter]["display"]} EDA
+## Common EDA flow
+{common_flow_md}
+## Building note
+{METER_META[meter]["note"]}
+## Preprocessing notes
+- Source: `energy_dataset/all_buildings_power.csv`
+- Timezone: UNIX timestamp converted to `Asia/Kolkata` (`+05:30`)
+- Unit conversion: watts to kW
+- Missing values: `NA`/blank kept as missing during aggregation
+## Key metrics
+| Metric | Value |
+| --- | ---: |
+| Building type | {summary["building_type"]} |
+| Observed rows | {summary["observed_rows"]} |
+| Missing rows | {summary["missing_rows"]} |
+| Missing % | {summary["missing_pct"]} |
+| Mean kW | {summary["mean_kw"]} |
+| Median kW | {summary["median_kw"]} |
+| P95 kW | {summary["p95_kw"]} |
+| Max kW | {summary["max_kw"]} |
+| Zero % of observed | {summary["zero_pct"]} |
+| Quality label | {common_summary["quality_label"]} |
+| Coefficient of variation | {common_summary["cv"]} |
+| P95 / median | {common_summary["p95_to_median"]} |
+| Weekday mean kW | {common_summary["weekday_mean_kw"]} |
+| Weekend mean kW | {common_summary["weekend_mean_kw"]} |
+| Weekend delta % | {common_summary["weekend_delta_pct"]} |
+## How to read this meter
+- Use missing % and zero % first; these decide how much confidence to put in the patterns.
+- Use mean/median/p95/max to understand normal load versus stress load.
+- Use hourly, weekday, and monthly profiles to separate schedule effects from long-term changes.
+- Use top peaks to inspect unusual operating days or demand-response opportunities.
+## Files
+- `{meter_slug}_hourly_profile.csv`
+- `{meter_slug}_weekday_profile.csv`
+- `{meter_slug}_monthly_profile.csv`
+- `{meter_slug}_top_peaks.csv`
+- `../charts/{meter_slug}_hourly_profile.svg`
+"""
+        (EDA_DIR / "per_meter" / f"{meter_slug}_eda.md").write_text(md, encoding="utf-8")
+        structured_md = md.replace(
+            f"- `{meter_slug}_hourly_profile.csv`\n"
+            f"- `{meter_slug}_weekday_profile.csv`\n"
+            f"- `{meter_slug}_monthly_profile.csv`\n"
+            f"- `{meter_slug}_top_peaks.csv`\n"
+            f"- `../charts/{meter_slug}_hourly_profile.svg`",
+            "- `hourly_profile.csv`\n"
+            "- `weekday_profile.csv`\n"
+            "- `monthly_profile.csv`\n"
+            "- `top_peaks.csv`\n"
+            "- `hourly_profile.svg`",
+        )
+        (building_dir / "README.md").write_text(structured_md, encoding="utf-8")
+    type_summary_rows = []
+    for building_type in sorted({meta["type"] for meta in METER_META.values()}):
+        meters = [meter for meter in METERS if METER_META[meter]["type"] == building_type]
+        selected = [row for row in summary_rows if row["meter"] in meters]
+        observed = sum(int(row["observed_rows"]) for row in selected)
+        missing = sum(int(row["missing_rows"]) for row in selected)
+        mean_values = [float(row["mean_kw"]) for row in selected if row["mean_kw"]]
+        type_summary_rows.append(
+            {
+                "building_type": building_type,
+                "meters": ", ".join(meters),
+                "observed_rows": observed,
+                "missing_rows": missing,
+                "missing_pct": fmt(missing / (missing + observed) * 100 if missing + observed else math.nan),
+                "mean_of_meter_means_kw": fmt(statistics.fmean(mean_values) if mean_values else math.nan),
+            }
+        )
+        type_daily_points = []
+        for row in type_daily_rows:
+            if row["building_type"] == building_type and row["mean_kw"]:
+                type_daily_points.append((row["date_ist"], float(row["mean_kw"])))
+        svg_line_chart(
+            EDA_DIR / "charts" / f"{building_type}_daily_profile.svg",
+            f"{building_type.title()} - daily mean load",
+            type_daily_points,
+        )
+        svg_line_chart(
+            FULL_EDA_DIR / "building_types" / building_type / "daily_profile.svg",
+            f"{building_type.title()} - daily mean load",
+            type_daily_points,
+        )
+        md_lines = [
+            f"# {building_type.title()} Building Type EDA",
+            "",
+            "## Included meters",
+            "",
+            ", ".join(meters),
+            "",
+            "## Meter summary",
+            "",
+            "| Meter | Mean kW | Median kW | P95 kW | Missing % | Max kW |",
+            "| --- | ---: | ---: | ---: | ---: | ---: |",
+        ]
+        for row in selected:
+            md_lines.append(
+                f"| {row['meter']} | {row['mean_kw']} | {row['median_kw']} | {row['p95_kw']} | {row['missing_pct']} | {row['max_kw']} |"
+            )
+        md_lines.extend(
+            [
+                "",
+                "## Files",
+                "",
+                "- `../../preprocessed_outputs/building_type_hourly_clean.csv`",
+                "- `../../preprocessed_outputs/building_type_daily_clean.csv`",
+                f"- `../charts/{building_type}_daily_profile.svg`",
+            ]
+        )
+        (EDA_DIR / "by_type" / f"{building_type}_eda.md").write_text("\n".join(md_lines) + "\n", encoding="utf-8")
+        (FULL_EDA_DIR / "building_types" / building_type / "README.md").write_text(
+            "\n".join(md_lines) + "\n",
+            encoding="utf-8",
+        )
+    write_csv(
+        EDA_DIR / "building_type_summary.csv",
+        ["building_type", "meters", "observed_rows", "missing_rows", "missing_pct", "mean_of_meter_means_kw"],
+        type_summary_rows,
+    )
+    write_csv(
+        FULL_EDA_DIR / "05_common_building_type_summary.csv",
+        ["building_type", "meters", "observed_rows", "missing_rows", "missing_pct", "mean_of_meter_means_kw"],
+        type_summary_rows,
+    )
+    index_lines = [
+        "# Building Energy Preprocessing and EDA Index",
+        "",
+        f"Source rows processed: `{row_count}`",
+        "",
+        "## Start here",
+        "",
+        "- `eda_analysis_plan.md`",
+        "- `all_buildings_common_eda.md`",
+        "- `all_meters_common_flow_summary.csv`",
+        "",
+        "## Preprocessed outputs",
+        "",
+        "- `preprocessed_outputs/building_preprocessing_summary.csv`",
+        "- `preprocessed_outputs/building_power_hourly_clean.csv`",
+        "- `preprocessed_outputs/building_power_daily_clean.csv`",
+        "- `preprocessed_outputs/building_type_hourly_clean.csv`",
+        "- `preprocessed_outputs/building_type_daily_clean.csv`",
+        "",
+        "## Per-meter reports",
+        "",
+    ]
+    for meter in METERS:
+        index_lines.append(f"- `per_meter/{meter.lower()}_eda.md`")
+    index_lines.extend(["", "## Per-building-type reports", ""])
+    for row in type_summary_rows:
+        index_lines.append(f"- `by_type/{row['building_type']}_eda.md`")
+    (EDA_DIR / "README.md").write_text("\n".join(index_lines) + "\n", encoding="utf-8")
+    full_index_lines = [
+        "# Full Energy EDA Output",
+        "",
+        "Common files are kept at this top level. Building-specific EDA is kept inside `buildings/`.",
+        "",
+        "## Common outputs",
+        "",
+        "- `00_eda_scope_and_flow.md`",
+        "- `01_common_building_power_summary.csv`",
+        "- `02_common_all_meters_flow_summary.csv`",
+        "- `03_common_transformer_power_summary.csv`",
+        "- `04_common_all_buildings_eda.md`",
+        "- `05_common_building_type_summary.csv`",
+        "- `charts/common_building_mean_power_ranking.svg`",
+        "- `charts/common_building_missing_pct.svg`",
+        "- `charts/common_transformer_mean_power.svg`",
+        "",
+        "## Per-building outputs",
+        "",
+    ]
+    for meter in METERS:
+        full_index_lines.append(f"- `buildings/{meter.lower()}/README.md`")
+    full_index_lines.extend(["", "## Per-building-type outputs", ""])
+    for row in type_summary_rows:
+        full_index_lines.append(f"- `building_types/{row['building_type']}/README.md`")
+    (FULL_EDA_DIR / "README.md").write_text("\n".join(full_index_lines) + "\n", encoding="utf-8")
+if __name__ == "__main__":
+    main()