HoangTrungNguyen commited on
Commit
06e1c90
·
verified ·
1 Parent(s): 626bb3f

Upload scripts/preprocess_and_eda_by_building.py with huggingface_hub

Browse files
scripts/preprocess_and_eda_by_building.py ADDED
@@ -0,0 +1,1110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Preprocess campus building energy data and create per-building EDA files.
3
+
4
+ The script intentionally uses only the Python standard library so it can run in
5
+ minimal environments. It reads the wide minute-level `all_buildings_power.csv`,
6
+ converts watts to kW, converts UNIX timestamps to Asia/Kolkata time, aggregates
7
+ hourly/daily analysis-ready files, and writes EDA reports for each meter and
8
+ building type.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import csv
14
+ import math
15
+ import statistics
16
+ from collections import defaultdict
17
+ from dataclasses import dataclass, field
18
+ from datetime import UTC, datetime
19
+ from pathlib import Path
20
+ from typing import Iterable
21
+ from zoneinfo import ZoneInfo
22
+
23
+
24
+ ROOT = Path(__file__).resolve().parents[1]
25
+ DATA_FILE = ROOT / "energy_dataset" / "all_buildings_power.csv"
26
+ TRANSFORMER_POWER_FILE = ROOT / "energy_dataset" / "all_transformer_power.csv"
27
+ OUT_DIR = ROOT / "preprocessed_outputs"
28
+ EDA_DIR = ROOT / "eda_by_building_type"
29
+ FULL_EDA_DIR = ROOT / "eda_energy_full"
30
+ IST = ZoneInfo("Asia/Kolkata")
31
+
32
+ METERS = [
33
+ "Academic",
34
+ "Boys_main",
35
+ "Boys_backup",
36
+ "Facilities",
37
+ "Girls_main",
38
+ "Girls_backup",
39
+ "Lecture",
40
+ "Library",
41
+ "Mess",
42
+ ]
43
+
44
+ METER_META = {
45
+ "Academic": {
46
+ "type": "academic",
47
+ "display": "Academic building",
48
+ "note": "Main academic block; expected to follow working-day and teaching-hour patterns.",
49
+ },
50
+ "Boys_main": {
51
+ "type": "hostel",
52
+ "display": "Boys hostel mains",
53
+ "note": "Main grid supply for boys hostel; residential load should be steadier than academic spaces.",
54
+ },
55
+ "Boys_backup": {
56
+ "type": "hostel",
57
+ "display": "Boys hostel UPS",
58
+ "note": "Backup/UPS supply for boys hostel; useful for separating essential residential load.",
59
+ },
60
+ "Facilities": {
61
+ "type": "facilities",
62
+ "display": "Facilities building",
63
+ "note": "Campus facilities load; may include operational equipment and irregular maintenance activity.",
64
+ },
65
+ "Girls_main": {
66
+ "type": "hostel",
67
+ "display": "Girls hostel mains",
68
+ "note": "Main grid supply for girls hostel; residential load should remain active across weekends.",
69
+ },
70
+ "Girls_backup": {
71
+ "type": "hostel",
72
+ "display": "Girls hostel UPS",
73
+ "note": "Backup/UPS supply for girls hostel; currently one of the cleanest meter series.",
74
+ },
75
+ "Lecture": {
76
+ "type": "lecture",
77
+ "display": "Lecture building",
78
+ "note": "Lecture/classroom load; expected to be schedule-driven with many low or zero periods.",
79
+ },
80
+ "Library": {
81
+ "type": "library",
82
+ "display": "Library building",
83
+ "note": "Library load; expected to reflect opening hours, study periods, and exam-season usage.",
84
+ },
85
+ "Mess": {
86
+ "type": "mess",
87
+ "display": "Dining/Mess building",
88
+ "note": "Dining building load; expected to show meal-time equipment peaks.",
89
+ },
90
+ }
91
+
92
+ EDA_ASPECTS = [
93
+ {
94
+ "aspect": "Data quality and coverage",
95
+ "why": "Check whether each building has enough usable observations before comparing consumption.",
96
+ "outputs": "missing %, observed rows, availability %, zero %, negative %",
97
+ },
98
+ {
99
+ "aspect": "Load magnitude and ranking",
100
+ "why": "Identify which buildings consume the most power and should be prioritized.",
101
+ "outputs": "mean kW, median kW, p75/p95 kW, max kW",
102
+ },
103
+ {
104
+ "aspect": "Temporal behavior",
105
+ "why": "Campus energy is strongly tied to operating hours, class schedules, weekends, and seasons.",
106
+ "outputs": "hourly profile, weekday profile, monthly profile, daily trend",
107
+ },
108
+ {
109
+ "aspect": "Peak demand",
110
+ "why": "Peak events drive capacity planning, demand response, and transformer stress.",
111
+ "outputs": "top 10 peak timestamps, p95 kW, max kW",
112
+ },
113
+ {
114
+ "aspect": "Stability and variability",
115
+ "why": "Stable loads behave differently from occupancy-driven loads and need different models.",
116
+ "outputs": "std kW, coefficient of variation, p95/median ratio",
117
+ },
118
+ {
119
+ "aspect": "Building type comparison",
120
+ "why": "Academic, hostel, library, mess, lecture, and facilities loads represent different use cases.",
121
+ "outputs": "type-level hourly/daily cleaned files and type-level reports",
122
+ },
123
+ ]
124
+
125
+ COMMON_EDA_FLOW = [
126
+ "1. Validate data coverage and missing/zero/negative values.",
127
+ "2. Convert timestamp to Asia/Kolkata and power from W to kW.",
128
+ "3. Aggregate minute data to hourly and daily clean datasets.",
129
+ "4. Summarize distribution: mean, median, p75, p95, max, variability.",
130
+ "5. Analyze temporal patterns by hour of day, weekday, month, and daily trend.",
131
+ "6. Extract peak events for operational review.",
132
+ "7. Compare the meter with its building type and with all other buildings.",
133
+ ]
134
+
135
+ TRANSFORMER_COLUMNS = ["transfomer_1", "transfomer_2", "transfomer_3"]
136
+
137
+
138
+ @dataclass
139
+ class SeriesStats:
140
+ count: int = 0
141
+ missing: int = 0
142
+ zero: int = 0
143
+ negative: int = 0
144
+ total_kw: float = 0.0
145
+ total_sq_kw: float = 0.0
146
+ min_kw: float | None = None
147
+ max_kw: float | None = None
148
+ first_seen: str | None = None
149
+ last_seen: str | None = None
150
+ values_for_quantiles: list[float] = field(default_factory=list)
151
+
152
+ def add(self, value: float | None, timestamp: str) -> None:
153
+ if value is None:
154
+ self.missing += 1
155
+ return
156
+ self.count += 1
157
+ self.total_kw += value
158
+ self.total_sq_kw += value * value
159
+ self.min_kw = value if self.min_kw is None else min(self.min_kw, value)
160
+ self.max_kw = value if self.max_kw is None else max(self.max_kw, value)
161
+ if value == 0:
162
+ self.zero += 1
163
+ if value < 0:
164
+ self.negative += 1
165
+ if self.first_seen is None:
166
+ self.first_seen = timestamp
167
+ self.last_seen = timestamp
168
+ self.values_for_quantiles.append(value)
169
+
170
+ @property
171
+ def mean_kw(self) -> float:
172
+ return self.total_kw / self.count if self.count else math.nan
173
+
174
+ @property
175
+ def std_kw(self) -> float:
176
+ if self.count <= 1:
177
+ return math.nan
178
+ variance = (self.total_sq_kw - (self.total_kw * self.total_kw / self.count)) / (self.count - 1)
179
+ return math.sqrt(max(variance, 0.0))
180
+
181
+
182
+ @dataclass
183
+ class Bucket:
184
+ sum_kw: float = 0.0
185
+ count: int = 0
186
+ missing: int = 0
187
+
188
+ def add(self, value: float | None) -> None:
189
+ if value is None:
190
+ self.missing += 1
191
+ return
192
+ self.sum_kw += value
193
+ self.count += 1
194
+
195
+ @property
196
+ def mean_kw(self) -> float:
197
+ return self.sum_kw / self.count if self.count else math.nan
198
+
199
+ @property
200
+ def availability_pct(self) -> float:
201
+ total = self.count + self.missing
202
+ return self.count / total * 100 if total else math.nan
203
+
204
+
205
+ def parse_watts(value: str) -> float | None:
206
+ value = value.strip()
207
+ if not value or value.upper() == "NA":
208
+ return None
209
+ try:
210
+ watts = float(value)
211
+ except ValueError:
212
+ return None
213
+ return watts / 1000.0
214
+
215
+
216
+ def parse_timestamp(value: str) -> int:
217
+ """Read UNIX timestamps stored either as integers or scientific notation."""
218
+ return int(float(value.strip()))
219
+
220
+
221
+ def fmt(value: float | int | None, digits: int = 3) -> str:
222
+ if value is None:
223
+ return ""
224
+ if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
225
+ return ""
226
+ return f"{value:.{digits}f}"
227
+
228
+
229
+ def quantile(sorted_values: list[float], q: float) -> float:
230
+ if not sorted_values:
231
+ return math.nan
232
+ if len(sorted_values) == 1:
233
+ return sorted_values[0]
234
+ pos = (len(sorted_values) - 1) * q
235
+ lower = math.floor(pos)
236
+ upper = math.ceil(pos)
237
+ if lower == upper:
238
+ return sorted_values[int(pos)]
239
+ return sorted_values[lower] * (upper - pos) + sorted_values[upper] * (pos - lower)
240
+
241
+
242
+ def ensure_dirs() -> None:
243
+ for path in [
244
+ OUT_DIR,
245
+ EDA_DIR,
246
+ EDA_DIR / "per_meter",
247
+ EDA_DIR / "by_type",
248
+ EDA_DIR / "charts",
249
+ FULL_EDA_DIR,
250
+ FULL_EDA_DIR / "charts",
251
+ FULL_EDA_DIR / "buildings",
252
+ FULL_EDA_DIR / "building_types",
253
+ ]:
254
+ path.mkdir(parents=True, exist_ok=True)
255
+ for meter in METERS:
256
+ (FULL_EDA_DIR / "buildings" / meter.lower()).mkdir(parents=True, exist_ok=True)
257
+ for building_type in sorted({meta["type"] for meta in METER_META.values()}):
258
+ (FULL_EDA_DIR / "building_types" / building_type).mkdir(parents=True, exist_ok=True)
259
+
260
+
261
+ def write_csv(path: Path, fieldnames: list[str], rows: Iterable[dict[str, object]]) -> None:
262
+ with path.open("w", newline="", encoding="utf-8") as file:
263
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
264
+ writer.writeheader()
265
+ for row in rows:
266
+ writer.writerow(row)
267
+
268
+
269
+ def svg_line_chart(path: Path, title: str, points: list[tuple[str, float]], y_label: str = "kW") -> None:
270
+ width, height = 920, 360
271
+ margin_left, margin_right, margin_top, margin_bottom = 70, 30, 50, 70
272
+ plot_w = width - margin_left - margin_right
273
+ plot_h = height - margin_top - margin_bottom
274
+ values = [v for _, v in points if not math.isnan(v)]
275
+ if not values:
276
+ return
277
+ min_v, max_v = min(values), max(values)
278
+ if min_v == max_v:
279
+ min_v -= 1
280
+ max_v += 1
281
+
282
+ coords = []
283
+ usable = [(label, value) for label, value in points if not math.isnan(value)]
284
+ for i, (_, value) in enumerate(usable):
285
+ x = margin_left + (plot_w * i / max(len(usable) - 1, 1))
286
+ y = margin_top + plot_h - ((value - min_v) / (max_v - min_v) * plot_h)
287
+ coords.append(f"{x:.1f},{y:.1f}")
288
+
289
+ tick_labels = []
290
+ for i in range(0, len(usable), max(1, len(usable) // 8)):
291
+ label, _ = usable[i]
292
+ x = margin_left + (plot_w * i / max(len(usable) - 1, 1))
293
+ tick_labels.append(
294
+ f'<text x="{x:.1f}" y="{height - 25}" text-anchor="middle" '
295
+ f'font-size="11" fill="#475569">{label}</text>'
296
+ )
297
+
298
+ svg = f"""<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">
299
+ <rect width="100%" height="100%" fill="#ffffff"/>
300
+ <text x="{margin_left}" y="28" font-size="20" font-family="Arial, sans-serif" font-weight="700" fill="#111827">{title}</text>
301
+ <line x1="{margin_left}" y1="{margin_top}" x2="{margin_left}" y2="{height - margin_bottom}" stroke="#CBD5E1"/>
302
+ <line x1="{margin_left}" y1="{height - margin_bottom}" x2="{width - margin_right}" y2="{height - margin_bottom}" stroke="#CBD5E1"/>
303
+ <text x="18" y="{margin_top + 10}" font-size="12" font-family="Arial, sans-serif" fill="#475569">{fmt(max_v, 1)} {y_label}</text>
304
+ <text x="18" y="{height - margin_bottom}" font-size="12" font-family="Arial, sans-serif" fill="#475569">{fmt(min_v, 1)} {y_label}</text>
305
+ <polyline points="{' '.join(coords)}" fill="none" stroke="#0F766E" stroke-width="2.5"/>
306
+ {''.join(tick_labels)}
307
+ </svg>
308
+ """
309
+ path.write_text(svg, encoding="utf-8")
310
+
311
+
312
+ def svg_bar_chart(path: Path, title: str, bars: list[tuple[str, float]], y_label: str = "kW") -> None:
313
+ width, height = 920, 420
314
+ margin_left, margin_right, margin_top, margin_bottom = 70, 30, 55, 110
315
+ plot_w = width - margin_left - margin_right
316
+ plot_h = height - margin_top - margin_bottom
317
+ clean_bars = [(label, value) for label, value in bars if not math.isnan(value)]
318
+ if not clean_bars:
319
+ return
320
+ max_v = max(value for _, value in clean_bars)
321
+ max_v = max_v if max_v > 0 else 1
322
+ step = plot_w / len(clean_bars)
323
+ bar_w = step * 0.62
324
+ rects = []
325
+ labels = []
326
+ for i, (label, value) in enumerate(clean_bars):
327
+ x = margin_left + i * step + (step - bar_w) / 2
328
+ bar_h = value / max_v * plot_h
329
+ y = margin_top + plot_h - bar_h
330
+ rects.append(
331
+ f'<rect x="{x:.1f}" y="{y:.1f}" width="{bar_w:.1f}" height="{bar_h:.1f}" '
332
+ f'fill="#2563EB" rx="3"/>'
333
+ )
334
+ labels.append(
335
+ f'<text x="{x + bar_w / 2:.1f}" y="{height - 72}" text-anchor="end" '
336
+ f'transform="rotate(-38 {x + bar_w / 2:.1f},{height - 72})" '
337
+ f'font-size="11" font-family="Arial, sans-serif" fill="#475569">{label}</text>'
338
+ )
339
+ labels.append(
340
+ f'<text x="{x + bar_w / 2:.1f}" y="{y - 6:.1f}" text-anchor="middle" '
341
+ f'font-size="11" font-family="Arial, sans-serif" fill="#111827">{fmt(value, 1)}</text>'
342
+ )
343
+ svg = f"""<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">
344
+ <rect width="100%" height="100%" fill="#ffffff"/>
345
+ <text x="{margin_left}" y="30" font-size="20" font-family="Arial, sans-serif" font-weight="700" fill="#111827">{title}</text>
346
+ <line x1="{margin_left}" y1="{margin_top}" x2="{margin_left}" y2="{height - margin_bottom}" stroke="#CBD5E1"/>
347
+ <line x1="{margin_left}" y1="{height - margin_bottom}" x2="{width - margin_right}" y2="{height - margin_bottom}" stroke="#CBD5E1"/>
348
+ <text x="18" y="{margin_top + 10}" font-size="12" font-family="Arial, sans-serif" fill="#475569">{fmt(max_v, 1)} {y_label}</text>
349
+ {''.join(rects)}
350
+ {''.join(labels)}
351
+ </svg>
352
+ """
353
+ path.write_text(svg, encoding="utf-8")
354
+
355
+
356
+ def quality_label(missing_pct: float, zero_pct: float) -> str:
357
+ if missing_pct <= 5 and zero_pct <= 10:
358
+ return "strong"
359
+ if missing_pct <= 15 and zero_pct <= 25:
360
+ return "usable"
361
+ if missing_pct <= 30:
362
+ return "limited"
363
+ return "weak"
364
+
365
+
366
+ def summarize_transformer_power() -> list[dict[str, object]]:
367
+ stats = {column: SeriesStats() for column in TRANSFORMER_COLUMNS}
368
+ if not TRANSFORMER_POWER_FILE.exists():
369
+ return []
370
+ with TRANSFORMER_POWER_FILE.open("r", newline="", encoding="utf-8") as file:
371
+ reader = csv.DictReader(file)
372
+ for row in reader:
373
+ dt = datetime.fromtimestamp(parse_timestamp(row["timestamp"]), tz=UTC).astimezone(IST)
374
+ dt_iso = dt.isoformat()
375
+ for column in TRANSFORMER_COLUMNS:
376
+ stats[column].add(parse_watts(row[column]), dt_iso)
377
+
378
+ rows = []
379
+ for column, item in stats.items():
380
+ values = sorted(item.values_for_quantiles)
381
+ total_points = item.count + item.missing
382
+ rows.append(
383
+ {
384
+ "transformer": column,
385
+ "rows_total": total_points,
386
+ "observed_rows": item.count,
387
+ "missing_rows": item.missing,
388
+ "missing_pct": fmt(item.missing / total_points * 100 if total_points else math.nan),
389
+ "zero_pct": fmt(item.zero / item.count * 100 if item.count else math.nan),
390
+ "mean_kw": fmt(item.mean_kw),
391
+ "median_kw": fmt(quantile(values, 0.50)),
392
+ "p95_kw": fmt(quantile(values, 0.95)),
393
+ "max_kw": fmt(item.max_kw),
394
+ "first_seen_ist": item.first_seen or "",
395
+ "last_seen_ist": item.last_seen or "",
396
+ }
397
+ )
398
+ return rows
399
+
400
+
401
+ def write_analysis_plan() -> None:
402
+ lines = [
403
+ "# EDA Analysis Plan Before Per-Building EDA",
404
+ "",
405
+ "This dataset should be analyzed with the same flow for all 9 building meters so comparisons stay fair.",
406
+ "",
407
+ "## Dataset-specific questions",
408
+ "",
409
+ "- Which meters have enough coverage to trust?",
410
+ "- Which buildings consume the most energy on average and during peaks?",
411
+ "- Which buildings show schedule-driven behavior by hour, weekday, and month?",
412
+ "- Which loads are stable enough for baseline forecasting?",
413
+ "- Which meters need caution because of missing data or many zeros?",
414
+ "- How do building types differ: academic, hostel, facilities, lecture, library, and mess?",
415
+ "",
416
+ "## Recommended analysis aspects",
417
+ "",
418
+ "| Aspect | Why it matters | Output files/metrics |",
419
+ "| --- | --- | --- |",
420
+ ]
421
+ for item in EDA_ASPECTS:
422
+ lines.append(f"| {item['aspect']} | {item['why']} | {item['outputs']} |")
423
+ lines.extend(["", "## Common EDA flow used for every building", ""])
424
+ lines.extend([f"- {step}" for step in COMMON_EDA_FLOW])
425
+ lines.extend(
426
+ [
427
+ "",
428
+ "## Practical interpretation",
429
+ "",
430
+ "- Use `strong` meters for headline insights and modeling first.",
431
+ "- Use `usable` meters for comparison after checking missing periods.",
432
+ "- Treat `limited` meters carefully; avoid overclaiming precise trends.",
433
+ "- Treat a high zero percentage as a separate operating-state signal, not automatically as clean data.",
434
+ "",
435
+ "## Current feature coverage",
436
+ "",
437
+ "| Feature | Current status | Notes |",
438
+ "| --- | --- | --- |",
439
+ "| Energy consumption | Calculated | Uses the 1-minute building and transformer power CSV files. |",
440
+ "| Schedule/time pattern | Calculated as proxy | Uses hour-of-day, weekday/weekend, and month from timestamps. No explicit class timetable file is present. |",
441
+ "| Occupancy | Not calculated yet | No occupancy CSV is present in the current workspace. Add it to join at 10-minute or hourly resolution. |",
442
+ "| Weather | Not calculated yet | No weather CSV is present in the current workspace. Add it to join by timestamp before correlation/regression. |",
443
+ ]
444
+ )
445
+ content = "\n".join(lines) + "\n"
446
+ (EDA_DIR / "eda_analysis_plan.md").write_text(content, encoding="utf-8")
447
+ (FULL_EDA_DIR / "00_eda_scope_and_flow.md").write_text(content, encoding="utf-8")
448
+
449
+
450
+ def main() -> None:
451
+ ensure_dirs()
452
+ write_analysis_plan()
453
+ stats = {meter: SeriesStats() for meter in METERS}
454
+ hourly = defaultdict(Bucket)
455
+ daily = defaultdict(Bucket)
456
+ hour_profile = defaultdict(Bucket)
457
+ weekday_profile = defaultdict(Bucket)
458
+ month_profile = defaultdict(Bucket)
459
+ type_hourly = defaultdict(Bucket)
460
+ type_daily = defaultdict(Bucket)
461
+ top_peaks = {meter: [] for meter in METERS}
462
+ row_count = 0
463
+
464
+ with DATA_FILE.open("r", newline="", encoding="utf-8") as file:
465
+ reader = csv.DictReader(file)
466
+ for row in reader:
467
+ row_count += 1
468
+ dt = datetime.fromtimestamp(parse_timestamp(row["timestamp"]), tz=UTC).astimezone(IST)
469
+ dt_iso = dt.isoformat()
470
+ hour_key = dt.strftime("%Y-%m-%d %H:00:00%z")
471
+ day_key = dt.strftime("%Y-%m-%d")
472
+ month_key = dt.strftime("%Y-%m")
473
+ weekday_key = str(dt.weekday())
474
+ hour_of_day = f"{dt.hour:02d}"
475
+ values_by_type = defaultdict(list)
476
+
477
+ for meter in METERS:
478
+ value = parse_watts(row[meter])
479
+ stats[meter].add(value, dt_iso)
480
+ hourly[(meter, hour_key)].add(value)
481
+ daily[(meter, day_key)].add(value)
482
+ hour_profile[(meter, hour_of_day)].add(value)
483
+ weekday_profile[(meter, weekday_key)].add(value)
484
+ month_profile[(meter, month_key)].add(value)
485
+ if value is not None:
486
+ values_by_type[METER_META[meter]["type"]].append(value)
487
+ peaks = top_peaks[meter]
488
+ peaks.append((value, dt_iso))
489
+ peaks.sort(key=lambda item: item[0], reverse=True)
490
+ del peaks[10:]
491
+ else:
492
+ type_hourly[(METER_META[meter]["type"], hour_key)].add(None)
493
+ type_daily[(METER_META[meter]["type"], day_key)].add(None)
494
+
495
+ for building_type, values in values_by_type.items():
496
+ total_type_kw = sum(values)
497
+ type_hourly[(building_type, hour_key)].add(total_type_kw)
498
+ type_daily[(building_type, day_key)].add(total_type_kw)
499
+
500
+ summary_rows = []
501
+ for meter, item in stats.items():
502
+ values = sorted(item.values_for_quantiles)
503
+ total_points = item.count + item.missing
504
+ summary_rows.append(
505
+ {
506
+ "meter": meter,
507
+ "building_type": METER_META[meter]["type"],
508
+ "display_name": METER_META[meter]["display"],
509
+ "building_note": METER_META[meter]["note"],
510
+ "rows_total": total_points,
511
+ "observed_rows": item.count,
512
+ "missing_rows": item.missing,
513
+ "missing_pct": fmt(item.missing / total_points * 100 if total_points else math.nan),
514
+ "zero_pct": fmt(item.zero / item.count * 100 if item.count else math.nan),
515
+ "negative_pct": fmt(item.negative / item.count * 100 if item.count else math.nan),
516
+ "mean_kw": fmt(item.mean_kw),
517
+ "std_kw": fmt(item.std_kw),
518
+ "min_kw": fmt(item.min_kw),
519
+ "p25_kw": fmt(quantile(values, 0.25)),
520
+ "median_kw": fmt(quantile(values, 0.50)),
521
+ "p75_kw": fmt(quantile(values, 0.75)),
522
+ "p95_kw": fmt(quantile(values, 0.95)),
523
+ "max_kw": fmt(item.max_kw),
524
+ "first_seen_ist": item.first_seen or "",
525
+ "last_seen_ist": item.last_seen or "",
526
+ }
527
+ )
528
+
529
+ write_csv(
530
+ OUT_DIR / "building_preprocessing_summary.csv",
531
+ list(summary_rows[0].keys()),
532
+ summary_rows,
533
+ )
534
+ write_csv(
535
+ FULL_EDA_DIR / "01_common_building_power_summary.csv",
536
+ list(summary_rows[0].keys()),
537
+ summary_rows,
538
+ )
539
+
540
+ common_flow_rows = []
541
+ for row in summary_rows:
542
+ meter = row["meter"]
543
+ weekdays = [weekday_profile[(meter, str(day))].mean_kw for day in range(5)]
544
+ weekends = [weekday_profile[(meter, str(day))].mean_kw for day in range(5, 7)]
545
+ weekday_mean = statistics.fmean([value for value in weekdays if not math.isnan(value)])
546
+ weekend_mean = statistics.fmean([value for value in weekends if not math.isnan(value)])
547
+ mean_kw = float(row["mean_kw"]) if row["mean_kw"] else math.nan
548
+ median_kw = float(row["median_kw"]) if row["median_kw"] else math.nan
549
+ std_kw = float(row["std_kw"]) if row["std_kw"] else math.nan
550
+ p95_kw = float(row["p95_kw"]) if row["p95_kw"] else math.nan
551
+ missing_pct = float(row["missing_pct"]) if row["missing_pct"] else math.nan
552
+ zero_pct = float(row["zero_pct"]) if row["zero_pct"] else math.nan
553
+ cv = std_kw / mean_kw if mean_kw else math.nan
554
+ p95_to_median = p95_kw / median_kw if median_kw else math.nan
555
+ common_flow_rows.append(
556
+ {
557
+ "meter": meter,
558
+ "building_type": row["building_type"],
559
+ "display_name": row["display_name"],
560
+ "building_note": row["building_note"],
561
+ "quality_label": quality_label(missing_pct, zero_pct),
562
+ "missing_pct": row["missing_pct"],
563
+ "zero_pct": row["zero_pct"],
564
+ "mean_kw": row["mean_kw"],
565
+ "median_kw": row["median_kw"],
566
+ "p95_kw": row["p95_kw"],
567
+ "max_kw": row["max_kw"],
568
+ "cv": fmt(cv),
569
+ "p95_to_median": fmt(p95_to_median),
570
+ "weekday_mean_kw": fmt(weekday_mean),
571
+ "weekend_mean_kw": fmt(weekend_mean),
572
+ "weekend_delta_pct": fmt((weekend_mean - weekday_mean) / weekday_mean * 100 if weekday_mean else math.nan),
573
+ "first_seen_ist": row["first_seen_ist"],
574
+ "last_seen_ist": row["last_seen_ist"],
575
+ }
576
+ )
577
+ write_csv(
578
+ EDA_DIR / "all_meters_common_flow_summary.csv",
579
+ [
580
+ "meter",
581
+ "building_type",
582
+ "display_name",
583
+ "building_note",
584
+ "quality_label",
585
+ "missing_pct",
586
+ "zero_pct",
587
+ "mean_kw",
588
+ "median_kw",
589
+ "p95_kw",
590
+ "max_kw",
591
+ "cv",
592
+ "p95_to_median",
593
+ "weekday_mean_kw",
594
+ "weekend_mean_kw",
595
+ "weekend_delta_pct",
596
+ "first_seen_ist",
597
+ "last_seen_ist",
598
+ ],
599
+ common_flow_rows,
600
+ )
601
+ write_csv(
602
+ FULL_EDA_DIR / "02_common_all_meters_flow_summary.csv",
603
+ [
604
+ "meter",
605
+ "building_type",
606
+ "display_name",
607
+ "building_note",
608
+ "quality_label",
609
+ "missing_pct",
610
+ "zero_pct",
611
+ "mean_kw",
612
+ "median_kw",
613
+ "p95_kw",
614
+ "max_kw",
615
+ "cv",
616
+ "p95_to_median",
617
+ "weekday_mean_kw",
618
+ "weekend_mean_kw",
619
+ "weekend_delta_pct",
620
+ "first_seen_ist",
621
+ "last_seen_ist",
622
+ ],
623
+ common_flow_rows,
624
+ )
625
+
626
+ transformer_rows = summarize_transformer_power()
627
+ if transformer_rows:
628
+ write_csv(
629
+ FULL_EDA_DIR / "03_common_transformer_power_summary.csv",
630
+ list(transformer_rows[0].keys()),
631
+ transformer_rows,
632
+ )
633
+ svg_bar_chart(
634
+ FULL_EDA_DIR / "charts" / "common_transformer_mean_power.svg",
635
+ "Transformer mean power",
636
+ [(row["transformer"], float(row["mean_kw"])) for row in transformer_rows if row["mean_kw"]],
637
+ )
638
+
639
+ ranked_rows = sorted(common_flow_rows, key=lambda row: (float(row["missing_pct"]), float(row["zero_pct"])))
640
+ svg_bar_chart(
641
+ FULL_EDA_DIR / "charts" / "common_building_mean_power_ranking.svg",
642
+ "Building mean power ranking",
643
+ [(row["meter"], float(row["mean_kw"])) for row in sorted(common_flow_rows, key=lambda item: float(item["mean_kw"]), reverse=True)],
644
+ )
645
+ svg_bar_chart(
646
+ FULL_EDA_DIR / "charts" / "common_building_missing_pct.svg",
647
+ "Building missing data percentage",
648
+ [(row["meter"], float(row["missing_pct"])) for row in sorted(common_flow_rows, key=lambda item: float(item["missing_pct"]), reverse=True)],
649
+ y_label="%",
650
+ )
651
+ all_building_lines = [
652
+ "# All 9 Buildings Common EDA",
653
+ "",
654
+ "Each building meter is analyzed with the same flow so the outputs can be compared directly.",
655
+ "",
656
+ "## Analysis aspects before EDA",
657
+ "",
658
+ "| Aspect | Why it matters |",
659
+ "| --- | --- |",
660
+ ]
661
+ for item in EDA_ASPECTS:
662
+ all_building_lines.append(f"| {item['aspect']} | {item['why']} |")
663
+ all_building_lines.extend(["", "## Common flow", ""])
664
+ all_building_lines.extend([f"- {step}" for step in COMMON_EDA_FLOW])
665
+ all_building_lines.extend(
666
+ [
667
+ "",
668
+ "## Building annotations",
669
+ "",
670
+ "| Meter | Display name | Type | Note |",
671
+ "| --- | --- | --- | --- |",
672
+ ]
673
+ )
674
+ for row in common_flow_rows:
675
+ all_building_lines.append(
676
+ f"| {row['meter']} | {row['display_name']} | {row['building_type']} | {row['building_note']} |"
677
+ )
678
+ all_building_lines.extend(
679
+ [
680
+ "",
681
+ "## 9-building comparable summary",
682
+ "",
683
+ "| Rank by quality | Meter | Type | Quality | Missing % | Zero % | Mean kW | Median kW | P95 kW | CV | Weekend delta % |",
684
+ "| ---: | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
685
+ ]
686
+ )
687
+ for rank, row in enumerate(ranked_rows, start=1):
688
+ all_building_lines.append(
689
+ f"| {rank} | {row['meter']} | {row['building_type']} | {row['quality_label']} | "
690
+ f"{row['missing_pct']} | {row['zero_pct']} | {row['mean_kw']} | {row['median_kw']} | "
691
+ f"{row['p95_kw']} | {row['cv']} | {row['weekend_delta_pct']} |"
692
+ )
693
+ all_building_lines.extend(
694
+ [
695
+ "",
696
+ "## Recommended interpretation order",
697
+ "",
698
+ "- Start with `quality_label`, `missing_pct`, and `zero_pct`.",
699
+ "- Use `mean_kw`, `median_kw`, and `p95_kw` for load ranking.",
700
+ "- Use `cv` and `p95_to_median` for volatility.",
701
+ "- Use `weekend_delta_pct` for schedule sensitivity.",
702
+ "- Drill into each `per_meter/*_eda.md` file for the same meter-level flow.",
703
+ ]
704
+ )
705
+ (EDA_DIR / "all_buildings_common_eda.md").write_text("\n".join(all_building_lines) + "\n", encoding="utf-8")
706
+ (FULL_EDA_DIR / "04_common_all_buildings_eda.md").write_text("\n".join(all_building_lines) + "\n", encoding="utf-8")
707
+
708
+ hourly_rows = []
709
+ for (meter, hour_key), bucket in sorted(hourly.items(), key=lambda item: (item[0][0], item[0][1])):
710
+ hourly_rows.append(
711
+ {
712
+ "datetime_hour_ist": hour_key,
713
+ "meter": meter,
714
+ "building_type": METER_META[meter]["type"],
715
+ "mean_kw": fmt(bucket.mean_kw),
716
+ "observed_minutes": bucket.count,
717
+ "missing_minutes": bucket.missing,
718
+ "availability_pct": fmt(bucket.availability_pct),
719
+ "approx_kwh": fmt(bucket.mean_kw if bucket.count else math.nan),
720
+ }
721
+ )
722
+ write_csv(
723
+ OUT_DIR / "building_power_hourly_clean.csv",
724
+ [
725
+ "datetime_hour_ist",
726
+ "meter",
727
+ "building_type",
728
+ "mean_kw",
729
+ "observed_minutes",
730
+ "missing_minutes",
731
+ "availability_pct",
732
+ "approx_kwh",
733
+ ],
734
+ hourly_rows,
735
+ )
736
+
737
+ daily_rows = []
738
+ for (meter, day_key), bucket in sorted(daily.items(), key=lambda item: (item[0][0], item[0][1])):
739
+ daily_rows.append(
740
+ {
741
+ "date_ist": day_key,
742
+ "meter": meter,
743
+ "building_type": METER_META[meter]["type"],
744
+ "mean_kw": fmt(bucket.mean_kw),
745
+ "observed_minutes": bucket.count,
746
+ "missing_minutes": bucket.missing,
747
+ "availability_pct": fmt(bucket.availability_pct),
748
+ "approx_kwh": fmt(bucket.mean_kw * 24 if bucket.count else math.nan),
749
+ }
750
+ )
751
+ write_csv(
752
+ OUT_DIR / "building_power_daily_clean.csv",
753
+ [
754
+ "date_ist",
755
+ "meter",
756
+ "building_type",
757
+ "mean_kw",
758
+ "observed_minutes",
759
+ "missing_minutes",
760
+ "availability_pct",
761
+ "approx_kwh",
762
+ ],
763
+ daily_rows,
764
+ )
765
+
766
+ type_hourly_rows = []
767
+ for (building_type, hour_key), bucket in sorted(type_hourly.items(), key=lambda item: (item[0][0], item[0][1])):
768
+ type_hourly_rows.append(
769
+ {
770
+ "datetime_hour_ist": hour_key,
771
+ "building_type": building_type,
772
+ "mean_kw": fmt(bucket.mean_kw),
773
+ "observed_meter_values": bucket.count,
774
+ "missing_meter_values": bucket.missing,
775
+ "availability_pct": fmt(bucket.availability_pct),
776
+ }
777
+ )
778
+ write_csv(
779
+ OUT_DIR / "building_type_hourly_clean.csv",
780
+ [
781
+ "datetime_hour_ist",
782
+ "building_type",
783
+ "mean_kw",
784
+ "observed_meter_values",
785
+ "missing_meter_values",
786
+ "availability_pct",
787
+ ],
788
+ type_hourly_rows,
789
+ )
790
+
791
+ type_daily_rows = []
792
+ for (building_type, day_key), bucket in sorted(type_daily.items(), key=lambda item: (item[0][0], item[0][1])):
793
+ type_daily_rows.append(
794
+ {
795
+ "date_ist": day_key,
796
+ "building_type": building_type,
797
+ "mean_kw": fmt(bucket.mean_kw),
798
+ "observed_meter_values": bucket.count,
799
+ "missing_meter_values": bucket.missing,
800
+ "availability_pct": fmt(bucket.availability_pct),
801
+ "approx_kwh": fmt(bucket.mean_kw * 24 if bucket.count else math.nan),
802
+ }
803
+ )
804
+ write_csv(
805
+ OUT_DIR / "building_type_daily_clean.csv",
806
+ [
807
+ "date_ist",
808
+ "building_type",
809
+ "mean_kw",
810
+ "observed_meter_values",
811
+ "missing_meter_values",
812
+ "availability_pct",
813
+ "approx_kwh",
814
+ ],
815
+ type_daily_rows,
816
+ )
817
+
818
+ for meter in METERS:
819
+ meter_slug = meter.lower()
820
+ building_dir = FULL_EDA_DIR / "buildings" / meter_slug
821
+ hourly_profile_rows = []
822
+ for hour in [f"{i:02d}" for i in range(24)]:
823
+ bucket = hour_profile[(meter, hour)]
824
+ hourly_profile_rows.append(
825
+ {
826
+ "hour_ist": hour,
827
+ "mean_kw": fmt(bucket.mean_kw),
828
+ "observed_points": bucket.count,
829
+ "availability_pct": fmt(bucket.availability_pct),
830
+ }
831
+ )
832
+ write_csv(
833
+ EDA_DIR / "per_meter" / f"{meter_slug}_hourly_profile.csv",
834
+ ["hour_ist", "mean_kw", "observed_points", "availability_pct"],
835
+ hourly_profile_rows,
836
+ )
837
+ write_csv(
838
+ building_dir / "hourly_profile.csv",
839
+ ["hour_ist", "mean_kw", "observed_points", "availability_pct"],
840
+ hourly_profile_rows,
841
+ )
842
+
843
+ weekday_rows = []
844
+ for weekday in range(7):
845
+ bucket = weekday_profile[(meter, str(weekday))]
846
+ weekday_rows.append(
847
+ {
848
+ "weekday": weekday,
849
+ "weekday_name": ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][weekday],
850
+ "mean_kw": fmt(bucket.mean_kw),
851
+ "observed_points": bucket.count,
852
+ "availability_pct": fmt(bucket.availability_pct),
853
+ }
854
+ )
855
+ write_csv(
856
+ EDA_DIR / "per_meter" / f"{meter_slug}_weekday_profile.csv",
857
+ ["weekday", "weekday_name", "mean_kw", "observed_points", "availability_pct"],
858
+ weekday_rows,
859
+ )
860
+ write_csv(
861
+ building_dir / "weekday_profile.csv",
862
+ ["weekday", "weekday_name", "mean_kw", "observed_points", "availability_pct"],
863
+ weekday_rows,
864
+ )
865
+
866
+ monthly_rows = []
867
+ for (profile_meter, month), bucket in sorted(month_profile.items()):
868
+ if profile_meter != meter:
869
+ continue
870
+ monthly_rows.append(
871
+ {
872
+ "month_ist": month,
873
+ "mean_kw": fmt(bucket.mean_kw),
874
+ "observed_points": bucket.count,
875
+ "availability_pct": fmt(bucket.availability_pct),
876
+ }
877
+ )
878
+ write_csv(
879
+ EDA_DIR / "per_meter" / f"{meter_slug}_monthly_profile.csv",
880
+ ["month_ist", "mean_kw", "observed_points", "availability_pct"],
881
+ monthly_rows,
882
+ )
883
+ write_csv(
884
+ building_dir / "monthly_profile.csv",
885
+ ["month_ist", "mean_kw", "observed_points", "availability_pct"],
886
+ monthly_rows,
887
+ )
888
+
889
+ peak_rows = [
890
+ {"rank": index + 1, "datetime_ist": timestamp, "kw": fmt(value)}
891
+ for index, (value, timestamp) in enumerate(top_peaks[meter])
892
+ ]
893
+ write_csv(EDA_DIR / "per_meter" / f"{meter_slug}_top_peaks.csv", ["rank", "datetime_ist", "kw"], peak_rows)
894
+ write_csv(building_dir / "top_peaks.csv", ["rank", "datetime_ist", "kw"], peak_rows)
895
+ svg_line_chart(
896
+ EDA_DIR / "charts" / f"{meter_slug}_hourly_profile.svg",
897
+ f"{METER_META[meter]['display']} - mean hourly load",
898
+ [(row["hour_ist"], float(row["mean_kw"]) if row["mean_kw"] else math.nan) for row in hourly_profile_rows],
899
+ )
900
+ svg_line_chart(
901
+ building_dir / "hourly_profile.svg",
902
+ f"{METER_META[meter]['display']} - mean hourly load",
903
+ [(row["hour_ist"], float(row["mean_kw"]) if row["mean_kw"] else math.nan) for row in hourly_profile_rows],
904
+ )
905
+
906
+ summary = next(row for row in summary_rows if row["meter"] == meter)
907
+ common_summary = next(row for row in common_flow_rows if row["meter"] == meter)
908
+ common_flow_md = "\n".join([f"- {step}" for step in COMMON_EDA_FLOW])
909
+ md = f"""# {METER_META[meter]["display"]} EDA
910
+
911
+ ## Common EDA flow
912
+
913
+ {common_flow_md}
914
+
915
+ ## Building note
916
+
917
+ {METER_META[meter]["note"]}
918
+
919
+ ## Preprocessing notes
920
+
921
+ - Source: `energy_dataset/all_buildings_power.csv`
922
+ - Timezone: UNIX timestamp converted to `Asia/Kolkata` (`+05:30`)
923
+ - Unit conversion: watts to kW
924
+ - Missing values: `NA`/blank kept as missing during aggregation
925
+
926
+ ## Key metrics
927
+
928
+ | Metric | Value |
929
+ | --- | ---: |
930
+ | Building type | {summary["building_type"]} |
931
+ | Observed rows | {summary["observed_rows"]} |
932
+ | Missing rows | {summary["missing_rows"]} |
933
+ | Missing % | {summary["missing_pct"]} |
934
+ | Mean kW | {summary["mean_kw"]} |
935
+ | Median kW | {summary["median_kw"]} |
936
+ | P95 kW | {summary["p95_kw"]} |
937
+ | Max kW | {summary["max_kw"]} |
938
+ | Zero % of observed | {summary["zero_pct"]} |
939
+ | Quality label | {common_summary["quality_label"]} |
940
+ | Coefficient of variation | {common_summary["cv"]} |
941
+ | P95 / median | {common_summary["p95_to_median"]} |
942
+ | Weekday mean kW | {common_summary["weekday_mean_kw"]} |
943
+ | Weekend mean kW | {common_summary["weekend_mean_kw"]} |
944
+ | Weekend delta % | {common_summary["weekend_delta_pct"]} |
945
+
946
+ ## How to read this meter
947
+
948
+ - Use missing % and zero % first; these decide how much confidence to put in the patterns.
949
+ - Use mean/median/p95/max to understand normal load versus stress load.
950
+ - Use hourly, weekday, and monthly profiles to separate schedule effects from long-term changes.
951
+ - Use top peaks to inspect unusual operating days or demand-response opportunities.
952
+
953
+ ## Files
954
+
955
+ - `{meter_slug}_hourly_profile.csv`
956
+ - `{meter_slug}_weekday_profile.csv`
957
+ - `{meter_slug}_monthly_profile.csv`
958
+ - `{meter_slug}_top_peaks.csv`
959
+ - `../charts/{meter_slug}_hourly_profile.svg`
960
+ """
961
+ (EDA_DIR / "per_meter" / f"{meter_slug}_eda.md").write_text(md, encoding="utf-8")
962
+ structured_md = md.replace(
963
+ f"- `{meter_slug}_hourly_profile.csv`\n"
964
+ f"- `{meter_slug}_weekday_profile.csv`\n"
965
+ f"- `{meter_slug}_monthly_profile.csv`\n"
966
+ f"- `{meter_slug}_top_peaks.csv`\n"
967
+ f"- `../charts/{meter_slug}_hourly_profile.svg`",
968
+ "- `hourly_profile.csv`\n"
969
+ "- `weekday_profile.csv`\n"
970
+ "- `monthly_profile.csv`\n"
971
+ "- `top_peaks.csv`\n"
972
+ "- `hourly_profile.svg`",
973
+ )
974
+ (building_dir / "README.md").write_text(structured_md, encoding="utf-8")
975
+
976
+ type_summary_rows = []
977
+ for building_type in sorted({meta["type"] for meta in METER_META.values()}):
978
+ meters = [meter for meter in METERS if METER_META[meter]["type"] == building_type]
979
+ selected = [row for row in summary_rows if row["meter"] in meters]
980
+ observed = sum(int(row["observed_rows"]) for row in selected)
981
+ missing = sum(int(row["missing_rows"]) for row in selected)
982
+ mean_values = [float(row["mean_kw"]) for row in selected if row["mean_kw"]]
983
+ type_summary_rows.append(
984
+ {
985
+ "building_type": building_type,
986
+ "meters": ", ".join(meters),
987
+ "observed_rows": observed,
988
+ "missing_rows": missing,
989
+ "missing_pct": fmt(missing / (missing + observed) * 100 if missing + observed else math.nan),
990
+ "mean_of_meter_means_kw": fmt(statistics.fmean(mean_values) if mean_values else math.nan),
991
+ }
992
+ )
993
+
994
+ type_daily_points = []
995
+ for row in type_daily_rows:
996
+ if row["building_type"] == building_type and row["mean_kw"]:
997
+ type_daily_points.append((row["date_ist"], float(row["mean_kw"])))
998
+ svg_line_chart(
999
+ EDA_DIR / "charts" / f"{building_type}_daily_profile.svg",
1000
+ f"{building_type.title()} - daily mean load",
1001
+ type_daily_points,
1002
+ )
1003
+ svg_line_chart(
1004
+ FULL_EDA_DIR / "building_types" / building_type / "daily_profile.svg",
1005
+ f"{building_type.title()} - daily mean load",
1006
+ type_daily_points,
1007
+ )
1008
+
1009
+ md_lines = [
1010
+ f"# {building_type.title()} Building Type EDA",
1011
+ "",
1012
+ "## Included meters",
1013
+ "",
1014
+ ", ".join(meters),
1015
+ "",
1016
+ "## Meter summary",
1017
+ "",
1018
+ "| Meter | Mean kW | Median kW | P95 kW | Missing % | Max kW |",
1019
+ "| --- | ---: | ---: | ---: | ---: | ---: |",
1020
+ ]
1021
+ for row in selected:
1022
+ md_lines.append(
1023
+ f"| {row['meter']} | {row['mean_kw']} | {row['median_kw']} | {row['p95_kw']} | {row['missing_pct']} | {row['max_kw']} |"
1024
+ )
1025
+ md_lines.extend(
1026
+ [
1027
+ "",
1028
+ "## Files",
1029
+ "",
1030
+ "- `../../preprocessed_outputs/building_type_hourly_clean.csv`",
1031
+ "- `../../preprocessed_outputs/building_type_daily_clean.csv`",
1032
+ f"- `../charts/{building_type}_daily_profile.svg`",
1033
+ ]
1034
+ )
1035
+ (EDA_DIR / "by_type" / f"{building_type}_eda.md").write_text("\n".join(md_lines) + "\n", encoding="utf-8")
1036
+ (FULL_EDA_DIR / "building_types" / building_type / "README.md").write_text(
1037
+ "\n".join(md_lines) + "\n",
1038
+ encoding="utf-8",
1039
+ )
1040
+
1041
+ write_csv(
1042
+ EDA_DIR / "building_type_summary.csv",
1043
+ ["building_type", "meters", "observed_rows", "missing_rows", "missing_pct", "mean_of_meter_means_kw"],
1044
+ type_summary_rows,
1045
+ )
1046
+ write_csv(
1047
+ FULL_EDA_DIR / "05_common_building_type_summary.csv",
1048
+ ["building_type", "meters", "observed_rows", "missing_rows", "missing_pct", "mean_of_meter_means_kw"],
1049
+ type_summary_rows,
1050
+ )
1051
+
1052
+ index_lines = [
1053
+ "# Building Energy Preprocessing and EDA Index",
1054
+ "",
1055
+ f"Source rows processed: `{row_count}`",
1056
+ "",
1057
+ "## Start here",
1058
+ "",
1059
+ "- `eda_analysis_plan.md`",
1060
+ "- `all_buildings_common_eda.md`",
1061
+ "- `all_meters_common_flow_summary.csv`",
1062
+ "",
1063
+ "## Preprocessed outputs",
1064
+ "",
1065
+ "- `preprocessed_outputs/building_preprocessing_summary.csv`",
1066
+ "- `preprocessed_outputs/building_power_hourly_clean.csv`",
1067
+ "- `preprocessed_outputs/building_power_daily_clean.csv`",
1068
+ "- `preprocessed_outputs/building_type_hourly_clean.csv`",
1069
+ "- `preprocessed_outputs/building_type_daily_clean.csv`",
1070
+ "",
1071
+ "## Per-meter reports",
1072
+ "",
1073
+ ]
1074
+ for meter in METERS:
1075
+ index_lines.append(f"- `per_meter/{meter.lower()}_eda.md`")
1076
+ index_lines.extend(["", "## Per-building-type reports", ""])
1077
+ for row in type_summary_rows:
1078
+ index_lines.append(f"- `by_type/{row['building_type']}_eda.md`")
1079
+ (EDA_DIR / "README.md").write_text("\n".join(index_lines) + "\n", encoding="utf-8")
1080
+
1081
+ full_index_lines = [
1082
+ "# Full Energy EDA Output",
1083
+ "",
1084
+ "Common files are kept at this top level. Building-specific EDA is kept inside `buildings/`.",
1085
+ "",
1086
+ "## Common outputs",
1087
+ "",
1088
+ "- `00_eda_scope_and_flow.md`",
1089
+ "- `01_common_building_power_summary.csv`",
1090
+ "- `02_common_all_meters_flow_summary.csv`",
1091
+ "- `03_common_transformer_power_summary.csv`",
1092
+ "- `04_common_all_buildings_eda.md`",
1093
+ "- `05_common_building_type_summary.csv`",
1094
+ "- `charts/common_building_mean_power_ranking.svg`",
1095
+ "- `charts/common_building_missing_pct.svg`",
1096
+ "- `charts/common_transformer_mean_power.svg`",
1097
+ "",
1098
+ "## Per-building outputs",
1099
+ "",
1100
+ ]
1101
+ for meter in METERS:
1102
+ full_index_lines.append(f"- `buildings/{meter.lower()}/README.md`")
1103
+ full_index_lines.extend(["", "## Per-building-type outputs", ""])
1104
+ for row in type_summary_rows:
1105
+ full_index_lines.append(f"- `building_types/{row['building_type']}/README.md`")
1106
+ (FULL_EDA_DIR / "README.md").write_text("\n".join(full_index_lines) + "\n", encoding="utf-8")
1107
+
1108
+
1109
+ if __name__ == "__main__":
1110
+ main()