WISE_Energy / src /core /waste_parser.py
ahanbose's picture
Upload 3 files
5a77372 verified
"""
core/waste_parser.py
────────────────────────────────────────────────────────────────────
SPJIMR Waste Analytics β€” Excel Parser
Reads the daily block-wise waste log (Book2.xlsx format) and returns
clean DataFrames for the analytics and gamification pages.
Data format in Excel:
Row with datetime β†’ date header (every 6 rows)
Row 'Waste type' β†’ column headers
Row 'Wet waste' β†’ per-block wet waste kg
Row 'Dry waste' β†’ per-block dry waste kg
Row 'TOTAL (Kg)' β†’ skip (recalculated here)
Blank row β†’ separator
"""
from __future__ import annotations
import io
import logging
from datetime import datetime, date
from pathlib import Path
from typing import Union
import pandas as pd
logger = logging.getLogger(__name__)
# Canonical block/location names as they appear in the spreadsheet
LOCATIONS = [
"A&B Block",
"C&D Block",
"L H Hostel",
"Hostel no -25",
"Hostel no -26",
"Hostel no -27",
"Hostel no -28",
"Hostel no -29",
"Hostel no -30",
"cantean",
"MESS",
]
# Friendly display names
LOCATION_LABELS = {
"A&B Block": "A & B Block",
"C&D Block": "C & D Block",
"L H Hostel": "LH Hostel",
"Hostel no -25": "Hostel 25",
"Hostel no -26": "Hostel 26",
"Hostel no -27": "Hostel 27",
"Hostel no -28": "Hostel 28",
"Hostel no -29": "Hostel 29",
"Hostel no -30": "Hostel 30",
"cantean": "Canteen",
"MESS": "Mess",
}
# Category grouping for gamification scoring
LOCATION_GROUPS = {
"Academic": ["A&B Block", "C&D Block"],
"Hostels": ["L H Hostel", "Hostel no -25", "Hostel no -26",
"Hostel no -27", "Hostel no -28", "Hostel no -29", "Hostel no -30"],
"Dining": ["cantean", "MESS"],
}
def _safe_float(val) -> float:
"""Return float or 0.0 for None / formula strings."""
if val is None:
return 0.0
if isinstance(val, str) and val.startswith("="):
return 0.0
try:
return float(val)
except (TypeError, ValueError):
return 0.0
def parse_waste_excel(source: Union[str, Path, bytes, io.BytesIO]) -> pd.DataFrame:
"""
Parse the waste-log Excel file into a tidy long-form DataFrame.
Parameters
----------
source : file path (str/Path), raw bytes, or BytesIO
Returns
-------
pd.DataFrame with columns:
date datetime.date
location str (one of LOCATIONS)
label str (friendly display name)
group str (Academic | Hostels | Dining)
wet_kg float
dry_kg float
total_kg float
month str (e.g. "Apr 2025")
week int (ISO week number)
"""
if isinstance(source, (str, Path)):
import openpyxl
wb = openpyxl.load_workbook(str(source), data_only=True)
elif isinstance(source, bytes):
import openpyxl
wb = openpyxl.load_workbook(io.BytesIO(source), data_only=True)
else:
import openpyxl
wb = openpyxl.load_workbook(source, data_only=True)
ws = wb.active
rows = list(ws.iter_rows(values_only=True))
records: list[dict] = []
current_date: date | None = None
wet_row: dict | None = None
for raw_row in rows:
cell0 = raw_row[0]
# ── Date header ──────────────────────────────────────────────────
if isinstance(cell0, datetime):
current_date = cell0.date()
wet_row = None
continue
if not current_date:
continue
if not isinstance(cell0, str):
continue
tag = cell0.strip().lower()
# ── Wet waste row ────────────────────────────────────────────────
if tag == "wet waste":
wet_row = {loc: _safe_float(raw_row[i + 1]) for i, loc in enumerate(LOCATIONS)}
continue
# ── Dry waste row β€” complete the record ─────────────────────────
if tag == "dry waste" and wet_row is not None:
dry_row = {loc: _safe_float(raw_row[i + 1]) for i, loc in enumerate(LOCATIONS)}
for loc in LOCATIONS:
w = wet_row.get(loc, 0.0)
d = dry_row.get(loc, 0.0)
records.append(
{
"date": current_date,
"location": loc,
"label": LOCATION_LABELS[loc],
"group": next(
(g for g, locs in LOCATION_GROUPS.items() if loc in locs),
"Other",
),
"wet_kg": w,
"dry_kg": d,
"total_kg": w + d,
}
)
wet_row = None
continue
if not records:
logger.warning("waste_parser: no records parsed from file.")
return pd.DataFrame()
df = pd.DataFrame(records)
df["date"] = pd.to_datetime(df["date"])
df["month"] = df["date"].dt.strftime("%b %Y")
df["week"] = df["date"].dt.isocalendar().week.astype(int)
df["day"] = df["date"].dt.day_name()
logger.info(
"waste_parser: parsed %d records | %d locations | %d days",
len(df), df["location"].nunique(), df["date"].nunique(),
)
return df
# ── Aggregation helpers used by both analytics and gamification pages ─────────
def monthly_summary(df: pd.DataFrame) -> pd.DataFrame:
"""Total wet / dry / total per location per month."""
return (
df.groupby(["month", "location", "label", "group"], as_index=False)
.agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
.sort_values("total_kg", ascending=False)
)
def daily_totals(df: pd.DataFrame) -> pd.DataFrame:
"""Campus-wide daily totals (all locations summed)."""
return (
df.groupby("date", as_index=False)
.agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
.sort_values("date")
)
def block_weekly(df: pd.DataFrame) -> pd.DataFrame:
"""Weekly totals per location."""
return (
df.groupby(["week", "location", "label", "group"], as_index=False)
.agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
)
def wet_dry_ratio(df: pd.DataFrame) -> pd.DataFrame:
"""Wet-to-dry ratio per location (monthly)."""
m = monthly_summary(df)
m["wet_pct"] = (m["wet_kg"] / m["total_kg"].replace(0, float("nan")) * 100).round(1)
m["dry_pct"] = (m["dry_kg"] / m["total_kg"].replace(0, float("nan")) * 100).round(1)
return m