""" core/waste_parser.py ──────────────────────────────────────────────────────────────────── SPJIMR Waste Analytics — Excel Parser Reads the daily block-wise waste log (Book2.xlsx format) and returns clean DataFrames for the analytics and gamification pages. Data format in Excel: Row with datetime → date header (every 6 rows) Row 'Waste type' → column headers Row 'Wet waste' → per-block wet waste kg Row 'Dry waste' → per-block dry waste kg Row 'TOTAL (Kg)' → skip (recalculated here) Blank row → separator """ from __future__ import annotations import io import logging from datetime import datetime, date from pathlib import Path from typing import Union import pandas as pd logger = logging.getLogger(__name__) # Canonical block/location names as they appear in the spreadsheet LOCATIONS = [ "A&B Block", "C&D Block", "L H Hostel", "Hostel no -25", "Hostel no -26", "Hostel no -27", "Hostel no -28", "Hostel no -29", "Hostel no -30", "cantean", "MESS", ] # Friendly display names LOCATION_LABELS = { "A&B Block": "A & B Block", "C&D Block": "C & D Block", "L H Hostel": "LH Hostel", "Hostel no -25": "Hostel 25", "Hostel no -26": "Hostel 26", "Hostel no -27": "Hostel 27", "Hostel no -28": "Hostel 28", "Hostel no -29": "Hostel 29", "Hostel no -30": "Hostel 30", "cantean": "Canteen", "MESS": "Mess", } # Category grouping for gamification scoring LOCATION_GROUPS = { "Academic": ["A&B Block", "C&D Block"], "Hostels": ["L H Hostel", "Hostel no -25", "Hostel no -26", "Hostel no -27", "Hostel no -28", "Hostel no -29", "Hostel no -30"], "Dining": ["cantean", "MESS"], } def _safe_float(val) -> float: """Return float or 0.0 for None / formula strings.""" if val is None: return 0.0 if isinstance(val, str) and val.startswith("="): return 0.0 try: return float(val) except (TypeError, ValueError): return 0.0 def parse_waste_excel(source: Union[str, Path, bytes, io.BytesIO]) -> pd.DataFrame: """ Parse the waste-log Excel file into a tidy long-form DataFrame. Parameters ---------- source : file path (str/Path), raw bytes, or BytesIO Returns ------- pd.DataFrame with columns: date datetime.date location str (one of LOCATIONS) label str (friendly display name) group str (Academic | Hostels | Dining) wet_kg float dry_kg float total_kg float month str (e.g. "Apr 2025") week int (ISO week number) """ if isinstance(source, (str, Path)): import openpyxl wb = openpyxl.load_workbook(str(source), data_only=True) elif isinstance(source, bytes): import openpyxl wb = openpyxl.load_workbook(io.BytesIO(source), data_only=True) else: import openpyxl wb = openpyxl.load_workbook(source, data_only=True) ws = wb.active rows = list(ws.iter_rows(values_only=True)) records: list[dict] = [] current_date: date | None = None wet_row: dict | None = None for raw_row in rows: cell0 = raw_row[0] # ── Date header ────────────────────────────────────────────────── if isinstance(cell0, datetime): current_date = cell0.date() wet_row = None continue if not current_date: continue if not isinstance(cell0, str): continue tag = cell0.strip().lower() # ── Wet waste row ──────────────────────────────────────────────── if tag == "wet waste": wet_row = {loc: _safe_float(raw_row[i + 1]) for i, loc in enumerate(LOCATIONS)} continue # ── Dry waste row — complete the record ───────────────────────── if tag == "dry waste" and wet_row is not None: dry_row = {loc: _safe_float(raw_row[i + 1]) for i, loc in enumerate(LOCATIONS)} for loc in LOCATIONS: w = wet_row.get(loc, 0.0) d = dry_row.get(loc, 0.0) records.append( { "date": current_date, "location": loc, "label": LOCATION_LABELS[loc], "group": next( (g for g, locs in LOCATION_GROUPS.items() if loc in locs), "Other", ), "wet_kg": w, "dry_kg": d, "total_kg": w + d, } ) wet_row = None continue if not records: logger.warning("waste_parser: no records parsed from file.") return pd.DataFrame() df = pd.DataFrame(records) df["date"] = pd.to_datetime(df["date"]) df["month"] = df["date"].dt.strftime("%b %Y") df["week"] = df["date"].dt.isocalendar().week.astype(int) df["day"] = df["date"].dt.day_name() logger.info( "waste_parser: parsed %d records | %d locations | %d days", len(df), df["location"].nunique(), df["date"].nunique(), ) return df # ── Aggregation helpers used by both analytics and gamification pages ───────── def monthly_summary(df: pd.DataFrame) -> pd.DataFrame: """Total wet / dry / total per location per month.""" return ( df.groupby(["month", "location", "label", "group"], as_index=False) .agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum")) .sort_values("total_kg", ascending=False) ) def daily_totals(df: pd.DataFrame) -> pd.DataFrame: """Campus-wide daily totals (all locations summed).""" return ( df.groupby("date", as_index=False) .agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum")) .sort_values("date") ) def block_weekly(df: pd.DataFrame) -> pd.DataFrame: """Weekly totals per location.""" return ( df.groupby(["week", "location", "label", "group"], as_index=False) .agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum")) ) def wet_dry_ratio(df: pd.DataFrame) -> pd.DataFrame: """Wet-to-dry ratio per location (monthly).""" m = monthly_summary(df) m["wet_pct"] = (m["wet_kg"] / m["total_kg"].replace(0, float("nan")) * 100).round(1) m["dry_pct"] = (m["dry_kg"] / m["total_kg"].replace(0, float("nan")) * 100).round(1) return m