Spaces:

ahanbose
/

WISE_Energy

Running

File size: 7,114 Bytes

5a77372

"""
core/waste_parser.py
────────────────────────────────────────────────────────────────────
SPJIMR Waste Analytics — Excel Parser
Reads the daily block-wise waste log (Book2.xlsx format) and returns
clean DataFrames for the analytics and gamification pages.

Data format in Excel:
  Row with datetime  → date header (every 6 rows)
  Row 'Waste type'   → column headers
  Row 'Wet waste'    → per-block wet waste kg
  Row 'Dry waste'    → per-block dry waste kg
  Row 'TOTAL (Kg)'   → skip (recalculated here)
  Blank row          → separator
"""

from __future__ import annotations
import io
import logging
from datetime import datetime, date
from pathlib import Path
from typing import Union

import pandas as pd

logger = logging.getLogger(__name__)

# Canonical block/location names as they appear in the spreadsheet
LOCATIONS = [
    "A&B Block",
    "C&D Block",
    "L H Hostel",
    "Hostel no -25",
    "Hostel no -26",
    "Hostel no -27",
    "Hostel no -28",
    "Hostel no -29",
    "Hostel no -30",
    "cantean",
    "MESS",
]

# Friendly display names
LOCATION_LABELS = {
    "A&B Block":     "A & B Block",
    "C&D Block":     "C & D Block",
    "L H Hostel":    "LH Hostel",
    "Hostel no -25": "Hostel 25",
    "Hostel no -26": "Hostel 26",
    "Hostel no -27": "Hostel 27",
    "Hostel no -28": "Hostel 28",
    "Hostel no -29": "Hostel 29",
    "Hostel no -30": "Hostel 30",
    "cantean":       "Canteen",
    "MESS":          "Mess",
}

# Category grouping for gamification scoring
LOCATION_GROUPS = {
    "Academic":  ["A&B Block", "C&D Block"],
    "Hostels":   ["L H Hostel", "Hostel no -25", "Hostel no -26",
                  "Hostel no -27", "Hostel no -28", "Hostel no -29", "Hostel no -30"],
    "Dining":    ["cantean", "MESS"],
}


def _safe_float(val) -> float:
    """Return float or 0.0 for None / formula strings."""
    if val is None:
        return 0.0
    if isinstance(val, str) and val.startswith("="):
        return 0.0
    try:
        return float(val)
    except (TypeError, ValueError):
        return 0.0


def parse_waste_excel(source: Union[str, Path, bytes, io.BytesIO]) -> pd.DataFrame:
    """
    Parse the waste-log Excel file into a tidy long-form DataFrame.

    Parameters
    ----------
    source : file path (str/Path), raw bytes, or BytesIO

    Returns
    -------
    pd.DataFrame with columns:
        date        datetime.date
        location    str  (one of LOCATIONS)
        label       str  (friendly display name)
        group       str  (Academic | Hostels | Dining)
        wet_kg      float
        dry_kg      float
        total_kg    float
        month       str  (e.g. "Apr 2025")
        week        int  (ISO week number)
    """
    if isinstance(source, (str, Path)):
        import openpyxl
        wb = openpyxl.load_workbook(str(source), data_only=True)
    elif isinstance(source, bytes):
        import openpyxl
        wb = openpyxl.load_workbook(io.BytesIO(source), data_only=True)
    else:
        import openpyxl
        wb = openpyxl.load_workbook(source, data_only=True)

    ws = wb.active
    rows = list(ws.iter_rows(values_only=True))

    records: list[dict] = []
    current_date: date | None = None
    wet_row: dict | None  = None

    for raw_row in rows:
        cell0 = raw_row[0]

        # ── Date header ──────────────────────────────────────────────────
        if isinstance(cell0, datetime):
            current_date = cell0.date()
            wet_row = None
            continue

        if not current_date:
            continue

        if not isinstance(cell0, str):
            continue

        tag = cell0.strip().lower()

        # ── Wet waste row ────────────────────────────────────────────────
        if tag == "wet waste":
            wet_row = {loc: _safe_float(raw_row[i + 1]) for i, loc in enumerate(LOCATIONS)}
            continue

        # ── Dry waste row — complete the record ─────────────────────────
        if tag == "dry waste" and wet_row is not None:
            dry_row = {loc: _safe_float(raw_row[i + 1]) for i, loc in enumerate(LOCATIONS)}

            for loc in LOCATIONS:
                w = wet_row.get(loc, 0.0)
                d = dry_row.get(loc, 0.0)
                records.append(
                    {
                        "date":     current_date,
                        "location": loc,
                        "label":    LOCATION_LABELS[loc],
                        "group":    next(
                            (g for g, locs in LOCATION_GROUPS.items() if loc in locs),
                            "Other",
                        ),
                        "wet_kg":   w,
                        "dry_kg":   d,
                        "total_kg": w + d,
                    }
                )
            wet_row = None
            continue

    if not records:
        logger.warning("waste_parser: no records parsed from file.")
        return pd.DataFrame()

    df = pd.DataFrame(records)
    df["date"]  = pd.to_datetime(df["date"])
    df["month"] = df["date"].dt.strftime("%b %Y")
    df["week"]  = df["date"].dt.isocalendar().week.astype(int)
    df["day"]   = df["date"].dt.day_name()

    logger.info(
        "waste_parser: parsed %d records | %d locations | %d days",
        len(df), df["location"].nunique(), df["date"].nunique(),
    )
    return df


# ── Aggregation helpers used by both analytics and gamification pages ─────────

def monthly_summary(df: pd.DataFrame) -> pd.DataFrame:
    """Total wet / dry / total per location per month."""
    return (
        df.groupby(["month", "location", "label", "group"], as_index=False)
        .agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
        .sort_values("total_kg", ascending=False)
    )


def daily_totals(df: pd.DataFrame) -> pd.DataFrame:
    """Campus-wide daily totals (all locations summed)."""
    return (
        df.groupby("date", as_index=False)
        .agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
        .sort_values("date")
    )


def block_weekly(df: pd.DataFrame) -> pd.DataFrame:
    """Weekly totals per location."""
    return (
        df.groupby(["week", "location", "label", "group"], as_index=False)
        .agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
    )


def wet_dry_ratio(df: pd.DataFrame) -> pd.DataFrame:
    """Wet-to-dry ratio per location (monthly)."""
    m = monthly_summary(df)
    m["wet_pct"] = (m["wet_kg"] / m["total_kg"].replace(0, float("nan")) * 100).round(1)
    m["dry_pct"] = (m["dry_kg"] / m["total_kg"].replace(0, float("nan")) * 100).round(1)
    return m