Spaces:
Running
Running
File size: 7,114 Bytes
5a77372 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | """
core/waste_parser.py
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
SPJIMR Waste Analytics β Excel Parser
Reads the daily block-wise waste log (Book2.xlsx format) and returns
clean DataFrames for the analytics and gamification pages.
Data format in Excel:
Row with datetime β date header (every 6 rows)
Row 'Waste type' β column headers
Row 'Wet waste' β per-block wet waste kg
Row 'Dry waste' β per-block dry waste kg
Row 'TOTAL (Kg)' β skip (recalculated here)
Blank row β separator
"""
from __future__ import annotations
import io
import logging
from datetime import datetime, date
from pathlib import Path
from typing import Union
import pandas as pd
logger = logging.getLogger(__name__)
# Canonical block/location names as they appear in the spreadsheet
LOCATIONS = [
"A&B Block",
"C&D Block",
"L H Hostel",
"Hostel no -25",
"Hostel no -26",
"Hostel no -27",
"Hostel no -28",
"Hostel no -29",
"Hostel no -30",
"cantean",
"MESS",
]
# Friendly display names
LOCATION_LABELS = {
"A&B Block": "A & B Block",
"C&D Block": "C & D Block",
"L H Hostel": "LH Hostel",
"Hostel no -25": "Hostel 25",
"Hostel no -26": "Hostel 26",
"Hostel no -27": "Hostel 27",
"Hostel no -28": "Hostel 28",
"Hostel no -29": "Hostel 29",
"Hostel no -30": "Hostel 30",
"cantean": "Canteen",
"MESS": "Mess",
}
# Category grouping for gamification scoring
LOCATION_GROUPS = {
"Academic": ["A&B Block", "C&D Block"],
"Hostels": ["L H Hostel", "Hostel no -25", "Hostel no -26",
"Hostel no -27", "Hostel no -28", "Hostel no -29", "Hostel no -30"],
"Dining": ["cantean", "MESS"],
}
def _safe_float(val) -> float:
"""Return float or 0.0 for None / formula strings."""
if val is None:
return 0.0
if isinstance(val, str) and val.startswith("="):
return 0.0
try:
return float(val)
except (TypeError, ValueError):
return 0.0
def parse_waste_excel(source: Union[str, Path, bytes, io.BytesIO]) -> pd.DataFrame:
"""
Parse the waste-log Excel file into a tidy long-form DataFrame.
Parameters
----------
source : file path (str/Path), raw bytes, or BytesIO
Returns
-------
pd.DataFrame with columns:
date datetime.date
location str (one of LOCATIONS)
label str (friendly display name)
group str (Academic | Hostels | Dining)
wet_kg float
dry_kg float
total_kg float
month str (e.g. "Apr 2025")
week int (ISO week number)
"""
if isinstance(source, (str, Path)):
import openpyxl
wb = openpyxl.load_workbook(str(source), data_only=True)
elif isinstance(source, bytes):
import openpyxl
wb = openpyxl.load_workbook(io.BytesIO(source), data_only=True)
else:
import openpyxl
wb = openpyxl.load_workbook(source, data_only=True)
ws = wb.active
rows = list(ws.iter_rows(values_only=True))
records: list[dict] = []
current_date: date | None = None
wet_row: dict | None = None
for raw_row in rows:
cell0 = raw_row[0]
# ββ Date header ββββββββββββββββββββββββββββββββββββββββββββββββββ
if isinstance(cell0, datetime):
current_date = cell0.date()
wet_row = None
continue
if not current_date:
continue
if not isinstance(cell0, str):
continue
tag = cell0.strip().lower()
# ββ Wet waste row ββββββββββββββββββββββββββββββββββββββββββββββββ
if tag == "wet waste":
wet_row = {loc: _safe_float(raw_row[i + 1]) for i, loc in enumerate(LOCATIONS)}
continue
# ββ Dry waste row β complete the record βββββββββββββββββββββββββ
if tag == "dry waste" and wet_row is not None:
dry_row = {loc: _safe_float(raw_row[i + 1]) for i, loc in enumerate(LOCATIONS)}
for loc in LOCATIONS:
w = wet_row.get(loc, 0.0)
d = dry_row.get(loc, 0.0)
records.append(
{
"date": current_date,
"location": loc,
"label": LOCATION_LABELS[loc],
"group": next(
(g for g, locs in LOCATION_GROUPS.items() if loc in locs),
"Other",
),
"wet_kg": w,
"dry_kg": d,
"total_kg": w + d,
}
)
wet_row = None
continue
if not records:
logger.warning("waste_parser: no records parsed from file.")
return pd.DataFrame()
df = pd.DataFrame(records)
df["date"] = pd.to_datetime(df["date"])
df["month"] = df["date"].dt.strftime("%b %Y")
df["week"] = df["date"].dt.isocalendar().week.astype(int)
df["day"] = df["date"].dt.day_name()
logger.info(
"waste_parser: parsed %d records | %d locations | %d days",
len(df), df["location"].nunique(), df["date"].nunique(),
)
return df
# ββ Aggregation helpers used by both analytics and gamification pages βββββββββ
def monthly_summary(df: pd.DataFrame) -> pd.DataFrame:
"""Total wet / dry / total per location per month."""
return (
df.groupby(["month", "location", "label", "group"], as_index=False)
.agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
.sort_values("total_kg", ascending=False)
)
def daily_totals(df: pd.DataFrame) -> pd.DataFrame:
"""Campus-wide daily totals (all locations summed)."""
return (
df.groupby("date", as_index=False)
.agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
.sort_values("date")
)
def block_weekly(df: pd.DataFrame) -> pd.DataFrame:
"""Weekly totals per location."""
return (
df.groupby(["week", "location", "label", "group"], as_index=False)
.agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
)
def wet_dry_ratio(df: pd.DataFrame) -> pd.DataFrame:
"""Wet-to-dry ratio per location (monthly)."""
m = monthly_summary(df)
m["wet_pct"] = (m["wet_kg"] / m["total_kg"].replace(0, float("nan")) * 100).round(1)
m["dry_pct"] = (m["dry_kg"] / m["total_kg"].replace(0, float("nan")) * 100).round(1)
return m
|