File size: 7,114 Bytes
5a77372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""
core/waste_parser.py
────────────────────────────────────────────────────────────────────
SPJIMR Waste Analytics β€” Excel Parser
Reads the daily block-wise waste log (Book2.xlsx format) and returns
clean DataFrames for the analytics and gamification pages.

Data format in Excel:
  Row with datetime  β†’ date header (every 6 rows)
  Row 'Waste type'   β†’ column headers
  Row 'Wet waste'    β†’ per-block wet waste kg
  Row 'Dry waste'    β†’ per-block dry waste kg
  Row 'TOTAL (Kg)'   β†’ skip (recalculated here)
  Blank row          β†’ separator
"""

from __future__ import annotations
import io
import logging
from datetime import datetime, date
from pathlib import Path
from typing import Union

import pandas as pd

logger = logging.getLogger(__name__)

# Canonical block/location names as they appear in the spreadsheet
LOCATIONS = [
    "A&B Block",
    "C&D Block",
    "L H Hostel",
    "Hostel no -25",
    "Hostel no -26",
    "Hostel no -27",
    "Hostel no -28",
    "Hostel no -29",
    "Hostel no -30",
    "cantean",
    "MESS",
]

# Friendly display names
LOCATION_LABELS = {
    "A&B Block":     "A & B Block",
    "C&D Block":     "C & D Block",
    "L H Hostel":    "LH Hostel",
    "Hostel no -25": "Hostel 25",
    "Hostel no -26": "Hostel 26",
    "Hostel no -27": "Hostel 27",
    "Hostel no -28": "Hostel 28",
    "Hostel no -29": "Hostel 29",
    "Hostel no -30": "Hostel 30",
    "cantean":       "Canteen",
    "MESS":          "Mess",
}

# Category grouping for gamification scoring
LOCATION_GROUPS = {
    "Academic":  ["A&B Block", "C&D Block"],
    "Hostels":   ["L H Hostel", "Hostel no -25", "Hostel no -26",
                  "Hostel no -27", "Hostel no -28", "Hostel no -29", "Hostel no -30"],
    "Dining":    ["cantean", "MESS"],
}


def _safe_float(val) -> float:
    """Return float or 0.0 for None / formula strings."""
    if val is None:
        return 0.0
    if isinstance(val, str) and val.startswith("="):
        return 0.0
    try:
        return float(val)
    except (TypeError, ValueError):
        return 0.0


def parse_waste_excel(source: Union[str, Path, bytes, io.BytesIO]) -> pd.DataFrame:
    """
    Parse the waste-log Excel file into a tidy long-form DataFrame.

    Parameters
    ----------
    source : file path (str/Path), raw bytes, or BytesIO

    Returns
    -------
    pd.DataFrame with columns:
        date        datetime.date
        location    str  (one of LOCATIONS)
        label       str  (friendly display name)
        group       str  (Academic | Hostels | Dining)
        wet_kg      float
        dry_kg      float
        total_kg    float
        month       str  (e.g. "Apr 2025")
        week        int  (ISO week number)
    """
    if isinstance(source, (str, Path)):
        import openpyxl
        wb = openpyxl.load_workbook(str(source), data_only=True)
    elif isinstance(source, bytes):
        import openpyxl
        wb = openpyxl.load_workbook(io.BytesIO(source), data_only=True)
    else:
        import openpyxl
        wb = openpyxl.load_workbook(source, data_only=True)

    ws = wb.active
    rows = list(ws.iter_rows(values_only=True))

    records: list[dict] = []
    current_date: date | None = None
    wet_row: dict | None  = None

    for raw_row in rows:
        cell0 = raw_row[0]

        # ── Date header ──────────────────────────────────────────────────
        if isinstance(cell0, datetime):
            current_date = cell0.date()
            wet_row = None
            continue

        if not current_date:
            continue

        if not isinstance(cell0, str):
            continue

        tag = cell0.strip().lower()

        # ── Wet waste row ────────────────────────────────────────────────
        if tag == "wet waste":
            wet_row = {loc: _safe_float(raw_row[i + 1]) for i, loc in enumerate(LOCATIONS)}
            continue

        # ── Dry waste row β€” complete the record ─────────────────────────
        if tag == "dry waste" and wet_row is not None:
            dry_row = {loc: _safe_float(raw_row[i + 1]) for i, loc in enumerate(LOCATIONS)}

            for loc in LOCATIONS:
                w = wet_row.get(loc, 0.0)
                d = dry_row.get(loc, 0.0)
                records.append(
                    {
                        "date":     current_date,
                        "location": loc,
                        "label":    LOCATION_LABELS[loc],
                        "group":    next(
                            (g for g, locs in LOCATION_GROUPS.items() if loc in locs),
                            "Other",
                        ),
                        "wet_kg":   w,
                        "dry_kg":   d,
                        "total_kg": w + d,
                    }
                )
            wet_row = None
            continue

    if not records:
        logger.warning("waste_parser: no records parsed from file.")
        return pd.DataFrame()

    df = pd.DataFrame(records)
    df["date"]  = pd.to_datetime(df["date"])
    df["month"] = df["date"].dt.strftime("%b %Y")
    df["week"]  = df["date"].dt.isocalendar().week.astype(int)
    df["day"]   = df["date"].dt.day_name()

    logger.info(
        "waste_parser: parsed %d records | %d locations | %d days",
        len(df), df["location"].nunique(), df["date"].nunique(),
    )
    return df


# ── Aggregation helpers used by both analytics and gamification pages ─────────

def monthly_summary(df: pd.DataFrame) -> pd.DataFrame:
    """Total wet / dry / total per location per month."""
    return (
        df.groupby(["month", "location", "label", "group"], as_index=False)
        .agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
        .sort_values("total_kg", ascending=False)
    )


def daily_totals(df: pd.DataFrame) -> pd.DataFrame:
    """Campus-wide daily totals (all locations summed)."""
    return (
        df.groupby("date", as_index=False)
        .agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
        .sort_values("date")
    )


def block_weekly(df: pd.DataFrame) -> pd.DataFrame:
    """Weekly totals per location."""
    return (
        df.groupby(["week", "location", "label", "group"], as_index=False)
        .agg(wet_kg=("wet_kg", "sum"), dry_kg=("dry_kg", "sum"), total_kg=("total_kg", "sum"))
    )


def wet_dry_ratio(df: pd.DataFrame) -> pd.DataFrame:
    """Wet-to-dry ratio per location (monthly)."""
    m = monthly_summary(df)
    m["wet_pct"] = (m["wet_kg"] / m["total_kg"].replace(0, float("nan")) * 100).round(1)
    m["dry_pct"] = (m["dry_kg"] / m["total_kg"].replace(0, float("nan")) * 100).round(1)
    return m