File size: 5,254 Bytes
992aa4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Data ingestion and cleaning for the Major Power Outage dataset.

Assumptions
-----------
* Source: Kaggle "Major Power Outage Risks in the U.S." (Purdue / DOE).
* The CSV is pre-exported; if using the original .xlsx you may need to
  adjust `SKIP_ROWS_IN_RAW` in config.py.
* Rows with missing CUSTOMERS.AFFECTED *and* missing OUTAGE.DURATION are
  dropped because the target cannot be computed.
"""
from __future__ import annotations

import logging
from pathlib import Path

import numpy as np
import pandas as pd

from src.config import (
    CUSTOMERS_AFFECTED_COL,
    IMPACT_QUANTILE,
    OUTAGE_DURATION_COL,
    RAW_DATA_PATH,
    SKIP_ROWS_IN_RAW,
    TARGET_COL,
)

logger = logging.getLogger(__name__)


def load_raw(path: Path | str | None = None, skip_rows: int = SKIP_ROWS_IN_RAW) -> pd.DataFrame:
    """Read the outage CSV into a DataFrame, handling common formatting issues."""
    path = Path(path) if path else RAW_DATA_PATH
    if not path.exists():
        raise FileNotFoundError(
            f"Dataset not found at {path}. Download from Kaggle and place it there."
        )

    suffix = path.suffix.lower()
    if suffix in (".xlsx", ".xls"):
        df = pd.read_excel(path, skiprows=skip_rows)
    else:
        # Try reading with skiprows first; fall back to 0 if it blows up.
        try:
            df = pd.read_csv(path, skiprows=skip_rows)
        except Exception:
            df = pd.read_csv(path)

    # Strip whitespace from column names
    df.columns = df.columns.str.strip()

    # Drop fully empty rows / columns (common artifact of Excel exports)
    df.dropna(how="all", axis=0, inplace=True)
    df.dropna(how="all", axis=1, inplace=True)

    logger.info("Loaded %d rows, %d columns from %s", len(df), len(df.columns), path)
    return df


def clean(df: pd.DataFrame) -> pd.DataFrame:
    """Basic type coercion and null handling."""
    df = df.copy()
    
    # The first row often contains units (e.g. "Megawatt", "mins"). Drop it!
    if df["YEAR"].iloc[0] == "Year" or df["ANOMALY.LEVEL"].iloc[0] == "numeric":
        df = df.iloc[1:].reset_index(drop=True)

    # Coerce numeric columns that may have been read as object
    from src.config import NUMERIC_FEATURES
    
    # We also include extra columns used for feature engineering that might not be in NUMERIC_FEATURES
    num_cols = list(set(NUMERIC_FEATURES + [
        CUSTOMERS_AFFECTED_COL, OUTAGE_DURATION_COL, "DEMAND.LOSS.MW",
        "TOTAL.CUSTOMERS", "RES.SALES", "TOTAL.SALES", "RES.PRICE", "IND.PRICE",
        "POPDEN_URBAN", "POPDEN_RURAL"
    ]))
    for col in num_cols:
        if col in df.columns:
            if df[col].dtype == object and df[col].astype(str).str.contains(',').any():
                df[col] = df[col].astype(str).str.replace(',', '')
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Parse MONTH as string to avoid scikit-learn imputer mixed-type errors
    if "MONTH" in df.columns:
        df["MONTH"] = pd.to_numeric(df["MONTH"], errors="coerce").fillna(-1).astype(int).astype(str)
        df.loc[df["MONTH"] == "-1", "MONTH"] = np.nan

    # Drop rows where we cannot define the target at all
    target_deps = [c for c in [CUSTOMERS_AFFECTED_COL, OUTAGE_DURATION_COL] if c in df.columns]
    df.dropna(subset=target_deps, how="all", inplace=True)
    df.reset_index(drop=True, inplace=True)

    logger.info("After cleaning: %d rows remain", len(df))
    return df


def build_target(df: pd.DataFrame, quantile: float = IMPACT_QUANTILE) -> pd.DataFrame:
    """
    Define binary target: **high_impact**.

    An outage qualifies as high-impact when:
      customers_affected >= Q(quantile)  AND  duration >= Q(quantile)

    This deliberately selects the upper-right quadrant of severity, which
    is the segment an operations center would triage first.

    When either column is missing for a row, we fall back to the single
    available column so we don't lose too many samples.
    """
    df = df.copy()

    cust_thresh = df[CUSTOMERS_AFFECTED_COL].quantile(quantile)
    dur_thresh = df[OUTAGE_DURATION_COL].quantile(quantile)

    has_cust = df[CUSTOMERS_AFFECTED_COL].notna()
    has_dur = df[OUTAGE_DURATION_COL].notna()

    high_cust = df[CUSTOMERS_AFFECTED_COL] >= cust_thresh
    high_dur = df[OUTAGE_DURATION_COL] >= dur_thresh

    # Both present β†’ require both; only one present β†’ use that one
    target = pd.Series(0, index=df.index)
    both = has_cust & has_dur
    target.loc[both] = ((high_cust & high_dur) & both).astype(int).loc[both]
    target.loc[has_cust & ~has_dur] = high_cust.astype(int).loc[has_cust & ~has_dur]
    target.loc[~has_cust & has_dur] = high_dur.astype(int).loc[~has_cust & has_dur]

    df[TARGET_COL] = target.astype(int)

    pos_rate = df[TARGET_COL].mean()
    logger.info(
        "Target built β€” positive rate: %.2f%% (%d / %d) | thresholds: customers>=%.0f, duration>=%.0f min",
        pos_rate * 100,
        df[TARGET_COL].sum(),
        len(df),
        cust_thresh,
        dur_thresh,
    )
    return df


def get_dataset(path: Path | str | None = None) -> pd.DataFrame:
    """End-to-end: load β†’ clean β†’ build target."""
    df = load_raw(path)
    df = clean(df)
    df = build_target(df)
    return df