Spaces:
Running
Running
| """ | |
| src.data.loader | |
| =============== | |
| Data loading utilities for the NASA PCoE Li-ion Battery Dataset. | |
| This module handles: | |
| - Loading and parsing ``metadata.csv`` (including MATLAB-format date vectors) | |
| - Loading individual cycle CSV files (charge / discharge / impedance) | |
| - Aggregating all discharge or charge cycles into a single DataFrame | |
| - Loading impedance scalar features (Re, Rct) from metadata | |
| Excluded batteries: B0049βB0052 (confirmed software crash / corrupt data). | |
| """ | |
| from __future__ import annotations | |
| import ast | |
| import re | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Literal | |
| import numpy as np | |
| import pandas as pd | |
| # ββ Project paths ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PROJECT_ROOT = Path(__file__).resolve().parents[2] | |
| DATASET_DIR = PROJECT_ROOT / "cleaned_dataset" | |
| METADATA_PATH = DATASET_DIR / "metadata.csv" | |
| DATA_DIR = DATASET_DIR / "data" | |
| ARTIFACTS_DIR = PROJECT_ROOT / "artifacts" | |
| # ββ Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| EXCLUDED_BATTERIES = {"B0049", "B0050", "B0051", "B0052"} | |
| NOMINAL_CAPACITY_AH = 2.0 | |
| EOL_30PCT = 1.4 # 30 % fade β 1.4 Ah | |
| EOL_20PCT = 1.6 # 20 % fade β 1.6 Ah | |
| # Battery groups with their EOL thresholds | |
| BATTERY_EOL_MAP: dict[str, float] = {} | |
| for _bid in ("B0005", "B0006", "B0007", "B0018", | |
| "B0025", "B0026", "B0027", "B0028", | |
| "B0029", "B0030", "B0031", "B0032", | |
| "B0041", "B0042", "B0043", "B0044", | |
| "B0045", "B0046", "B0047", "B0048", | |
| "B0053", "B0054", "B0055", "B0056"): | |
| BATTERY_EOL_MAP[_bid] = EOL_30PCT | |
| for _bid in ("B0033", "B0034", "B0036", | |
| "B0038", "B0039", "B0040"): | |
| BATTERY_EOL_MAP[_bid] = EOL_20PCT | |
| # ββ MATLAB date-vector parser βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _parse_matlab_datevec(s: str) -> datetime | None: | |
| """Parse a MATLAB-style date vector string into a Python datetime. | |
| Handles formats like: | |
| ``[2010. 7. 21. 15. 0. 35.093]`` | |
| ``[2.008e+03, 4.000e+00, 2.000e+00, ...]`` | |
| """ | |
| if not isinstance(s, str) or s.strip() in ("", "[]"): | |
| return None | |
| try: | |
| # Strip brackets and split on comma / whitespace | |
| inner = s.strip().strip("[]") | |
| # Replace multiple spaces / commas with single comma | |
| inner = re.sub(r"[,\s]+", ",", inner.strip()) | |
| parts = [float(x) for x in inner.split(",") if x] | |
| if len(parts) < 6: | |
| return None | |
| yr, mo, dy, hr, mi, sc = parts[:6] | |
| return datetime(int(yr), int(mo), int(dy), int(hr), int(mi), int(sc)) | |
| except (ValueError, OverflowError): | |
| return None | |
| # ββ Metadata βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_metadata( | |
| *, | |
| exclude_corrupt: bool = True, | |
| parse_dates: bool = True, | |
| ) -> pd.DataFrame: | |
| """Load ``metadata.csv`` with optional date parsing and corrupt-battery exclusion. | |
| Parameters | |
| ---------- | |
| exclude_corrupt : bool | |
| If True, drop rows for B0049βB0052. | |
| parse_dates : bool | |
| If True, add a ``datetime`` column parsed from the raw ``start_time`` field. | |
| Returns | |
| ------- | |
| pd.DataFrame | |
| One row per test/cycle. | |
| """ | |
| df = pd.read_csv(METADATA_PATH) | |
| # Coerce Capacity to numeric (handles '[]' and empty strings) | |
| df["Capacity"] = pd.to_numeric(df["Capacity"], errors="coerce") | |
| df["Re"] = pd.to_numeric(df["Re"], errors="coerce") | |
| df["Rct"] = pd.to_numeric(df["Rct"], errors="coerce") | |
| if exclude_corrupt: | |
| df = df[~df["battery_id"].isin(EXCLUDED_BATTERIES)].reset_index(drop=True) | |
| if parse_dates: | |
| df["datetime"] = df["start_time"].apply(_parse_matlab_datevec) | |
| return df | |
| # ββ Individual cycle data ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_cycle_csv(uid: int | str) -> pd.DataFrame: | |
| """Load a single cycle CSV by its UID (filename number). | |
| Parameters | |
| ---------- | |
| uid : int or str | |
| The global unique ID, e.g. 1 β ``00001.csv``. | |
| Returns | |
| ------- | |
| pd.DataFrame | |
| Raw time-series data for that cycle. | |
| """ | |
| fname = f"{int(uid):05d}.csv" | |
| path = DATA_DIR / fname | |
| if not path.exists(): | |
| raise FileNotFoundError(f"Cycle CSV not found: {path}") | |
| return pd.read_csv(path) | |
| # ββ Aggregated cycle loading βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_all_cycles( | |
| cycle_type: Literal["discharge", "charge", "impedance"], | |
| *, | |
| exclude_corrupt: bool = True, | |
| max_batteries: int | None = None, | |
| verbose: bool = True, | |
| ) -> pd.DataFrame: | |
| """Load and concatenate all cycles of a given type across all batteries. | |
| Adds ``battery_id``, ``test_id``, ``uid``, ``cycle_number`` (0-based per | |
| battery for this cycle type), and ``Capacity`` (for discharge cycles). | |
| Parameters | |
| ---------- | |
| cycle_type : {"discharge", "charge", "impedance"} | |
| exclude_corrupt : bool | |
| max_batteries : int or None | |
| Limit number of batteries processed (useful for debugging). | |
| verbose : bool | |
| Returns | |
| ------- | |
| pd.DataFrame | |
| Concatenated time-series data with metadata columns appended. | |
| """ | |
| from tqdm import tqdm | |
| meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=False) | |
| subset = meta[meta["type"] == cycle_type].copy() | |
| if max_batteries is not None: | |
| keep_bats = subset["battery_id"].unique()[:max_batteries] | |
| subset = subset[subset["battery_id"].isin(keep_bats)] | |
| # Assign cycle_number per battery within this type | |
| subset = subset.sort_values(["battery_id", "test_id"]).reset_index(drop=True) | |
| subset["cycle_number"] = subset.groupby("battery_id").cumcount() | |
| frames: list[pd.DataFrame] = [] | |
| iterator = tqdm(subset.iterrows(), total=len(subset), desc=f"Loading {cycle_type}") if verbose else subset.iterrows() | |
| for _, row in iterator: | |
| try: | |
| df = load_cycle_csv(row["uid"]) | |
| except FileNotFoundError: | |
| continue | |
| df["battery_id"] = row["battery_id"] | |
| df["test_id"] = row["test_id"] | |
| df["uid"] = row["uid"] | |
| df["cycle_number"] = row["cycle_number"] | |
| if cycle_type == "discharge": | |
| df["Capacity"] = row["Capacity"] | |
| if cycle_type == "impedance": | |
| df["Re"] = row["Re"] | |
| df["Rct"] = row["Rct"] | |
| frames.append(df) | |
| if not frames: | |
| return pd.DataFrame() | |
| return pd.concat(frames, ignore_index=True) | |
| def load_discharge_capacities( | |
| *, | |
| exclude_corrupt: bool = True, | |
| drop_zero: bool = True, | |
| ) -> pd.DataFrame: | |
| """Return a compact DataFrame of discharge capacity per cycle per battery. | |
| Columns: ``battery_id``, ``cycle_number``, ``Capacity``, ``ambient_temperature``. | |
| This is much faster than `load_all_cycles("discharge")` because it only | |
| reads metadata β no individual CSV loading. | |
| """ | |
| meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=True) | |
| dis = meta[meta["type"] == "discharge"].copy() | |
| dis = dis.sort_values(["battery_id", "test_id"]).reset_index(drop=True) | |
| dis["cycle_number"] = dis.groupby("battery_id").cumcount() | |
| cols = ["battery_id", "cycle_number", "Capacity", "ambient_temperature"] | |
| if "datetime" in dis.columns: | |
| cols.append("datetime") | |
| result = dis[cols].copy() | |
| if drop_zero: | |
| result = result[result["Capacity"] > 0].dropna(subset=["Capacity"]) | |
| return result.reset_index(drop=True) | |
| def load_impedance_scalars(*, exclude_corrupt: bool = True) -> pd.DataFrame: | |
| """Return Re and Rct per cycle per battery from impedance tests (metadata only).""" | |
| meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=True) | |
| imp = meta[meta["type"] == "impedance"].copy() | |
| imp = imp.sort_values(["battery_id", "test_id"]).reset_index(drop=True) | |
| imp["cycle_number"] = imp.groupby("battery_id").cumcount() | |
| cols = ["battery_id", "cycle_number", "Re", "Rct", "ambient_temperature"] | |
| if "datetime" in imp.columns: | |
| cols.append("datetime") | |
| return imp[cols].dropna(subset=["Re", "Rct"]).reset_index(drop=True) | |
| def get_battery_ids(*, exclude_corrupt: bool = True) -> list[str]: | |
| """Return sorted list of available battery IDs.""" | |
| meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=False) | |
| return sorted(meta["battery_id"].unique().tolist()) | |
| def get_eol_threshold(battery_id: str) -> float: | |
| """Get EOL capacity threshold for a given battery.""" | |
| return BATTERY_EOL_MAP.get(battery_id, EOL_30PCT) | |