Spaces:
Running
Running
File size: 9,254 Bytes
f381be8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 | """
src.data.loader
===============
Data loading utilities for the NASA PCoE Li-ion Battery Dataset.
This module handles:
- Loading and parsing ``metadata.csv`` (including MATLAB-format date vectors)
- Loading individual cycle CSV files (charge / discharge / impedance)
- Aggregating all discharge or charge cycles into a single DataFrame
- Loading impedance scalar features (Re, Rct) from metadata
Excluded batteries: B0049βB0052 (confirmed software crash / corrupt data).
"""
from __future__ import annotations
import ast
import re
from datetime import datetime
from pathlib import Path
from typing import Literal
import numpy as np
import pandas as pd
# ββ Project paths ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PROJECT_ROOT = Path(__file__).resolve().parents[2]
DATASET_DIR = PROJECT_ROOT / "cleaned_dataset"
METADATA_PATH = DATASET_DIR / "metadata.csv"
DATA_DIR = DATASET_DIR / "data"
ARTIFACTS_DIR = PROJECT_ROOT / "artifacts"
# ββ Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
EXCLUDED_BATTERIES = {"B0049", "B0050", "B0051", "B0052"}
NOMINAL_CAPACITY_AH = 2.0
EOL_30PCT = 1.4 # 30 % fade β 1.4 Ah
EOL_20PCT = 1.6 # 20 % fade β 1.6 Ah
# Battery groups with their EOL thresholds
BATTERY_EOL_MAP: dict[str, float] = {}
for _bid in ("B0005", "B0006", "B0007", "B0018",
"B0025", "B0026", "B0027", "B0028",
"B0029", "B0030", "B0031", "B0032",
"B0041", "B0042", "B0043", "B0044",
"B0045", "B0046", "B0047", "B0048",
"B0053", "B0054", "B0055", "B0056"):
BATTERY_EOL_MAP[_bid] = EOL_30PCT
for _bid in ("B0033", "B0034", "B0036",
"B0038", "B0039", "B0040"):
BATTERY_EOL_MAP[_bid] = EOL_20PCT
# ββ MATLAB date-vector parser βββββββββββββββββββββββββββββββββββββββββββββββ
def _parse_matlab_datevec(s: str) -> datetime | None:
"""Parse a MATLAB-style date vector string into a Python datetime.
Handles formats like:
``[2010. 7. 21. 15. 0. 35.093]``
``[2.008e+03, 4.000e+00, 2.000e+00, ...]``
"""
if not isinstance(s, str) or s.strip() in ("", "[]"):
return None
try:
# Strip brackets and split on comma / whitespace
inner = s.strip().strip("[]")
# Replace multiple spaces / commas with single comma
inner = re.sub(r"[,\s]+", ",", inner.strip())
parts = [float(x) for x in inner.split(",") if x]
if len(parts) < 6:
return None
yr, mo, dy, hr, mi, sc = parts[:6]
return datetime(int(yr), int(mo), int(dy), int(hr), int(mi), int(sc))
except (ValueError, OverflowError):
return None
# ββ Metadata βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def load_metadata(
*,
exclude_corrupt: bool = True,
parse_dates: bool = True,
) -> pd.DataFrame:
"""Load ``metadata.csv`` with optional date parsing and corrupt-battery exclusion.
Parameters
----------
exclude_corrupt : bool
If True, drop rows for B0049βB0052.
parse_dates : bool
If True, add a ``datetime`` column parsed from the raw ``start_time`` field.
Returns
-------
pd.DataFrame
One row per test/cycle.
"""
df = pd.read_csv(METADATA_PATH)
# Coerce Capacity to numeric (handles '[]' and empty strings)
df["Capacity"] = pd.to_numeric(df["Capacity"], errors="coerce")
df["Re"] = pd.to_numeric(df["Re"], errors="coerce")
df["Rct"] = pd.to_numeric(df["Rct"], errors="coerce")
if exclude_corrupt:
df = df[~df["battery_id"].isin(EXCLUDED_BATTERIES)].reset_index(drop=True)
if parse_dates:
df["datetime"] = df["start_time"].apply(_parse_matlab_datevec)
return df
# ββ Individual cycle data ββββββββββββββββββββββββββββββββββββββββββββββββββββ
def load_cycle_csv(uid: int | str) -> pd.DataFrame:
"""Load a single cycle CSV by its UID (filename number).
Parameters
----------
uid : int or str
The global unique ID, e.g. 1 β ``00001.csv``.
Returns
-------
pd.DataFrame
Raw time-series data for that cycle.
"""
fname = f"{int(uid):05d}.csv"
path = DATA_DIR / fname
if not path.exists():
raise FileNotFoundError(f"Cycle CSV not found: {path}")
return pd.read_csv(path)
# ββ Aggregated cycle loading βββββββββββββββββββββββββββββββββββββββββββββββββ
def load_all_cycles(
cycle_type: Literal["discharge", "charge", "impedance"],
*,
exclude_corrupt: bool = True,
max_batteries: int | None = None,
verbose: bool = True,
) -> pd.DataFrame:
"""Load and concatenate all cycles of a given type across all batteries.
Adds ``battery_id``, ``test_id``, ``uid``, ``cycle_number`` (0-based per
battery for this cycle type), and ``Capacity`` (for discharge cycles).
Parameters
----------
cycle_type : {"discharge", "charge", "impedance"}
exclude_corrupt : bool
max_batteries : int or None
Limit number of batteries processed (useful for debugging).
verbose : bool
Returns
-------
pd.DataFrame
Concatenated time-series data with metadata columns appended.
"""
from tqdm import tqdm
meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=False)
subset = meta[meta["type"] == cycle_type].copy()
if max_batteries is not None:
keep_bats = subset["battery_id"].unique()[:max_batteries]
subset = subset[subset["battery_id"].isin(keep_bats)]
# Assign cycle_number per battery within this type
subset = subset.sort_values(["battery_id", "test_id"]).reset_index(drop=True)
subset["cycle_number"] = subset.groupby("battery_id").cumcount()
frames: list[pd.DataFrame] = []
iterator = tqdm(subset.iterrows(), total=len(subset), desc=f"Loading {cycle_type}") if verbose else subset.iterrows()
for _, row in iterator:
try:
df = load_cycle_csv(row["uid"])
except FileNotFoundError:
continue
df["battery_id"] = row["battery_id"]
df["test_id"] = row["test_id"]
df["uid"] = row["uid"]
df["cycle_number"] = row["cycle_number"]
if cycle_type == "discharge":
df["Capacity"] = row["Capacity"]
if cycle_type == "impedance":
df["Re"] = row["Re"]
df["Rct"] = row["Rct"]
frames.append(df)
if not frames:
return pd.DataFrame()
return pd.concat(frames, ignore_index=True)
def load_discharge_capacities(
*,
exclude_corrupt: bool = True,
drop_zero: bool = True,
) -> pd.DataFrame:
"""Return a compact DataFrame of discharge capacity per cycle per battery.
Columns: ``battery_id``, ``cycle_number``, ``Capacity``, ``ambient_temperature``.
This is much faster than `load_all_cycles("discharge")` because it only
reads metadata β no individual CSV loading.
"""
meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=True)
dis = meta[meta["type"] == "discharge"].copy()
dis = dis.sort_values(["battery_id", "test_id"]).reset_index(drop=True)
dis["cycle_number"] = dis.groupby("battery_id").cumcount()
cols = ["battery_id", "cycle_number", "Capacity", "ambient_temperature"]
if "datetime" in dis.columns:
cols.append("datetime")
result = dis[cols].copy()
if drop_zero:
result = result[result["Capacity"] > 0].dropna(subset=["Capacity"])
return result.reset_index(drop=True)
def load_impedance_scalars(*, exclude_corrupt: bool = True) -> pd.DataFrame:
"""Return Re and Rct per cycle per battery from impedance tests (metadata only)."""
meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=True)
imp = meta[meta["type"] == "impedance"].copy()
imp = imp.sort_values(["battery_id", "test_id"]).reset_index(drop=True)
imp["cycle_number"] = imp.groupby("battery_id").cumcount()
cols = ["battery_id", "cycle_number", "Re", "Rct", "ambient_temperature"]
if "datetime" in imp.columns:
cols.append("datetime")
return imp[cols].dropna(subset=["Re", "Rct"]).reset_index(drop=True)
def get_battery_ids(*, exclude_corrupt: bool = True) -> list[str]:
"""Return sorted list of available battery IDs."""
meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=False)
return sorted(meta["battery_id"].unique().tolist())
def get_eol_threshold(battery_id: str) -> float:
"""Get EOL capacity threshold for a given battery."""
return BATTERY_EOL_MAP.get(battery_id, EOL_30PCT)
|