"""
src.data.loader
===============
Data loading utilities for the NASA PCoE Li-ion Battery Dataset.

This module handles:
- Loading and parsing ``metadata.csv`` (including MATLAB-format date vectors)
- Loading individual cycle CSV files (charge / discharge / impedance)
- Aggregating all discharge or charge cycles into a single DataFrame
- Loading impedance scalar features (Re, Rct) from metadata

Excluded batteries: B0049–B0052 (confirmed software crash / corrupt data).
"""

from __future__ import annotations

import ast
import re
from datetime import datetime
from pathlib import Path
from typing import Literal

import numpy as np
import pandas as pd

# ── Project paths ────────────────────────────────────────────────────────────
PROJECT_ROOT = Path(__file__).resolve().parents[2]
DATASET_DIR = PROJECT_ROOT / "cleaned_dataset"
METADATA_PATH = DATASET_DIR / "metadata.csv"
DATA_DIR = DATASET_DIR / "data"
ARTIFACTS_DIR = PROJECT_ROOT / "artifacts"

# ── Constants ────────────────────────────────────────────────────────────────
EXCLUDED_BATTERIES = {"B0049", "B0050", "B0051", "B0052"}
NOMINAL_CAPACITY_AH = 2.0
EOL_30PCT = 1.4  # 30 % fade → 1.4 Ah
EOL_20PCT = 1.6  # 20 % fade → 1.6 Ah

# Battery groups with their EOL thresholds
BATTERY_EOL_MAP: dict[str, float] = {}
for _bid in ("B0005", "B0006", "B0007", "B0018",
             "B0025", "B0026", "B0027", "B0028",
             "B0029", "B0030", "B0031", "B0032",
             "B0041", "B0042", "B0043", "B0044",
             "B0045", "B0046", "B0047", "B0048",
             "B0053", "B0054", "B0055", "B0056"):
    BATTERY_EOL_MAP[_bid] = EOL_30PCT
for _bid in ("B0033", "B0034", "B0036",
             "B0038", "B0039", "B0040"):
    BATTERY_EOL_MAP[_bid] = EOL_20PCT


# ── MATLAB date-vector parser ───────────────────────────────────────────────
def _parse_matlab_datevec(s: str) -> datetime | None:
    """Parse a MATLAB-style date vector string into a Python datetime.

    Handles formats like:
        ``[2010. 7. 21. 15. 0. 35.093]``
        ``[2.008e+03, 4.000e+00, 2.000e+00, ...]``
    """
    if not isinstance(s, str) or s.strip() in ("", "[]"):
        return None
    try:
        # Strip brackets and split on comma / whitespace
        inner = s.strip().strip("[]")
        # Replace multiple spaces / commas with single comma
        inner = re.sub(r"[,\s]+", ",", inner.strip())
        parts = [float(x) for x in inner.split(",") if x]
        if len(parts) < 6:
            return None
        yr, mo, dy, hr, mi, sc = parts[:6]
        return datetime(int(yr), int(mo), int(dy), int(hr), int(mi), int(sc))
    except (ValueError, OverflowError):
        return None


# ── Metadata ─────────────────────────────────────────────────────────────────
def load_metadata(
    *,
    exclude_corrupt: bool = True,
    parse_dates: bool = True,
) -> pd.DataFrame:
    """Load ``metadata.csv`` with optional date parsing and corrupt-battery exclusion.

    Parameters
    ----------
    exclude_corrupt : bool
        If True, drop rows for B0049–B0052.
    parse_dates : bool
        If True, add a ``datetime`` column parsed from the raw ``start_time`` field.

    Returns
    -------
    pd.DataFrame
        One row per test/cycle.
    """
    df = pd.read_csv(METADATA_PATH)

    # Coerce Capacity to numeric (handles '[]' and empty strings)
    df["Capacity"] = pd.to_numeric(df["Capacity"], errors="coerce")
    df["Re"] = pd.to_numeric(df["Re"], errors="coerce")
    df["Rct"] = pd.to_numeric(df["Rct"], errors="coerce")

    if exclude_corrupt:
        df = df[~df["battery_id"].isin(EXCLUDED_BATTERIES)].reset_index(drop=True)

    if parse_dates:
        df["datetime"] = df["start_time"].apply(_parse_matlab_datevec)

    return df


# ── Individual cycle data ────────────────────────────────────────────────────
def load_cycle_csv(uid: int | str) -> pd.DataFrame:
    """Load a single cycle CSV by its UID (filename number).

    Parameters
    ----------
    uid : int or str
        The global unique ID, e.g. 1 → ``00001.csv``.

    Returns
    -------
    pd.DataFrame
        Raw time-series data for that cycle.
    """
    fname = f"{int(uid):05d}.csv"
    path = DATA_DIR / fname
    if not path.exists():
        raise FileNotFoundError(f"Cycle CSV not found: {path}")
    return pd.read_csv(path)


# ── Aggregated cycle loading ─────────────────────────────────────────────────
def load_all_cycles(
    cycle_type: Literal["discharge", "charge", "impedance"],
    *,
    exclude_corrupt: bool = True,
    max_batteries: int | None = None,
    verbose: bool = True,
) -> pd.DataFrame:
    """Load and concatenate all cycles of a given type across all batteries.

    Adds ``battery_id``, ``test_id``, ``uid``, ``cycle_number`` (0-based per
    battery for this cycle type), and ``Capacity`` (for discharge cycles).

    Parameters
    ----------
    cycle_type : {"discharge", "charge", "impedance"}
    exclude_corrupt : bool
    max_batteries : int or None
        Limit number of batteries processed (useful for debugging).
    verbose : bool

    Returns
    -------
    pd.DataFrame
        Concatenated time-series data with metadata columns appended.
    """
    from tqdm import tqdm

    meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=False)
    subset = meta[meta["type"] == cycle_type].copy()

    if max_batteries is not None:
        keep_bats = subset["battery_id"].unique()[:max_batteries]
        subset = subset[subset["battery_id"].isin(keep_bats)]

    # Assign cycle_number per battery within this type
    subset = subset.sort_values(["battery_id", "test_id"]).reset_index(drop=True)
    subset["cycle_number"] = subset.groupby("battery_id").cumcount()

    frames: list[pd.DataFrame] = []
    iterator = tqdm(subset.iterrows(), total=len(subset), desc=f"Loading {cycle_type}") if verbose else subset.iterrows()

    for _, row in iterator:
        try:
            df = load_cycle_csv(row["uid"])
        except FileNotFoundError:
            continue

        df["battery_id"] = row["battery_id"]
        df["test_id"] = row["test_id"]
        df["uid"] = row["uid"]
        df["cycle_number"] = row["cycle_number"]

        if cycle_type == "discharge":
            df["Capacity"] = row["Capacity"]
        if cycle_type == "impedance":
            df["Re"] = row["Re"]
            df["Rct"] = row["Rct"]

        frames.append(df)

    if not frames:
        return pd.DataFrame()
    return pd.concat(frames, ignore_index=True)


def load_discharge_capacities(
    *,
    exclude_corrupt: bool = True,
    drop_zero: bool = True,
) -> pd.DataFrame:
    """Return a compact DataFrame of discharge capacity per cycle per battery.

    Columns: ``battery_id``, ``cycle_number``, ``Capacity``, ``ambient_temperature``.
    This is much faster than `load_all_cycles("discharge")` because it only
    reads metadata — no individual CSV loading.
    """
    meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=True)
    dis = meta[meta["type"] == "discharge"].copy()
    dis = dis.sort_values(["battery_id", "test_id"]).reset_index(drop=True)
    dis["cycle_number"] = dis.groupby("battery_id").cumcount()

    cols = ["battery_id", "cycle_number", "Capacity", "ambient_temperature"]
    if "datetime" in dis.columns:
        cols.append("datetime")
    result = dis[cols].copy()

    if drop_zero:
        result = result[result["Capacity"] > 0].dropna(subset=["Capacity"])

    return result.reset_index(drop=True)


def load_impedance_scalars(*, exclude_corrupt: bool = True) -> pd.DataFrame:
    """Return Re and Rct per cycle per battery from impedance tests (metadata only)."""
    meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=True)
    imp = meta[meta["type"] == "impedance"].copy()
    imp = imp.sort_values(["battery_id", "test_id"]).reset_index(drop=True)
    imp["cycle_number"] = imp.groupby("battery_id").cumcount()
    cols = ["battery_id", "cycle_number", "Re", "Rct", "ambient_temperature"]
    if "datetime" in imp.columns:
        cols.append("datetime")
    return imp[cols].dropna(subset=["Re", "Rct"]).reset_index(drop=True)


def get_battery_ids(*, exclude_corrupt: bool = True) -> list[str]:
    """Return sorted list of available battery IDs."""
    meta = load_metadata(exclude_corrupt=exclude_corrupt, parse_dates=False)
    return sorted(meta["battery_id"].unique().tolist())


def get_eol_threshold(battery_id: str) -> float:
    """Get EOL capacity threshold for a given battery."""
    return BATTERY_EOL_MAP.get(battery_id, EOL_30PCT)