# src/ingestion/features.py
from typing import Dict, Any, List
from collections import defaultdict
import numpy as np

from tools.runner_ai import sec_to_min_km


def running_features(runs: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Extract running features from a list of runs.

    Args:
        Runs: list of runs

    Returns:
        Running features dict with:
            - per_run summaries (pace_min_per_km, avg_hr)
            - time series arrays for charts
            - weekly aggregates (mileage)
            - consistency score (0-100)
    """
    per_run = []
    pace_series = []
    hr_series = []
    dates = []
    # weekly miles bucket: iso-week -> meters
    weekly = defaultdict(float)

    for r in runs:
        dist_m = r.get("total_distance_m") or 0.0
        dur_s = r.get("total_duration_s") or 0.0
        start_time = r.get("start_time")
        avg_hr = None
        # compute average hr if available
        hrs = [rec.get("hr_bpm") for rec in r.get("records", []) if rec.get("hr_bpm") is not None]
        if hrs:
            avg_hr = int(sum(hrs) / len(hrs))

        pace_min_per_km = None
        if dist_m and dur_s and dist_m > 0:
            pace_min_per_km = (dur_s / 60.0) / (dist_m / 1000.0)  # minutes per km

        per_run.append(
            {
                "id": r.get("id"),
                "start_time": start_time,
                "distance_m": dist_m,
                "duration_s": dur_s,
                "pace_min_per_km": pace_min_per_km,
                "avg_hr": avg_hr,
            }
        )

        if pace_min_per_km is not None:
            pace_series.append(pace_min_per_km)
        if avg_hr is not None:
            hr_series.append(avg_hr)
        if start_time:
            dates.append(start_time.date())
            iso_week = start_time.isocalendar()[0:2]  # (year, week)
            weekly[iso_week] += dist_m

    # compute weekly mileage (km)
    weekly_km = {f"{y}-{w}": v / 1000.0 for (y, w), v in weekly.items()}

    # consistency: fraction of weeks with >=1 run in last N weeks
    consistency_score = 0
    if weekly_km:
        weeks = sorted(weekly_km.items(), reverse=True)
        recent_weeks = weeks[:12]  # last 12 weeks
        active_weeks = sum(1 for _, km in recent_weeks if km > 0)
        consistency_score = int((active_weeks / len(recent_weeks)) * 100) if recent_weeks else 0

    # trend: simple linear regression slope on pace_series (if enough points)
    pace_trend = None
    if len(pace_series) >= 3:
        x = np.arange(len(pace_series))
        y = np.array(pace_series)
        A = np.vstack([x, np.ones(len(x))]).T
        m, c = np.linalg.lstsq(A, y, rcond=None)[0]
        pace_trend = float(m)  # minutes per km per run index

    return {
        "per_run": per_run,
        "pace_series": pace_series,
        "hr_series": hr_series,
        "dates": dates,
        "weekly_km": weekly_km,
        "consistency_score": consistency_score,
        "pace_trend": pace_trend,
    }


def compute_per_run_features(run: Dict[str, Any]) -> Dict[str, Any]:
    """
    Compute key features for a single run parsed from GPX/TCX.

    Args:
        run: dict output from parse_gpx_file or parse_tcx_file

    Returns:
        features dict
    """
    records = run.get("records", [])
    # If no records but we have aggregate distance or duration, still allow processing
    # so that top-level features can be passed through or simple paces computed.
    if not records and not (run.get("total_distance_m") or run.get("total_duration_s")):
        return {}

    # Total distance
    total_distance_m = run.get("total_distance_m")

    # Duration
    total_duration_s = run.get("total_duration_s")
    if not total_duration_s and records[0].get("time") and records[-1].get("time"):
        total_duration_s = (records[-1]["time"] - records[0]["time"]).total_seconds()

    # Pace in sec/km
    avg_pace_s_per_km = None
    if total_distance_m and total_distance_m > 0 and total_duration_s:
        avg_pace_s_per_km = total_duration_s / (total_distance_m / 1000)

    # Format pace as min/km string
    avg_pace_min_per_km = None
    if avg_pace_s_per_km:
        # minutes = int(avg_pace_s_per_km // 60)
        # seconds = int(avg_pace_s_per_km % 60)
        avg_pace_min_per_km = sec_to_min_km(avg_pace_s_per_km)

    # Heart Rate
    hr_values = [r["hr_bpm"] for r in records if r.get("hr_bpm") is not None]
    avg_hr_bpm = sum(hr_values) / len(hr_values) if hr_values else None
    max_hr_bpm = max(hr_values) if hr_values else None

    # Cadence
    cadence_values = [r["cadence_rpm"] for r in records if r.get("cadence_rpm") is not None]
    avg_cadence_rpm = sum(cadence_values) / len(cadence_values) if cadence_values else None

    # Elevation gain/loss
    elevation_values = [r["altitude_m"] for r in records if r.get("altitude_m") is not None]
    elevation_gain_m = 0.0
    elevation_loss_m = 0.0
    if elevation_values:
        for i in range(1, len(elevation_values)):
            diff = elevation_values[i] - elevation_values[i - 1]
            if diff > 0:
                elevation_gain_m += diff
            else:
                elevation_loss_m += abs(diff)

    # Optional: Heart rate zones (example zones)
    hr_zones = {}
    if avg_hr_bpm:
        zones = [(0.5, "Zone1"), (0.6, "Zone2"), (0.7, "Zone3"), (0.8, "Zone4"), (0.9, "Zone5")]
        max_hr = max_hr_bpm
        for fraction, name in zones:
            hr_zones[name] = min(int(max_hr * fraction), max_hr)

    features = {
        "id": run.get("id"),
        "sport": run.get("sport"),
        "start_time": run.get("start_time"),
        "total_distance_m": total_distance_m,
        "total_duration_s": total_duration_s,
        "avg_pace_s_per_km": avg_pace_s_per_km,
        "avg_pace_min_per_km": avg_pace_min_per_km,
        "avg_hr_bpm": avg_hr_bpm,
        "max_hr_bpm": max_hr_bpm,
        "avg_cadence_rpm": avg_cadence_rpm,
        "elevation_gain_m": elevation_gain_m,
        "elevation_loss_m": elevation_loss_m,
        "hr_zones": hr_zones,
        "source_path": run.get("source_path"),
    }

    return features


def compute_features_batch(runs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Compute features for a batch of runs.
    """
    return [compute_per_run_features(r) for r in runs]