"""Load the raw Food Delivery dataset and produce a clean ML-ready CSV.

The dataset has noisy "(min)" suffixes, NaN-strings, and mixed types.
Here we clean and persist a tidy frame.

Usage:
    python -m src.ml.prepare_data
"""

from __future__ import annotations

import sys
from pathlib import Path

import numpy as np
import pandas as pd

sys.path.insert(0, str(Path(__file__).resolve().parents[2]))

from src.config import (  # noqa: E402
    GEO_LAT_BOUNDS,
    GEO_LON_BOUNDS,
    PROCESSED_DIR,
    RAW_DIR,
)


RAW_CSV_CANDIDATES = [
    RAW_DIR / "food_delivery" / "train.csv",
    RAW_DIR / "food_delivery" / "Food_Delivery_Times.csv",
]
OUTPUT_CSV = PROCESSED_DIR / "food_delivery_clean.csv"


def _resolve_raw_csv() -> Path:
    for c in RAW_CSV_CANDIDATES:
        if c.exists():
            return c
    matches = list((RAW_DIR / "food_delivery").glob("*.csv"))
    if matches:
        return matches[0]
    raise FileNotFoundError(
        "No food-delivery CSV found. Run 'python scripts/download_data.py --ml'."
    )


def _clean_numeric(s: pd.Series) -> pd.Series:
    """Strip suffixes like '(min) 24' or 'conditions Sunny' -> '24' / 'Sunny'."""
    if s.dtype == object:
        s = s.astype(str).str.replace(r"^\(min\)\s*", "", regex=True)
        s = s.str.replace(r"^conditions\s*", "", regex=True)
        s = s.str.strip()
    return s


def load_raw() -> pd.DataFrame:
    path = _resolve_raw_csv()
    print(f"[prepare_data] reading {path}")
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]
    return df


def clean(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Strip noisy prefixes
    for col in df.select_dtypes(include="object").columns:
        df[col] = _clean_numeric(df[col])

    # NaN strings
    df = df.replace({"NaN ": np.nan, "NaN": np.nan, "": np.nan})

    # Coerce numerics
    numeric_cols = [
        "Delivery_person_Age",
        "Delivery_person_Ratings",
        "Restaurant_latitude",
        "Restaurant_longitude",
        "Delivery_location_latitude",
        "Delivery_location_longitude",
        "Vehicle_condition",
        "multiple_deliveries",
        "Time_taken(min)",
    ]
    for c in numeric_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # Parse order timestamp
    if "Order_Date" in df.columns and "Time_Orderd" in df.columns:
        df["order_datetime"] = pd.to_datetime(
            df["Order_Date"].astype(str) + " " + df["Time_Orderd"].astype(str),
            errors="coerce",
            dayfirst=True,
        )

    # Drop rows without target or timestamp
    target = "Time_taken(min)"
    if target in df.columns:
        df = df.dropna(subset=[target])
    if "order_datetime" in df.columns:
        df = df.dropna(subset=["order_datetime"])

    # Lat/long sanity: the raw data has sign noise (negative coords) and a few
    # near-zero placeholders. Take the magnitude, then null out anything outside
    # the plausible geographic window so haversine distances stay sane.
    lat_cols = ("Restaurant_latitude", "Delivery_location_latitude")
    lon_cols = ("Restaurant_longitude", "Delivery_location_longitude")
    for c in lat_cols + lon_cols:
        if c in df.columns:
            df[c] = df[c].abs()
            df.loc[df[c] < 1, c] = np.nan
    for c in lat_cols:
        if c in df.columns:
            lo, hi = GEO_LAT_BOUNDS
            df.loc[(df[c] < lo) | (df[c] > hi), c] = np.nan
    for c in lon_cols:
        if c in df.columns:
            lo, hi = GEO_LON_BOUNDS
            df.loc[(df[c] < lo) | (df[c] > hi), c] = np.nan

    # Explicit category for missing City / Festival instead of silent imputation
    for c in ("City", "Festival"):
        if c in df.columns:
            df[c] = df[c].fillna("Unknown")

    df = df.reset_index(drop=True)
    print(f"[prepare_data] clean rows: {len(df):,}")
    return df


def main() -> None:
    df = load_raw()
    clean_df = clean(df)
    OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    clean_df.to_csv(OUTPUT_CSV, index=False)
    print(f"[prepare_data] wrote {OUTPUT_CSV}")


if __name__ == "__main__":
    main()