"""Load the raw Food Delivery dataset and produce a clean ML-ready CSV. The dataset has noisy "(min)" suffixes, NaN-strings, and mixed types. Here we clean and persist a tidy frame. Usage: python -m src.ml.prepare_data """ from __future__ import annotations import sys from pathlib import Path import numpy as np import pandas as pd sys.path.insert(0, str(Path(__file__).resolve().parents[2])) from src.config import ( # noqa: E402 GEO_LAT_BOUNDS, GEO_LON_BOUNDS, PROCESSED_DIR, RAW_DIR, ) RAW_CSV_CANDIDATES = [ RAW_DIR / "food_delivery" / "train.csv", RAW_DIR / "food_delivery" / "Food_Delivery_Times.csv", ] OUTPUT_CSV = PROCESSED_DIR / "food_delivery_clean.csv" def _resolve_raw_csv() -> Path: for c in RAW_CSV_CANDIDATES: if c.exists(): return c matches = list((RAW_DIR / "food_delivery").glob("*.csv")) if matches: return matches[0] raise FileNotFoundError( "No food-delivery CSV found. Run 'python scripts/download_data.py --ml'." ) def _clean_numeric(s: pd.Series) -> pd.Series: """Strip suffixes like '(min) 24' or 'conditions Sunny' -> '24' / 'Sunny'.""" if s.dtype == object: s = s.astype(str).str.replace(r"^\(min\)\s*", "", regex=True) s = s.str.replace(r"^conditions\s*", "", regex=True) s = s.str.strip() return s def load_raw() -> pd.DataFrame: path = _resolve_raw_csv() print(f"[prepare_data] reading {path}") df = pd.read_csv(path) df.columns = [c.strip() for c in df.columns] return df def clean(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() # Strip noisy prefixes for col in df.select_dtypes(include="object").columns: df[col] = _clean_numeric(df[col]) # NaN strings df = df.replace({"NaN ": np.nan, "NaN": np.nan, "": np.nan}) # Coerce numerics numeric_cols = [ "Delivery_person_Age", "Delivery_person_Ratings", "Restaurant_latitude", "Restaurant_longitude", "Delivery_location_latitude", "Delivery_location_longitude", "Vehicle_condition", "multiple_deliveries", "Time_taken(min)", ] for c in numeric_cols: if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce") # Parse order timestamp if "Order_Date" in df.columns and "Time_Orderd" in df.columns: df["order_datetime"] = pd.to_datetime( df["Order_Date"].astype(str) + " " + df["Time_Orderd"].astype(str), errors="coerce", dayfirst=True, ) # Drop rows without target or timestamp target = "Time_taken(min)" if target in df.columns: df = df.dropna(subset=[target]) if "order_datetime" in df.columns: df = df.dropna(subset=["order_datetime"]) # Lat/long sanity: the raw data has sign noise (negative coords) and a few # near-zero placeholders. Take the magnitude, then null out anything outside # the plausible geographic window so haversine distances stay sane. lat_cols = ("Restaurant_latitude", "Delivery_location_latitude") lon_cols = ("Restaurant_longitude", "Delivery_location_longitude") for c in lat_cols + lon_cols: if c in df.columns: df[c] = df[c].abs() df.loc[df[c] < 1, c] = np.nan for c in lat_cols: if c in df.columns: lo, hi = GEO_LAT_BOUNDS df.loc[(df[c] < lo) | (df[c] > hi), c] = np.nan for c in lon_cols: if c in df.columns: lo, hi = GEO_LON_BOUNDS df.loc[(df[c] < lo) | (df[c] > hi), c] = np.nan # Explicit category for missing City / Festival instead of silent imputation for c in ("City", "Festival"): if c in df.columns: df[c] = df[c].fillna("Unknown") df = df.reset_index(drop=True) print(f"[prepare_data] clean rows: {len(df):,}") return df def main() -> None: df = load_raw() clean_df = clean(df) OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True) clean_df.to_csv(OUTPUT_CSV, index=False) print(f"[prepare_data] wrote {OUTPUT_CSV}") if __name__ == "__main__": main()