Spaces:
Sleeping
Sleeping
| """Load the raw Food Delivery dataset and produce a clean ML-ready CSV. | |
| The dataset has noisy "(min)" suffixes, NaN-strings, and mixed types. | |
| Here we clean and persist a tidy frame. | |
| Usage: | |
| python -m src.ml.prepare_data | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| sys.path.insert(0, str(Path(__file__).resolve().parents[2])) | |
| from src.config import ( # noqa: E402 | |
| GEO_LAT_BOUNDS, | |
| GEO_LON_BOUNDS, | |
| PROCESSED_DIR, | |
| RAW_DIR, | |
| ) | |
| RAW_CSV_CANDIDATES = [ | |
| RAW_DIR / "food_delivery" / "train.csv", | |
| RAW_DIR / "food_delivery" / "Food_Delivery_Times.csv", | |
| ] | |
| OUTPUT_CSV = PROCESSED_DIR / "food_delivery_clean.csv" | |
| def _resolve_raw_csv() -> Path: | |
| for c in RAW_CSV_CANDIDATES: | |
| if c.exists(): | |
| return c | |
| matches = list((RAW_DIR / "food_delivery").glob("*.csv")) | |
| if matches: | |
| return matches[0] | |
| raise FileNotFoundError( | |
| "No food-delivery CSV found. Run 'python scripts/download_data.py --ml'." | |
| ) | |
| def _clean_numeric(s: pd.Series) -> pd.Series: | |
| """Strip suffixes like '(min) 24' or 'conditions Sunny' -> '24' / 'Sunny'.""" | |
| if s.dtype == object: | |
| s = s.astype(str).str.replace(r"^\(min\)\s*", "", regex=True) | |
| s = s.str.replace(r"^conditions\s*", "", regex=True) | |
| s = s.str.strip() | |
| return s | |
| def load_raw() -> pd.DataFrame: | |
| path = _resolve_raw_csv() | |
| print(f"[prepare_data] reading {path}") | |
| df = pd.read_csv(path) | |
| df.columns = [c.strip() for c in df.columns] | |
| return df | |
| def clean(df: pd.DataFrame) -> pd.DataFrame: | |
| df = df.copy() | |
| # Strip noisy prefixes | |
| for col in df.select_dtypes(include="object").columns: | |
| df[col] = _clean_numeric(df[col]) | |
| # NaN strings | |
| df = df.replace({"NaN ": np.nan, "NaN": np.nan, "": np.nan}) | |
| # Coerce numerics | |
| numeric_cols = [ | |
| "Delivery_person_Age", | |
| "Delivery_person_Ratings", | |
| "Restaurant_latitude", | |
| "Restaurant_longitude", | |
| "Delivery_location_latitude", | |
| "Delivery_location_longitude", | |
| "Vehicle_condition", | |
| "multiple_deliveries", | |
| "Time_taken(min)", | |
| ] | |
| for c in numeric_cols: | |
| if c in df.columns: | |
| df[c] = pd.to_numeric(df[c], errors="coerce") | |
| # Parse order timestamp | |
| if "Order_Date" in df.columns and "Time_Orderd" in df.columns: | |
| df["order_datetime"] = pd.to_datetime( | |
| df["Order_Date"].astype(str) + " " + df["Time_Orderd"].astype(str), | |
| errors="coerce", | |
| dayfirst=True, | |
| ) | |
| # Drop rows without target or timestamp | |
| target = "Time_taken(min)" | |
| if target in df.columns: | |
| df = df.dropna(subset=[target]) | |
| if "order_datetime" in df.columns: | |
| df = df.dropna(subset=["order_datetime"]) | |
| # Lat/long sanity: the raw data has sign noise (negative coords) and a few | |
| # near-zero placeholders. Take the magnitude, then null out anything outside | |
| # the plausible geographic window so haversine distances stay sane. | |
| lat_cols = ("Restaurant_latitude", "Delivery_location_latitude") | |
| lon_cols = ("Restaurant_longitude", "Delivery_location_longitude") | |
| for c in lat_cols + lon_cols: | |
| if c in df.columns: | |
| df[c] = df[c].abs() | |
| df.loc[df[c] < 1, c] = np.nan | |
| for c in lat_cols: | |
| if c in df.columns: | |
| lo, hi = GEO_LAT_BOUNDS | |
| df.loc[(df[c] < lo) | (df[c] > hi), c] = np.nan | |
| for c in lon_cols: | |
| if c in df.columns: | |
| lo, hi = GEO_LON_BOUNDS | |
| df.loc[(df[c] < lo) | (df[c] > hi), c] = np.nan | |
| # Explicit category for missing City / Festival instead of silent imputation | |
| for c in ("City", "Festival"): | |
| if c in df.columns: | |
| df[c] = df[c].fillna("Unknown") | |
| df = df.reset_index(drop=True) | |
| print(f"[prepare_data] clean rows: {len(df):,}") | |
| return df | |
| def main() -> None: | |
| df = load_raw() | |
| clean_df = clean(df) | |
| OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True) | |
| clean_df.to_csv(OUTPUT_CSV, index=False) | |
| print(f"[prepare_data] wrote {OUTPUT_CSV}") | |
| if __name__ == "__main__": | |
| main() | |