kuechenpassagent / src /ml /prepare_data.py
lederyou's picture
Upload folder using huggingface_hub
db662ea verified
Raw
History Blame Contribute Delete
4.17 kB
"""Load the raw Food Delivery dataset and produce a clean ML-ready CSV.
The dataset has noisy "(min)" suffixes, NaN-strings, and mixed types.
Here we clean and persist a tidy frame.
Usage:
python -m src.ml.prepare_data
"""
from __future__ import annotations
import sys
from pathlib import Path
import numpy as np
import pandas as pd
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from src.config import ( # noqa: E402
GEO_LAT_BOUNDS,
GEO_LON_BOUNDS,
PROCESSED_DIR,
RAW_DIR,
)
RAW_CSV_CANDIDATES = [
RAW_DIR / "food_delivery" / "train.csv",
RAW_DIR / "food_delivery" / "Food_Delivery_Times.csv",
]
OUTPUT_CSV = PROCESSED_DIR / "food_delivery_clean.csv"
def _resolve_raw_csv() -> Path:
for c in RAW_CSV_CANDIDATES:
if c.exists():
return c
matches = list((RAW_DIR / "food_delivery").glob("*.csv"))
if matches:
return matches[0]
raise FileNotFoundError(
"No food-delivery CSV found. Run 'python scripts/download_data.py --ml'."
)
def _clean_numeric(s: pd.Series) -> pd.Series:
"""Strip suffixes like '(min) 24' or 'conditions Sunny' -> '24' / 'Sunny'."""
if s.dtype == object:
s = s.astype(str).str.replace(r"^\(min\)\s*", "", regex=True)
s = s.str.replace(r"^conditions\s*", "", regex=True)
s = s.str.strip()
return s
def load_raw() -> pd.DataFrame:
path = _resolve_raw_csv()
print(f"[prepare_data] reading {path}")
df = pd.read_csv(path)
df.columns = [c.strip() for c in df.columns]
return df
def clean(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
# Strip noisy prefixes
for col in df.select_dtypes(include="object").columns:
df[col] = _clean_numeric(df[col])
# NaN strings
df = df.replace({"NaN ": np.nan, "NaN": np.nan, "": np.nan})
# Coerce numerics
numeric_cols = [
"Delivery_person_Age",
"Delivery_person_Ratings",
"Restaurant_latitude",
"Restaurant_longitude",
"Delivery_location_latitude",
"Delivery_location_longitude",
"Vehicle_condition",
"multiple_deliveries",
"Time_taken(min)",
]
for c in numeric_cols:
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce")
# Parse order timestamp
if "Order_Date" in df.columns and "Time_Orderd" in df.columns:
df["order_datetime"] = pd.to_datetime(
df["Order_Date"].astype(str) + " " + df["Time_Orderd"].astype(str),
errors="coerce",
dayfirst=True,
)
# Drop rows without target or timestamp
target = "Time_taken(min)"
if target in df.columns:
df = df.dropna(subset=[target])
if "order_datetime" in df.columns:
df = df.dropna(subset=["order_datetime"])
# Lat/long sanity: the raw data has sign noise (negative coords) and a few
# near-zero placeholders. Take the magnitude, then null out anything outside
# the plausible geographic window so haversine distances stay sane.
lat_cols = ("Restaurant_latitude", "Delivery_location_latitude")
lon_cols = ("Restaurant_longitude", "Delivery_location_longitude")
for c in lat_cols + lon_cols:
if c in df.columns:
df[c] = df[c].abs()
df.loc[df[c] < 1, c] = np.nan
for c in lat_cols:
if c in df.columns:
lo, hi = GEO_LAT_BOUNDS
df.loc[(df[c] < lo) | (df[c] > hi), c] = np.nan
for c in lon_cols:
if c in df.columns:
lo, hi = GEO_LON_BOUNDS
df.loc[(df[c] < lo) | (df[c] > hi), c] = np.nan
# Explicit category for missing City / Festival instead of silent imputation
for c in ("City", "Festival"):
if c in df.columns:
df[c] = df[c].fillna("Unknown")
df = df.reset_index(drop=True)
print(f"[prepare_data] clean rows: {len(df):,}")
return df
def main() -> None:
df = load_raw()
clean_df = clean(df)
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
clean_df.to_csv(OUTPUT_CSV, index=False)
print(f"[prepare_data] wrote {OUTPUT_CSV}")
if __name__ == "__main__":
main()