Spaces:

lederyou
/

kuechenpassagent

Sleeping

App Files Files Community

kuechenpassagent / src /ml /prepare_data.py

lederyou

Upload folder using huggingface_hub

db662ea verified about 1 month ago

Raw

History Blame Contribute Delete

4.17 kB

	"""Load the raw Food Delivery dataset and produce a clean ML-ready CSV.

	The dataset has noisy "(min)" suffixes, NaN-strings, and mixed types.
	Here we clean and persist a tidy frame.

	Usage:
	python -m src.ml.prepare_data
	"""

	from __future__ import annotations

	import sys
	from pathlib import Path

	import numpy as np
	import pandas as pd

	sys.path.insert(0, str(Path(__file__).resolve().parents[2]))

	from src.config import ( # noqa: E402
	GEO_LAT_BOUNDS,
	GEO_LON_BOUNDS,
	PROCESSED_DIR,
	RAW_DIR,
	)


	RAW_CSV_CANDIDATES = [
	RAW_DIR / "food_delivery" / "train.csv",
	RAW_DIR / "food_delivery" / "Food_Delivery_Times.csv",
	]
	OUTPUT_CSV = PROCESSED_DIR / "food_delivery_clean.csv"


	def _resolve_raw_csv() -> Path:
	for c in RAW_CSV_CANDIDATES:
	if c.exists():
	return c
	matches = list((RAW_DIR / "food_delivery").glob("*.csv"))
	if matches:
	return matches[0]
	raise FileNotFoundError(
	"No food-delivery CSV found. Run 'python scripts/download_data.py --ml'."
	)


	def _clean_numeric(s: pd.Series) -> pd.Series:
	"""Strip suffixes like '(min) 24' or 'conditions Sunny' -> '24' / 'Sunny'."""
	if s.dtype == object:
	s = s.astype(str).str.replace(r"^\(min\)\s*", "", regex=True)
	s = s.str.replace(r"^conditions\s*", "", regex=True)
	s = s.str.strip()
	return s


	def load_raw() -> pd.DataFrame:
	path = _resolve_raw_csv()
	print(f"[prepare_data] reading {path}")
	df = pd.read_csv(path)
	df.columns = [c.strip() for c in df.columns]
	return df


	def clean(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()

	# Strip noisy prefixes
	for col in df.select_dtypes(include="object").columns:
	df[col] = _clean_numeric(df[col])

	# NaN strings
	df = df.replace({"NaN ": np.nan, "NaN": np.nan, "": np.nan})

	# Coerce numerics
	numeric_cols = [
	"Delivery_person_Age",
	"Delivery_person_Ratings",
	"Restaurant_latitude",
	"Restaurant_longitude",
	"Delivery_location_latitude",
	"Delivery_location_longitude",
	"Vehicle_condition",
	"multiple_deliveries",
	"Time_taken(min)",
	]
	for c in numeric_cols:
	if c in df.columns:
	df[c] = pd.to_numeric(df[c], errors="coerce")

	# Parse order timestamp
	if "Order_Date" in df.columns and "Time_Orderd" in df.columns:
	df["order_datetime"] = pd.to_datetime(
	df["Order_Date"].astype(str) + " " + df["Time_Orderd"].astype(str),
	errors="coerce",
	dayfirst=True,
	)

	# Drop rows without target or timestamp
	target = "Time_taken(min)"
	if target in df.columns:
	df = df.dropna(subset=[target])
	if "order_datetime" in df.columns:
	df = df.dropna(subset=["order_datetime"])

	# Lat/long sanity: the raw data has sign noise (negative coords) and a few
	# near-zero placeholders. Take the magnitude, then null out anything outside
	# the plausible geographic window so haversine distances stay sane.
	lat_cols = ("Restaurant_latitude", "Delivery_location_latitude")
	lon_cols = ("Restaurant_longitude", "Delivery_location_longitude")
	for c in lat_cols + lon_cols:
	if c in df.columns:
	df[c] = df[c].abs()
	df.loc[df[c] < 1, c] = np.nan
	for c in lat_cols:
	if c in df.columns:
	lo, hi = GEO_LAT_BOUNDS
	df.loc[(df[c] < lo) \| (df[c] > hi), c] = np.nan
	for c in lon_cols:
	if c in df.columns:
	lo, hi = GEO_LON_BOUNDS
	df.loc[(df[c] < lo) \| (df[c] > hi), c] = np.nan

	# Explicit category for missing City / Festival instead of silent imputation
	for c in ("City", "Festival"):
	if c in df.columns:
	df[c] = df[c].fillna("Unknown")

	df = df.reset_index(drop=True)
	print(f"[prepare_data] clean rows: {len(df):,}")
	return df


	def main() -> None:
	df = load_raw()
	clean_df = clean(df)
	OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
	clean_df.to_csv(OUTPUT_CSV, index=False)
	print(f"[prepare_data] wrote {OUTPUT_CSV}")


	if __name__ == "__main__":
	main()