import pandas as pd import logging logger = logging.getLogger(__name__) def load_raw_data(train_path: str, store_path: str) -> pd.DataFrame: """Loads and merges the training and store datasets.""" logger.info(f"Loading data from {train_path} and {store_path}") train_df = pd.read_csv(train_path, low_memory=False) store_df = pd.read_csv(store_path) # Merge datasets df = pd.merge(train_df, store_df, on="Store", how="left") logger.info(f"Data merged. Shape: {df.shape}") return df def load_store_data(store_path: str) -> pd.DataFrame: """Loads store metadata used by both training and serving.""" logger.info("Loading store metadata from %s", store_path) return pd.read_csv(store_path) def clean_data(df: pd.DataFrame) -> pd.DataFrame: """Performs basic data cleaning.""" logger.info("Cleaning data...") df = df.copy() # Fill missing CompetitionDistance with a large value if "CompetitionDistance" in df.columns: df["CompetitionDistance"] = df["CompetitionDistance"].fillna(100000) # Convert StateHoliday to numeric if "StateHoliday" in df.columns: df["StateHoliday"] = df["StateHoliday"].astype(str).map({ "0": 0, "a": 1, "b": 2, "c": 3 }).fillna(0).astype(int) # Fill binary promo indicators for col in ["Promo2", "Promo2SinceWeek", "Promo2SinceYear"]: if col in df.columns: df[col] = df[col].fillna(0).astype(int) # Filter out closed stores or zero sales for training if "Sales" in df.columns: df = df[df["Open"] != 0] df = df[df["Sales"] > 0] logger.info(f"Filtered rows with zero sales/closed shops. New shape: {df.shape}") return df