Spaces:
Sleeping
Sleeping
File size: 1,736 Bytes
52cc99a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | import pandas as pd
import logging
logger = logging.getLogger(__name__)
def load_raw_data(train_path: str, store_path: str) -> pd.DataFrame:
"""Loads and merges the training and store datasets."""
logger.info(f"Loading data from {train_path} and {store_path}")
train_df = pd.read_csv(train_path, low_memory=False)
store_df = pd.read_csv(store_path)
# Merge datasets
df = pd.merge(train_df, store_df, on="Store", how="left")
logger.info(f"Data merged. Shape: {df.shape}")
return df
def load_store_data(store_path: str) -> pd.DataFrame:
"""Loads store metadata used by both training and serving."""
logger.info("Loading store metadata from %s", store_path)
return pd.read_csv(store_path)
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""Performs basic data cleaning."""
logger.info("Cleaning data...")
df = df.copy()
# Fill missing CompetitionDistance with a large value
if "CompetitionDistance" in df.columns:
df["CompetitionDistance"] = df["CompetitionDistance"].fillna(100000)
# Convert StateHoliday to numeric
if "StateHoliday" in df.columns:
df["StateHoliday"] = df["StateHoliday"].astype(str).map({
"0": 0, "a": 1, "b": 2, "c": 3
}).fillna(0).astype(int)
# Fill binary promo indicators
for col in ["Promo2", "Promo2SinceWeek", "Promo2SinceYear"]:
if col in df.columns:
df[col] = df[col].fillna(0).astype(int)
# Filter out closed stores or zero sales for training
if "Sales" in df.columns:
df = df[df["Open"] != 0]
df = df[df["Sales"] > 0]
logger.info(f"Filtered rows with zero sales/closed shops. New shape: {df.shape}")
return df
|