File size: 1,736 Bytes
52cc99a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
import logging

logger = logging.getLogger(__name__)

def load_raw_data(train_path: str, store_path: str) -> pd.DataFrame:
    """Loads and merges the training and store datasets."""
    logger.info(f"Loading data from {train_path} and {store_path}")
    
    train_df = pd.read_csv(train_path, low_memory=False)
    store_df = pd.read_csv(store_path)
    
    # Merge datasets
    df = pd.merge(train_df, store_df, on="Store", how="left")
    logger.info(f"Data merged. Shape: {df.shape}")
    
    return df


def load_store_data(store_path: str) -> pd.DataFrame:
    """Loads store metadata used by both training and serving."""
    logger.info("Loading store metadata from %s", store_path)
    return pd.read_csv(store_path)

def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Performs basic data cleaning."""
    logger.info("Cleaning data...")
    df = df.copy()

    # Fill missing CompetitionDistance with a large value
    if "CompetitionDistance" in df.columns:
        df["CompetitionDistance"] = df["CompetitionDistance"].fillna(100000)

    # Convert StateHoliday to numeric
    if "StateHoliday" in df.columns:
        df["StateHoliday"] = df["StateHoliday"].astype(str).map({
            "0": 0, "a": 1, "b": 2, "c": 3
        }).fillna(0).astype(int)

    # Fill binary promo indicators
    for col in ["Promo2", "Promo2SinceWeek", "Promo2SinceYear"]:
        if col in df.columns:
            df[col] = df[col].fillna(0).astype(int)

    # Filter out closed stores or zero sales for training
    if "Sales" in df.columns:
        df = df[df["Open"] != 0]
        df = df[df["Sales"] > 0]
        logger.info(f"Filtered rows with zero sales/closed shops. New shape: {df.shape}")

    return df