Rossmann-Store-Sales / scripts /validate_rossmann_data.py
ymlin105's picture
Initial Production Release: End-to-End Rossmann Sales Forecasting System
ea6f215
import pandas as pd
import numpy as np
import os
import sys
# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.ingest.ingestor import DataIngestorFactory
from src.core.logger import setup_logger
logger = setup_logger(__name__)
def validate_data(train_path):
"""
Performs data integrity and continuity checks on the Rossmann dataset.
"""
try:
# 1. Ingest Data
factory = DataIngestorFactory()
ingestor = factory.get_data_ingestor("rossmann")
df = ingestor.ingest(train_path)
logger.info(f"Loaded dataset with {len(df)} rows.")
# 2. Check for missing values
missing_values = df.isnull().sum()
logger.info(f"Missing values per column:\n{missing_values[missing_values > 0]}")
# 3. Check Date Continuity
# Group by Store and check if dates are continuous
logger.info("Checking date continuity per store...")
store_id = df['Store'].unique()[0] # Check first store as sample for efficiency
store_data = df[df['Store'] == store_id].sort_values('Date')
min_date = store_data['Date'].min()
max_date = store_data['Date'].max()
expected_range = pd.date_range(start=min_date, end=max_date)
missing_dates = expected_range.difference(store_data['Date'])
if len(missing_dates) > 0:
logger.warning(f"Store {store_id} has {len(missing_dates)} missing dates in range {min_date.date()} to {max_date.date()}")
else:
logger.info(f"Store {store_id} has a continuous date range.")
# 4. Check for Store x Product (Rossmann is Store x Date, but we can check if all Stores have entries)
num_stores = df['Store'].nunique()
logger.info(f"Total unique stores: {num_stores}")
if 'StoreType' in df.columns:
logger.info(f"Store Types distribution:\n{df['StoreType'].value_counts()}")
# 5. Sales Statistics
logger.info(f"Sales Stats:\n{df['Sales'].describe()}")
# Check for non-stationarity (sample trend)
monthly_sales = df.set_index('Date').resample('ME')['Sales'].mean()
logger.info(f"Monthly Avg Sales Trend (first 5 months):\n{monthly_sales.head()}")
return True
except Exception as e:
logger.error(f"Validation failed: {e}")
return False
if __name__ == "__main__":
train_csv = os.path.abspath("data/raw/train.csv")
if os.path.exists(train_csv):
success = validate_data(train_csv)
if success:
print("Rossmann data validation completed successfully.")
else:
print("Rossmann data validation failed.")
else:
print(f"File not found: {train_csv}")