Rossmann-Store-Sales / scripts /generate_submission.py
ymlin105's picture
Fix linting, formatting, and deployment configuration
7b0e417
import os
import sys
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from src.pipeline import RossmannPipeline
from src.core import setup_logger
logger = setup_logger(__name__)
def generate_submission():
"""
Generates the Kaggle submission file using the production model.
"""
test_csv = os.path.abspath("data/raw/test.csv")
model_path = os.path.abspath("models/rossmann_production_model.pkl")
store_csv = os.path.abspath("data/raw/store.csv")
if not os.path.exists(test_csv):
logger.error(f"Test data not found at {test_csv}.")
return
if not os.path.exists(model_path):
logger.error(f"Model not found at {model_path}. Run production training first.")
return
logger.info("Initializing Submission Generation...")
# We use RossmannPipeline to handle ingestion and feature engineering
# Note: Test data needs to be merged with store data just like train data
pipeline = RossmannPipeline(test_csv)
# Load production model
with open(model_path, "rb") as f:
pipeline.model = pickle.load(f)
# 1. Ingest and Merge Test Data
# The ingestor logic in RossmannDataIngestor already handles merge with store.csv
# if it's in the same directory as test.csv
logger.info("Ingesting and merging test data...")
df_test = pipeline.ingestor.ingest(test_csv)
# 2. Run Feature Engineering
# We need to preserve the 'Id' column for the submission
logger.info("Running feature engineering on test data...")
# RossmannPipeline.run_feature_engineering filters out 'Open' == 0 for Sales transform,
# but for test data we need to predict 0 for 'Open' == 0 manually.
# Separate open and closed stores
df_open = df_test[df_test["Open"] != 0].copy()
df_closed = df_test[df_test["Open"] == 0].copy()
logger.info(f"Test data split: {len(df_open)} open, {len(df_closed)} closed.")
# 3. Predict for Open Stores
if len(df_open) > 0:
df_open_feat = pipeline.run_feature_engineering(df_open)
# Determine feature columns (must match training)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ["StoreType", "Assortment"]:
if col in df_open_feat.columns:
df_open_feat[col] = le.fit_transform(df_open_feat[col].astype(str))
feature_cols = [
"Store",
"DayOfWeek",
"Promo",
"StateHoliday",
"SchoolHoliday",
"Year",
"Month",
"Day",
"IsWeekend",
"DayOfMonth",
"CompetitionDistance",
"CompetitionOpenTime",
"StoreType",
"Assortment",
] + [c for c in df_open_feat.columns if "fourier" in c or "easter" in c]
# Fill missing values for test data
X_test = df_open_feat[feature_cols].fillna(0)
# Predict in log space and transform back
y_pred_log = pipeline.model.predict(X_test)
df_open_feat["Sales"] = np.expm1(y_pred_log)
# Join back to get Sales for open stores
res_open = df_open_feat[["Id", "Sales"]]
else:
res_open = pd.DataFrame(columns=["Id", "Sales"])
# 4. Handle Closed Stores (Sales = 0)
res_closed = pd.DataFrame({"Id": df_closed["Id"], "Sales": 0.0})
# 5. Combine and Sort
submission = pd.concat([res_open, res_closed]).sort_values("Id")
# 6. Final Formatting and Save
submission["Id"] = submission["Id"].astype(int)
submission["Sales"] = submission["Sales"].apply(
lambda x: max(0, x)
) # Ensure no negative sales
output_path = "data/output/submission.csv"
os.makedirs("data/output", exist_ok=True)
submission.to_csv(output_path, index=False)
logger.info(f"Submission saved to {output_path}")
logger.info(f"Total records: {len(submission)}")
print(submission.head())
if __name__ == "__main__":
generate_submission()