Spaces:

ymlin105
/

Rossmann-Store-Sales

Sleeping

App Files Files Community

Rossmann-Store-Sales / scripts /generate_submission.py

ymlin105

Fix linting, formatting, and deployment configuration

7b0e417 6 days ago

raw

history blame contribute delete

4.13 kB

	import os
	import sys
	import pickle
	import pandas as pd
	import numpy as np
	from datetime import datetime

	# Add project root to path
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

	from src.pipeline import RossmannPipeline
	from src.core import setup_logger

	logger = setup_logger(__name__)


	def generate_submission():
	"""
	Generates the Kaggle submission file using the production model.
	"""
	test_csv = os.path.abspath("data/raw/test.csv")
	model_path = os.path.abspath("models/rossmann_production_model.pkl")
	store_csv = os.path.abspath("data/raw/store.csv")

	if not os.path.exists(test_csv):
	logger.error(f"Test data not found at {test_csv}.")
	return
	if not os.path.exists(model_path):
	logger.error(f"Model not found at {model_path}. Run production training first.")
	return

	logger.info("Initializing Submission Generation...")

	# We use RossmannPipeline to handle ingestion and feature engineering
	# Note: Test data needs to be merged with store data just like train data
	pipeline = RossmannPipeline(test_csv)

	# Load production model
	with open(model_path, "rb") as f:
	pipeline.model = pickle.load(f)

	# 1. Ingest and Merge Test Data
	# The ingestor logic in RossmannDataIngestor already handles merge with store.csv
	# if it's in the same directory as test.csv
	logger.info("Ingesting and merging test data...")
	df_test = pipeline.ingestor.ingest(test_csv)

	# 2. Run Feature Engineering
	# We need to preserve the 'Id' column for the submission
	logger.info("Running feature engineering on test data...")
	# RossmannPipeline.run_feature_engineering filters out 'Open' == 0 for Sales transform,
	# but for test data we need to predict 0 for 'Open' == 0 manually.

	# Separate open and closed stores
	df_open = df_test[df_test["Open"] != 0].copy()
	df_closed = df_test[df_test["Open"] == 0].copy()

	logger.info(f"Test data split: {len(df_open)} open, {len(df_closed)} closed.")

	# 3. Predict for Open Stores
	if len(df_open) > 0:
	df_open_feat = pipeline.run_feature_engineering(df_open)

	# Determine feature columns (must match training)
	from sklearn.preprocessing import LabelEncoder

	le = LabelEncoder()
	for col in ["StoreType", "Assortment"]:
	if col in df_open_feat.columns:
	df_open_feat[col] = le.fit_transform(df_open_feat[col].astype(str))

	feature_cols = [
	"Store",
	"DayOfWeek",
	"Promo",
	"StateHoliday",
	"SchoolHoliday",
	"Year",
	"Month",
	"Day",
	"IsWeekend",
	"DayOfMonth",
	"CompetitionDistance",
	"CompetitionOpenTime",
	"StoreType",
	"Assortment",
	] + [c for c in df_open_feat.columns if "fourier" in c or "easter" in c]

	# Fill missing values for test data
	X_test = df_open_feat[feature_cols].fillna(0)

	# Predict in log space and transform back
	y_pred_log = pipeline.model.predict(X_test)
	df_open_feat["Sales"] = np.expm1(y_pred_log)

	# Join back to get Sales for open stores
	res_open = df_open_feat[["Id", "Sales"]]
	else:
	res_open = pd.DataFrame(columns=["Id", "Sales"])

	# 4. Handle Closed Stores (Sales = 0)
	res_closed = pd.DataFrame({"Id": df_closed["Id"], "Sales": 0.0})

	# 5. Combine and Sort
	submission = pd.concat([res_open, res_closed]).sort_values("Id")

	# 6. Final Formatting and Save
	submission["Id"] = submission["Id"].astype(int)
	submission["Sales"] = submission["Sales"].apply(
	lambda x: max(0, x)
	) # Ensure no negative sales

	output_path = "data/output/submission.csv"
	os.makedirs("data/output", exist_ok=True)
	submission.to_csv(output_path, index=False)

	logger.info(f"Submission saved to {output_path}")
	logger.info(f"Total records: {len(submission)}")
	print(submission.head())


	if __name__ == "__main__":
	generate_submission()