Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import pickle | |
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime | |
| # Add project root to path | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) | |
| from src.pipeline import RossmannPipeline | |
| from src.core import setup_logger | |
| logger = setup_logger(__name__) | |
| def generate_submission(): | |
| """ | |
| Generates the Kaggle submission file using the production model. | |
| """ | |
| test_csv = os.path.abspath("data/raw/test.csv") | |
| model_path = os.path.abspath("models/rossmann_production_model.pkl") | |
| store_csv = os.path.abspath("data/raw/store.csv") | |
| if not os.path.exists(test_csv): | |
| logger.error(f"Test data not found at {test_csv}.") | |
| return | |
| if not os.path.exists(model_path): | |
| logger.error(f"Model not found at {model_path}. Run production training first.") | |
| return | |
| logger.info("Initializing Submission Generation...") | |
| # We use RossmannPipeline to handle ingestion and feature engineering | |
| # Note: Test data needs to be merged with store data just like train data | |
| pipeline = RossmannPipeline(test_csv) | |
| # Load production model | |
| with open(model_path, "rb") as f: | |
| pipeline.model = pickle.load(f) | |
| # 1. Ingest and Merge Test Data | |
| # The ingestor logic in RossmannDataIngestor already handles merge with store.csv | |
| # if it's in the same directory as test.csv | |
| logger.info("Ingesting and merging test data...") | |
| df_test = pipeline.ingestor.ingest(test_csv) | |
| # 2. Run Feature Engineering | |
| # We need to preserve the 'Id' column for the submission | |
| logger.info("Running feature engineering on test data...") | |
| # RossmannPipeline.run_feature_engineering filters out 'Open' == 0 for Sales transform, | |
| # but for test data we need to predict 0 for 'Open' == 0 manually. | |
| # Separate open and closed stores | |
| df_open = df_test[df_test["Open"] != 0].copy() | |
| df_closed = df_test[df_test["Open"] == 0].copy() | |
| logger.info(f"Test data split: {len(df_open)} open, {len(df_closed)} closed.") | |
| # 3. Predict for Open Stores | |
| if len(df_open) > 0: | |
| df_open_feat = pipeline.run_feature_engineering(df_open) | |
| # Determine feature columns (must match training) | |
| from sklearn.preprocessing import LabelEncoder | |
| le = LabelEncoder() | |
| for col in ["StoreType", "Assortment"]: | |
| if col in df_open_feat.columns: | |
| df_open_feat[col] = le.fit_transform(df_open_feat[col].astype(str)) | |
| feature_cols = [ | |
| "Store", | |
| "DayOfWeek", | |
| "Promo", | |
| "StateHoliday", | |
| "SchoolHoliday", | |
| "Year", | |
| "Month", | |
| "Day", | |
| "IsWeekend", | |
| "DayOfMonth", | |
| "CompetitionDistance", | |
| "CompetitionOpenTime", | |
| "StoreType", | |
| "Assortment", | |
| ] + [c for c in df_open_feat.columns if "fourier" in c or "easter" in c] | |
| # Fill missing values for test data | |
| X_test = df_open_feat[feature_cols].fillna(0) | |
| # Predict in log space and transform back | |
| y_pred_log = pipeline.model.predict(X_test) | |
| df_open_feat["Sales"] = np.expm1(y_pred_log) | |
| # Join back to get Sales for open stores | |
| res_open = df_open_feat[["Id", "Sales"]] | |
| else: | |
| res_open = pd.DataFrame(columns=["Id", "Sales"]) | |
| # 4. Handle Closed Stores (Sales = 0) | |
| res_closed = pd.DataFrame({"Id": df_closed["Id"], "Sales": 0.0}) | |
| # 5. Combine and Sort | |
| submission = pd.concat([res_open, res_closed]).sort_values("Id") | |
| # 6. Final Formatting and Save | |
| submission["Id"] = submission["Id"].astype(int) | |
| submission["Sales"] = submission["Sales"].apply( | |
| lambda x: max(0, x) | |
| ) # Ensure no negative sales | |
| output_path = "data/output/submission.csv" | |
| os.makedirs("data/output", exist_ok=True) | |
| submission.to_csv(output_path, index=False) | |
| logger.info(f"Submission saved to {output_path}") | |
| logger.info(f"Total records: {len(submission)}") | |
| print(submission.head()) | |
| if __name__ == "__main__": | |
| generate_submission() | |