Rossmann-Store-Sales / scripts /train_production_model.py
ymlin105's picture
feat: standardize feature engineering and push new production model
aa92081
import os
import sys
import pickle
import pandas as pd
from datetime import datetime
# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.pipeline import RossmannPipeline
from src.core import setup_logger
logger = setup_logger(__name__)
def run_production_training():
"""
Executes a formal production training run.
"""
train_csv = os.path.abspath("data/raw/train.csv")
if not os.path.exists(train_csv):
logger.error(f"Raw data not found at {train_csv}. Please ensure data is present.")
return
logger.info("Initializing Production Training Pipeline...")
pipeline = RossmannPipeline(train_csv)
# 1. Ingest Full Dataset
logger.info("Ingesting full dataset...")
df_raw = pipeline.ingestor.ingest(train_csv)
# 2. Feature Engineering
logger.info("Running feature engineering...")
df_feat = pipeline.run_feature_engineering(df_raw)
# 3. Define Final Feature Set
# Consistent Encoding with app.py
if 'StoreType' in df_feat.columns:
df_feat['StoreType'] = df_feat['StoreType'].astype(str).map({'a':1, 'b':2, 'c':3, 'd':4}).fillna(0)
if 'Assortment' in df_feat.columns:
df_feat['Assortment'] = df_feat['Assortment'].astype(str).map({'a':1, 'b':2, 'c':3}).fillna(0)
feature_cols = [
'Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
'Year', 'Month', 'Day', 'IsWeekend', 'DayOfMonth',
'CompetitionDistance', 'StoreType', 'Assortment'
] + [c for c in df_feat.columns if 'fourier' in c or 'easter' in c]
# 4. Final Training (using all available data to create the 'Gold' model)
X = df_feat[feature_cols].fillna(0)
y = df_feat['target']
logger.info(f"Training final model on {len(df_feat)} records with {len(feature_cols)} features...")
pipeline.train(X, y)
# 5. Export Model & Metadata
os.makedirs('models', exist_ok=True)
model_path = 'models/rossmann_production_model.pkl'
with open(model_path, 'wb') as f:
pickle.dump(pipeline.model, f)
# Log the successful run
pipeline.tracker.log_experiment(
name="production_training_run",
params={
"feature_count": len(feature_cols),
"data_size": len(df_feat),
"model_type": str(type(pipeline.model))
},
metrics={"status": "success"}
)
logger.info("--- PRODUCTION TRAINING COMPLETE ---")
logger.info(f"Model saved to: {model_path}")
logger.info("Project is now ready for deployment/inference.")
if __name__ == "__main__":
run_production_training()