Spaces:
Sleeping
Sleeping
File size: 2,854 Bytes
ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 aa92081 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 7b0e417 ea6f215 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | import os
import sys
import pickle
import pandas as pd
from datetime import datetime
# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from src.pipeline import RossmannPipeline
from src.core import setup_logger
logger = setup_logger(__name__)
def run_production_training():
"""
Executes a formal production training run.
"""
train_csv = os.path.abspath("data/raw/train.csv")
if not os.path.exists(train_csv):
logger.error(
f"Raw data not found at {train_csv}. Please ensure data is present."
)
return
logger.info("Initializing Production Training Pipeline...")
pipeline = RossmannPipeline(train_csv)
# 1. Ingest Full Dataset
logger.info("Ingesting full dataset...")
df_raw = pipeline.ingestor.ingest(train_csv)
# 2. Feature Engineering
logger.info("Running feature engineering...")
df_feat = pipeline.run_feature_engineering(df_raw)
# 3. Define Final Feature Set
# Consistent Encoding with app.py
if "StoreType" in df_feat.columns:
df_feat["StoreType"] = (
df_feat["StoreType"]
.astype(str)
.map({"a": 1, "b": 2, "c": 3, "d": 4})
.fillna(0)
)
if "Assortment" in df_feat.columns:
df_feat["Assortment"] = (
df_feat["Assortment"].astype(str).map({"a": 1, "b": 2, "c": 3}).fillna(0)
)
feature_cols = [
"Store",
"DayOfWeek",
"Promo",
"StateHoliday",
"SchoolHoliday",
"Year",
"Month",
"Day",
"IsWeekend",
"DayOfMonth",
"CompetitionDistance",
"StoreType",
"Assortment",
] + [c for c in df_feat.columns if "fourier" in c or "easter" in c]
# 4. Final Training (using all available data to create the 'Gold' model)
X = df_feat[feature_cols].fillna(0)
y = df_feat["target"]
logger.info(
f"Training final model on {len(df_feat)} records with {len(feature_cols)} features..."
)
pipeline.train(X, y)
# 5. Export Model & Metadata
os.makedirs("models", exist_ok=True)
model_path = "models/rossmann_production_model.pkl"
with open(model_path, "wb") as f:
pickle.dump(pipeline.model, f)
# Log the successful run
pipeline.tracker.log_experiment(
name="production_training_run",
params={
"feature_count": len(feature_cols),
"data_size": len(df_feat),
"model_type": str(type(pipeline.model)),
},
metrics={"status": "success"},
)
logger.info("--- PRODUCTION TRAINING COMPLETE ---")
logger.info(f"Model saved to: {model_path}")
logger.info("Project is now ready for deployment/inference.")
if __name__ == "__main__":
run_production_training()
|