File size: 2,854 Bytes
ea6f215
 
 
 
 
 
 
7b0e417
ea6f215
 
 
 
 
 
7b0e417
ea6f215
 
 
 
 
 
7b0e417
 
 
ea6f215
 
 
 
7b0e417
ea6f215
 
 
7b0e417
ea6f215
 
 
7b0e417
ea6f215
aa92081
7b0e417
 
 
 
 
 
 
 
 
 
 
ea6f215
 
7b0e417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea6f215
 
7b0e417
 
 
 
 
ea6f215
7b0e417
ea6f215
7b0e417
 
 
ea6f215
7b0e417
ea6f215
 
 
 
 
 
7b0e417
ea6f215
7b0e417
ea6f215
7b0e417
ea6f215
 
 
 
7b0e417
ea6f215
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import sys
import pickle
import pandas as pd
from datetime import datetime

# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from src.pipeline import RossmannPipeline
from src.core import setup_logger

logger = setup_logger(__name__)


def run_production_training():
    """
    Executes a formal production training run.
    """
    train_csv = os.path.abspath("data/raw/train.csv")
    if not os.path.exists(train_csv):
        logger.error(
            f"Raw data not found at {train_csv}. Please ensure data is present."
        )
        return

    logger.info("Initializing Production Training Pipeline...")
    pipeline = RossmannPipeline(train_csv)

    # 1. Ingest Full Dataset
    logger.info("Ingesting full dataset...")
    df_raw = pipeline.ingestor.ingest(train_csv)

    # 2. Feature Engineering
    logger.info("Running feature engineering...")
    df_feat = pipeline.run_feature_engineering(df_raw)

    # 3. Define Final Feature Set
    # Consistent Encoding with app.py
    if "StoreType" in df_feat.columns:
        df_feat["StoreType"] = (
            df_feat["StoreType"]
            .astype(str)
            .map({"a": 1, "b": 2, "c": 3, "d": 4})
            .fillna(0)
        )
    if "Assortment" in df_feat.columns:
        df_feat["Assortment"] = (
            df_feat["Assortment"].astype(str).map({"a": 1, "b": 2, "c": 3}).fillna(0)
        )

    feature_cols = [
        "Store",
        "DayOfWeek",
        "Promo",
        "StateHoliday",
        "SchoolHoliday",
        "Year",
        "Month",
        "Day",
        "IsWeekend",
        "DayOfMonth",
        "CompetitionDistance",
        "StoreType",
        "Assortment",
    ] + [c for c in df_feat.columns if "fourier" in c or "easter" in c]

    # 4. Final Training (using all available data to create the 'Gold' model)
    X = df_feat[feature_cols].fillna(0)
    y = df_feat["target"]

    logger.info(
        f"Training final model on {len(df_feat)} records with {len(feature_cols)} features..."
    )
    pipeline.train(X, y)

    # 5. Export Model & Metadata
    os.makedirs("models", exist_ok=True)
    model_path = "models/rossmann_production_model.pkl"
    with open(model_path, "wb") as f:
        pickle.dump(pipeline.model, f)

    # Log the successful run
    pipeline.tracker.log_experiment(
        name="production_training_run",
        params={
            "feature_count": len(feature_cols),
            "data_size": len(df_feat),
            "model_type": str(type(pipeline.model)),
        },
        metrics={"status": "success"},
    )

    logger.info("--- PRODUCTION TRAINING COMPLETE ---")
    logger.info(f"Model saved to: {model_path}")
    logger.info("Project is now ready for deployment/inference.")


if __name__ == "__main__":
    run_production_training()