| import pandas as pd |
| import numpy as np |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import roc_auc_score |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.linear_model import LogisticRegression |
| import mlflow |
| import joblib |
|
|
|
|
| def load_data(): |
| df = pd.read_csv("customer_marketing_data.csv") |
|
|
| |
| if df.shape[0] == 0: |
| raise pd.errors.EmptyDataError("Empty dataset") |
| if "converted" not in df.columns: |
| raise KeyError("Target missing") |
|
|
| return df |
|
|
|
|
| def feature_engineering(df): |
| df["engagement_score"] = ( |
| df["app_logins_last30"] |
| + df["emails_opened_last30"] |
| + df["feature_usage_last30"] |
| ) |
|
|
| features = [ |
| "tenure_months", |
| "vehicle_age", |
| "engagement_score", |
| "service_visits_last12" |
| ] |
|
|
| X = df[features] |
| y = df["converted"] |
|
|
| return X, y |
|
|
|
|
| def train_model(X, y): |
| scaler = StandardScaler() |
| X_scaled = scaler.fit_transform(X) |
|
|
| X_train, X_test, y_train, y_test = train_test_split( |
| X_scaled, y, test_size=0.2, random_state=42 |
| ) |
|
|
| model = LogisticRegression() |
| model.fit(X_train, y_train) |
|
|
| preds = model.predict_proba(X_test)[:, 1] |
| auc = roc_auc_score(y_test, preds) |
|
|
| print("Validation AUC:", auc) |
|
|
| return model, scaler, auc |
|
|
| def log_model(model, scaler, auc): |
| with mlflow.start_run(): |
| mlflow.log_metric("auc", auc) |
| mlflow.sklearn.log_model(model, "conversion_model") |
| joblib.dump(scaler, "scaler.pkl") |
| mlflow.log_artifact("scaler.pkl") |
|
|
|
|
| def score_customers(model, scaler): |
| new_data = pd.read_csv("daily_customer_scoring_data.csv") |
|
|
| new_data["engagement_score"] = ( |
| new_data["app_logins_last30"] |
| + new_data["emails_opened_last30"] |
| + new_data["feature_usage_last30"] |
| ) |
|
|
| features = [ |
| "tenure_months", |
| "vehicle_age", |
| "engagement_score", |
| "service_visits_last12" |
| ] |
|
|
| X = scaler.transform(new_data[features]) |
|
|
| new_data["conversion_probability"] = model.predict_proba(X)[:, 1] |
|
|
| new_data.to_csv("scored_output.csv", index=False) |
|
|
|
|
| def monitoring_check(scored_df): |
| avg_score = scored_df["conversion_probability"].mean() |
|
|
| if avg_score < 0.05: |
| raise ValueError("Prediction distribution abnormal — investigate pipeline") |
|
|
|
|
| if __name__ == "__main__": |
| df = load_data() |
| X, y = feature_engineering(df) |
|
|
| model, scaler, auc = train_model(X, y) |
| log_model(model, scaler, auc) |
|
|
| score_customers(model, scaler) |