conversion-probability-prediction / predictionpipeline.py
Ramanuj-Sarkar's picture
Upload predictionpipeline.py
c1fcfd8 verified
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import mlflow
import joblib
def load_data():
df = pd.read_csv("customer_marketing_data.csv")
# Simple data quality checks
if df.shape[0] == 0:
raise pd.errors.EmptyDataError("Empty dataset")
if "converted" not in df.columns:
raise KeyError("Target missing")
return df
def feature_engineering(df):
df["engagement_score"] = (
df["app_logins_last30"]
+ df["emails_opened_last30"]
+ df["feature_usage_last30"]
)
features = [
"tenure_months",
"vehicle_age",
"engagement_score",
"service_visits_last12"
]
X = df[features]
y = df["converted"]
return X, y
def train_model(X, y):
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42
)
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, preds)
print("Validation AUC:", auc)
return model, scaler, auc
def log_model(model, scaler, auc):
with mlflow.start_run():
mlflow.log_metric("auc", auc)
mlflow.sklearn.log_model(model, "conversion_model")
joblib.dump(scaler, "scaler.pkl")
mlflow.log_artifact("scaler.pkl")
def score_customers(model, scaler):
new_data = pd.read_csv("daily_customer_scoring_data.csv")
new_data["engagement_score"] = (
new_data["app_logins_last30"]
+ new_data["emails_opened_last30"]
+ new_data["feature_usage_last30"]
)
features = [
"tenure_months",
"vehicle_age",
"engagement_score",
"service_visits_last12"
]
X = scaler.transform(new_data[features])
new_data["conversion_probability"] = model.predict_proba(X)[:, 1]
new_data.to_csv("scored_output.csv", index=False)
def monitoring_check(scored_df):
avg_score = scored_df["conversion_probability"].mean()
if avg_score < 0.05:
raise ValueError("Prediction distribution abnormal — investigate pipeline")
if __name__ == "__main__":
df = load_data()
X, y = feature_engineering(df)
model, scaler, auc = train_model(X, y)
log_model(model, scaler, auc)
score_customers(model, scaler)