Ramanuj-Sarkar commited on
Commit
c1fcfd8
·
verified ·
1 Parent(s): 93965f1

Upload predictionpipeline.py

Browse files

This is the prediction pipeline of the model.

Files changed (1) hide show
  1. predictionpipeline.py +106 -0
predictionpipeline.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import roc_auc_score
5
+ from sklearn.preprocessing import StandardScaler
6
+ from sklearn.linear_model import LogisticRegression
7
+ import mlflow
8
+ import joblib
9
+
10
+
11
+ def load_data():
12
+ df = pd.read_csv("customer_marketing_data.csv")
13
+
14
+ # Simple data quality checks
15
+ if df.shape[0] == 0:
16
+ raise pd.errors.EmptyDataError("Empty dataset")
17
+ if "converted" not in df.columns:
18
+ raise KeyError("Target missing")
19
+
20
+ return df
21
+
22
+
23
+ def feature_engineering(df):
24
+ df["engagement_score"] = (
25
+ df["app_logins_last30"]
26
+ + df["emails_opened_last30"]
27
+ + df["feature_usage_last30"]
28
+ )
29
+
30
+ features = [
31
+ "tenure_months",
32
+ "vehicle_age",
33
+ "engagement_score",
34
+ "service_visits_last12"
35
+ ]
36
+
37
+ X = df[features]
38
+ y = df["converted"]
39
+
40
+ return X, y
41
+
42
+
43
+ def train_model(X, y):
44
+ scaler = StandardScaler()
45
+ X_scaled = scaler.fit_transform(X)
46
+
47
+ X_train, X_test, y_train, y_test = train_test_split(
48
+ X_scaled, y, test_size=0.2, random_state=42
49
+ )
50
+
51
+ model = LogisticRegression()
52
+ model.fit(X_train, y_train)
53
+
54
+ preds = model.predict_proba(X_test)[:, 1]
55
+ auc = roc_auc_score(y_test, preds)
56
+
57
+ print("Validation AUC:", auc)
58
+
59
+ return model, scaler, auc
60
+
61
+ def log_model(model, scaler, auc):
62
+ with mlflow.start_run():
63
+ mlflow.log_metric("auc", auc)
64
+ mlflow.sklearn.log_model(model, "conversion_model")
65
+ joblib.dump(scaler, "scaler.pkl")
66
+ mlflow.log_artifact("scaler.pkl")
67
+
68
+
69
+ def score_customers(model, scaler):
70
+ new_data = pd.read_csv("daily_customer_scoring_data.csv")
71
+
72
+ new_data["engagement_score"] = (
73
+ new_data["app_logins_last30"]
74
+ + new_data["emails_opened_last30"]
75
+ + new_data["feature_usage_last30"]
76
+ )
77
+
78
+ features = [
79
+ "tenure_months",
80
+ "vehicle_age",
81
+ "engagement_score",
82
+ "service_visits_last12"
83
+ ]
84
+
85
+ X = scaler.transform(new_data[features])
86
+
87
+ new_data["conversion_probability"] = model.predict_proba(X)[:, 1]
88
+
89
+ new_data.to_csv("scored_output.csv", index=False)
90
+
91
+
92
+ def monitoring_check(scored_df):
93
+ avg_score = scored_df["conversion_probability"].mean()
94
+
95
+ if avg_score < 0.05:
96
+ raise ValueError("Prediction distribution abnormal — investigate pipeline")
97
+
98
+
99
+ if __name__ == "__main__":
100
+ df = load_data()
101
+ X, y = feature_engineering(df)
102
+
103
+ model, scaler, auc = train_model(X, y)
104
+ log_model(model, scaler, auc)
105
+
106
+ score_customers(model, scaler)