SPG_ML / personalization /train_ctr_model.py
meetmendapara's picture
Added Personalization Models
5059de5
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
import pickle
import os
from feature_engineering import FeatureEngineer
class CTRModelTrainer:
def __init__(self):
self.feature_engineer = FeatureEngineer()
self.model = None
def prepare_training_data(self):
"""Prepare training data from notification feedback"""
conn = self.feature_engineer.conn
# If no database connection, generate synthetic data
if conn is None:
print("Database not available. Generating synthetic notification data...")
return self._generate_synthetic_ctr_data()
query = """
SELECT
nf.clicked::int as label,
COALESCE(uf.recency_score, 0.0) as recency_score,
COALESCE(uf.frequency_score, 0.0) as frequency_score,
COALESCE(pp.conscientiousness, 0.5) as conscientiousness,
COALESCE(pp.openness, 0.5) as openness,
CASE WHEN sn.notification_type = 'reminder' THEN 1 ELSE 0 END as is_reminder,
CASE WHEN sn.notification_type = 'milestone' THEN 1 ELSE 0 END as is_milestone,
COALESCE(sn.priority_score, 0.5) as priority_score,
EXTRACT(HOUR FROM sn.sent_at) / 24.0 as time_of_day
FROM notification_feedback nf
JOIN smart_notifications sn ON sn.id = nf.notification_id
LEFT JOIN user_features uf ON uf.user_id = nf.user_id
LEFT JOIN personality_profiles pp ON pp.user_id = nf.user_id
WHERE sn.sent_at IS NOT NULL
"""
try:
df = pd.read_sql(query, conn)
except Exception as e:
print(f"Database query failed ({e}). Generating synthetic data instead.")
return self._generate_synthetic_ctr_data()
if df.empty:
print("No notification feedback data available. Generating synthetic data...")
return self._generate_synthetic_ctr_data()
X = df.drop('label', axis=1).fillna(0.5)
y = df['label']
return X, y
def train(self):
"""Train CTR prediction model"""
print("Preparing training data...")
X, y = self.prepare_training_data()
if X is None or len(X) < 100:
print(f"Not enough training data for CTR model (need 100+, have {len(X) if X is not None else 0})")
return
print(f"Training on {len(X)} samples")
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Train model
self.model = GradientBoostingClassifier(
n_estimators=100,
max_depth=3,
learning_rate=0.1,
random_state=42
)
self.model.fit(X_train, y_train)
# Evaluate
self.evaluate(X_test, y_test)
# Save model
os.makedirs('models/personalization', exist_ok=True)
with open('models/personalization/notification_ctr_model.pkl', 'wb') as f:
pickle.dump(self.model, f)
print("CTR model saved successfully")
def evaluate(self, X_test, y_test):
"""Evaluate model performance"""
# Predictions
y_pred = self.model.predict(X_test)
y_pred_proba = self.model.predict_proba(X_test)[:, 1]
# Metrics
auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
print(f"AUC: {auc:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
# Feature importance
feature_names = X_test.columns
importances = self.model.feature_importances_
print("\nFeature Importance:")
for name, importance in sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True):
print(f" {name}: {importance:.3f}")
def _generate_synthetic_ctr_data(self):
"""Generate synthetic notification click-through data"""
np.random.seed(42)
n_samples = 3000
rows = []
for _ in range(n_samples):
# Generate features
recency = np.random.uniform(0, 1)
frequency = np.random.uniform(0, 1)
conscientiousness = np.random.beta(2, 2)
openness = np.random.beta(2, 2)
neuroticism = np.random.beta(2, 2)
is_reminder = np.random.choice([0, 1], p=[0.6, 0.4])
is_milestone = np.random.choice([0, 1], p=[0.8, 0.2])
priority_score = np.random.uniform(0.1, 1.0)
time_of_day = np.random.uniform(0, 1)
# Predict probability of click based on features
p_click = (
0.15 + 0.2 * conscientiousness + 0.1 * openness +
0.15 * priority_score + 0.05 * (1 - abs(time_of_day - 0.5)) -
0.1 * (frequency > 0.8) + 0.05 * is_milestone
)
p_click = np.clip(p_click, 0.02, 0.95)
clicked = int(np.random.random() < p_click)
rows.append({
'label': clicked,
'recency_score': recency,
'frequency_score': frequency,
'conscientiousness': conscientiousness,
'openness': openness,
'neuroticism': neuroticism,
'is_reminder': is_reminder,
'is_milestone': is_milestone,
'priority_score': priority_score,
'time_of_day': time_of_day,
})
df = pd.DataFrame(rows)
print(f"Generated {len(df)} synthetic notification samples")
X = df.drop('label', axis=1).fillna(0.5)
y = df['label']
return X, y
if __name__ == "__main__":
trainer = CTRModelTrainer()
trainer.train()