import pandas as pd import numpy as np from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score import pickle import os from feature_engineering import FeatureEngineer class CTRModelTrainer: def __init__(self): self.feature_engineer = FeatureEngineer() self.model = None def prepare_training_data(self): """Prepare training data from notification feedback""" conn = self.feature_engineer.conn # If no database connection, generate synthetic data if conn is None: print("Database not available. Generating synthetic notification data...") return self._generate_synthetic_ctr_data() query = """ SELECT nf.clicked::int as label, COALESCE(uf.recency_score, 0.0) as recency_score, COALESCE(uf.frequency_score, 0.0) as frequency_score, COALESCE(pp.conscientiousness, 0.5) as conscientiousness, COALESCE(pp.openness, 0.5) as openness, CASE WHEN sn.notification_type = 'reminder' THEN 1 ELSE 0 END as is_reminder, CASE WHEN sn.notification_type = 'milestone' THEN 1 ELSE 0 END as is_milestone, COALESCE(sn.priority_score, 0.5) as priority_score, EXTRACT(HOUR FROM sn.sent_at) / 24.0 as time_of_day FROM notification_feedback nf JOIN smart_notifications sn ON sn.id = nf.notification_id LEFT JOIN user_features uf ON uf.user_id = nf.user_id LEFT JOIN personality_profiles pp ON pp.user_id = nf.user_id WHERE sn.sent_at IS NOT NULL """ try: df = pd.read_sql(query, conn) except Exception as e: print(f"Database query failed ({e}). Generating synthetic data instead.") return self._generate_synthetic_ctr_data() if df.empty: print("No notification feedback data available. Generating synthetic data...") return self._generate_synthetic_ctr_data() X = df.drop('label', axis=1).fillna(0.5) y = df['label'] return X, y def train(self): """Train CTR prediction model""" print("Preparing training data...") X, y = self.prepare_training_data() if X is None or len(X) < 100: print(f"Not enough training data for CTR model (need 100+, have {len(X) if X is not None else 0})") return print(f"Training on {len(X)} samples") # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Train model self.model = GradientBoostingClassifier( n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42 ) self.model.fit(X_train, y_train) # Evaluate self.evaluate(X_test, y_test) # Save model os.makedirs('models/personalization', exist_ok=True) with open('models/personalization/notification_ctr_model.pkl', 'wb') as f: pickle.dump(self.model, f) print("CTR model saved successfully") def evaluate(self, X_test, y_test): """Evaluate model performance""" # Predictions y_pred = self.model.predict(X_test) y_pred_proba = self.model.predict_proba(X_test)[:, 1] # Metrics auc = roc_auc_score(y_test, y_pred_proba) precision = precision_score(y_test, y_pred, zero_division=0) recall = recall_score(y_test, y_pred, zero_division=0) f1 = f1_score(y_test, y_pred, zero_division=0) print(f"AUC: {auc:.3f}") print(f"Precision: {precision:.3f}") print(f"Recall: {recall:.3f}") print(f"F1 Score: {f1:.3f}") # Feature importance feature_names = X_test.columns importances = self.model.feature_importances_ print("\nFeature Importance:") for name, importance in sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True): print(f" {name}: {importance:.3f}") def _generate_synthetic_ctr_data(self): """Generate synthetic notification click-through data""" np.random.seed(42) n_samples = 3000 rows = [] for _ in range(n_samples): # Generate features recency = np.random.uniform(0, 1) frequency = np.random.uniform(0, 1) conscientiousness = np.random.beta(2, 2) openness = np.random.beta(2, 2) neuroticism = np.random.beta(2, 2) is_reminder = np.random.choice([0, 1], p=[0.6, 0.4]) is_milestone = np.random.choice([0, 1], p=[0.8, 0.2]) priority_score = np.random.uniform(0.1, 1.0) time_of_day = np.random.uniform(0, 1) # Predict probability of click based on features p_click = ( 0.15 + 0.2 * conscientiousness + 0.1 * openness + 0.15 * priority_score + 0.05 * (1 - abs(time_of_day - 0.5)) - 0.1 * (frequency > 0.8) + 0.05 * is_milestone ) p_click = np.clip(p_click, 0.02, 0.95) clicked = int(np.random.random() < p_click) rows.append({ 'label': clicked, 'recency_score': recency, 'frequency_score': frequency, 'conscientiousness': conscientiousness, 'openness': openness, 'neuroticism': neuroticism, 'is_reminder': is_reminder, 'is_milestone': is_milestone, 'priority_score': priority_score, 'time_of_day': time_of_day, }) df = pd.DataFrame(rows) print(f"Generated {len(df)} synthetic notification samples") X = df.drop('label', axis=1).fillna(0.5) y = df['label'] return X, y if __name__ == "__main__": trainer = CTRModelTrainer() trainer.train()