Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.ensemble import GradientBoostingClassifier | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score | |
| import pickle | |
| import os | |
| from feature_engineering import FeatureEngineer | |
| class CTRModelTrainer: | |
| def __init__(self): | |
| self.feature_engineer = FeatureEngineer() | |
| self.model = None | |
| def prepare_training_data(self): | |
| """Prepare training data from notification feedback""" | |
| conn = self.feature_engineer.conn | |
| # If no database connection, generate synthetic data | |
| if conn is None: | |
| print("Database not available. Generating synthetic notification data...") | |
| return self._generate_synthetic_ctr_data() | |
| query = """ | |
| SELECT | |
| nf.clicked::int as label, | |
| COALESCE(uf.recency_score, 0.0) as recency_score, | |
| COALESCE(uf.frequency_score, 0.0) as frequency_score, | |
| COALESCE(pp.conscientiousness, 0.5) as conscientiousness, | |
| COALESCE(pp.openness, 0.5) as openness, | |
| CASE WHEN sn.notification_type = 'reminder' THEN 1 ELSE 0 END as is_reminder, | |
| CASE WHEN sn.notification_type = 'milestone' THEN 1 ELSE 0 END as is_milestone, | |
| COALESCE(sn.priority_score, 0.5) as priority_score, | |
| EXTRACT(HOUR FROM sn.sent_at) / 24.0 as time_of_day | |
| FROM notification_feedback nf | |
| JOIN smart_notifications sn ON sn.id = nf.notification_id | |
| LEFT JOIN user_features uf ON uf.user_id = nf.user_id | |
| LEFT JOIN personality_profiles pp ON pp.user_id = nf.user_id | |
| WHERE sn.sent_at IS NOT NULL | |
| """ | |
| try: | |
| df = pd.read_sql(query, conn) | |
| except Exception as e: | |
| print(f"Database query failed ({e}). Generating synthetic data instead.") | |
| return self._generate_synthetic_ctr_data() | |
| if df.empty: | |
| print("No notification feedback data available. Generating synthetic data...") | |
| return self._generate_synthetic_ctr_data() | |
| X = df.drop('label', axis=1).fillna(0.5) | |
| y = df['label'] | |
| return X, y | |
| def train(self): | |
| """Train CTR prediction model""" | |
| print("Preparing training data...") | |
| X, y = self.prepare_training_data() | |
| if X is None or len(X) < 100: | |
| print(f"Not enough training data for CTR model (need 100+, have {len(X) if X is not None else 0})") | |
| return | |
| print(f"Training on {len(X)} samples") | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| # Train model | |
| self.model = GradientBoostingClassifier( | |
| n_estimators=100, | |
| max_depth=3, | |
| learning_rate=0.1, | |
| random_state=42 | |
| ) | |
| self.model.fit(X_train, y_train) | |
| # Evaluate | |
| self.evaluate(X_test, y_test) | |
| # Save model | |
| os.makedirs('models/personalization', exist_ok=True) | |
| with open('models/personalization/notification_ctr_model.pkl', 'wb') as f: | |
| pickle.dump(self.model, f) | |
| print("CTR model saved successfully") | |
| def evaluate(self, X_test, y_test): | |
| """Evaluate model performance""" | |
| # Predictions | |
| y_pred = self.model.predict(X_test) | |
| y_pred_proba = self.model.predict_proba(X_test)[:, 1] | |
| # Metrics | |
| auc = roc_auc_score(y_test, y_pred_proba) | |
| precision = precision_score(y_test, y_pred, zero_division=0) | |
| recall = recall_score(y_test, y_pred, zero_division=0) | |
| f1 = f1_score(y_test, y_pred, zero_division=0) | |
| print(f"AUC: {auc:.3f}") | |
| print(f"Precision: {precision:.3f}") | |
| print(f"Recall: {recall:.3f}") | |
| print(f"F1 Score: {f1:.3f}") | |
| # Feature importance | |
| feature_names = X_test.columns | |
| importances = self.model.feature_importances_ | |
| print("\nFeature Importance:") | |
| for name, importance in sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True): | |
| print(f" {name}: {importance:.3f}") | |
| def _generate_synthetic_ctr_data(self): | |
| """Generate synthetic notification click-through data""" | |
| np.random.seed(42) | |
| n_samples = 3000 | |
| rows = [] | |
| for _ in range(n_samples): | |
| # Generate features | |
| recency = np.random.uniform(0, 1) | |
| frequency = np.random.uniform(0, 1) | |
| conscientiousness = np.random.beta(2, 2) | |
| openness = np.random.beta(2, 2) | |
| neuroticism = np.random.beta(2, 2) | |
| is_reminder = np.random.choice([0, 1], p=[0.6, 0.4]) | |
| is_milestone = np.random.choice([0, 1], p=[0.8, 0.2]) | |
| priority_score = np.random.uniform(0.1, 1.0) | |
| time_of_day = np.random.uniform(0, 1) | |
| # Predict probability of click based on features | |
| p_click = ( | |
| 0.15 + 0.2 * conscientiousness + 0.1 * openness + | |
| 0.15 * priority_score + 0.05 * (1 - abs(time_of_day - 0.5)) - | |
| 0.1 * (frequency > 0.8) + 0.05 * is_milestone | |
| ) | |
| p_click = np.clip(p_click, 0.02, 0.95) | |
| clicked = int(np.random.random() < p_click) | |
| rows.append({ | |
| 'label': clicked, | |
| 'recency_score': recency, | |
| 'frequency_score': frequency, | |
| 'conscientiousness': conscientiousness, | |
| 'openness': openness, | |
| 'neuroticism': neuroticism, | |
| 'is_reminder': is_reminder, | |
| 'is_milestone': is_milestone, | |
| 'priority_score': priority_score, | |
| 'time_of_day': time_of_day, | |
| }) | |
| df = pd.DataFrame(rows) | |
| print(f"Generated {len(df)} synthetic notification samples") | |
| X = df.drop('label', axis=1).fillna(0.5) | |
| y = df['label'] | |
| return X, y | |
| if __name__ == "__main__": | |
| trainer = CTRModelTrainer() | |
| trainer.train() | |