Spaces:
Sleeping
Sleeping
| """ | |
| Quick ML Model Training (Minimal Dependencies) | |
| Train models with minimal dependencies - just pandas, numpy, sklearn, joblib | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import warnings | |
| import joblib | |
| import json | |
| import os | |
| warnings.filterwarnings('ignore') | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.metrics import ( | |
| classification_report, confusion_matrix, roc_auc_score, | |
| f1_score, accuracy_score, mean_squared_error, | |
| mean_absolute_error, r2_score | |
| ) | |
| from sklearn.ensemble import ( | |
| RandomForestClassifier, GradientBoostingClassifier, | |
| RandomForestRegressor, GradientBoostingRegressor | |
| ) | |
| from sklearn.linear_model import LogisticRegression, Ridge | |
| def prepare_data_for_churn(data): | |
| """Prepare features for churn prediction""" | |
| print("\n[*] Preparing Churn Prediction Data...") | |
| df = data.copy() | |
| y = df['has_churned'] | |
| # Drop non-feature columns | |
| drop_cols = ['customer_id', 'has_churned', 'churn_date', 'churn_reason', | |
| 'signup_date', 'contract_end_date', 'last_service_date', | |
| 'value_segment', 'lifecycle_stage', 'plan_type'] | |
| # Get categorical columns if they exist | |
| cat_cols = [c for c in ['value_segment', 'lifecycle_stage', 'plan_type'] if c in df.columns] | |
| # Create features | |
| X_num = df.drop(columns=drop_cols + cat_cols, errors='ignore') | |
| if cat_cols: | |
| X_cat = pd.get_dummies(df[cat_cols], drop_first=True) | |
| X = pd.concat([X_num, X_cat], axis=1) | |
| else: | |
| X = X_num | |
| # Keep only numeric | |
| X = X.select_dtypes(include=[np.number]) | |
| X.replace([np.inf, -np.inf], 0, inplace=True) | |
| X.fillna(0, inplace=True) | |
| # Split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) | |
| # Scale | |
| scaler = StandardScaler() | |
| X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index) | |
| X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index) | |
| print(f" β Train: {len(X_train):,} | Test: {len(X_test):,} | Features: {X.shape[1]}") | |
| print(f" β Churn rate: {y_train.mean()*100:.1f}% (train), {y_test.mean()*100:.1f}% (test)") | |
| return X_train_scaled, X_test_scaled, y_train, y_test, scaler | |
| def train_churn_model(X_train, X_test, y_train, y_test): | |
| """Train churn prediction models""" | |
| print("\n[*] Training Churn Models...") | |
| models = { | |
| 'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'), | |
| 'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced', n_jobs=-1), | |
| 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42) | |
| } | |
| results = {} | |
| best_auc = 0 | |
| best_model = None | |
| best_name = None | |
| for name, model in models.items(): | |
| print(f" - Training {name}...", end=' ') | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| y_proba = model.predict_proba(X_test)[:, 1] | |
| acc = accuracy_score(y_test, y_pred) | |
| f1 = f1_score(y_test, y_pred) | |
| auc = roc_auc_score(y_test, y_proba) | |
| results[name] = {'accuracy': acc, 'f1_score': f1, 'roc_auc': auc} | |
| print(f"Acc: {acc:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}") | |
| if auc > best_auc: | |
| best_auc = auc | |
| best_model = model | |
| best_name = name | |
| print(f"\n β Best Model: {best_name} (AUC: {best_auc:.4f})") | |
| # Detailed metrics for best model | |
| y_pred = best_model.predict(X_test) | |
| print("\n Classification Report:") | |
| print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn'], zero_division=0)) | |
| # Feature importance | |
| importance_df = None | |
| if hasattr(best_model, 'feature_importances_'): | |
| importance_df = pd.DataFrame({ | |
| 'feature': X_train.columns, | |
| 'importance': best_model.feature_importances_ | |
| }).sort_values('importance', ascending=False) | |
| print("\n Top 10 Features:") | |
| for idx, row in importance_df.head(10).iterrows(): | |
| print(f" {row['feature'][:40]:40s}: {row['importance']:.4f}") | |
| return best_model, results[best_name], importance_df | |
| def prepare_data_for_ltv(data): | |
| """Prepare features for LTV prediction""" | |
| print("\n[*] Preparing LTV Prediction Data...") | |
| df = data.copy() | |
| # Calculate target | |
| if 'estimated_ltv' in df.columns: | |
| y = df['estimated_ltv'] | |
| elif 'arpu' in df.columns and 'tenure_months' in df.columns: | |
| y = df['arpu'] * df['tenure_months'] * 0.85 | |
| else: | |
| print(" β Cannot calculate LTV - missing columns") | |
| return None, None, None, None, None | |
| # Only active customers | |
| active = df['has_churned'] == 0 | |
| df = df[active].copy() | |
| y = y[active] | |
| # Drop columns | |
| drop_cols = ['customer_id', 'has_churned', 'churn_date', 'churn_reason', | |
| 'signup_date', 'contract_end_date', 'last_service_date', | |
| 'value_segment', 'lifecycle_stage', 'plan_type', | |
| 'estimated_ltv', 'total_revenue'] | |
| cat_cols = [c for c in ['value_segment', 'lifecycle_stage', 'plan_type'] if c in df.columns] | |
| X_num = df.drop(columns=drop_cols + cat_cols, errors='ignore') | |
| if cat_cols: | |
| X_cat = pd.get_dummies(df[cat_cols], drop_first=True) | |
| X = pd.concat([X_num, X_cat], axis=1) | |
| else: | |
| X = X_num | |
| X = X.select_dtypes(include=[np.number]) | |
| X.replace([np.inf, -np.inf], 0, inplace=True) | |
| X.fillna(0, inplace=True) | |
| # Split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Scale | |
| scaler = StandardScaler() | |
| X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index) | |
| X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index) | |
| print(f" β Train: {len(X_train):,} | Test: {len(X_test):,} | Features: {X.shape[1]}") | |
| return X_train_scaled, X_test_scaled, y_train, y_test, scaler | |
| def train_ltv_model(X_train, X_test, y_train, y_test): | |
| """Train LTV regression models""" | |
| print("\n[*] Training LTV Models...") | |
| models = { | |
| 'Ridge Regression': Ridge(alpha=1.0, random_state=42), | |
| 'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1), | |
| 'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42) | |
| } | |
| results = {} | |
| best_r2 = -999 | |
| best_model = None | |
| best_name = None | |
| for name, model in models.items(): | |
| print(f" - Training {name}...", end=' ') | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| rmse = np.sqrt(mean_squared_error(y_test, y_pred)) | |
| mae = mean_absolute_error(y_test, y_pred) | |
| r2 = r2_score(y_test, y_pred) | |
| results[name] = {'rmse': rmse, 'mae': mae, 'r2': r2} | |
| print(f"RΒ²: {r2:.3f}, RMSE: ${rmse:,.0f}, MAE: ${mae:,.0f}") | |
| if r2 > best_r2: | |
| best_r2 = r2 | |
| best_model = model | |
| best_name = name | |
| print(f"\n β Best Model: {best_name} (RΒ²: {best_r2:.4f})") | |
| # Feature importance | |
| importance_df = None | |
| if hasattr(best_model, 'feature_importances_'): | |
| importance_df = pd.DataFrame({ | |
| 'feature': X_train.columns, | |
| 'importance': best_model.feature_importances_ | |
| }).sort_values('importance', ascending=False) | |
| print("\n Top 10 Features:") | |
| for idx, row in importance_df.head(10).iterrows(): | |
| print(f" {row['feature'][:40]:40s}: {row['importance']:.4f}") | |
| return best_model, results[best_name], importance_df | |
| def main(): | |
| """Main training pipeline""" | |
| print("\n" + "=" * 80) | |
| print(" TELECOM ML MODEL TRAINING") | |
| print("=" * 80) | |
| # Check data exists | |
| data_path = 'data/processed/master_feature_table.csv' | |
| if not os.path.exists(data_path): | |
| print(f"\nβ ERROR: {data_path} not found!") | |
| print(" Run: python run_feature_engineering.py") | |
| return | |
| # Load data | |
| print(f"\n[*] Loading data from {data_path}...") | |
| data = pd.read_csv(data_path) | |
| print(f" β Loaded {len(data):,} records, {len(data.columns)} columns") | |
| # Create output directory | |
| os.makedirs('data/models', exist_ok=True) | |
| all_metrics = {} | |
| # ==================== CHURN MODEL ==================== | |
| print("\n" + "=" * 80) | |
| print(" CHURN PREDICTION MODEL") | |
| print("=" * 80) | |
| X_train, X_test, y_train, y_test, scaler_churn = prepare_data_for_churn(data) | |
| churn_model, churn_metrics, churn_importance = train_churn_model(X_train, X_test, y_train, y_test) | |
| # Save churn model | |
| joblib.dump(churn_model, 'data/models/churn_model.pkl') | |
| joblib.dump(scaler_churn, 'data/models/churn_scaler.pkl') | |
| print("\n β Saved: data/models/churn_model.pkl") | |
| print(" β Saved: data/models/churn_scaler.pkl") | |
| if churn_importance is not None: | |
| churn_importance.to_csv('data/models/churn_feature_importance.csv', index=False) | |
| print(" β Saved: data/models/churn_feature_importance.csv") | |
| all_metrics['churn'] = churn_metrics | |
| # ==================== LTV MODEL ==================== | |
| print("\n" + "=" * 80) | |
| print(" CUSTOMER LIFETIME VALUE MODEL") | |
| print("=" * 80) | |
| ltv_data = prepare_data_for_ltv(data) | |
| if ltv_data[0] is not None: | |
| X_train, X_test, y_train, y_test, scaler_ltv = ltv_data | |
| ltv_model, ltv_metrics, ltv_importance = train_ltv_model(X_train, X_test, y_train, y_test) | |
| # Save LTV model | |
| joblib.dump(ltv_model, 'data/models/ltv_model.pkl') | |
| joblib.dump(scaler_ltv, 'data/models/ltv_scaler.pkl') | |
| print("\n β Saved: data/models/ltv_model.pkl") | |
| print(" β Saved: data/models/ltv_scaler.pkl") | |
| if ltv_importance is not None: | |
| ltv_importance.to_csv('data/models/ltv_feature_importance.csv', index=False) | |
| print(" β Saved: data/models/ltv_feature_importance.csv") | |
| all_metrics['ltv'] = ltv_metrics | |
| # ==================== SAVE METRICS ==================== | |
| with open('data/models/model_metrics.json', 'w') as f: | |
| json.dump(all_metrics, f, indent=2) | |
| print("\n β Saved: data/models/model_metrics.json") | |
| # ==================== SUMMARY ==================== | |
| print("\n" + "=" * 80) | |
| print(" β TRAINING COMPLETE!") | |
| print("=" * 80) | |
| print("\nTrained Models:") | |
| for model_name in all_metrics.keys(): | |
| print(f" β {model_name.upper()} Prediction Model") | |
| print("\nModel Performance:") | |
| if 'churn' in all_metrics: | |
| print(f" Churn: AUC = {all_metrics['churn']['roc_auc']:.4f}, F1 = {all_metrics['churn']['f1_score']:.4f}") | |
| if 'ltv' in all_metrics: | |
| print(f" LTV: RΒ² = {all_metrics['ltv']['r2']:.4f}, RMSE = ${all_metrics['ltv']['rmse']:,.0f}") | |
| print("\nFiles Saved in: data/models/") | |
| print(" - *_model.pkl (trained models)") | |
| print(" - *_scaler.pkl (feature scalers)") | |
| print(" - *_feature_importance.csv (feature rankings)") | |
| print(" - model_metrics.json (performance metrics)") | |
| print("=" * 80) | |
| if __name__ == "__main__": | |
| main() | |