Spaces:
Sleeping
Sleeping
| """ | |
| Batch Customer Scoring Script | |
| Score all customers with churn risk and LTV predictions | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| def load_models(): | |
| """Load trained models and scalers""" | |
| print("[*] Loading trained models...") | |
| models = { | |
| 'churn_model': joblib.load('data/models/churn_model.pkl'), | |
| 'churn_scaler': joblib.load('data/models/churn_scaler.pkl'), | |
| 'ltv_model': joblib.load('data/models/ltv_model.pkl'), | |
| 'ltv_scaler': joblib.load('data/models/ltv_scaler.pkl') | |
| } | |
| print(" β All models loaded successfully\n") | |
| return models | |
| def prepare_features(df): | |
| """Prepare features for prediction""" | |
| # Drop non-feature columns | |
| drop_cols = ['customer_id', 'has_churned', 'churn_date', 'churn_reason', | |
| 'signup_date', 'contract_end_date', 'last_service_date', | |
| 'value_segment', 'lifecycle_stage', 'plan_type'] | |
| # Keep categorical for encoding | |
| cat_cols = [c for c in ['value_segment', 'lifecycle_stage', 'plan_type'] if c in df.columns] | |
| # Drop columns | |
| X = df.drop(columns=drop_cols + cat_cols, errors='ignore') | |
| # Handle categoricals if present | |
| if cat_cols: | |
| X_cat = pd.get_dummies(df[cat_cols], drop_first=True) | |
| # Note: In production, you'd need to ensure same dummy columns as training | |
| # For simplicity, we're just using numeric features here | |
| # Keep only numeric | |
| X = X.select_dtypes(include=[np.number]) | |
| X.replace([np.inf, -np.inf], 0, inplace=True) | |
| X.fillna(0, inplace=True) | |
| return X | |
| def score_customers(data_path='data/processed/master_feature_table.csv'): | |
| """Score all customers with churn and LTV predictions""" | |
| print("=" * 80) | |
| print(" CUSTOMER BATCH SCORING") | |
| print("=" * 80) | |
| # Load models | |
| models = load_models() | |
| # Load customer data | |
| print(f"[*] Loading customer data from {data_path}...") | |
| df = pd.read_csv(data_path) | |
| print(f" β Loaded {len(df):,} customers\n") | |
| # Keep customer ID | |
| customer_ids = df['customer_id'] | |
| # Prepare features | |
| print("[*] Preparing features...") | |
| X = prepare_features(df) | |
| print(f" β {X.shape[1]} features prepared\n") | |
| # === CHURN PREDICTIONS === | |
| print("[*] Predicting churn risk...") | |
| # Get feature columns from training | |
| churn_features = models['churn_scaler'].feature_names_in_ | |
| # Align features | |
| X_churn = X[list(churn_features)] | |
| # Scale and predict | |
| X_churn_scaled = models['churn_scaler'].transform(X_churn) | |
| churn_proba = models['churn_model'].predict_proba(X_churn_scaled)[:, 1] | |
| churn_pred = models['churn_model'].predict(X_churn_scaled) | |
| # Categorize risk | |
| risk_levels = [] | |
| for prob in churn_proba: | |
| if prob < 0.3: | |
| risk_levels.append('Low') | |
| elif prob < 0.6: | |
| risk_levels.append('Medium') | |
| else: | |
| risk_levels.append('High') | |
| print(f" β Churn predictions complete\n") | |
| # === LTV PREDICTIONS === | |
| print("[*] Predicting customer lifetime value...") | |
| # Filter active customers for LTV | |
| active_idx = df['has_churned'] == 0 | |
| ltv_features = models['ltv_scaler'].feature_names_in_ | |
| X_ltv = X[list(ltv_features)] | |
| # Scale and predict | |
| X_ltv_scaled = models['ltv_scaler'].transform(X_ltv) | |
| ltv_pred = models['ltv_model'].predict(X_ltv_scaled) | |
| # Set LTV to 0 for churned customers | |
| ltv_pred[~active_idx] = 0 | |
| print(f" β LTV predictions complete\n") | |
| # === CREATE RESULTS === | |
| print("[*] Creating results dataframe...") | |
| results = pd.DataFrame({ | |
| 'customer_id': customer_ids, | |
| 'churn_probability': churn_proba, | |
| 'churn_prediction': churn_pred, | |
| 'churn_risk_level': risk_levels, | |
| 'predicted_ltv': ltv_pred, | |
| 'is_active': active_idx | |
| }) | |
| # Add value tier | |
| # Only for active customers | |
| results['value_tier'] = 'N/A' | |
| active_ltv = results[results['is_active']]['predicted_ltv'] | |
| if len(active_ltv) > 0: | |
| percentiles = active_ltv.quantile([0.33, 0.67]).values | |
| results.loc[results['is_active'], 'value_tier'] = pd.cut( | |
| results.loc[results['is_active'], 'predicted_ltv'], | |
| bins=[0, percentiles[0], percentiles[1], float('inf')], | |
| labels=['Bronze', 'Silver', 'Gold'] | |
| ) | |
| # Priority flag: High-value + High-risk | |
| results['priority_customer'] = ( | |
| (results['value_tier'] == 'Gold') & | |
| (results['churn_risk_level'] == 'High') | |
| ) | |
| print(f" β Results prepared\n") | |
| # === SUMMARY STATISTICS === | |
| print("=" * 80) | |
| print(" SCORING SUMMARY") | |
| print("=" * 80) | |
| print(f"\nTotal Customers Scored: {len(results):,}") | |
| print(f"Active Customers: {results['is_active'].sum():,}") | |
| print(f"Churned Customers: {(~results['is_active']).sum():,}") | |
| print("\n--- Churn Risk Distribution ---") | |
| print(results['churn_risk_level'].value_counts().to_string()) | |
| print(f"\nAverage Churn Probability: {results['churn_probability'].mean():.1%}") | |
| print("\n--- Value Tier Distribution (Active Only) ---") | |
| active_results = results[results['is_active']] | |
| print(active_results['value_tier'].value_counts().to_string()) | |
| print(f"\n--- Lifetime Value Stats (Active Only) ---") | |
| print(f"Total Predicted LTV: ${active_results['predicted_ltv'].sum():,.0f}") | |
| print(f"Average LTV: ${active_results['predicted_ltv'].mean():,.0f}") | |
| print(f"Median LTV: ${active_results['predicted_ltv'].median():,.0f}") | |
| print(f"Max LTV: ${active_results['predicted_ltv'].max():,.0f}") | |
| print(f"\n--- Priority Customers ---") | |
| priority = results[results['priority_customer']] | |
| print(f"High-Value, High-Risk Customers: {len(priority):,}") | |
| if len(priority) > 0: | |
| print(f"At-Risk Revenue: ${priority['predicted_ltv'].sum():,.0f}") | |
| # === SAVE RESULTS === | |
| output_path = 'data/processed/customer_scores.csv' | |
| results.to_csv(output_path, index=False) | |
| print(f"\nβ Saved results to: {output_path}") | |
| # === SAVE PRIORITY CUSTOMERS === | |
| if len(priority) > 0: | |
| priority_path = 'data/processed/priority_customers.csv' | |
| # Merge with original data to get more context | |
| priority_full = df[df['customer_id'].isin(priority['customer_id'])].copy() | |
| priority_full = priority_full.merge(results, on='customer_id', how='left') | |
| priority_full.to_csv(priority_path, index=False) | |
| print(f"β Saved priority customers to: {priority_path}") | |
| # === TOP HIGH-RISK CUSTOMERS === | |
| print("\n" + "=" * 80) | |
| print(" TOP 10 HIGH-RISK CUSTOMERS") | |
| print("=" * 80) | |
| top_risk = results.nlargest(10, 'churn_probability') | |
| print(top_risk[['customer_id', 'churn_probability', 'churn_risk_level', | |
| 'predicted_ltv', 'value_tier']].to_string(index=False)) | |
| # === TOP VALUE CUSTOMERS === | |
| print("\n" + "=" * 80) | |
| print(" TOP 10 VALUE CUSTOMERS") | |
| print("=" * 80) | |
| top_value = results[results['is_active']].nlargest(10, 'predicted_ltv') | |
| print(top_value[['customer_id', 'predicted_ltv', 'churn_probability', | |
| 'churn_risk_level', 'value_tier']].to_string(index=False)) | |
| print("\n" + "=" * 80) | |
| print(" SCORING COMPLETE!") | |
| print("=" * 80) | |
| print("\nNext Actions:") | |
| print(" 1. Review priority_customers.csv for immediate retention efforts") | |
| print(" 2. Segment customers by value_tier for differentiated service") | |
| print(" 3. Create targeted campaigns for high-risk customers") | |
| print(" 4. Monitor scores over time to track changes") | |
| print("=" * 80) | |
| return results | |
| if __name__ == "__main__": | |
| results = score_customers() | |