pranit_churn_application / score_customers.py
rajkhanke's picture
Upload 14 files
292c00b verified
"""
Batch Customer Scoring Script
Score all customers with churn risk and LTV predictions
"""
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')
def load_models():
"""Load trained models and scalers"""
print("[*] Loading trained models...")
models = {
'churn_model': joblib.load('data/models/churn_model.pkl'),
'churn_scaler': joblib.load('data/models/churn_scaler.pkl'),
'ltv_model': joblib.load('data/models/ltv_model.pkl'),
'ltv_scaler': joblib.load('data/models/ltv_scaler.pkl')
}
print(" βœ“ All models loaded successfully\n")
return models
def prepare_features(df):
"""Prepare features for prediction"""
# Drop non-feature columns
drop_cols = ['customer_id', 'has_churned', 'churn_date', 'churn_reason',
'signup_date', 'contract_end_date', 'last_service_date',
'value_segment', 'lifecycle_stage', 'plan_type']
# Keep categorical for encoding
cat_cols = [c for c in ['value_segment', 'lifecycle_stage', 'plan_type'] if c in df.columns]
# Drop columns
X = df.drop(columns=drop_cols + cat_cols, errors='ignore')
# Handle categoricals if present
if cat_cols:
X_cat = pd.get_dummies(df[cat_cols], drop_first=True)
# Note: In production, you'd need to ensure same dummy columns as training
# For simplicity, we're just using numeric features here
# Keep only numeric
X = X.select_dtypes(include=[np.number])
X.replace([np.inf, -np.inf], 0, inplace=True)
X.fillna(0, inplace=True)
return X
def score_customers(data_path='data/processed/master_feature_table.csv'):
"""Score all customers with churn and LTV predictions"""
print("=" * 80)
print(" CUSTOMER BATCH SCORING")
print("=" * 80)
# Load models
models = load_models()
# Load customer data
print(f"[*] Loading customer data from {data_path}...")
df = pd.read_csv(data_path)
print(f" βœ“ Loaded {len(df):,} customers\n")
# Keep customer ID
customer_ids = df['customer_id']
# Prepare features
print("[*] Preparing features...")
X = prepare_features(df)
print(f" βœ“ {X.shape[1]} features prepared\n")
# === CHURN PREDICTIONS ===
print("[*] Predicting churn risk...")
# Get feature columns from training
churn_features = models['churn_scaler'].feature_names_in_
# Align features
X_churn = X[list(churn_features)]
# Scale and predict
X_churn_scaled = models['churn_scaler'].transform(X_churn)
churn_proba = models['churn_model'].predict_proba(X_churn_scaled)[:, 1]
churn_pred = models['churn_model'].predict(X_churn_scaled)
# Categorize risk
risk_levels = []
for prob in churn_proba:
if prob < 0.3:
risk_levels.append('Low')
elif prob < 0.6:
risk_levels.append('Medium')
else:
risk_levels.append('High')
print(f" βœ“ Churn predictions complete\n")
# === LTV PREDICTIONS ===
print("[*] Predicting customer lifetime value...")
# Filter active customers for LTV
active_idx = df['has_churned'] == 0
ltv_features = models['ltv_scaler'].feature_names_in_
X_ltv = X[list(ltv_features)]
# Scale and predict
X_ltv_scaled = models['ltv_scaler'].transform(X_ltv)
ltv_pred = models['ltv_model'].predict(X_ltv_scaled)
# Set LTV to 0 for churned customers
ltv_pred[~active_idx] = 0
print(f" βœ“ LTV predictions complete\n")
# === CREATE RESULTS ===
print("[*] Creating results dataframe...")
results = pd.DataFrame({
'customer_id': customer_ids,
'churn_probability': churn_proba,
'churn_prediction': churn_pred,
'churn_risk_level': risk_levels,
'predicted_ltv': ltv_pred,
'is_active': active_idx
})
# Add value tier
# Only for active customers
results['value_tier'] = 'N/A'
active_ltv = results[results['is_active']]['predicted_ltv']
if len(active_ltv) > 0:
percentiles = active_ltv.quantile([0.33, 0.67]).values
results.loc[results['is_active'], 'value_tier'] = pd.cut(
results.loc[results['is_active'], 'predicted_ltv'],
bins=[0, percentiles[0], percentiles[1], float('inf')],
labels=['Bronze', 'Silver', 'Gold']
)
# Priority flag: High-value + High-risk
results['priority_customer'] = (
(results['value_tier'] == 'Gold') &
(results['churn_risk_level'] == 'High')
)
print(f" βœ“ Results prepared\n")
# === SUMMARY STATISTICS ===
print("=" * 80)
print(" SCORING SUMMARY")
print("=" * 80)
print(f"\nTotal Customers Scored: {len(results):,}")
print(f"Active Customers: {results['is_active'].sum():,}")
print(f"Churned Customers: {(~results['is_active']).sum():,}")
print("\n--- Churn Risk Distribution ---")
print(results['churn_risk_level'].value_counts().to_string())
print(f"\nAverage Churn Probability: {results['churn_probability'].mean():.1%}")
print("\n--- Value Tier Distribution (Active Only) ---")
active_results = results[results['is_active']]
print(active_results['value_tier'].value_counts().to_string())
print(f"\n--- Lifetime Value Stats (Active Only) ---")
print(f"Total Predicted LTV: ${active_results['predicted_ltv'].sum():,.0f}")
print(f"Average LTV: ${active_results['predicted_ltv'].mean():,.0f}")
print(f"Median LTV: ${active_results['predicted_ltv'].median():,.0f}")
print(f"Max LTV: ${active_results['predicted_ltv'].max():,.0f}")
print(f"\n--- Priority Customers ---")
priority = results[results['priority_customer']]
print(f"High-Value, High-Risk Customers: {len(priority):,}")
if len(priority) > 0:
print(f"At-Risk Revenue: ${priority['predicted_ltv'].sum():,.0f}")
# === SAVE RESULTS ===
output_path = 'data/processed/customer_scores.csv'
results.to_csv(output_path, index=False)
print(f"\nβœ“ Saved results to: {output_path}")
# === SAVE PRIORITY CUSTOMERS ===
if len(priority) > 0:
priority_path = 'data/processed/priority_customers.csv'
# Merge with original data to get more context
priority_full = df[df['customer_id'].isin(priority['customer_id'])].copy()
priority_full = priority_full.merge(results, on='customer_id', how='left')
priority_full.to_csv(priority_path, index=False)
print(f"βœ“ Saved priority customers to: {priority_path}")
# === TOP HIGH-RISK CUSTOMERS ===
print("\n" + "=" * 80)
print(" TOP 10 HIGH-RISK CUSTOMERS")
print("=" * 80)
top_risk = results.nlargest(10, 'churn_probability')
print(top_risk[['customer_id', 'churn_probability', 'churn_risk_level',
'predicted_ltv', 'value_tier']].to_string(index=False))
# === TOP VALUE CUSTOMERS ===
print("\n" + "=" * 80)
print(" TOP 10 VALUE CUSTOMERS")
print("=" * 80)
top_value = results[results['is_active']].nlargest(10, 'predicted_ltv')
print(top_value[['customer_id', 'predicted_ltv', 'churn_probability',
'churn_risk_level', 'value_tier']].to_string(index=False))
print("\n" + "=" * 80)
print(" SCORING COMPLETE!")
print("=" * 80)
print("\nNext Actions:")
print(" 1. Review priority_customers.csv for immediate retention efforts")
print(" 2. Segment customers by value_tier for differentiated service")
print(" 3. Create targeted campaigns for high-risk customers")
print(" 4. Monitor scores over time to track changes")
print("=" * 80)
return results
if __name__ == "__main__":
results = score_customers()