Spaces:

rajkhanke
/

pranit_churn_application

Sleeping

App Files Files Community

pranit_churn_application / score_customers.py

rajkhanke

Upload 14 files

292c00b verified about 1 month ago

raw

history blame contribute delete

8.17 kB

	"""
	Batch Customer Scoring Script

	Score all customers with churn risk and LTV predictions
	"""

	import pandas as pd
	import numpy as np
	import joblib
	import warnings
	warnings.filterwarnings('ignore')

	def load_models():
	"""Load trained models and scalers"""
	print("[*] Loading trained models...")

	models = {
	'churn_model': joblib.load('data/models/churn_model.pkl'),
	'churn_scaler': joblib.load('data/models/churn_scaler.pkl'),
	'ltv_model': joblib.load('data/models/ltv_model.pkl'),
	'ltv_scaler': joblib.load('data/models/ltv_scaler.pkl')
	}

	print(" ✓ All models loaded successfully\n")
	return models

	def prepare_features(df):
	"""Prepare features for prediction"""
	# Drop non-feature columns
	drop_cols = ['customer_id', 'has_churned', 'churn_date', 'churn_reason',
	'signup_date', 'contract_end_date', 'last_service_date',
	'value_segment', 'lifecycle_stage', 'plan_type']

	# Keep categorical for encoding
	cat_cols = [c for c in ['value_segment', 'lifecycle_stage', 'plan_type'] if c in df.columns]

	# Drop columns
	X = df.drop(columns=drop_cols + cat_cols, errors='ignore')

	# Handle categoricals if present
	if cat_cols:
	X_cat = pd.get_dummies(df[cat_cols], drop_first=True)
	# Note: In production, you'd need to ensure same dummy columns as training
	# For simplicity, we're just using numeric features here

	# Keep only numeric
	X = X.select_dtypes(include=[np.number])
	X.replace([np.inf, -np.inf], 0, inplace=True)
	X.fillna(0, inplace=True)

	return X

	def score_customers(data_path='data/processed/master_feature_table.csv'):
	"""Score all customers with churn and LTV predictions"""

	print("=" * 80)
	print(" CUSTOMER BATCH SCORING")
	print("=" * 80)

	# Load models
	models = load_models()

	# Load customer data
	print(f"[*] Loading customer data from {data_path}...")
	df = pd.read_csv(data_path)
	print(f" ✓ Loaded {len(df):,} customers\n")

	# Keep customer ID
	customer_ids = df['customer_id']

	# Prepare features
	print("[*] Preparing features...")
	X = prepare_features(df)
	print(f" ✓ {X.shape[1]} features prepared\n")

	# === CHURN PREDICTIONS ===
	print("[*] Predicting churn risk...")

	# Get feature columns from training
	churn_features = models['churn_scaler'].feature_names_in_

	# Align features
	X_churn = X[list(churn_features)]

	# Scale and predict
	X_churn_scaled = models['churn_scaler'].transform(X_churn)
	churn_proba = models['churn_model'].predict_proba(X_churn_scaled)[:, 1]
	churn_pred = models['churn_model'].predict(X_churn_scaled)

	# Categorize risk
	risk_levels = []
	for prob in churn_proba:
	if prob < 0.3:
	risk_levels.append('Low')
	elif prob < 0.6:
	risk_levels.append('Medium')
	else:
	risk_levels.append('High')

	print(f" ✓ Churn predictions complete\n")

	# === LTV PREDICTIONS ===
	print("[*] Predicting customer lifetime value...")

	# Filter active customers for LTV
	active_idx = df['has_churned'] == 0

	ltv_features = models['ltv_scaler'].feature_names_in_
	X_ltv = X[list(ltv_features)]

	# Scale and predict
	X_ltv_scaled = models['ltv_scaler'].transform(X_ltv)
	ltv_pred = models['ltv_model'].predict(X_ltv_scaled)

	# Set LTV to 0 for churned customers
	ltv_pred[~active_idx] = 0

	print(f" ✓ LTV predictions complete\n")

	# === CREATE RESULTS ===
	print("[*] Creating results dataframe...")

	results = pd.DataFrame({
	'customer_id': customer_ids,
	'churn_probability': churn_proba,
	'churn_prediction': churn_pred,
	'churn_risk_level': risk_levels,
	'predicted_ltv': ltv_pred,
	'is_active': active_idx
	})

	# Add value tier
	# Only for active customers
	results['value_tier'] = 'N/A'
	active_ltv = results[results['is_active']]['predicted_ltv']

	if len(active_ltv) > 0:
	percentiles = active_ltv.quantile([0.33, 0.67]).values
	results.loc[results['is_active'], 'value_tier'] = pd.cut(
	results.loc[results['is_active'], 'predicted_ltv'],
	bins=[0, percentiles[0], percentiles[1], float('inf')],
	labels=['Bronze', 'Silver', 'Gold']
	)

	# Priority flag: High-value + High-risk
	results['priority_customer'] = (
	(results['value_tier'] == 'Gold') &
	(results['churn_risk_level'] == 'High')
	)

	print(f" ✓ Results prepared\n")

	# === SUMMARY STATISTICS ===
	print("=" * 80)
	print(" SCORING SUMMARY")
	print("=" * 80)

	print(f"\nTotal Customers Scored: {len(results):,}")
	print(f"Active Customers: {results['is_active'].sum():,}")
	print(f"Churned Customers: {(~results['is_active']).sum():,}")

	print("\n--- Churn Risk Distribution ---")
	print(results['churn_risk_level'].value_counts().to_string())
	print(f"\nAverage Churn Probability: {results['churn_probability'].mean():.1%}")

	print("\n--- Value Tier Distribution (Active Only) ---")
	active_results = results[results['is_active']]
	print(active_results['value_tier'].value_counts().to_string())

	print(f"\n--- Lifetime Value Stats (Active Only) ---")
	print(f"Total Predicted LTV: ${active_results['predicted_ltv'].sum():,.0f}")
	print(f"Average LTV: ${active_results['predicted_ltv'].mean():,.0f}")
	print(f"Median LTV: ${active_results['predicted_ltv'].median():,.0f}")
	print(f"Max LTV: ${active_results['predicted_ltv'].max():,.0f}")

	print(f"\n--- Priority Customers ---")
	priority = results[results['priority_customer']]
	print(f"High-Value, High-Risk Customers: {len(priority):,}")
	if len(priority) > 0:
	print(f"At-Risk Revenue: ${priority['predicted_ltv'].sum():,.0f}")

	# === SAVE RESULTS ===
	output_path = 'data/processed/customer_scores.csv'
	results.to_csv(output_path, index=False)

	print(f"\n✓ Saved results to: {output_path}")

	# === SAVE PRIORITY CUSTOMERS ===
	if len(priority) > 0:
	priority_path = 'data/processed/priority_customers.csv'

	# Merge with original data to get more context
	priority_full = df[df['customer_id'].isin(priority['customer_id'])].copy()
	priority_full = priority_full.merge(results, on='customer_id', how='left')

	priority_full.to_csv(priority_path, index=False)
	print(f"✓ Saved priority customers to: {priority_path}")

	# === TOP HIGH-RISK CUSTOMERS ===
	print("\n" + "=" * 80)
	print(" TOP 10 HIGH-RISK CUSTOMERS")
	print("=" * 80)

	top_risk = results.nlargest(10, 'churn_probability')
	print(top_risk[['customer_id', 'churn_probability', 'churn_risk_level',
	'predicted_ltv', 'value_tier']].to_string(index=False))

	# === TOP VALUE CUSTOMERS ===
	print("\n" + "=" * 80)
	print(" TOP 10 VALUE CUSTOMERS")
	print("=" * 80)

	top_value = results[results['is_active']].nlargest(10, 'predicted_ltv')
	print(top_value[['customer_id', 'predicted_ltv', 'churn_probability',
	'churn_risk_level', 'value_tier']].to_string(index=False))

	print("\n" + "=" * 80)
	print(" SCORING COMPLETE!")
	print("=" * 80)
	print("\nNext Actions:")
	print(" 1. Review priority_customers.csv for immediate retention efforts")
	print(" 2. Segment customers by value_tier for differentiated service")
	print(" 3. Create targeted campaigns for high-risk customers")
	print(" 4. Monitor scores over time to track changes")
	print("=" * 80)

	return results


	if __name__ == "__main__":
	results = score_customers()