pranit_churn_application / train_models.py
rajkhanke's picture
Upload 14 files
292c00b verified
"""
Quick ML Model Training (Minimal Dependencies)
Train models with minimal dependencies - just pandas, numpy, sklearn, joblib
"""
import pandas as pd
import numpy as np
import warnings
import joblib
import json
import os
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
classification_report, confusion_matrix, roc_auc_score,
f1_score, accuracy_score, mean_squared_error,
mean_absolute_error, r2_score
)
from sklearn.ensemble import (
RandomForestClassifier, GradientBoostingClassifier,
RandomForestRegressor, GradientBoostingRegressor
)
from sklearn.linear_model import LogisticRegression, Ridge
def prepare_data_for_churn(data):
"""Prepare features for churn prediction"""
print("\n[*] Preparing Churn Prediction Data...")
df = data.copy()
y = df['has_churned']
# Drop non-feature columns
drop_cols = ['customer_id', 'has_churned', 'churn_date', 'churn_reason',
'signup_date', 'contract_end_date', 'last_service_date',
'value_segment', 'lifecycle_stage', 'plan_type']
# Get categorical columns if they exist
cat_cols = [c for c in ['value_segment', 'lifecycle_stage', 'plan_type'] if c in df.columns]
# Create features
X_num = df.drop(columns=drop_cols + cat_cols, errors='ignore')
if cat_cols:
X_cat = pd.get_dummies(df[cat_cols], drop_first=True)
X = pd.concat([X_num, X_cat], axis=1)
else:
X = X_num
# Keep only numeric
X = X.select_dtypes(include=[np.number])
X.replace([np.inf, -np.inf], 0, inplace=True)
X.fillna(0, inplace=True)
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Scale
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
print(f" βœ“ Train: {len(X_train):,} | Test: {len(X_test):,} | Features: {X.shape[1]}")
print(f" βœ“ Churn rate: {y_train.mean()*100:.1f}% (train), {y_test.mean()*100:.1f}% (test)")
return X_train_scaled, X_test_scaled, y_train, y_test, scaler
def train_churn_model(X_train, X_test, y_train, y_test):
"""Train churn prediction models"""
print("\n[*] Training Churn Models...")
models = {
'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced', n_jobs=-1),
'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
}
results = {}
best_auc = 0
best_model = None
best_name = None
for name, model in models.items():
print(f" - Training {name}...", end=' ')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
results[name] = {'accuracy': acc, 'f1_score': f1, 'roc_auc': auc}
print(f"Acc: {acc:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")
if auc > best_auc:
best_auc = auc
best_model = model
best_name = name
print(f"\n βœ“ Best Model: {best_name} (AUC: {best_auc:.4f})")
# Detailed metrics for best model
y_pred = best_model.predict(X_test)
print("\n Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn'], zero_division=0))
# Feature importance
importance_df = None
if hasattr(best_model, 'feature_importances_'):
importance_df = pd.DataFrame({
'feature': X_train.columns,
'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\n Top 10 Features:")
for idx, row in importance_df.head(10).iterrows():
print(f" {row['feature'][:40]:40s}: {row['importance']:.4f}")
return best_model, results[best_name], importance_df
def prepare_data_for_ltv(data):
"""Prepare features for LTV prediction"""
print("\n[*] Preparing LTV Prediction Data...")
df = data.copy()
# Calculate target
if 'estimated_ltv' in df.columns:
y = df['estimated_ltv']
elif 'arpu' in df.columns and 'tenure_months' in df.columns:
y = df['arpu'] * df['tenure_months'] * 0.85
else:
print(" ⚠ Cannot calculate LTV - missing columns")
return None, None, None, None, None
# Only active customers
active = df['has_churned'] == 0
df = df[active].copy()
y = y[active]
# Drop columns
drop_cols = ['customer_id', 'has_churned', 'churn_date', 'churn_reason',
'signup_date', 'contract_end_date', 'last_service_date',
'value_segment', 'lifecycle_stage', 'plan_type',
'estimated_ltv', 'total_revenue']
cat_cols = [c for c in ['value_segment', 'lifecycle_stage', 'plan_type'] if c in df.columns]
X_num = df.drop(columns=drop_cols + cat_cols, errors='ignore')
if cat_cols:
X_cat = pd.get_dummies(df[cat_cols], drop_first=True)
X = pd.concat([X_num, X_cat], axis=1)
else:
X = X_num
X = X.select_dtypes(include=[np.number])
X.replace([np.inf, -np.inf], 0, inplace=True)
X.fillna(0, inplace=True)
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
print(f" βœ“ Train: {len(X_train):,} | Test: {len(X_test):,} | Features: {X.shape[1]}")
return X_train_scaled, X_test_scaled, y_train, y_test, scaler
def train_ltv_model(X_train, X_test, y_train, y_test):
"""Train LTV regression models"""
print("\n[*] Training LTV Models...")
models = {
'Ridge Regression': Ridge(alpha=1.0, random_state=42),
'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
}
results = {}
best_r2 = -999
best_model = None
best_name = None
for name, model in models.items():
print(f" - Training {name}...", end=' ')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results[name] = {'rmse': rmse, 'mae': mae, 'r2': r2}
print(f"RΒ²: {r2:.3f}, RMSE: ${rmse:,.0f}, MAE: ${mae:,.0f}")
if r2 > best_r2:
best_r2 = r2
best_model = model
best_name = name
print(f"\n βœ“ Best Model: {best_name} (RΒ²: {best_r2:.4f})")
# Feature importance
importance_df = None
if hasattr(best_model, 'feature_importances_'):
importance_df = pd.DataFrame({
'feature': X_train.columns,
'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\n Top 10 Features:")
for idx, row in importance_df.head(10).iterrows():
print(f" {row['feature'][:40]:40s}: {row['importance']:.4f}")
return best_model, results[best_name], importance_df
def main():
"""Main training pipeline"""
print("\n" + "=" * 80)
print(" TELECOM ML MODEL TRAINING")
print("=" * 80)
# Check data exists
data_path = 'data/processed/master_feature_table.csv'
if not os.path.exists(data_path):
print(f"\n⚠ ERROR: {data_path} not found!")
print(" Run: python run_feature_engineering.py")
return
# Load data
print(f"\n[*] Loading data from {data_path}...")
data = pd.read_csv(data_path)
print(f" βœ“ Loaded {len(data):,} records, {len(data.columns)} columns")
# Create output directory
os.makedirs('data/models', exist_ok=True)
all_metrics = {}
# ==================== CHURN MODEL ====================
print("\n" + "=" * 80)
print(" CHURN PREDICTION MODEL")
print("=" * 80)
X_train, X_test, y_train, y_test, scaler_churn = prepare_data_for_churn(data)
churn_model, churn_metrics, churn_importance = train_churn_model(X_train, X_test, y_train, y_test)
# Save churn model
joblib.dump(churn_model, 'data/models/churn_model.pkl')
joblib.dump(scaler_churn, 'data/models/churn_scaler.pkl')
print("\n βœ“ Saved: data/models/churn_model.pkl")
print(" βœ“ Saved: data/models/churn_scaler.pkl")
if churn_importance is not None:
churn_importance.to_csv('data/models/churn_feature_importance.csv', index=False)
print(" βœ“ Saved: data/models/churn_feature_importance.csv")
all_metrics['churn'] = churn_metrics
# ==================== LTV MODEL ====================
print("\n" + "=" * 80)
print(" CUSTOMER LIFETIME VALUE MODEL")
print("=" * 80)
ltv_data = prepare_data_for_ltv(data)
if ltv_data[0] is not None:
X_train, X_test, y_train, y_test, scaler_ltv = ltv_data
ltv_model, ltv_metrics, ltv_importance = train_ltv_model(X_train, X_test, y_train, y_test)
# Save LTV model
joblib.dump(ltv_model, 'data/models/ltv_model.pkl')
joblib.dump(scaler_ltv, 'data/models/ltv_scaler.pkl')
print("\n βœ“ Saved: data/models/ltv_model.pkl")
print(" βœ“ Saved: data/models/ltv_scaler.pkl")
if ltv_importance is not None:
ltv_importance.to_csv('data/models/ltv_feature_importance.csv', index=False)
print(" βœ“ Saved: data/models/ltv_feature_importance.csv")
all_metrics['ltv'] = ltv_metrics
# ==================== SAVE METRICS ====================
with open('data/models/model_metrics.json', 'w') as f:
json.dump(all_metrics, f, indent=2)
print("\n βœ“ Saved: data/models/model_metrics.json")
# ==================== SUMMARY ====================
print("\n" + "=" * 80)
print(" βœ“ TRAINING COMPLETE!")
print("=" * 80)
print("\nTrained Models:")
for model_name in all_metrics.keys():
print(f" βœ“ {model_name.upper()} Prediction Model")
print("\nModel Performance:")
if 'churn' in all_metrics:
print(f" Churn: AUC = {all_metrics['churn']['roc_auc']:.4f}, F1 = {all_metrics['churn']['f1_score']:.4f}")
if 'ltv' in all_metrics:
print(f" LTV: RΒ² = {all_metrics['ltv']['r2']:.4f}, RMSE = ${all_metrics['ltv']['rmse']:,.0f}")
print("\nFiles Saved in: data/models/")
print(" - *_model.pkl (trained models)")
print(" - *_scaler.pkl (feature scalers)")
print(" - *_feature_importance.csv (feature rankings)")
print(" - model_metrics.json (performance metrics)")
print("=" * 80)
if __name__ == "__main__":
main()