Spaces:
Sleeping
Sleeping
File size: 6,207 Bytes
2fc252e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
st.title("Customer Churn Prediction")
df =
# Data Loading and Preprocessing (same as before)
@st.cache_data
def load_and_preprocess_data(file_path):
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(inplace=True)
for col in ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
return df
# file_path = st.file_uploader("Upload CSV file", type="csv")
file_path = "./WA_Fn-UseC_-Telco-Customer-Churn.csv"
if file_path is not None:
df = load_and_preprocess_data(file_path)
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model Training and Evaluation (using session state - same as before)
if 'models' not in st.session_state:
st.session_state.models = {}
def train_and_evaluate(model_name, model, X_train, y_train, X_test, y_test):
if model_name not in st.session_state.models:
model.fit(X_train, y_train)
st.session_state.models[model_name] = model
else:
model = st.session_state.models[model_name]
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
cm = confusion_matrix(y_test, y_pred)
# ROC Curve and AUC
if hasattr(model, "predict_proba"): #check if model has predict_proba
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
return accuracy, report, cm, model, fpr, tpr, roc_auc
else:
return accuracy, report, cm, model, None, None, None
models = {
"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
"Random Forest": RandomForestClassifier(random_state=42),
"Gradient Boosting": GradientBoostingClassifier(random_state=42),
"AdaBoost": AdaBoostClassifier(random_state=42),
"SVM": SVC(probability=True, random_state=42), # probability=True for ROC Curve
"K-Nearest Neighbors": KNeighborsClassifier(),
"Decision Tree": DecisionTreeClassifier(random_state=42),
"Naive Bayes": GaussianNB(),
}
# Tabs for Comparison
tabs = ["Model Comparison", "Individual Model Performance"]
selected_tab = st.sidebar.radio("Select Tab", tabs)
if selected_tab == "Model Comparison":
st.subheader("Model Comparison")
results = []
for model_name, model in models.items():
accuracy, report, cm, trained_model, fpr, tpr, roc_auc = train_and_evaluate(model_name, model, X_train, y_train, X_test, y_test)
results.append([model_name, accuracy])
results_df = pd.DataFrame(results, columns=["Model", "Accuracy"])
st.dataframe(results_df.sort_values(by="Accuracy", ascending=False)) # Sort by accuracy
# Combined ROC Curve Plot
fig, ax = plt.subplots()
for model_name, model in models.items():
_, _, _, _, fpr, tpr, roc_auc = train_and_evaluate(model_name, model, X_train, y_train, X_test, y_test)
if fpr is not None and tpr is not None and roc_auc is not None:
ax.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')
ax.plot([0, 1], [0, 1], 'k--') # Dashed diagonal
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curves')
ax.legend()
st.pyplot(fig)
elif selected_tab == "Individual Model Performance":
model_name = st.selectbox("Select Model", list(models.keys()))
accuracy, report, cm, trained_model, fpr, tpr, roc_auc = train_and_evaluate(model_name, models[model_name], X_train, y_train, X_test, y_test)
st.subheader(f"{model_name} Performance")
st.write(f"Accuracy: {accuracy:.4f}")
report_df = pd.DataFrame(report).transpose()
st.dataframe(report_df)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
st.pyplot(fig)
if hasattr(trained_model, "feature_importances_"):
importances = trained_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
st.write("Feature Importance:")
st.dataframe(feature_importance_df)
if fpr is not None and tpr is not None and roc_auc is not None:
fig, ax = plt.subplots()
ax.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve')
ax.legend()
st.pyplot(fig)
else:
st.write("Please upload a CSV file to begin.") |