Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.svm import SVC | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.naive_bayes import GaussianNB | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| st.title("Customer Churn Prediction") | |
| df = | |
| # Data Loading and Preprocessing (same as before) | |
| def load_and_preprocess_data(file_path): | |
| df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce') | |
| df.dropna(inplace=True) | |
| for col in ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', | |
| 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', | |
| 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']: | |
| le = LabelEncoder() | |
| df[col] = le.fit_transform(df[col]) | |
| numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges'] | |
| scaler = StandardScaler() | |
| df[numerical_cols] = scaler.fit_transform(df[numerical_cols]) | |
| return df | |
| # file_path = st.file_uploader("Upload CSV file", type="csv") | |
| file_path = "./WA_Fn-UseC_-Telco-Customer-Churn.csv" | |
| if file_path is not None: | |
| df = load_and_preprocess_data(file_path) | |
| X = df.drop('Churn', axis=1) | |
| y = df['Churn'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Model Training and Evaluation (using session state - same as before) | |
| if 'models' not in st.session_state: | |
| st.session_state.models = {} | |
| def train_and_evaluate(model_name, model, X_train, y_train, X_test, y_test): | |
| if model_name not in st.session_state.models: | |
| model.fit(X_train, y_train) | |
| st.session_state.models[model_name] = model | |
| else: | |
| model = st.session_state.models[model_name] | |
| y_pred = model.predict(X_test) | |
| accuracy = accuracy_score(y_test, y_pred) | |
| report = classification_report(y_test, y_pred, output_dict=True) | |
| cm = confusion_matrix(y_test, y_pred) | |
| # ROC Curve and AUC | |
| if hasattr(model, "predict_proba"): #check if model has predict_proba | |
| y_prob = model.predict_proba(X_test)[:, 1] | |
| fpr, tpr, _ = roc_curve(y_test, y_prob) | |
| roc_auc = auc(fpr, tpr) | |
| return accuracy, report, cm, model, fpr, tpr, roc_auc | |
| else: | |
| return accuracy, report, cm, model, None, None, None | |
| models = { | |
| "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42), | |
| "Random Forest": RandomForestClassifier(random_state=42), | |
| "Gradient Boosting": GradientBoostingClassifier(random_state=42), | |
| "AdaBoost": AdaBoostClassifier(random_state=42), | |
| "SVM": SVC(probability=True, random_state=42), # probability=True for ROC Curve | |
| "K-Nearest Neighbors": KNeighborsClassifier(), | |
| "Decision Tree": DecisionTreeClassifier(random_state=42), | |
| "Naive Bayes": GaussianNB(), | |
| } | |
| # Tabs for Comparison | |
| tabs = ["Model Comparison", "Individual Model Performance"] | |
| selected_tab = st.sidebar.radio("Select Tab", tabs) | |
| if selected_tab == "Model Comparison": | |
| st.subheader("Model Comparison") | |
| results = [] | |
| for model_name, model in models.items(): | |
| accuracy, report, cm, trained_model, fpr, tpr, roc_auc = train_and_evaluate(model_name, model, X_train, y_train, X_test, y_test) | |
| results.append([model_name, accuracy]) | |
| results_df = pd.DataFrame(results, columns=["Model", "Accuracy"]) | |
| st.dataframe(results_df.sort_values(by="Accuracy", ascending=False)) # Sort by accuracy | |
| # Combined ROC Curve Plot | |
| fig, ax = plt.subplots() | |
| for model_name, model in models.items(): | |
| _, _, _, _, fpr, tpr, roc_auc = train_and_evaluate(model_name, model, X_train, y_train, X_test, y_test) | |
| if fpr is not None and tpr is not None and roc_auc is not None: | |
| ax.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})') | |
| ax.plot([0, 1], [0, 1], 'k--') # Dashed diagonal | |
| ax.set_xlabel('False Positive Rate') | |
| ax.set_ylabel('True Positive Rate') | |
| ax.set_title('ROC Curves') | |
| ax.legend() | |
| st.pyplot(fig) | |
| elif selected_tab == "Individual Model Performance": | |
| model_name = st.selectbox("Select Model", list(models.keys())) | |
| accuracy, report, cm, trained_model, fpr, tpr, roc_auc = train_and_evaluate(model_name, models[model_name], X_train, y_train, X_test, y_test) | |
| st.subheader(f"{model_name} Performance") | |
| st.write(f"Accuracy: {accuracy:.4f}") | |
| report_df = pd.DataFrame(report).transpose() | |
| st.dataframe(report_df) | |
| fig, ax = plt.subplots() | |
| sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax) | |
| plt.xlabel("Predicted Label") | |
| plt.ylabel("True Label") | |
| st.pyplot(fig) | |
| if hasattr(trained_model, "feature_importances_"): | |
| importances = trained_model.feature_importances_ | |
| feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}) | |
| feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False) | |
| st.write("Feature Importance:") | |
| st.dataframe(feature_importance_df) | |
| if fpr is not None and tpr is not None and roc_auc is not None: | |
| fig, ax = plt.subplots() | |
| ax.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})') | |
| ax.plot([0, 1], [0, 1], 'k--') | |
| ax.set_xlabel('False Positive Rate') | |
| ax.set_ylabel('True Positive Rate') | |
| ax.set_title('ROC Curve') | |
| ax.legend() | |
| st.pyplot(fig) | |
| else: | |
| st.write("Please upload a CSV file to begin.") |