Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import statsmodels.api as sm | |
| from statsmodels.stats.outliers_influence import variance_inflation_factor | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.preprocessing import StandardScaler, PowerTransformer | |
| from sklearn.metrics import (accuracy_score, classification_report, | |
| roc_curve, roc_auc_score, confusion_matrix, | |
| precision_recall_curve, average_precision_score) | |
| from sklearn.linear_model import LogisticRegression | |
| import os | |
| # Configuration | |
| os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false" | |
| os.environ["STREAMLIT_METRICS_ENABLED"] = "false" | |
| st.set_page_config(page_title="Advanced Logistic Regression", layout="wide") | |
| def load_data(): | |
| """Load data with improved error handling and data type detection""" | |
| uploaded_data = st.file_uploader('π Upload Data File', type=['csv', 'txt', 'xlsx', 'xls']) | |
| if uploaded_data is not None: | |
| try: | |
| if uploaded_data.type == 'text/plain': | |
| delimiter = st.radio('Select delimiter (separator)', [',', '\t', '|', ' ', 'Auto Detect']) | |
| if delimiter == 'Auto Detect': | |
| df = pd.read_csv(uploaded_data, sep=None, engine='python') | |
| else: | |
| df = pd.read_csv(uploaded_data, sep=delimiter) | |
| elif uploaded_data.type == 'text/csv': | |
| df = pd.read_csv(uploaded_data) | |
| elif uploaded_data.type in ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | |
| 'application/vnd.ms-excel']: | |
| df = pd.read_excel(uploaded_data) | |
| # Basic data quality check | |
| st.write('### π Dataset Preview') | |
| st.dataframe(df.head()) | |
| # Show data summary | |
| with st.expander("π Data Summary"): | |
| st.write("**Data Types:**") | |
| st.dataframe(df.dtypes.astype(str)) | |
| st.write("**Descriptive Statistics:**") | |
| st.dataframe(df.describe()) | |
| st.write("**Missing Values:**") | |
| st.dataframe(df.isnull().sum().rename("Missing Count")) | |
| return df | |
| except Exception as e: | |
| st.error(f"Error loading file: {str(e)}") | |
| return None | |
| return None | |
| def calculate_vif(X): | |
| """Calculate VIF with improved handling""" | |
| X = X.select_dtypes(include=[np.number]).dropna() | |
| X = X.loc[:, (X != X.iloc[0]).any()] | |
| if X.shape[1] < 2: | |
| return None | |
| vif_data = pd.DataFrame() | |
| vif_data["Feature"] = X.columns | |
| vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] | |
| vif_data["Severity"] = np.where(vif_data["VIF"] > 10, "High", | |
| np.where(vif_data["VIF"] > 5, "Moderate", "Low")) | |
| return vif_data.sort_values("VIF", ascending=False) | |
| def plot_roc_pr_curves(y_true, y_pred_prob): | |
| """Plot ROC and Precision-Recall curves side by side""" | |
| # ROC Curve | |
| fpr, tpr, _ = roc_curve(y_true, y_pred_prob) | |
| roc_auc = roc_auc_score(y_true, y_pred_prob) | |
| # Precision-Recall Curve | |
| precision, recall, _ = precision_recall_curve(y_true, y_pred_prob) | |
| avg_precision = average_precision_score(y_true, y_pred_prob) | |
| fig = make_subplots(rows=1, cols=2, | |
| subplot_titles=( | |
| f"ROC Curve (AUC = {roc_auc:.2f})", | |
| f"Precision-Recall Curve (AP = {avg_precision:.2f})" | |
| )) | |
| # ROC Curve | |
| fig.add_trace( | |
| go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve'), | |
| row=1, col=1 | |
| ) | |
| fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1, | |
| line=dict(color="black", dash="dash"), | |
| row=1, col=1) | |
| # Precision-Recall Curve | |
| fig.add_trace( | |
| go.Scatter(x=recall, y=precision, mode='lines', name='Precision-Recall'), | |
| row=1, col=2 | |
| ) | |
| fig.update_layout( | |
| height=500, | |
| showlegend=False, | |
| template='plotly_white', | |
| xaxis_title="False Positive Rate", | |
| yaxis_title="True Positive Rate", | |
| xaxis2_title="Recall", | |
| yaxis2_title="Precision", | |
| margin=dict(l=50, r=50, b=50, t=50) | |
| ) | |
| return fig | |
| def main(): | |
| st.title('π Advanced Logistic Regression Analysis') | |
| st.markdown(""" | |
| This tool provides comprehensive logistic regression analysis with diagnostics and visualizations. | |
| Upload your data, select variables, and explore the results! | |
| """) | |
| df = load_data() | |
| if df is not None: | |
| # Data Cleaning Section | |
| st.sidebar.header("Data Cleaning Options") | |
| if df.isnull().sum().sum() > 0: | |
| st.sidebar.warning("β οΈ Dataset contains missing values") | |
| impute_method = st.sidebar.selectbox( | |
| "Imputation method", | |
| ['Fill with mean', 'Fill with median', 'Fill with mode', 'Drop rows'] | |
| ) | |
| if impute_method == 'Fill with mean': | |
| df.fillna(df.mean(), inplace=True) | |
| elif impute_method == 'Fill with median': | |
| df.fillna(df.median(), inplace=True) | |
| elif impute_method == 'Fill with mode': | |
| df.fillna(df.mode().iloc[0], inplace=True) | |
| elif impute_method == 'Drop rows': | |
| df.dropna(inplace=True) | |
| else: | |
| st.sidebar.info("No missing values detected") | |
| # Other cleaning options | |
| outlier_handling = st.sidebar.selectbox( | |
| "Handle outliers", | |
| ['None', 'Winsorize', 'Remove outliers'] | |
| ) | |
| # Variable Selection | |
| st.header("Variable Selection") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| predictors = st.multiselect( | |
| 'π― Select Predictor Variables', | |
| [col for col in df.columns if df[col].nunique() > 1], | |
| help="Select multiple features for multiple regression" | |
| ) | |
| with col2: | |
| target = st.selectbox( | |
| 'π Select Binary Target Variable', | |
| [col for col in df.columns if col not in predictors] | |
| ) | |
| if not predictors or not target: | |
| st.warning("Please select at least one predictor and a target variable") | |
| st.stop() | |
| # Check if target is binary | |
| unique_values = df[target].nunique() | |
| if unique_values != 2: | |
| st.error(f"Target variable must have exactly 2 unique values (has {unique_values}). " | |
| f"Unique values found: {df[target].unique()}") | |
| st.stop() | |
| X = df[predictors] | |
| y = df[target] | |
| # Data Transformation Section | |
| st.header("Data Transformations") | |
| transformations = st.multiselect( | |
| "Apply transformations to improve model performance", | |
| ['log', 'sqrt', 'boxcox'], | |
| help="Log and sqrt help with right-skewed data. Box-Cox requires positive values." | |
| ) | |
| if transformations: | |
| for trans in transformations: | |
| if trans == 'log': | |
| X = np.log1p(X) | |
| elif trans == 'sqrt': | |
| X = np.sqrt(X) | |
| elif trans == 'boxcox': | |
| for col in X.columns: | |
| if (X[col] > 0).all(): | |
| X[col], _ = boxcox(X[col] + 1e-6) | |
| # Model Configuration | |
| st.header("Model Configuration") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| test_size = st.slider('Test set size (%)', 10, 50, 20, 5)/100 | |
| random_state = st.number_input('Random seed', 0, 1000, 42) | |
| with col2: | |
| scale_data = st.checkbox("Standardize features", True) | |
| cv_folds = st.selectbox("Cross-validation folds", [3, 5, 10], 2) | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=test_size, random_state=random_state | |
| ) | |
| # Reset indices to ensure alignment | |
| X_train = X_train.reset_index(drop=True) | |
| X_test = X_test.reset_index(drop=True) | |
| y_train = y_train.reset_index(drop=True) | |
| y_test = y_test.reset_index(drop=True) | |
| # Standardize if requested | |
| if scale_data: | |
| scaler = StandardScaler() | |
| X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns) | |
| X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns) | |
| # Add constant after resetting indices | |
| X_train_const = sm.add_constant(X_train) | |
| X_test_const = sm.add_constant(X_test) | |
| # Fit model with error handling | |
| try: | |
| model_sm = sm.Logit(y_train, X_train_const).fit(disp=0) | |
| except Exception as e: | |
| st.error(f"Model failed to converge: {str(e)}") | |
| if "perfectly predicted" in str(e): | |
| st.error("Solution: Check for features that perfectly predict the outcome") | |
| elif "indices" in str(e): | |
| st.error("Solution: This should be fixed by the index reset above") | |
| else: | |
| st.error("Try reducing the number of features or increasing the sample size") | |
| st.stop() | |
| model_sk = LogisticRegression().fit(X_train, y_train) | |
| # Cross-validation | |
| cv_scores = cross_val_score(model_sk, X_train, y_train, | |
| cv=cv_folds, scoring='accuracy') | |
| # Predictions | |
| y_pred_prob = model_sm.predict(X_test_const) | |
| y_pred = (y_pred_prob > 0.5).astype(int) | |
| y_train_pred = model_sm.predict(X_train_const) | |
| # Performance Metrics | |
| st.header("Model Performance") | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Accuracy", f"{accuracy_score(y_test, y_pred):.3f}") | |
| with col2: | |
| st.metric("ROC AUC", f"{roc_auc_score(y_test, y_pred_prob):.3f}") | |
| with col3: | |
| st.metric("CV Accuracy (Mean)", f"{np.mean(cv_scores):.3f}") | |
| with col4: | |
| st.metric("Log-Likelihood", f"{model_sm.llf:.1f}") | |
| st.markdown("---") | |
| # Actual vs Predicted Probability Plot | |
| vis_df = pd.DataFrame({ | |
| "Actual": y_test, | |
| "Predicted Probability": y_pred_prob, | |
| "Predicted Class": y_pred | |
| }) | |
| fig_avp = px.strip(vis_df, x="Actual", y="Predicted Probability", | |
| color="Actual", stripmode="overlay", | |
| title="Actual vs Predicted Probability", | |
| labels={"Actual":"Actual Class", | |
| "Predicted Probability":"Predicted Probability"}) | |
| fig_avp.add_hline(y=0.5, line_dash="dot", line_color="red") | |
| st.plotly_chart(fig_avp, use_container_width=True) | |
| # ROC and PR Curves | |
| st.plotly_chart(plot_roc_pr_curves(y_test, y_pred_prob), | |
| use_container_width=True) | |
| # Feature Importance | |
| if len(predictors) > 1: | |
| st.subheader("Feature Importance") | |
| odds_ratios = pd.DataFrame({ | |
| 'Feature': X_train.columns, | |
| 'Odds Ratio': np.exp(model_sm.params[1:]), | |
| 'Coefficient': model_sm.params[1:] | |
| }).sort_values('Odds Ratio', ascending=False) | |
| fig_coef = px.bar(odds_ratios, x='Feature', y='Odds Ratio', | |
| color='Coefficient', | |
| color_continuous_scale='RdBu', | |
| title='Feature Importance (Odds Ratios)') | |
| st.plotly_chart(fig_coef, use_container_width=True) | |
| # Diagnostic Plots | |
| st.header("Model Diagnostics") | |
| with st.expander("Classification Report"): | |
| report = classification_report(y_test, y_pred, output_dict=True) | |
| report_df = pd.DataFrame(report).T | |
| st.dataframe(report_df.style.format({ | |
| "precision": "{:.2f}", | |
| "recall": "{:.2f}", | |
| "f1-score": "{:.2f}", | |
| "support": "{:.0f}" | |
| })) | |
| with st.expander("Confusion Matrix"): | |
| cm = confusion_matrix(y_test, y_pred) | |
| cm_df = pd.DataFrame(cm, | |
| index=['Actual Negative', 'Actual Positive'], | |
| columns=['Predicted Negative', 'Predicted Positive']) | |
| fig_cm = px.imshow(cm, text_auto=True, | |
| labels=dict(x="Predicted", y="Actual"), | |
| x=['Negative', 'Positive'], | |
| y=['Negative', 'Positive']) | |
| st.plotly_chart(fig_cm, use_container_width=True) | |
| with st.expander("Multicollinearity Check"): | |
| vif_data = calculate_vif(X_train) | |
| if vif_data is not None: | |
| fig_vif = px.bar(vif_data, x='Feature', y='VIF', color='Severity', | |
| color_discrete_map={'High': 'red', 'Moderate': 'orange', 'Low': 'green'}, | |
| title='Variance Inflation Factors (VIF)') | |
| st.plotly_chart(fig_vif, use_container_width=True) | |
| high_vif = vif_data[vif_data['VIF'] > 10] | |
| if not high_vif.empty: | |
| st.warning("High multicollinearity detected in these features:") | |
| st.dataframe(high_vif) | |
| else: | |
| st.info("Not enough features to calculate VIF") | |
| # Model Summary | |
| st.header("Model Summary") | |
| with st.expander("Detailed Summary"): | |
| st.write(model_sm.summary()) | |
| # Prediction Interface | |
| st.header("Make Predictions") | |
| st.markdown("Enter values for prediction (using original scale):") | |
| input_values = {} | |
| cols = st.columns(min(3, len(predictors))) | |
| for i, predictor in enumerate(predictors): | |
| with cols[i % len(cols)]: | |
| input_values[predictor] = st.number_input( | |
| predictor, | |
| value=float(X[predictor].median()), | |
| step=float(X[predictor].std()/10) | |
| ) | |
| if st.button("Predict"): | |
| input_df = pd.DataFrame([input_values]) | |
| # Apply transformations if needed | |
| if transformations: | |
| for trans in transformations: | |
| if trans == 'log': | |
| input_df = np.log1p(input_df) | |
| elif trans == 'sqrt': | |
| input_df = np.sqrt(input_df) | |
| elif trans == 'boxcox': | |
| for col in input_df.columns: | |
| if (input_df[col] > 0).all(): | |
| input_df[col], _ = boxcox(input_df[col] + 1e-6) | |
| # Standardize if needed | |
| if scale_data: | |
| input_df = pd.DataFrame(scaler.transform(input_df), columns=input_df.columns) | |
| # Add constant and predict | |
| input_df = sm.add_constant(input_df, has_constant='add') | |
| pred_prob = model_sm.predict(input_df)[0] | |
| pred_class = int(pred_prob > 0.5) | |
| st.success(f"**Predicted Probability:** {pred_prob:.4f}") | |
| st.success(f"**Predicted Class:** {pred_class}") | |
| # Show prediction interpretation | |
| if pred_prob > 0.5: | |
| st.info(f"The model predicts class 1 with {pred_prob:.1%} confidence") | |
| else: | |
| st.info(f"The model predicts class 0 with {1-pred_prob:.1%} confidence") | |
| if __name__ == '__main__': | |
| st.set_page_config(page_title="Logistic Regression Analysis", layout="wide") | |
| main() |