import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import statsmodels.api as sm from statsmodels.stats.outliers_influence import variance_inflation_factor from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler, PowerTransformer from sklearn.metrics import (accuracy_score, classification_report, roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, average_precision_score) from sklearn.linear_model import LogisticRegression import os # Configuration os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false" os.environ["STREAMLIT_METRICS_ENABLED"] = "false" st.set_page_config(page_title="Advanced Logistic Regression", layout="wide") def load_data(): """Load data with improved error handling and data type detection""" uploaded_data = st.file_uploader('📂 Upload Data File', type=['csv', 'txt', 'xlsx', 'xls']) if uploaded_data is not None: try: if uploaded_data.type == 'text/plain': delimiter = st.radio('Select delimiter (separator)', [',', '\t', '|', ' ', 'Auto Detect']) if delimiter == 'Auto Detect': df = pd.read_csv(uploaded_data, sep=None, engine='python') else: df = pd.read_csv(uploaded_data, sep=delimiter) elif uploaded_data.type == 'text/csv': df = pd.read_csv(uploaded_data) elif uploaded_data.type in ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.ms-excel']: df = pd.read_excel(uploaded_data) # Basic data quality check st.write('### 🔍 Dataset Preview') st.dataframe(df.head()) # Show data summary with st.expander("📊 Data Summary"): st.write("**Data Types:**") st.dataframe(df.dtypes.astype(str)) st.write("**Descriptive Statistics:**") st.dataframe(df.describe()) st.write("**Missing Values:**") st.dataframe(df.isnull().sum().rename("Missing Count")) return df except Exception as e: st.error(f"Error loading file: {str(e)}") return None return None @st.cache_data def calculate_vif(X): """Calculate VIF with improved handling""" X = X.select_dtypes(include=[np.number]).dropna() X = X.loc[:, (X != X.iloc[0]).any()] if X.shape[1] < 2: return None vif_data = pd.DataFrame() vif_data["Feature"] = X.columns vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] vif_data["Severity"] = np.where(vif_data["VIF"] > 10, "High", np.where(vif_data["VIF"] > 5, "Moderate", "Low")) return vif_data.sort_values("VIF", ascending=False) def plot_roc_pr_curves(y_true, y_pred_prob): """Plot ROC and Precision-Recall curves side by side""" # ROC Curve fpr, tpr, _ = roc_curve(y_true, y_pred_prob) roc_auc = roc_auc_score(y_true, y_pred_prob) # Precision-Recall Curve precision, recall, _ = precision_recall_curve(y_true, y_pred_prob) avg_precision = average_precision_score(y_true, y_pred_prob) fig = make_subplots(rows=1, cols=2, subplot_titles=( f"ROC Curve (AUC = {roc_auc:.2f})", f"Precision-Recall Curve (AP = {avg_precision:.2f})" )) # ROC Curve fig.add_trace( go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve'), row=1, col=1 ) fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1, line=dict(color="black", dash="dash"), row=1, col=1) # Precision-Recall Curve fig.add_trace( go.Scatter(x=recall, y=precision, mode='lines', name='Precision-Recall'), row=1, col=2 ) fig.update_layout( height=500, showlegend=False, template='plotly_white', xaxis_title="False Positive Rate", yaxis_title="True Positive Rate", xaxis2_title="Recall", yaxis2_title="Precision", margin=dict(l=50, r=50, b=50, t=50) ) return fig def main(): st.title('📊 Advanced Logistic Regression Analysis') st.markdown(""" This tool provides comprehensive logistic regression analysis with diagnostics and visualizations. Upload your data, select variables, and explore the results! """) df = load_data() if df is not None: # Data Cleaning Section st.sidebar.header("Data Cleaning Options") if df.isnull().sum().sum() > 0: st.sidebar.warning("⚠️ Dataset contains missing values") impute_method = st.sidebar.selectbox( "Imputation method", ['Fill with mean', 'Fill with median', 'Fill with mode', 'Drop rows'] ) if impute_method == 'Fill with mean': df.fillna(df.mean(), inplace=True) elif impute_method == 'Fill with median': df.fillna(df.median(), inplace=True) elif impute_method == 'Fill with mode': df.fillna(df.mode().iloc[0], inplace=True) elif impute_method == 'Drop rows': df.dropna(inplace=True) else: st.sidebar.info("No missing values detected") # Other cleaning options outlier_handling = st.sidebar.selectbox( "Handle outliers", ['None', 'Winsorize', 'Remove outliers'] ) # Variable Selection st.header("Variable Selection") col1, col2 = st.columns(2) with col1: predictors = st.multiselect( '🎯 Select Predictor Variables', [col for col in df.columns if df[col].nunique() > 1], help="Select multiple features for multiple regression" ) with col2: target = st.selectbox( '📌 Select Binary Target Variable', [col for col in df.columns if col not in predictors] ) if not predictors or not target: st.warning("Please select at least one predictor and a target variable") st.stop() # Check if target is binary unique_values = df[target].nunique() if unique_values != 2: st.error(f"Target variable must have exactly 2 unique values (has {unique_values}). " f"Unique values found: {df[target].unique()}") st.stop() X = df[predictors] y = df[target] # Data Transformation Section st.header("Data Transformations") transformations = st.multiselect( "Apply transformations to improve model performance", ['log', 'sqrt', 'boxcox'], help="Log and sqrt help with right-skewed data. Box-Cox requires positive values." ) if transformations: for trans in transformations: if trans == 'log': X = np.log1p(X) elif trans == 'sqrt': X = np.sqrt(X) elif trans == 'boxcox': for col in X.columns: if (X[col] > 0).all(): X[col], _ = boxcox(X[col] + 1e-6) # Model Configuration st.header("Model Configuration") col1, col2 = st.columns(2) with col1: test_size = st.slider('Test set size (%)', 10, 50, 20, 5)/100 random_state = st.number_input('Random seed', 0, 1000, 42) with col2: scale_data = st.checkbox("Standardize features", True) cv_folds = st.selectbox("Cross-validation folds", [3, 5, 10], 2) # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=random_state ) # Reset indices to ensure alignment X_train = X_train.reset_index(drop=True) X_test = X_test.reset_index(drop=True) y_train = y_train.reset_index(drop=True) y_test = y_test.reset_index(drop=True) # Standardize if requested if scale_data: scaler = StandardScaler() X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns) X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns) # Add constant after resetting indices X_train_const = sm.add_constant(X_train) X_test_const = sm.add_constant(X_test) # Fit model with error handling try: model_sm = sm.Logit(y_train, X_train_const).fit(disp=0) except Exception as e: st.error(f"Model failed to converge: {str(e)}") if "perfectly predicted" in str(e): st.error("Solution: Check for features that perfectly predict the outcome") elif "indices" in str(e): st.error("Solution: This should be fixed by the index reset above") else: st.error("Try reducing the number of features or increasing the sample size") st.stop() model_sk = LogisticRegression().fit(X_train, y_train) # Cross-validation cv_scores = cross_val_score(model_sk, X_train, y_train, cv=cv_folds, scoring='accuracy') # Predictions y_pred_prob = model_sm.predict(X_test_const) y_pred = (y_pred_prob > 0.5).astype(int) y_train_pred = model_sm.predict(X_train_const) # Performance Metrics st.header("Model Performance") col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Accuracy", f"{accuracy_score(y_test, y_pred):.3f}") with col2: st.metric("ROC AUC", f"{roc_auc_score(y_test, y_pred_prob):.3f}") with col3: st.metric("CV Accuracy (Mean)", f"{np.mean(cv_scores):.3f}") with col4: st.metric("Log-Likelihood", f"{model_sm.llf:.1f}") st.markdown("---") # Actual vs Predicted Probability Plot vis_df = pd.DataFrame({ "Actual": y_test, "Predicted Probability": y_pred_prob, "Predicted Class": y_pred }) fig_avp = px.strip(vis_df, x="Actual", y="Predicted Probability", color="Actual", stripmode="overlay", title="Actual vs Predicted Probability", labels={"Actual":"Actual Class", "Predicted Probability":"Predicted Probability"}) fig_avp.add_hline(y=0.5, line_dash="dot", line_color="red") st.plotly_chart(fig_avp, use_container_width=True) # ROC and PR Curves st.plotly_chart(plot_roc_pr_curves(y_test, y_pred_prob), use_container_width=True) # Feature Importance if len(predictors) > 1: st.subheader("Feature Importance") odds_ratios = pd.DataFrame({ 'Feature': X_train.columns, 'Odds Ratio': np.exp(model_sm.params[1:]), 'Coefficient': model_sm.params[1:] }).sort_values('Odds Ratio', ascending=False) fig_coef = px.bar(odds_ratios, x='Feature', y='Odds Ratio', color='Coefficient', color_continuous_scale='RdBu', title='Feature Importance (Odds Ratios)') st.plotly_chart(fig_coef, use_container_width=True) # Diagnostic Plots st.header("Model Diagnostics") with st.expander("Classification Report"): report = classification_report(y_test, y_pred, output_dict=True) report_df = pd.DataFrame(report).T st.dataframe(report_df.style.format({ "precision": "{:.2f}", "recall": "{:.2f}", "f1-score": "{:.2f}", "support": "{:.0f}" })) with st.expander("Confusion Matrix"): cm = confusion_matrix(y_test, y_pred) cm_df = pd.DataFrame(cm, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive']) fig_cm = px.imshow(cm, text_auto=True, labels=dict(x="Predicted", y="Actual"), x=['Negative', 'Positive'], y=['Negative', 'Positive']) st.plotly_chart(fig_cm, use_container_width=True) with st.expander("Multicollinearity Check"): vif_data = calculate_vif(X_train) if vif_data is not None: fig_vif = px.bar(vif_data, x='Feature', y='VIF', color='Severity', color_discrete_map={'High': 'red', 'Moderate': 'orange', 'Low': 'green'}, title='Variance Inflation Factors (VIF)') st.plotly_chart(fig_vif, use_container_width=True) high_vif = vif_data[vif_data['VIF'] > 10] if not high_vif.empty: st.warning("High multicollinearity detected in these features:") st.dataframe(high_vif) else: st.info("Not enough features to calculate VIF") # Model Summary st.header("Model Summary") with st.expander("Detailed Summary"): st.write(model_sm.summary()) # Prediction Interface st.header("Make Predictions") st.markdown("Enter values for prediction (using original scale):") input_values = {} cols = st.columns(min(3, len(predictors))) for i, predictor in enumerate(predictors): with cols[i % len(cols)]: input_values[predictor] = st.number_input( predictor, value=float(X[predictor].median()), step=float(X[predictor].std()/10) ) if st.button("Predict"): input_df = pd.DataFrame([input_values]) # Apply transformations if needed if transformations: for trans in transformations: if trans == 'log': input_df = np.log1p(input_df) elif trans == 'sqrt': input_df = np.sqrt(input_df) elif trans == 'boxcox': for col in input_df.columns: if (input_df[col] > 0).all(): input_df[col], _ = boxcox(input_df[col] + 1e-6) # Standardize if needed if scale_data: input_df = pd.DataFrame(scaler.transform(input_df), columns=input_df.columns) # Add constant and predict input_df = sm.add_constant(input_df, has_constant='add') pred_prob = model_sm.predict(input_df)[0] pred_class = int(pred_prob > 0.5) st.success(f"**Predicted Probability:** {pred_prob:.4f}") st.success(f"**Predicted Class:** {pred_class}") # Show prediction interpretation if pred_prob > 0.5: st.info(f"The model predicts class 1 with {pred_prob:.1%} confidence") else: st.info(f"The model predicts class 0 with {1-pred_prob:.1%} confidence") if __name__ == '__main__': st.set_page_config(page_title="Logistic Regression Analysis", layout="wide") main()