logistic-regression / src /streamlit_app.py
Ezzio11's picture
Update src/streamlit_app.py
c7d32bb verified
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import (accuracy_score, classification_report,
roc_curve, roc_auc_score, confusion_matrix,
precision_recall_curve, average_precision_score)
from sklearn.linear_model import LogisticRegression
import os
# Configuration
os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false"
os.environ["STREAMLIT_METRICS_ENABLED"] = "false"
st.set_page_config(page_title="Advanced Logistic Regression", layout="wide")
def load_data():
"""Load data with improved error handling and data type detection"""
uploaded_data = st.file_uploader('πŸ“‚ Upload Data File', type=['csv', 'txt', 'xlsx', 'xls'])
if uploaded_data is not None:
try:
if uploaded_data.type == 'text/plain':
delimiter = st.radio('Select delimiter (separator)', [',', '\t', '|', ' ', 'Auto Detect'])
if delimiter == 'Auto Detect':
df = pd.read_csv(uploaded_data, sep=None, engine='python')
else:
df = pd.read_csv(uploaded_data, sep=delimiter)
elif uploaded_data.type == 'text/csv':
df = pd.read_csv(uploaded_data)
elif uploaded_data.type in ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.ms-excel']:
df = pd.read_excel(uploaded_data)
# Basic data quality check
st.write('### πŸ” Dataset Preview')
st.dataframe(df.head())
# Show data summary
with st.expander("πŸ“Š Data Summary"):
st.write("**Data Types:**")
st.dataframe(df.dtypes.astype(str))
st.write("**Descriptive Statistics:**")
st.dataframe(df.describe())
st.write("**Missing Values:**")
st.dataframe(df.isnull().sum().rename("Missing Count"))
return df
except Exception as e:
st.error(f"Error loading file: {str(e)}")
return None
return None
@st.cache_data
def calculate_vif(X):
"""Calculate VIF with improved handling"""
X = X.select_dtypes(include=[np.number]).dropna()
X = X.loc[:, (X != X.iloc[0]).any()]
if X.shape[1] < 2:
return None
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data["Severity"] = np.where(vif_data["VIF"] > 10, "High",
np.where(vif_data["VIF"] > 5, "Moderate", "Low"))
return vif_data.sort_values("VIF", ascending=False)
def plot_roc_pr_curves(y_true, y_pred_prob):
"""Plot ROC and Precision-Recall curves side by side"""
# ROC Curve
fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
roc_auc = roc_auc_score(y_true, y_pred_prob)
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_true, y_pred_prob)
avg_precision = average_precision_score(y_true, y_pred_prob)
fig = make_subplots(rows=1, cols=2,
subplot_titles=(
f"ROC Curve (AUC = {roc_auc:.2f})",
f"Precision-Recall Curve (AP = {avg_precision:.2f})"
))
# ROC Curve
fig.add_trace(
go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve'),
row=1, col=1
)
fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1,
line=dict(color="black", dash="dash"),
row=1, col=1)
# Precision-Recall Curve
fig.add_trace(
go.Scatter(x=recall, y=precision, mode='lines', name='Precision-Recall'),
row=1, col=2
)
fig.update_layout(
height=500,
showlegend=False,
template='plotly_white',
xaxis_title="False Positive Rate",
yaxis_title="True Positive Rate",
xaxis2_title="Recall",
yaxis2_title="Precision",
margin=dict(l=50, r=50, b=50, t=50)
)
return fig
def main():
st.title('πŸ“Š Advanced Logistic Regression Analysis')
st.markdown("""
This tool provides comprehensive logistic regression analysis with diagnostics and visualizations.
Upload your data, select variables, and explore the results!
""")
df = load_data()
if df is not None:
# Data Cleaning Section
st.sidebar.header("Data Cleaning Options")
if df.isnull().sum().sum() > 0:
st.sidebar.warning("⚠️ Dataset contains missing values")
impute_method = st.sidebar.selectbox(
"Imputation method",
['Fill with mean', 'Fill with median', 'Fill with mode', 'Drop rows']
)
if impute_method == 'Fill with mean':
df.fillna(df.mean(), inplace=True)
elif impute_method == 'Fill with median':
df.fillna(df.median(), inplace=True)
elif impute_method == 'Fill with mode':
df.fillna(df.mode().iloc[0], inplace=True)
elif impute_method == 'Drop rows':
df.dropna(inplace=True)
else:
st.sidebar.info("No missing values detected")
# Other cleaning options
outlier_handling = st.sidebar.selectbox(
"Handle outliers",
['None', 'Winsorize', 'Remove outliers']
)
# Variable Selection
st.header("Variable Selection")
col1, col2 = st.columns(2)
with col1:
predictors = st.multiselect(
'🎯 Select Predictor Variables',
[col for col in df.columns if df[col].nunique() > 1],
help="Select multiple features for multiple regression"
)
with col2:
target = st.selectbox(
'πŸ“Œ Select Binary Target Variable',
[col for col in df.columns if col not in predictors]
)
if not predictors or not target:
st.warning("Please select at least one predictor and a target variable")
st.stop()
# Check if target is binary
unique_values = df[target].nunique()
if unique_values != 2:
st.error(f"Target variable must have exactly 2 unique values (has {unique_values}). "
f"Unique values found: {df[target].unique()}")
st.stop()
X = df[predictors]
y = df[target]
# Data Transformation Section
st.header("Data Transformations")
transformations = st.multiselect(
"Apply transformations to improve model performance",
['log', 'sqrt', 'boxcox'],
help="Log and sqrt help with right-skewed data. Box-Cox requires positive values."
)
if transformations:
for trans in transformations:
if trans == 'log':
X = np.log1p(X)
elif trans == 'sqrt':
X = np.sqrt(X)
elif trans == 'boxcox':
for col in X.columns:
if (X[col] > 0).all():
X[col], _ = boxcox(X[col] + 1e-6)
# Model Configuration
st.header("Model Configuration")
col1, col2 = st.columns(2)
with col1:
test_size = st.slider('Test set size (%)', 10, 50, 20, 5)/100
random_state = st.number_input('Random seed', 0, 1000, 42)
with col2:
scale_data = st.checkbox("Standardize features", True)
cv_folds = st.selectbox("Cross-validation folds", [3, 5, 10], 2)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state
)
# Reset indices to ensure alignment
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
# Standardize if requested
if scale_data:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
# Add constant after resetting indices
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)
# Fit model with error handling
try:
model_sm = sm.Logit(y_train, X_train_const).fit(disp=0)
except Exception as e:
st.error(f"Model failed to converge: {str(e)}")
if "perfectly predicted" in str(e):
st.error("Solution: Check for features that perfectly predict the outcome")
elif "indices" in str(e):
st.error("Solution: This should be fixed by the index reset above")
else:
st.error("Try reducing the number of features or increasing the sample size")
st.stop()
model_sk = LogisticRegression().fit(X_train, y_train)
# Cross-validation
cv_scores = cross_val_score(model_sk, X_train, y_train,
cv=cv_folds, scoring='accuracy')
# Predictions
y_pred_prob = model_sm.predict(X_test_const)
y_pred = (y_pred_prob > 0.5).astype(int)
y_train_pred = model_sm.predict(X_train_const)
# Performance Metrics
st.header("Model Performance")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Accuracy", f"{accuracy_score(y_test, y_pred):.3f}")
with col2:
st.metric("ROC AUC", f"{roc_auc_score(y_test, y_pred_prob):.3f}")
with col3:
st.metric("CV Accuracy (Mean)", f"{np.mean(cv_scores):.3f}")
with col4:
st.metric("Log-Likelihood", f"{model_sm.llf:.1f}")
st.markdown("---")
# Actual vs Predicted Probability Plot
vis_df = pd.DataFrame({
"Actual": y_test,
"Predicted Probability": y_pred_prob,
"Predicted Class": y_pred
})
fig_avp = px.strip(vis_df, x="Actual", y="Predicted Probability",
color="Actual", stripmode="overlay",
title="Actual vs Predicted Probability",
labels={"Actual":"Actual Class",
"Predicted Probability":"Predicted Probability"})
fig_avp.add_hline(y=0.5, line_dash="dot", line_color="red")
st.plotly_chart(fig_avp, use_container_width=True)
# ROC and PR Curves
st.plotly_chart(plot_roc_pr_curves(y_test, y_pred_prob),
use_container_width=True)
# Feature Importance
if len(predictors) > 1:
st.subheader("Feature Importance")
odds_ratios = pd.DataFrame({
'Feature': X_train.columns,
'Odds Ratio': np.exp(model_sm.params[1:]),
'Coefficient': model_sm.params[1:]
}).sort_values('Odds Ratio', ascending=False)
fig_coef = px.bar(odds_ratios, x='Feature', y='Odds Ratio',
color='Coefficient',
color_continuous_scale='RdBu',
title='Feature Importance (Odds Ratios)')
st.plotly_chart(fig_coef, use_container_width=True)
# Diagnostic Plots
st.header("Model Diagnostics")
with st.expander("Classification Report"):
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).T
st.dataframe(report_df.style.format({
"precision": "{:.2f}",
"recall": "{:.2f}",
"f1-score": "{:.2f}",
"support": "{:.0f}"
}))
with st.expander("Confusion Matrix"):
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm,
index=['Actual Negative', 'Actual Positive'],
columns=['Predicted Negative', 'Predicted Positive'])
fig_cm = px.imshow(cm, text_auto=True,
labels=dict(x="Predicted", y="Actual"),
x=['Negative', 'Positive'],
y=['Negative', 'Positive'])
st.plotly_chart(fig_cm, use_container_width=True)
with st.expander("Multicollinearity Check"):
vif_data = calculate_vif(X_train)
if vif_data is not None:
fig_vif = px.bar(vif_data, x='Feature', y='VIF', color='Severity',
color_discrete_map={'High': 'red', 'Moderate': 'orange', 'Low': 'green'},
title='Variance Inflation Factors (VIF)')
st.plotly_chart(fig_vif, use_container_width=True)
high_vif = vif_data[vif_data['VIF'] > 10]
if not high_vif.empty:
st.warning("High multicollinearity detected in these features:")
st.dataframe(high_vif)
else:
st.info("Not enough features to calculate VIF")
# Model Summary
st.header("Model Summary")
with st.expander("Detailed Summary"):
st.write(model_sm.summary())
# Prediction Interface
st.header("Make Predictions")
st.markdown("Enter values for prediction (using original scale):")
input_values = {}
cols = st.columns(min(3, len(predictors)))
for i, predictor in enumerate(predictors):
with cols[i % len(cols)]:
input_values[predictor] = st.number_input(
predictor,
value=float(X[predictor].median()),
step=float(X[predictor].std()/10)
)
if st.button("Predict"):
input_df = pd.DataFrame([input_values])
# Apply transformations if needed
if transformations:
for trans in transformations:
if trans == 'log':
input_df = np.log1p(input_df)
elif trans == 'sqrt':
input_df = np.sqrt(input_df)
elif trans == 'boxcox':
for col in input_df.columns:
if (input_df[col] > 0).all():
input_df[col], _ = boxcox(input_df[col] + 1e-6)
# Standardize if needed
if scale_data:
input_df = pd.DataFrame(scaler.transform(input_df), columns=input_df.columns)
# Add constant and predict
input_df = sm.add_constant(input_df, has_constant='add')
pred_prob = model_sm.predict(input_df)[0]
pred_class = int(pred_prob > 0.5)
st.success(f"**Predicted Probability:** {pred_prob:.4f}")
st.success(f"**Predicted Class:** {pred_class}")
# Show prediction interpretation
if pred_prob > 0.5:
st.info(f"The model predicts class 1 with {pred_prob:.1%} confidence")
else:
st.info(f"The model predicts class 0 with {1-pred_prob:.1%} confidence")
if __name__ == '__main__':
st.set_page_config(page_title="Logistic Regression Analysis", layout="wide")
main()