File size: 16,368 Bytes
a9ef1a8
1c339fe
 
 
7296462
2f884d6
1c339fe
 
7296462
 
 
 
 
 
11878e3
1c339fe
7296462
da7d299
 
7296462
da7d299
7296462
 
 
1c339fe
7296462
 
 
 
1c339fe
7296462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c339fe
 
 
 
 
7296462
1c339fe
 
 
 
7296462
1c339fe
 
 
7296462
 
 
 
 
 
 
 
 
 
 
 
 
 
2f884d6
 
 
 
 
7296462
 
 
 
 
 
 
2f884d6
 
7296462
 
 
 
 
 
 
 
 
 
 
 
 
 
2f884d6
 
7296462
 
1c339fe
 
7296462
 
 
 
 
 
 
1c339fe
 
7296462
 
c7d32bb
7296462
 
 
 
 
 
 
 
 
 
 
 
 
 
c7d32bb
 
 
 
 
 
 
 
7296462
 
 
1c339fe
 
7296462
 
 
 
 
1c339fe
7296462
 
 
 
 
 
 
 
1c339fe
7296462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123ec7d
 
 
 
 
 
7296462
 
 
 
 
 
123ec7d
7296462
 
 
123ec7d
7296462
 
 
 
123ec7d
 
 
 
 
 
7296462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9ef1a8
1c339fe
7296462
1c339fe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import (accuracy_score, classification_report, 
                           roc_curve, roc_auc_score, confusion_matrix, 
                           precision_recall_curve, average_precision_score)
from sklearn.linear_model import LogisticRegression
import os

# Configuration
os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false"
os.environ["STREAMLIT_METRICS_ENABLED"] = "false"
st.set_page_config(page_title="Advanced Logistic Regression", layout="wide")

def load_data():
    """Load data with improved error handling and data type detection"""
    uploaded_data = st.file_uploader('πŸ“‚ Upload Data File', type=['csv', 'txt', 'xlsx', 'xls'])
    if uploaded_data is not None:
        try:
            if uploaded_data.type == 'text/plain':
                delimiter = st.radio('Select delimiter (separator)', [',', '\t', '|', ' ', 'Auto Detect'])
                if delimiter == 'Auto Detect':
                    df = pd.read_csv(uploaded_data, sep=None, engine='python')
                else:
                    df = pd.read_csv(uploaded_data, sep=delimiter)
            elif uploaded_data.type == 'text/csv':
                df = pd.read_csv(uploaded_data)
            elif uploaded_data.type in ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 
                                      'application/vnd.ms-excel']:
                df = pd.read_excel(uploaded_data)
            
            # Basic data quality check
            st.write('### πŸ” Dataset Preview')
            st.dataframe(df.head())
            
            # Show data summary
            with st.expander("πŸ“Š Data Summary"):
                st.write("**Data Types:**")
                st.dataframe(df.dtypes.astype(str))
                st.write("**Descriptive Statistics:**")
                st.dataframe(df.describe())
                st.write("**Missing Values:**")
                st.dataframe(df.isnull().sum().rename("Missing Count"))
                
            return df
        except Exception as e:
            st.error(f"Error loading file: {str(e)}")
            return None
    return None

@st.cache_data
def calculate_vif(X):
    """Calculate VIF with improved handling"""
    X = X.select_dtypes(include=[np.number]).dropna()
    X = X.loc[:, (X != X.iloc[0]).any()]
    if X.shape[1] < 2:
        return None
    
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif_data["Severity"] = np.where(vif_data["VIF"] > 10, "High", 
                                  np.where(vif_data["VIF"] > 5, "Moderate", "Low"))
    return vif_data.sort_values("VIF", ascending=False)

def plot_roc_pr_curves(y_true, y_pred_prob):
    """Plot ROC and Precision-Recall curves side by side"""
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
    roc_auc = roc_auc_score(y_true, y_pred_prob)
    
    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_true, y_pred_prob)
    avg_precision = average_precision_score(y_true, y_pred_prob)
    
    fig = make_subplots(rows=1, cols=2, 
                       subplot_titles=(
                           f"ROC Curve (AUC = {roc_auc:.2f})",
                           f"Precision-Recall Curve (AP = {avg_precision:.2f})"
                       ))
    
    # ROC Curve
    fig.add_trace(
        go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve'),
        row=1, col=1
    )
    fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1, 
                 line=dict(color="black", dash="dash"), 
                 row=1, col=1)
    
    # Precision-Recall Curve
    fig.add_trace(
        go.Scatter(x=recall, y=precision, mode='lines', name='Precision-Recall'),
        row=1, col=2
    )
    
    fig.update_layout(
        height=500,
        showlegend=False,
        template='plotly_white',
        xaxis_title="False Positive Rate",
        yaxis_title="True Positive Rate",
        xaxis2_title="Recall",
        yaxis2_title="Precision",
        margin=dict(l=50, r=50, b=50, t=50)
    )
    return fig

def main():
    st.title('πŸ“Š Advanced Logistic Regression Analysis')
    st.markdown("""
    This tool provides comprehensive logistic regression analysis with diagnostics and visualizations.
    Upload your data, select variables, and explore the results!
    """)
    
    df = load_data()

    if df is not None:
        # Data Cleaning Section
        st.sidebar.header("Data Cleaning Options")
        
        if df.isnull().sum().sum() > 0:
            st.sidebar.warning("⚠️ Dataset contains missing values")
            impute_method = st.sidebar.selectbox(
                "Imputation method",
                ['Fill with mean', 'Fill with median', 'Fill with mode', 'Drop rows']
            )
            if impute_method == 'Fill with mean':
                df.fillna(df.mean(), inplace=True)
            elif impute_method == 'Fill with median':
                df.fillna(df.median(), inplace=True)
            elif impute_method == 'Fill with mode':
                df.fillna(df.mode().iloc[0], inplace=True)
            elif impute_method == 'Drop rows':
                df.dropna(inplace=True)
        else:
            st.sidebar.info("No missing values detected")
            # Other cleaning options
            outlier_handling = st.sidebar.selectbox(
                "Handle outliers",
                ['None', 'Winsorize', 'Remove outliers']
            )

        
        # Variable Selection
        st.header("Variable Selection")
        col1, col2 = st.columns(2)
        with col1:
            predictors = st.multiselect(
                '🎯 Select Predictor Variables',
                [col for col in df.columns if df[col].nunique() > 1],
                help="Select multiple features for multiple regression"
            )
        with col2:
            target = st.selectbox(
                'πŸ“Œ Select Binary Target Variable',
                [col for col in df.columns if col not in predictors]
            )
        
        if not predictors or not target:
            st.warning("Please select at least one predictor and a target variable")
            st.stop()
            
        # Check if target is binary
        unique_values = df[target].nunique()
        if unique_values != 2:
            st.error(f"Target variable must have exactly 2 unique values (has {unique_values}). "
                    f"Unique values found: {df[target].unique()}")
            st.stop()
            
        X = df[predictors]
        y = df[target]
        
        # Data Transformation Section
        st.header("Data Transformations")
        transformations = st.multiselect(
            "Apply transformations to improve model performance",
            ['log', 'sqrt', 'boxcox'],
            help="Log and sqrt help with right-skewed data. Box-Cox requires positive values."
        )
        
        if transformations:
            for trans in transformations:
                if trans == 'log':
                    X = np.log1p(X)
                elif trans == 'sqrt':
                    X = np.sqrt(X)
                elif trans == 'boxcox':
                    for col in X.columns:
                        if (X[col] > 0).all():
                            X[col], _ = boxcox(X[col] + 1e-6)
        
        # Model Configuration
        st.header("Model Configuration")
        col1, col2 = st.columns(2)
        with col1:
            test_size = st.slider('Test set size (%)', 10, 50, 20, 5)/100
            random_state = st.number_input('Random seed', 0, 1000, 42)
        with col2:
            scale_data = st.checkbox("Standardize features", True)
            cv_folds = st.selectbox("Cross-validation folds", [3, 5, 10], 2)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        
        # Reset indices to ensure alignment
        X_train = X_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)
        
        # Standardize if requested
        if scale_data:
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
            X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
        
        # Add constant after resetting indices
        X_train_const = sm.add_constant(X_train)
        X_test_const = sm.add_constant(X_test)
        
        # Fit model with error handling
        try:
            model_sm = sm.Logit(y_train, X_train_const).fit(disp=0)
        except Exception as e:
            st.error(f"Model failed to converge: {str(e)}")
            if "perfectly predicted" in str(e):
                st.error("Solution: Check for features that perfectly predict the outcome")
            elif "indices" in str(e):
                st.error("Solution: This should be fixed by the index reset above")
            else:
                st.error("Try reducing the number of features or increasing the sample size")
            st.stop()
            
        model_sk = LogisticRegression().fit(X_train, y_train)
        
        # Cross-validation
        cv_scores = cross_val_score(model_sk, X_train, y_train, 
                                  cv=cv_folds, scoring='accuracy')
        
        # Predictions
        y_pred_prob = model_sm.predict(X_test_const)
        y_pred = (y_pred_prob > 0.5).astype(int)
        y_train_pred = model_sm.predict(X_train_const)
        
        # Performance Metrics
        st.header("Model Performance")
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("Accuracy", f"{accuracy_score(y_test, y_pred):.3f}")
        with col2:
            st.metric("ROC AUC", f"{roc_auc_score(y_test, y_pred_prob):.3f}")
        with col3:
            st.metric("CV Accuracy (Mean)", f"{np.mean(cv_scores):.3f}")
        with col4:
            st.metric("Log-Likelihood", f"{model_sm.llf:.1f}")
        
        st.markdown("---")
        
        # Actual vs Predicted Probability Plot
        vis_df = pd.DataFrame({
            "Actual": y_test,
            "Predicted Probability": y_pred_prob,
            "Predicted Class": y_pred
        })
        
        fig_avp = px.strip(vis_df, x="Actual", y="Predicted Probability", 
                          color="Actual", stripmode="overlay",
                          title="Actual vs Predicted Probability",
                          labels={"Actual":"Actual Class", 
                                 "Predicted Probability":"Predicted Probability"})
        fig_avp.add_hline(y=0.5, line_dash="dot", line_color="red")
        st.plotly_chart(fig_avp, use_container_width=True)
        
        # ROC and PR Curves
        st.plotly_chart(plot_roc_pr_curves(y_test, y_pred_prob), 
                       use_container_width=True)
        
        # Feature Importance
        if len(predictors) > 1:
            st.subheader("Feature Importance")
            odds_ratios = pd.DataFrame({
                'Feature': X_train.columns,
                'Odds Ratio': np.exp(model_sm.params[1:]),
                'Coefficient': model_sm.params[1:]
            }).sort_values('Odds Ratio', ascending=False)
            
            fig_coef = px.bar(odds_ratios, x='Feature', y='Odds Ratio',
                             color='Coefficient', 
                             color_continuous_scale='RdBu',
                             title='Feature Importance (Odds Ratios)')
            st.plotly_chart(fig_coef, use_container_width=True)
        
        # Diagnostic Plots
        st.header("Model Diagnostics")
        
        with st.expander("Classification Report"):
            report = classification_report(y_test, y_pred, output_dict=True)
            report_df = pd.DataFrame(report).T
            st.dataframe(report_df.style.format({
                "precision": "{:.2f}", 
                "recall": "{:.2f}", 
                "f1-score": "{:.2f}", 
                "support": "{:.0f}"
            }))
            
        with st.expander("Confusion Matrix"):
            cm = confusion_matrix(y_test, y_pred)
            cm_df = pd.DataFrame(cm, 
                               index=['Actual Negative', 'Actual Positive'], 
                               columns=['Predicted Negative', 'Predicted Positive'])
            fig_cm = px.imshow(cm, text_auto=True,
                             labels=dict(x="Predicted", y="Actual"),
                             x=['Negative', 'Positive'],
                             y=['Negative', 'Positive'])
            st.plotly_chart(fig_cm, use_container_width=True)
        
        with st.expander("Multicollinearity Check"):
            vif_data = calculate_vif(X_train)
            if vif_data is not None:
                fig_vif = px.bar(vif_data, x='Feature', y='VIF', color='Severity',
                                color_discrete_map={'High': 'red', 'Moderate': 'orange', 'Low': 'green'},
                                title='Variance Inflation Factors (VIF)')
                st.plotly_chart(fig_vif, use_container_width=True)
                
                high_vif = vif_data[vif_data['VIF'] > 10]
                if not high_vif.empty:
                    st.warning("High multicollinearity detected in these features:")
                    st.dataframe(high_vif)
            else:
                st.info("Not enough features to calculate VIF")
        
        # Model Summary
        st.header("Model Summary")
        with st.expander("Detailed Summary"):
            st.write(model_sm.summary())
        
        # Prediction Interface
        st.header("Make Predictions")
        st.markdown("Enter values for prediction (using original scale):")
        
        input_values = {}
        cols = st.columns(min(3, len(predictors)))
        for i, predictor in enumerate(predictors):
            with cols[i % len(cols)]:
                input_values[predictor] = st.number_input(
                    predictor,
                    value=float(X[predictor].median()),
                    step=float(X[predictor].std()/10)
                )
        
        if st.button("Predict"):
            input_df = pd.DataFrame([input_values])
            
            # Apply transformations if needed
            if transformations:
                for trans in transformations:
                    if trans == 'log':
                        input_df = np.log1p(input_df)
                    elif trans == 'sqrt':
                        input_df = np.sqrt(input_df)
                    elif trans == 'boxcox':
                        for col in input_df.columns:
                            if (input_df[col] > 0).all():
                                input_df[col], _ = boxcox(input_df[col] + 1e-6)
            
            # Standardize if needed
            if scale_data:
                input_df = pd.DataFrame(scaler.transform(input_df), columns=input_df.columns)
            
            # Add constant and predict
            input_df = sm.add_constant(input_df, has_constant='add')
            pred_prob = model_sm.predict(input_df)[0]
            pred_class = int(pred_prob > 0.5)
            
            st.success(f"**Predicted Probability:** {pred_prob:.4f}")
            st.success(f"**Predicted Class:** {pred_class}")
            
            # Show prediction interpretation
            if pred_prob > 0.5:
                st.info(f"The model predicts class 1 with {pred_prob:.1%} confidence")
            else:
                st.info(f"The model predicts class 0 with {1-pred_prob:.1%} confidence")

if __name__ == '__main__':
    st.set_page_config(page_title="Logistic Regression Analysis", layout="wide")
    main()