File size: 15,881 Bytes
1a947f0
 
 
 
 
f1cb155
f9e2d27
 
1a947f0
 
 
 
 
ae41b32
 
 
 
 
 
 
994e406
 
 
f1cb155
4279b2f
f1cb155
ae41b32
 
f1cb155
ae41b32
f1cb155
 
 
4279b2f
ae41b32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ea9add
 
d8dc529
2ea9add
 
d8dc529
2ea9add
 
 
 
 
d8dc529
2ea9add
4279b2f
994e406
 
f1cb155
994e406
 
 
 
 
 
 
 
 
d8dc529
994e406
 
4279b2f
 
994e406
 
1a947f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8dc529
 
 
 
 
1a947f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ea9add
 
 
 
 
1a947f0
 
2ea9add
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a947f0
 
 
 
 
 
 
2ea9add
 
 
1a947f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1cb155
1a947f0
4279b2f
 
1a947f0
 
 
 
4279b2f
 
1a947f0
 
 
43959e6
 
 
1a947f0
 
994e406
 
1a947f0
 
ae41b32
 
 
2ea9add
 
 
 
 
 
43959e6
2ea9add
 
43959e6
1a947f0
 
43959e6
 
 
 
 
 
 
 
 
 
 
d8dc529
1a947f0
 
 
 
994e406
1a947f0
994e406
 
 
1a947f0
994e406
a74193e
ae41b32
43959e6
a74193e
 
43959e6
a74193e
4279b2f
 
994e406
 
 
1a947f0
994e406
1a947f0
f49cc38
 
 
 
f9e2d27
f49cc38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9e2d27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a947f0
f1cb155
1a947f0
 
 
 
 
 
 
f9e2d27
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
import streamlit as st
import joblib
import numpy as np
import os
import pandas as pd
import openpyxl
import zipfile
import io

# Load the preprocessor
preprocessor_path = 'modelExports/preprocessor.pkl'
preprocessor = joblib.load(preprocessor_path)

def find_header_row(df, required_columns, max_rows_to_check=10):
    required_columns_upper = [col.upper().strip() for col in required_columns]
    for i in range(min(max_rows_to_check, len(df))):
        row_values = [str(val).upper().strip() for val in df.iloc[i].values]
        if all(col in row_values for col in required_columns_upper):
            return i  # Header row found at row i
    return -1  # Header row not found

def process_uploaded_file(uploaded_file, required_columns):
    try:
        file_extension = uploaded_file.name.split('.')[-1].lower()

        if file_extension == 'csv':
            # Read the first few rows to check for headers
            df = pd.read_csv(uploaded_file, nrows=10, header=None)
        elif file_extension in ['xlsx', 'xls']:
            df = pd.read_excel(uploaded_file, nrows=10, header=None, engine='openpyxl')
        else:
            st.error("Unsupported file format. Please upload a CSV or Excel file.")
            return None

        header_row = find_header_row(df, required_columns)

        if header_row == -1:
            st.error(f"Required columns not found in the first {len(df)} rows.")
            st.write("Expected columns:", required_columns)
            st.write("Found data rows:", df.head().values.tolist())
            return None

        # Re-read the file with the correct header row
        uploaded_file.seek(0)  # Reset file pointer
        if file_extension == 'csv':
            df = pd.read_csv(uploaded_file, header=header_row)
        else:
            df = pd.read_excel(uploaded_file, header=header_row, engine='openpyxl')

        # Standardize column names to uppercase and strip spaces
        df.columns = df.columns.str.upper().str.strip()

        # Standardize required columns to uppercase and strip spaces
        required_columns_upper = [col.upper().strip() for col in required_columns]

        # Check if all required columns are present
        missing_columns = [col for col in required_columns_upper if col not in df.columns]
        if missing_columns:
            st.error(f"The following required columns are missing: {missing_columns}")
            return None

        st.write(f"Uploaded data has {df.shape[0]} rows and {df.shape[1]} columns.")

        return df
    except Exception as e:
        st.error(f"Error reading the file: {e}")
        return None

def predict_with_model(model, data, includes_preprocessor):
    if includes_preprocessor:
        return model.predict(data)
    else:
        return model.predict(preprocessor.transform(data))

def create_model_output(df, predictions, model_name):
    output_df = df[['PID']].copy()  # Keep PID from original dataframe
    output_df['Prediction'] = predictions
    output_df['Churn Probability'] = predictions
    output_df['Churn'] = output_df['Prediction'].apply(
        lambda x: 'Yes' if x == 1 else 'No')
    return output_df

# Load models and record whether they include the preprocessor
model_folder = 'modelExports'
models = {}
models_with_preprocessor = {}
for file_name in os.listdir(model_folder):
    if file_name.endswith('.pkl') and file_name != 'preprocessor.pkl':
        model_name = file_name.replace('.pkl', '').replace('_', ' ').upper()
        model = joblib.load(os.path.join(model_folder, file_name))
        models[model_name] = model

        # Check if model includes preprocessor
        includes_preprocessor = hasattr(
            model, 'named_steps') and 'preprocessor' in model.named_steps
        models_with_preprocessor[model_name] = includes_preprocessor

# Model accuracies
model_accuracies = {
    "GAUSSIAN NAIVE BAYES WITH SMOTE MODEL": 86,
    "GAUSSIAN NAIVE BAYES WITHOUT SMOTE MODEL": 85,
    "GRADIENT BOOSTING WITH SMOTE MODEL": 95,
    "GRADIENT BOOSTING WITHOUT SMOTE MODEL": 94,
    "LINEAR DISCRIMINANT ANALYSIS WITH SMOTE MODEL": 88,
    "LINEAR DISCRIMINANT ANALYSIS WITHOUT SMOTE MODEL": 87,
    "LOGISTIC REGRESSION WITH SMOTE MODEL": 90,
    "LOGISTIC REGRESSION WITHOUT SMOTE MODEL": 89,
    "RANDOM FOREST WITH SMOTE MODEL": 95,
    "RANDOM FOREST WITHOUT SMOTE MODEL": 93,
    "SUPPORT VECTOR MACHINE WITH SMOTE MODEL": 91,
    "SUPPORT VECTOR MACHINE WITHOUT SMOTE MODEL": 90,
    "ADABOOST WITH SMOTE MODEL": 92,
    "ADABOOST WITHOUT SMOTE MODEL": 90,
    "DECISION TREE WITH SMOTE MODEL": 88,
    "DECISION TREE WITHOUT SMOTE MODEL": 86
}

# Define the Streamlit app
st.title('Customer Churn Prediction')

# Sidebar for interface selection
st.sidebar.header('Interface Selection')
interface = st.sidebar.radio(
    "Choose an interface",
    ("Single Prediction", "Batch Prediction")
)

# Sidebar for model selection
st.sidebar.header('Model Selection')
selected_models = st.sidebar.multiselect(
    'Select models for prediction',
    list(models.keys()),
    default=list(models.keys())
)

# Define categorical options
crm_pid_value_segment_options = ['Bronze', 'Iron', 'Gold', 'Silver', 'Lead',
                                 'Platinum', 'SME', 'SE', 'Sliver', 'Unknown']
effective_segment_options = ['SOHO', 'VSE', 'Other', 'SME', 'LE', 'SE']
ka_name_options = ['Vladimir Manahilov', 'Desislava Ivanova', 'Martin Tilev',
                   'Anna Dimitrova', 'Rumiana Jordanova', 'Anna Dimova',
                   'Vania Uzunova', 'Varta Torosian', 'Daniela Stefanova',
                   'Ginka Vachkova', 'Tatiana Trifonova', 'Jenia Gogova', 'Unknown']

if interface == "Single Prediction":
    # Input fields for new customer data
    st.header('Enter New Customer Data')

    # Collect input data
    input_data = {}

    # Categorical inputs
    input_data['CRM_PID_VALUE_SEGMENT'] = st.selectbox(
        'CRM_PID_VALUE_SEGMENT', crm_pid_value_segment_options)
    input_data['EFFECTIVESEGMENT'] = st.selectbox(
        'EFFECTIVESEGMENT', effective_segment_options)
    input_data['KA_NAME'] = st.selectbox('KA_NAME', ka_name_options)

    # Numerical inputs
    input_data['BILLING_ZIP'] = st.number_input(
        'BILLING_ZIP', min_value=0, format="%d")
    input_data['ACTIVE_SUBSCRIBERS'] = st.number_input(
        'ACTIVE_SUBSCRIBERS', min_value=0, format="%d")
    input_data['NOT_ACTIVE_SUBSCRIBERS'] = st.number_input(
        'NOT_ACTIVE_SUBSCRIBERS', min_value=0, format="%d")
    input_data['SUSPENDED_SUBSCRIBERS'] = st.number_input(
        'SUSPENDED_SUBSCRIBERS', min_value=0, format="%d")
    input_data['TOTAL_SUBS'] = st.number_input(
        'TOTAL_SUBS', min_value=0, format="%d")
    input_data['AVGMOBILEREVENUE'] = st.number_input(
        'AVGMOBILEREVENUE', min_value=0.0, format="%.2f")
    input_data['AVGFIXREVENUE'] = st.number_input(
        'AVGFIXREVENUE', min_value=0.0, format="%.2f")
    input_data['TOTALREVENUE'] = st.number_input(
        'TOTALREVENUE', min_value=0.0, format="%.2f")
    input_data['ARPU'] = st.number_input('ARPU', min_value=0.0, format="%.2f")

    # Predict churn
    if st.button('Predict Churn'):
        # Convert input data to DataFrame
        input_df = pd.DataFrame([input_data])

        # Standardize column names to uppercase
        input_df.columns = input_df.columns.str.upper().str.strip()

        # Preprocess the data only if needed
        input_data_transformed = preprocessor.transform(input_df)

        st.write("### Model Predictions")

        predictions = {}
        weighted_votes = {'Churn': 0, 'No Churn': 0}

        for model_name in selected_models:
            model = models[model_name]
            includes_preprocessor = models_with_preprocessor[model_name]

            try:
                if includes_preprocessor:
                    # Model includes preprocessor; use raw data
                    prediction = model.predict(input_df)
                else:
                    # Model does not include preprocessor; use preprocessed data
                    prediction = model.predict(input_data_transformed)
            except Exception as e:
                st.error(f"Error predicting with model {model_name}: {e}")
                continue

            churn_prediction = 'Churn' if prediction[0] == 1 else 'No Churn'
            predictions[model_name] = churn_prediction

            # Add weighted vote
            weight = model_accuracies.get(model_name, 1)
            weighted_votes[churn_prediction] += weight

            # Display individual model predictions
            st.write(
                f"**{model_name}:** {churn_prediction} (Accuracy: {weight}%)")

        # Calculate and display the overall prediction
        total_weight = sum(weighted_votes.values())
        if total_weight == 0:
            st.error(
                "No valid predictions were made. Cannot compute churn probability.")
        else:
            churn_probability = weighted_votes['Churn'] / total_weight
            overall_prediction = 'Churn' if churn_probability > 0.5 else 'No Churn'

            st.write("### Overall Prediction")
            st.write(f"**Final Prediction:** {overall_prediction}")
            st.write(f"**Churn Probability:** {churn_probability:.2%}")
            st.write(f"**No Churn Probability:** {1 - churn_probability:.2%}")

            # Visualize the predictions
            st.write("### Prediction Visualization")
            chart_data = pd.DataFrame(
                {
                    'Prediction': ['Churn', 'No Churn'],
                    'Weighted Vote': [
                        weighted_votes['Churn'],
                        weighted_votes['No Churn']
                    ]
                }
            )
            st.bar_chart(chart_data.set_index('Prediction'))

elif interface == "Batch Prediction":
    # Batch Prediction Interface
    st.header('Batch Prediction')
    st.write('Upload a CSV or Excel file containing customer data.')

    uploaded_file = st.file_uploader(
        "Choose a CSV or Excel file", type=["csv", "xlsx", "xls"])

    if uploaded_file is not None:
        # Check if models are selected
        if not selected_models:
            st.error(
                "No models selected for prediction. Please select at least one model in the sidebar.")
            st.stop()

        required_columns = [
            'PID', 'CRM_PID_VALUE_SEGMENT', 'EFFECTIVESEGMENT', 'BILLING_ZIP', 'KA_NAME',
            'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS', 'SUSPENDED_SUBSCRIBERS',
            'TOTAL_SUBS', 'AVGMOBILEREVENUE', 'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU'
        ]

        df = process_uploaded_file(uploaded_file, required_columns)
        if df is None:
            st.stop()

        # Standardize required columns to uppercase and strip spaces
        required_columns_upper = [col.upper().strip() for col in required_columns]

        # Convert numerical columns to numeric data types
        numerical_columns = [
            'BILLING_ZIP', 'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS',
            'SUSPENDED_SUBSCRIBERS', 'TOTAL_SUBS', 'AVGMOBILEREVENUE',
            'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU'
        ]

        for col in numerical_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        # Fill missing values if any
        df.fillna({
            'CRM_PID_VALUE_SEGMENT': 'Unknown',
            'EFFECTIVESEGMENT': 'Unknown',
            'KA_NAME': 'Unknown',
            'BILLING_ZIP': 0,
            'ACTIVE_SUBSCRIBERS': 0,
            'NOT_ACTIVE_SUBSCRIBERS': 0,
            'SUSPENDED_SUBSCRIBERS': 0,
            'TOTAL_SUBS': 0,
            'AVGMOBILEREVENUE': 0.0,
            'AVGFIXREVENUE': 0.0,
            'TOTALREVENUE': 0.0,
            'ARPU': 0.0
        }, inplace=True)

        st.write("### Processing Batch Predictions...")

        model_outputs = {}

        for model_name in selected_models:
            model = models[model_name]
            includes_preprocessor = models_with_preprocessor[model_name]

            try:
                # Prepare data for prediction
                data_for_prediction = df[required_columns_upper[1:]]  # Exclude 'PID'

                if not includes_preprocessor:
                    data_for_prediction = preprocessor.transform(data_for_prediction)

                predictions = model.predict(data_for_prediction)
                model_outputs[model_name] = create_model_output(
                    df, predictions, model_name)
                st.success(f"Predictions completed for {model_name}")
            except Exception as e:
                st.error(f"Error predicting with model {model_name}: {e}")

        st.success('Batch predictions completed for all selected models.')

        # Download options
        st.header('Download Predictions')
        download_option = st.radio(
            "Choose how to download your predictions:",
            ("All Models in Separate Files", "Churn and Non-Churn in Separate Files", "Download All at Once")
        )

        if download_option == "All Models in Separate Files":
            # Allow user to download the results for each model
            for model_name, output_df in model_outputs.items():
                csv = output_df.to_csv(index=False).encode('utf-8')
                st.download_button(
                    label=f"Download {model_name} Predictions as CSV",
                    data=csv,
                    file_name=f'{model_name.lower().replace(" ", "_")}_predictions.csv',
                    mime='text/csv',
                )
        elif download_option == "Churn and Non-Churn in Separate Files":
            # Consolidate results for all models and split into churn and non-churn files
            for model_name, output_df in model_outputs.items():
                churn_df = output_df[output_df['Churn'] == 'Yes']
                non_churn_df = output_df[output_df['Churn'] == 'No']

                churn_csv = churn_df.to_csv(index=False).encode('utf-8')
                non_churn_csv = non_churn_df.to_csv(index=False).encode('utf-8')

                st.download_button(
                    label=f"Download {model_name} Churn Predictions as CSV",
                    data=churn_csv,
                    file_name=f'{model_name.lower().replace(" ", "_")}_churn_predictions.csv',
                    mime='text/csv',
                )
                st.download_button(
                    label=f"Download {model_name} Non-Churn Predictions as CSV",
                    data=non_churn_csv,
                    file_name=f'{model_name.lower().replace(" ", "_")}_non_churn_predictions.csv',
                    mime='text/csv',
                )
        elif download_option == "Download All at Once":
            # Create a zip file containing all outputs
            zip_buffer = io.BytesIO()
            with zipfile.ZipFile(zip_buffer, "w") as zip_file:
                for model_name, output_df in model_outputs.items():
                    csv_data = output_df.to_csv(index=False).encode('utf-8')
                    zip_file.writestr(f'{model_name.lower().replace(" ", "_")}_predictions.csv', csv_data)
            zip_buffer.seek(0)

            st.download_button(
                label="Download All Predictions as ZIP",
                data=zip_buffer,
                file_name='all_model_predictions.zip',
                mime='application/zip'
            )
    else:
        st.info('Awaiting CSV or Excel file to be uploaded.')

# Sidebar information
st.sidebar.write("### Model Information")
st.sidebar.write(f"Total models available: {len(models)}")
st.sidebar.write(f"Models selected for prediction: {len(selected_models)}")
st.sidebar.write("### Model Accuracies")
for model, accuracy in model_accuracies.items():
    st.sidebar.write(f"{model}: {accuracy}%")