import streamlit as st import joblib import numpy as np import os import pandas as pd import openpyxl import zipfile import io # Load the preprocessor preprocessor_path = 'modelExports/preprocessor.pkl' preprocessor = joblib.load(preprocessor_path) def find_header_row(df, required_columns, max_rows_to_check=10): required_columns_upper = [col.upper().strip() for col in required_columns] for i in range(min(max_rows_to_check, len(df))): row_values = [str(val).upper().strip() for val in df.iloc[i].values] if all(col in row_values for col in required_columns_upper): return i # Header row found at row i return -1 # Header row not found def process_uploaded_file(uploaded_file, required_columns): try: file_extension = uploaded_file.name.split('.')[-1].lower() if file_extension == 'csv': # Read the first few rows to check for headers df = pd.read_csv(uploaded_file, nrows=10, header=None) elif file_extension in ['xlsx', 'xls']: df = pd.read_excel(uploaded_file, nrows=10, header=None, engine='openpyxl') else: st.error("Unsupported file format. Please upload a CSV or Excel file.") return None header_row = find_header_row(df, required_columns) if header_row == -1: st.error(f"Required columns not found in the first {len(df)} rows.") st.write("Expected columns:", required_columns) st.write("Found data rows:", df.head().values.tolist()) return None # Re-read the file with the correct header row uploaded_file.seek(0) # Reset file pointer if file_extension == 'csv': df = pd.read_csv(uploaded_file, header=header_row) else: df = pd.read_excel(uploaded_file, header=header_row, engine='openpyxl') # Standardize column names to uppercase and strip spaces df.columns = df.columns.str.upper().str.strip() # Standardize required columns to uppercase and strip spaces required_columns_upper = [col.upper().strip() for col in required_columns] # Check if all required columns are present missing_columns = [col for col in required_columns_upper if col not in df.columns] if missing_columns: st.error(f"The following required columns are missing: {missing_columns}") return None st.write(f"Uploaded data has {df.shape[0]} rows and {df.shape[1]} columns.") return df except Exception as e: st.error(f"Error reading the file: {e}") return None def predict_with_model(model, data, includes_preprocessor): if includes_preprocessor: return model.predict(data) else: return model.predict(preprocessor.transform(data)) def create_model_output(df, predictions, model_name): output_df = df[['PID']].copy() # Keep PID from original dataframe output_df['Prediction'] = predictions output_df['Churn Probability'] = predictions output_df['Churn'] = output_df['Prediction'].apply( lambda x: 'Yes' if x == 1 else 'No') return output_df # Load models and record whether they include the preprocessor model_folder = 'modelExports' models = {} models_with_preprocessor = {} for file_name in os.listdir(model_folder): if file_name.endswith('.pkl') and file_name != 'preprocessor.pkl': model_name = file_name.replace('.pkl', '').replace('_', ' ').upper() model = joblib.load(os.path.join(model_folder, file_name)) models[model_name] = model # Check if model includes preprocessor includes_preprocessor = hasattr( model, 'named_steps') and 'preprocessor' in model.named_steps models_with_preprocessor[model_name] = includes_preprocessor # Model accuracies model_accuracies = { "GAUSSIAN NAIVE BAYES WITH SMOTE MODEL": 86, "GAUSSIAN NAIVE BAYES WITHOUT SMOTE MODEL": 85, "GRADIENT BOOSTING WITH SMOTE MODEL": 95, "GRADIENT BOOSTING WITHOUT SMOTE MODEL": 94, "LINEAR DISCRIMINANT ANALYSIS WITH SMOTE MODEL": 88, "LINEAR DISCRIMINANT ANALYSIS WITHOUT SMOTE MODEL": 87, "LOGISTIC REGRESSION WITH SMOTE MODEL": 90, "LOGISTIC REGRESSION WITHOUT SMOTE MODEL": 89, "RANDOM FOREST WITH SMOTE MODEL": 95, "RANDOM FOREST WITHOUT SMOTE MODEL": 93, "SUPPORT VECTOR MACHINE WITH SMOTE MODEL": 91, "SUPPORT VECTOR MACHINE WITHOUT SMOTE MODEL": 90, "ADABOOST WITH SMOTE MODEL": 92, "ADABOOST WITHOUT SMOTE MODEL": 90, "DECISION TREE WITH SMOTE MODEL": 88, "DECISION TREE WITHOUT SMOTE MODEL": 86 } # Define the Streamlit app st.title('Customer Churn Prediction') # Sidebar for interface selection st.sidebar.header('Interface Selection') interface = st.sidebar.radio( "Choose an interface", ("Single Prediction", "Batch Prediction") ) # Sidebar for model selection st.sidebar.header('Model Selection') selected_models = st.sidebar.multiselect( 'Select models for prediction', list(models.keys()), default=list(models.keys()) ) # Define categorical options crm_pid_value_segment_options = ['Bronze', 'Iron', 'Gold', 'Silver', 'Lead', 'Platinum', 'SME', 'SE', 'Sliver', 'Unknown'] effective_segment_options = ['SOHO', 'VSE', 'Other', 'SME', 'LE', 'SE'] ka_name_options = ['Vladimir Manahilov', 'Desislava Ivanova', 'Martin Tilev', 'Anna Dimitrova', 'Rumiana Jordanova', 'Anna Dimova', 'Vania Uzunova', 'Varta Torosian', 'Daniela Stefanova', 'Ginka Vachkova', 'Tatiana Trifonova', 'Jenia Gogova', 'Unknown'] if interface == "Single Prediction": # Input fields for new customer data st.header('Enter New Customer Data') # Collect input data input_data = {} # Categorical inputs input_data['CRM_PID_VALUE_SEGMENT'] = st.selectbox( 'CRM_PID_VALUE_SEGMENT', crm_pid_value_segment_options) input_data['EFFECTIVESEGMENT'] = st.selectbox( 'EFFECTIVESEGMENT', effective_segment_options) input_data['KA_NAME'] = st.selectbox('KA_NAME', ka_name_options) # Numerical inputs input_data['BILLING_ZIP'] = st.number_input( 'BILLING_ZIP', min_value=0, format="%d") input_data['ACTIVE_SUBSCRIBERS'] = st.number_input( 'ACTIVE_SUBSCRIBERS', min_value=0, format="%d") input_data['NOT_ACTIVE_SUBSCRIBERS'] = st.number_input( 'NOT_ACTIVE_SUBSCRIBERS', min_value=0, format="%d") input_data['SUSPENDED_SUBSCRIBERS'] = st.number_input( 'SUSPENDED_SUBSCRIBERS', min_value=0, format="%d") input_data['TOTAL_SUBS'] = st.number_input( 'TOTAL_SUBS', min_value=0, format="%d") input_data['AVGMOBILEREVENUE'] = st.number_input( 'AVGMOBILEREVENUE', min_value=0.0, format="%.2f") input_data['AVGFIXREVENUE'] = st.number_input( 'AVGFIXREVENUE', min_value=0.0, format="%.2f") input_data['TOTALREVENUE'] = st.number_input( 'TOTALREVENUE', min_value=0.0, format="%.2f") input_data['ARPU'] = st.number_input('ARPU', min_value=0.0, format="%.2f") # Predict churn if st.button('Predict Churn'): # Convert input data to DataFrame input_df = pd.DataFrame([input_data]) # Standardize column names to uppercase input_df.columns = input_df.columns.str.upper().str.strip() # Preprocess the data only if needed input_data_transformed = preprocessor.transform(input_df) st.write("### Model Predictions") predictions = {} weighted_votes = {'Churn': 0, 'No Churn': 0} for model_name in selected_models: model = models[model_name] includes_preprocessor = models_with_preprocessor[model_name] try: if includes_preprocessor: # Model includes preprocessor; use raw data prediction = model.predict(input_df) else: # Model does not include preprocessor; use preprocessed data prediction = model.predict(input_data_transformed) except Exception as e: st.error(f"Error predicting with model {model_name}: {e}") continue churn_prediction = 'Churn' if prediction[0] == 1 else 'No Churn' predictions[model_name] = churn_prediction # Add weighted vote weight = model_accuracies.get(model_name, 1) weighted_votes[churn_prediction] += weight # Display individual model predictions st.write( f"**{model_name}:** {churn_prediction} (Accuracy: {weight}%)") # Calculate and display the overall prediction total_weight = sum(weighted_votes.values()) if total_weight == 0: st.error( "No valid predictions were made. Cannot compute churn probability.") else: churn_probability = weighted_votes['Churn'] / total_weight overall_prediction = 'Churn' if churn_probability > 0.5 else 'No Churn' st.write("### Overall Prediction") st.write(f"**Final Prediction:** {overall_prediction}") st.write(f"**Churn Probability:** {churn_probability:.2%}") st.write(f"**No Churn Probability:** {1 - churn_probability:.2%}") # Visualize the predictions st.write("### Prediction Visualization") chart_data = pd.DataFrame( { 'Prediction': ['Churn', 'No Churn'], 'Weighted Vote': [ weighted_votes['Churn'], weighted_votes['No Churn'] ] } ) st.bar_chart(chart_data.set_index('Prediction')) elif interface == "Batch Prediction": # Batch Prediction Interface st.header('Batch Prediction') st.write('Upload a CSV or Excel file containing customer data.') uploaded_file = st.file_uploader( "Choose a CSV or Excel file", type=["csv", "xlsx", "xls"]) if uploaded_file is not None: # Check if models are selected if not selected_models: st.error( "No models selected for prediction. Please select at least one model in the sidebar.") st.stop() required_columns = [ 'PID', 'CRM_PID_VALUE_SEGMENT', 'EFFECTIVESEGMENT', 'BILLING_ZIP', 'KA_NAME', 'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS', 'SUSPENDED_SUBSCRIBERS', 'TOTAL_SUBS', 'AVGMOBILEREVENUE', 'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU' ] df = process_uploaded_file(uploaded_file, required_columns) if df is None: st.stop() # Standardize required columns to uppercase and strip spaces required_columns_upper = [col.upper().strip() for col in required_columns] # Convert numerical columns to numeric data types numerical_columns = [ 'BILLING_ZIP', 'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS', 'SUSPENDED_SUBSCRIBERS', 'TOTAL_SUBS', 'AVGMOBILEREVENUE', 'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU' ] for col in numerical_columns: df[col] = pd.to_numeric(df[col], errors='coerce') # Fill missing values if any df.fillna({ 'CRM_PID_VALUE_SEGMENT': 'Unknown', 'EFFECTIVESEGMENT': 'Unknown', 'KA_NAME': 'Unknown', 'BILLING_ZIP': 0, 'ACTIVE_SUBSCRIBERS': 0, 'NOT_ACTIVE_SUBSCRIBERS': 0, 'SUSPENDED_SUBSCRIBERS': 0, 'TOTAL_SUBS': 0, 'AVGMOBILEREVENUE': 0.0, 'AVGFIXREVENUE': 0.0, 'TOTALREVENUE': 0.0, 'ARPU': 0.0 }, inplace=True) st.write("### Processing Batch Predictions...") model_outputs = {} for model_name in selected_models: model = models[model_name] includes_preprocessor = models_with_preprocessor[model_name] try: # Prepare data for prediction data_for_prediction = df[required_columns_upper[1:]] # Exclude 'PID' if not includes_preprocessor: data_for_prediction = preprocessor.transform(data_for_prediction) predictions = model.predict(data_for_prediction) model_outputs[model_name] = create_model_output( df, predictions, model_name) st.success(f"Predictions completed for {model_name}") except Exception as e: st.error(f"Error predicting with model {model_name}: {e}") st.success('Batch predictions completed for all selected models.') # Download options st.header('Download Predictions') download_option = st.radio( "Choose how to download your predictions:", ("All Models in Separate Files", "Churn and Non-Churn in Separate Files", "Download All at Once") ) if download_option == "All Models in Separate Files": # Allow user to download the results for each model for model_name, output_df in model_outputs.items(): csv = output_df.to_csv(index=False).encode('utf-8') st.download_button( label=f"Download {model_name} Predictions as CSV", data=csv, file_name=f'{model_name.lower().replace(" ", "_")}_predictions.csv', mime='text/csv', ) elif download_option == "Churn and Non-Churn in Separate Files": # Consolidate results for all models and split into churn and non-churn files for model_name, output_df in model_outputs.items(): churn_df = output_df[output_df['Churn'] == 'Yes'] non_churn_df = output_df[output_df['Churn'] == 'No'] churn_csv = churn_df.to_csv(index=False).encode('utf-8') non_churn_csv = non_churn_df.to_csv(index=False).encode('utf-8') st.download_button( label=f"Download {model_name} Churn Predictions as CSV", data=churn_csv, file_name=f'{model_name.lower().replace(" ", "_")}_churn_predictions.csv', mime='text/csv', ) st.download_button( label=f"Download {model_name} Non-Churn Predictions as CSV", data=non_churn_csv, file_name=f'{model_name.lower().replace(" ", "_")}_non_churn_predictions.csv', mime='text/csv', ) elif download_option == "Download All at Once": # Create a zip file containing all outputs zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, "w") as zip_file: for model_name, output_df in model_outputs.items(): csv_data = output_df.to_csv(index=False).encode('utf-8') zip_file.writestr(f'{model_name.lower().replace(" ", "_")}_predictions.csv', csv_data) zip_buffer.seek(0) st.download_button( label="Download All Predictions as ZIP", data=zip_buffer, file_name='all_model_predictions.zip', mime='application/zip' ) else: st.info('Awaiting CSV or Excel file to be uploaded.') # Sidebar information st.sidebar.write("### Model Information") st.sidebar.write(f"Total models available: {len(models)}") st.sidebar.write(f"Models selected for prediction: {len(selected_models)}") st.sidebar.write("### Model Accuracies") for model, accuracy in model_accuracies.items(): st.sidebar.write(f"{model}: {accuracy}%")