Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import joblib | |
| import numpy as np | |
| import os | |
| import pandas as pd | |
| import openpyxl | |
| import zipfile | |
| import io | |
| # Load the preprocessor | |
| preprocessor_path = 'modelExports/preprocessor.pkl' | |
| preprocessor = joblib.load(preprocessor_path) | |
| def find_header_row(df, required_columns, max_rows_to_check=10): | |
| required_columns_upper = [col.upper().strip() for col in required_columns] | |
| for i in range(min(max_rows_to_check, len(df))): | |
| row_values = [str(val).upper().strip() for val in df.iloc[i].values] | |
| if all(col in row_values for col in required_columns_upper): | |
| return i # Header row found at row i | |
| return -1 # Header row not found | |
| def process_uploaded_file(uploaded_file, required_columns): | |
| try: | |
| file_extension = uploaded_file.name.split('.')[-1].lower() | |
| if file_extension == 'csv': | |
| # Read the first few rows to check for headers | |
| df = pd.read_csv(uploaded_file, nrows=10, header=None) | |
| elif file_extension in ['xlsx', 'xls']: | |
| df = pd.read_excel(uploaded_file, nrows=10, header=None, engine='openpyxl') | |
| else: | |
| st.error("Unsupported file format. Please upload a CSV or Excel file.") | |
| return None | |
| header_row = find_header_row(df, required_columns) | |
| if header_row == -1: | |
| st.error(f"Required columns not found in the first {len(df)} rows.") | |
| st.write("Expected columns:", required_columns) | |
| st.write("Found data rows:", df.head().values.tolist()) | |
| return None | |
| # Re-read the file with the correct header row | |
| uploaded_file.seek(0) # Reset file pointer | |
| if file_extension == 'csv': | |
| df = pd.read_csv(uploaded_file, header=header_row) | |
| else: | |
| df = pd.read_excel(uploaded_file, header=header_row, engine='openpyxl') | |
| # Standardize column names to uppercase and strip spaces | |
| df.columns = df.columns.str.upper().str.strip() | |
| # Standardize required columns to uppercase and strip spaces | |
| required_columns_upper = [col.upper().strip() for col in required_columns] | |
| # Check if all required columns are present | |
| missing_columns = [col for col in required_columns_upper if col not in df.columns] | |
| if missing_columns: | |
| st.error(f"The following required columns are missing: {missing_columns}") | |
| return None | |
| st.write(f"Uploaded data has {df.shape[0]} rows and {df.shape[1]} columns.") | |
| return df | |
| except Exception as e: | |
| st.error(f"Error reading the file: {e}") | |
| return None | |
| def predict_with_model(model, data, includes_preprocessor): | |
| if includes_preprocessor: | |
| return model.predict(data) | |
| else: | |
| return model.predict(preprocessor.transform(data)) | |
| def create_model_output(df, predictions, model_name): | |
| output_df = df[['PID']].copy() # Keep PID from original dataframe | |
| output_df['Prediction'] = predictions | |
| output_df['Churn Probability'] = predictions | |
| output_df['Churn'] = output_df['Prediction'].apply( | |
| lambda x: 'Yes' if x == 1 else 'No') | |
| return output_df | |
| # Load models and record whether they include the preprocessor | |
| model_folder = 'modelExports' | |
| models = {} | |
| models_with_preprocessor = {} | |
| for file_name in os.listdir(model_folder): | |
| if file_name.endswith('.pkl') and file_name != 'preprocessor.pkl': | |
| model_name = file_name.replace('.pkl', '').replace('_', ' ').upper() | |
| model = joblib.load(os.path.join(model_folder, file_name)) | |
| models[model_name] = model | |
| # Check if model includes preprocessor | |
| includes_preprocessor = hasattr( | |
| model, 'named_steps') and 'preprocessor' in model.named_steps | |
| models_with_preprocessor[model_name] = includes_preprocessor | |
| # Model accuracies | |
| model_accuracies = { | |
| "GAUSSIAN NAIVE BAYES WITH SMOTE MODEL": 86, | |
| "GAUSSIAN NAIVE BAYES WITHOUT SMOTE MODEL": 85, | |
| "GRADIENT BOOSTING WITH SMOTE MODEL": 95, | |
| "GRADIENT BOOSTING WITHOUT SMOTE MODEL": 94, | |
| "LINEAR DISCRIMINANT ANALYSIS WITH SMOTE MODEL": 88, | |
| "LINEAR DISCRIMINANT ANALYSIS WITHOUT SMOTE MODEL": 87, | |
| "LOGISTIC REGRESSION WITH SMOTE MODEL": 90, | |
| "LOGISTIC REGRESSION WITHOUT SMOTE MODEL": 89, | |
| "RANDOM FOREST WITH SMOTE MODEL": 95, | |
| "RANDOM FOREST WITHOUT SMOTE MODEL": 93, | |
| "SUPPORT VECTOR MACHINE WITH SMOTE MODEL": 91, | |
| "SUPPORT VECTOR MACHINE WITHOUT SMOTE MODEL": 90, | |
| "ADABOOST WITH SMOTE MODEL": 92, | |
| "ADABOOST WITHOUT SMOTE MODEL": 90, | |
| "DECISION TREE WITH SMOTE MODEL": 88, | |
| "DECISION TREE WITHOUT SMOTE MODEL": 86 | |
| } | |
| # Define the Streamlit app | |
| st.title('Customer Churn Prediction') | |
| # Sidebar for interface selection | |
| st.sidebar.header('Interface Selection') | |
| interface = st.sidebar.radio( | |
| "Choose an interface", | |
| ("Single Prediction", "Batch Prediction") | |
| ) | |
| # Sidebar for model selection | |
| st.sidebar.header('Model Selection') | |
| selected_models = st.sidebar.multiselect( | |
| 'Select models for prediction', | |
| list(models.keys()), | |
| default=list(models.keys()) | |
| ) | |
| # Define categorical options | |
| crm_pid_value_segment_options = ['Bronze', 'Iron', 'Gold', 'Silver', 'Lead', | |
| 'Platinum', 'SME', 'SE', 'Sliver', 'Unknown'] | |
| effective_segment_options = ['SOHO', 'VSE', 'Other', 'SME', 'LE', 'SE'] | |
| ka_name_options = ['Vladimir Manahilov', 'Desislava Ivanova', 'Martin Tilev', | |
| 'Anna Dimitrova', 'Rumiana Jordanova', 'Anna Dimova', | |
| 'Vania Uzunova', 'Varta Torosian', 'Daniela Stefanova', | |
| 'Ginka Vachkova', 'Tatiana Trifonova', 'Jenia Gogova', 'Unknown'] | |
| if interface == "Single Prediction": | |
| # Input fields for new customer data | |
| st.header('Enter New Customer Data') | |
| # Collect input data | |
| input_data = {} | |
| # Categorical inputs | |
| input_data['CRM_PID_VALUE_SEGMENT'] = st.selectbox( | |
| 'CRM_PID_VALUE_SEGMENT', crm_pid_value_segment_options) | |
| input_data['EFFECTIVESEGMENT'] = st.selectbox( | |
| 'EFFECTIVESEGMENT', effective_segment_options) | |
| input_data['KA_NAME'] = st.selectbox('KA_NAME', ka_name_options) | |
| # Numerical inputs | |
| input_data['BILLING_ZIP'] = st.number_input( | |
| 'BILLING_ZIP', min_value=0, format="%d") | |
| input_data['ACTIVE_SUBSCRIBERS'] = st.number_input( | |
| 'ACTIVE_SUBSCRIBERS', min_value=0, format="%d") | |
| input_data['NOT_ACTIVE_SUBSCRIBERS'] = st.number_input( | |
| 'NOT_ACTIVE_SUBSCRIBERS', min_value=0, format="%d") | |
| input_data['SUSPENDED_SUBSCRIBERS'] = st.number_input( | |
| 'SUSPENDED_SUBSCRIBERS', min_value=0, format="%d") | |
| input_data['TOTAL_SUBS'] = st.number_input( | |
| 'TOTAL_SUBS', min_value=0, format="%d") | |
| input_data['AVGMOBILEREVENUE'] = st.number_input( | |
| 'AVGMOBILEREVENUE', min_value=0.0, format="%.2f") | |
| input_data['AVGFIXREVENUE'] = st.number_input( | |
| 'AVGFIXREVENUE', min_value=0.0, format="%.2f") | |
| input_data['TOTALREVENUE'] = st.number_input( | |
| 'TOTALREVENUE', min_value=0.0, format="%.2f") | |
| input_data['ARPU'] = st.number_input('ARPU', min_value=0.0, format="%.2f") | |
| # Predict churn | |
| if st.button('Predict Churn'): | |
| # Convert input data to DataFrame | |
| input_df = pd.DataFrame([input_data]) | |
| # Standardize column names to uppercase | |
| input_df.columns = input_df.columns.str.upper().str.strip() | |
| # Preprocess the data only if needed | |
| input_data_transformed = preprocessor.transform(input_df) | |
| st.write("### Model Predictions") | |
| predictions = {} | |
| weighted_votes = {'Churn': 0, 'No Churn': 0} | |
| for model_name in selected_models: | |
| model = models[model_name] | |
| includes_preprocessor = models_with_preprocessor[model_name] | |
| try: | |
| if includes_preprocessor: | |
| # Model includes preprocessor; use raw data | |
| prediction = model.predict(input_df) | |
| else: | |
| # Model does not include preprocessor; use preprocessed data | |
| prediction = model.predict(input_data_transformed) | |
| except Exception as e: | |
| st.error(f"Error predicting with model {model_name}: {e}") | |
| continue | |
| churn_prediction = 'Churn' if prediction[0] == 1 else 'No Churn' | |
| predictions[model_name] = churn_prediction | |
| # Add weighted vote | |
| weight = model_accuracies.get(model_name, 1) | |
| weighted_votes[churn_prediction] += weight | |
| # Display individual model predictions | |
| st.write( | |
| f"**{model_name}:** {churn_prediction} (Accuracy: {weight}%)") | |
| # Calculate and display the overall prediction | |
| total_weight = sum(weighted_votes.values()) | |
| if total_weight == 0: | |
| st.error( | |
| "No valid predictions were made. Cannot compute churn probability.") | |
| else: | |
| churn_probability = weighted_votes['Churn'] / total_weight | |
| overall_prediction = 'Churn' if churn_probability > 0.5 else 'No Churn' | |
| st.write("### Overall Prediction") | |
| st.write(f"**Final Prediction:** {overall_prediction}") | |
| st.write(f"**Churn Probability:** {churn_probability:.2%}") | |
| st.write(f"**No Churn Probability:** {1 - churn_probability:.2%}") | |
| # Visualize the predictions | |
| st.write("### Prediction Visualization") | |
| chart_data = pd.DataFrame( | |
| { | |
| 'Prediction': ['Churn', 'No Churn'], | |
| 'Weighted Vote': [ | |
| weighted_votes['Churn'], | |
| weighted_votes['No Churn'] | |
| ] | |
| } | |
| ) | |
| st.bar_chart(chart_data.set_index('Prediction')) | |
| elif interface == "Batch Prediction": | |
| # Batch Prediction Interface | |
| st.header('Batch Prediction') | |
| st.write('Upload a CSV or Excel file containing customer data.') | |
| uploaded_file = st.file_uploader( | |
| "Choose a CSV or Excel file", type=["csv", "xlsx", "xls"]) | |
| if uploaded_file is not None: | |
| # Check if models are selected | |
| if not selected_models: | |
| st.error( | |
| "No models selected for prediction. Please select at least one model in the sidebar.") | |
| st.stop() | |
| required_columns = [ | |
| 'PID', 'CRM_PID_VALUE_SEGMENT', 'EFFECTIVESEGMENT', 'BILLING_ZIP', 'KA_NAME', | |
| 'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS', 'SUSPENDED_SUBSCRIBERS', | |
| 'TOTAL_SUBS', 'AVGMOBILEREVENUE', 'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU' | |
| ] | |
| df = process_uploaded_file(uploaded_file, required_columns) | |
| if df is None: | |
| st.stop() | |
| # Standardize required columns to uppercase and strip spaces | |
| required_columns_upper = [col.upper().strip() for col in required_columns] | |
| # Convert numerical columns to numeric data types | |
| numerical_columns = [ | |
| 'BILLING_ZIP', 'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS', | |
| 'SUSPENDED_SUBSCRIBERS', 'TOTAL_SUBS', 'AVGMOBILEREVENUE', | |
| 'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU' | |
| ] | |
| for col in numerical_columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| # Fill missing values if any | |
| df.fillna({ | |
| 'CRM_PID_VALUE_SEGMENT': 'Unknown', | |
| 'EFFECTIVESEGMENT': 'Unknown', | |
| 'KA_NAME': 'Unknown', | |
| 'BILLING_ZIP': 0, | |
| 'ACTIVE_SUBSCRIBERS': 0, | |
| 'NOT_ACTIVE_SUBSCRIBERS': 0, | |
| 'SUSPENDED_SUBSCRIBERS': 0, | |
| 'TOTAL_SUBS': 0, | |
| 'AVGMOBILEREVENUE': 0.0, | |
| 'AVGFIXREVENUE': 0.0, | |
| 'TOTALREVENUE': 0.0, | |
| 'ARPU': 0.0 | |
| }, inplace=True) | |
| st.write("### Processing Batch Predictions...") | |
| model_outputs = {} | |
| for model_name in selected_models: | |
| model = models[model_name] | |
| includes_preprocessor = models_with_preprocessor[model_name] | |
| try: | |
| # Prepare data for prediction | |
| data_for_prediction = df[required_columns_upper[1:]] # Exclude 'PID' | |
| if not includes_preprocessor: | |
| data_for_prediction = preprocessor.transform(data_for_prediction) | |
| predictions = model.predict(data_for_prediction) | |
| model_outputs[model_name] = create_model_output( | |
| df, predictions, model_name) | |
| st.success(f"Predictions completed for {model_name}") | |
| except Exception as e: | |
| st.error(f"Error predicting with model {model_name}: {e}") | |
| st.success('Batch predictions completed for all selected models.') | |
| # Download options | |
| st.header('Download Predictions') | |
| download_option = st.radio( | |
| "Choose how to download your predictions:", | |
| ("All Models in Separate Files", "Churn and Non-Churn in Separate Files", "Download All at Once") | |
| ) | |
| if download_option == "All Models in Separate Files": | |
| # Allow user to download the results for each model | |
| for model_name, output_df in model_outputs.items(): | |
| csv = output_df.to_csv(index=False).encode('utf-8') | |
| st.download_button( | |
| label=f"Download {model_name} Predictions as CSV", | |
| data=csv, | |
| file_name=f'{model_name.lower().replace(" ", "_")}_predictions.csv', | |
| mime='text/csv', | |
| ) | |
| elif download_option == "Churn and Non-Churn in Separate Files": | |
| # Consolidate results for all models and split into churn and non-churn files | |
| for model_name, output_df in model_outputs.items(): | |
| churn_df = output_df[output_df['Churn'] == 'Yes'] | |
| non_churn_df = output_df[output_df['Churn'] == 'No'] | |
| churn_csv = churn_df.to_csv(index=False).encode('utf-8') | |
| non_churn_csv = non_churn_df.to_csv(index=False).encode('utf-8') | |
| st.download_button( | |
| label=f"Download {model_name} Churn Predictions as CSV", | |
| data=churn_csv, | |
| file_name=f'{model_name.lower().replace(" ", "_")}_churn_predictions.csv', | |
| mime='text/csv', | |
| ) | |
| st.download_button( | |
| label=f"Download {model_name} Non-Churn Predictions as CSV", | |
| data=non_churn_csv, | |
| file_name=f'{model_name.lower().replace(" ", "_")}_non_churn_predictions.csv', | |
| mime='text/csv', | |
| ) | |
| elif download_option == "Download All at Once": | |
| # Create a zip file containing all outputs | |
| zip_buffer = io.BytesIO() | |
| with zipfile.ZipFile(zip_buffer, "w") as zip_file: | |
| for model_name, output_df in model_outputs.items(): | |
| csv_data = output_df.to_csv(index=False).encode('utf-8') | |
| zip_file.writestr(f'{model_name.lower().replace(" ", "_")}_predictions.csv', csv_data) | |
| zip_buffer.seek(0) | |
| st.download_button( | |
| label="Download All Predictions as ZIP", | |
| data=zip_buffer, | |
| file_name='all_model_predictions.zip', | |
| mime='application/zip' | |
| ) | |
| else: | |
| st.info('Awaiting CSV or Excel file to be uploaded.') | |
| # Sidebar information | |
| st.sidebar.write("### Model Information") | |
| st.sidebar.write(f"Total models available: {len(models)}") | |
| st.sidebar.write(f"Models selected for prediction: {len(selected_models)}") | |
| st.sidebar.write("### Model Accuracies") | |
| for model, accuracy in model_accuracies.items(): | |
| st.sidebar.write(f"{model}: {accuracy}%") | |