hhhar's picture
Update app.py
f9e2d27 verified
import streamlit as st
import joblib
import numpy as np
import os
import pandas as pd
import openpyxl
import zipfile
import io
# Load the preprocessor
preprocessor_path = 'modelExports/preprocessor.pkl'
preprocessor = joblib.load(preprocessor_path)
def find_header_row(df, required_columns, max_rows_to_check=10):
required_columns_upper = [col.upper().strip() for col in required_columns]
for i in range(min(max_rows_to_check, len(df))):
row_values = [str(val).upper().strip() for val in df.iloc[i].values]
if all(col in row_values for col in required_columns_upper):
return i # Header row found at row i
return -1 # Header row not found
def process_uploaded_file(uploaded_file, required_columns):
try:
file_extension = uploaded_file.name.split('.')[-1].lower()
if file_extension == 'csv':
# Read the first few rows to check for headers
df = pd.read_csv(uploaded_file, nrows=10, header=None)
elif file_extension in ['xlsx', 'xls']:
df = pd.read_excel(uploaded_file, nrows=10, header=None, engine='openpyxl')
else:
st.error("Unsupported file format. Please upload a CSV or Excel file.")
return None
header_row = find_header_row(df, required_columns)
if header_row == -1:
st.error(f"Required columns not found in the first {len(df)} rows.")
st.write("Expected columns:", required_columns)
st.write("Found data rows:", df.head().values.tolist())
return None
# Re-read the file with the correct header row
uploaded_file.seek(0) # Reset file pointer
if file_extension == 'csv':
df = pd.read_csv(uploaded_file, header=header_row)
else:
df = pd.read_excel(uploaded_file, header=header_row, engine='openpyxl')
# Standardize column names to uppercase and strip spaces
df.columns = df.columns.str.upper().str.strip()
# Standardize required columns to uppercase and strip spaces
required_columns_upper = [col.upper().strip() for col in required_columns]
# Check if all required columns are present
missing_columns = [col for col in required_columns_upper if col not in df.columns]
if missing_columns:
st.error(f"The following required columns are missing: {missing_columns}")
return None
st.write(f"Uploaded data has {df.shape[0]} rows and {df.shape[1]} columns.")
return df
except Exception as e:
st.error(f"Error reading the file: {e}")
return None
def predict_with_model(model, data, includes_preprocessor):
if includes_preprocessor:
return model.predict(data)
else:
return model.predict(preprocessor.transform(data))
def create_model_output(df, predictions, model_name):
output_df = df[['PID']].copy() # Keep PID from original dataframe
output_df['Prediction'] = predictions
output_df['Churn Probability'] = predictions
output_df['Churn'] = output_df['Prediction'].apply(
lambda x: 'Yes' if x == 1 else 'No')
return output_df
# Load models and record whether they include the preprocessor
model_folder = 'modelExports'
models = {}
models_with_preprocessor = {}
for file_name in os.listdir(model_folder):
if file_name.endswith('.pkl') and file_name != 'preprocessor.pkl':
model_name = file_name.replace('.pkl', '').replace('_', ' ').upper()
model = joblib.load(os.path.join(model_folder, file_name))
models[model_name] = model
# Check if model includes preprocessor
includes_preprocessor = hasattr(
model, 'named_steps') and 'preprocessor' in model.named_steps
models_with_preprocessor[model_name] = includes_preprocessor
# Model accuracies
model_accuracies = {
"GAUSSIAN NAIVE BAYES WITH SMOTE MODEL": 86,
"GAUSSIAN NAIVE BAYES WITHOUT SMOTE MODEL": 85,
"GRADIENT BOOSTING WITH SMOTE MODEL": 95,
"GRADIENT BOOSTING WITHOUT SMOTE MODEL": 94,
"LINEAR DISCRIMINANT ANALYSIS WITH SMOTE MODEL": 88,
"LINEAR DISCRIMINANT ANALYSIS WITHOUT SMOTE MODEL": 87,
"LOGISTIC REGRESSION WITH SMOTE MODEL": 90,
"LOGISTIC REGRESSION WITHOUT SMOTE MODEL": 89,
"RANDOM FOREST WITH SMOTE MODEL": 95,
"RANDOM FOREST WITHOUT SMOTE MODEL": 93,
"SUPPORT VECTOR MACHINE WITH SMOTE MODEL": 91,
"SUPPORT VECTOR MACHINE WITHOUT SMOTE MODEL": 90,
"ADABOOST WITH SMOTE MODEL": 92,
"ADABOOST WITHOUT SMOTE MODEL": 90,
"DECISION TREE WITH SMOTE MODEL": 88,
"DECISION TREE WITHOUT SMOTE MODEL": 86
}
# Define the Streamlit app
st.title('Customer Churn Prediction')
# Sidebar for interface selection
st.sidebar.header('Interface Selection')
interface = st.sidebar.radio(
"Choose an interface",
("Single Prediction", "Batch Prediction")
)
# Sidebar for model selection
st.sidebar.header('Model Selection')
selected_models = st.sidebar.multiselect(
'Select models for prediction',
list(models.keys()),
default=list(models.keys())
)
# Define categorical options
crm_pid_value_segment_options = ['Bronze', 'Iron', 'Gold', 'Silver', 'Lead',
'Platinum', 'SME', 'SE', 'Sliver', 'Unknown']
effective_segment_options = ['SOHO', 'VSE', 'Other', 'SME', 'LE', 'SE']
ka_name_options = ['Vladimir Manahilov', 'Desislava Ivanova', 'Martin Tilev',
'Anna Dimitrova', 'Rumiana Jordanova', 'Anna Dimova',
'Vania Uzunova', 'Varta Torosian', 'Daniela Stefanova',
'Ginka Vachkova', 'Tatiana Trifonova', 'Jenia Gogova', 'Unknown']
if interface == "Single Prediction":
# Input fields for new customer data
st.header('Enter New Customer Data')
# Collect input data
input_data = {}
# Categorical inputs
input_data['CRM_PID_VALUE_SEGMENT'] = st.selectbox(
'CRM_PID_VALUE_SEGMENT', crm_pid_value_segment_options)
input_data['EFFECTIVESEGMENT'] = st.selectbox(
'EFFECTIVESEGMENT', effective_segment_options)
input_data['KA_NAME'] = st.selectbox('KA_NAME', ka_name_options)
# Numerical inputs
input_data['BILLING_ZIP'] = st.number_input(
'BILLING_ZIP', min_value=0, format="%d")
input_data['ACTIVE_SUBSCRIBERS'] = st.number_input(
'ACTIVE_SUBSCRIBERS', min_value=0, format="%d")
input_data['NOT_ACTIVE_SUBSCRIBERS'] = st.number_input(
'NOT_ACTIVE_SUBSCRIBERS', min_value=0, format="%d")
input_data['SUSPENDED_SUBSCRIBERS'] = st.number_input(
'SUSPENDED_SUBSCRIBERS', min_value=0, format="%d")
input_data['TOTAL_SUBS'] = st.number_input(
'TOTAL_SUBS', min_value=0, format="%d")
input_data['AVGMOBILEREVENUE'] = st.number_input(
'AVGMOBILEREVENUE', min_value=0.0, format="%.2f")
input_data['AVGFIXREVENUE'] = st.number_input(
'AVGFIXREVENUE', min_value=0.0, format="%.2f")
input_data['TOTALREVENUE'] = st.number_input(
'TOTALREVENUE', min_value=0.0, format="%.2f")
input_data['ARPU'] = st.number_input('ARPU', min_value=0.0, format="%.2f")
# Predict churn
if st.button('Predict Churn'):
# Convert input data to DataFrame
input_df = pd.DataFrame([input_data])
# Standardize column names to uppercase
input_df.columns = input_df.columns.str.upper().str.strip()
# Preprocess the data only if needed
input_data_transformed = preprocessor.transform(input_df)
st.write("### Model Predictions")
predictions = {}
weighted_votes = {'Churn': 0, 'No Churn': 0}
for model_name in selected_models:
model = models[model_name]
includes_preprocessor = models_with_preprocessor[model_name]
try:
if includes_preprocessor:
# Model includes preprocessor; use raw data
prediction = model.predict(input_df)
else:
# Model does not include preprocessor; use preprocessed data
prediction = model.predict(input_data_transformed)
except Exception as e:
st.error(f"Error predicting with model {model_name}: {e}")
continue
churn_prediction = 'Churn' if prediction[0] == 1 else 'No Churn'
predictions[model_name] = churn_prediction
# Add weighted vote
weight = model_accuracies.get(model_name, 1)
weighted_votes[churn_prediction] += weight
# Display individual model predictions
st.write(
f"**{model_name}:** {churn_prediction} (Accuracy: {weight}%)")
# Calculate and display the overall prediction
total_weight = sum(weighted_votes.values())
if total_weight == 0:
st.error(
"No valid predictions were made. Cannot compute churn probability.")
else:
churn_probability = weighted_votes['Churn'] / total_weight
overall_prediction = 'Churn' if churn_probability > 0.5 else 'No Churn'
st.write("### Overall Prediction")
st.write(f"**Final Prediction:** {overall_prediction}")
st.write(f"**Churn Probability:** {churn_probability:.2%}")
st.write(f"**No Churn Probability:** {1 - churn_probability:.2%}")
# Visualize the predictions
st.write("### Prediction Visualization")
chart_data = pd.DataFrame(
{
'Prediction': ['Churn', 'No Churn'],
'Weighted Vote': [
weighted_votes['Churn'],
weighted_votes['No Churn']
]
}
)
st.bar_chart(chart_data.set_index('Prediction'))
elif interface == "Batch Prediction":
# Batch Prediction Interface
st.header('Batch Prediction')
st.write('Upload a CSV or Excel file containing customer data.')
uploaded_file = st.file_uploader(
"Choose a CSV or Excel file", type=["csv", "xlsx", "xls"])
if uploaded_file is not None:
# Check if models are selected
if not selected_models:
st.error(
"No models selected for prediction. Please select at least one model in the sidebar.")
st.stop()
required_columns = [
'PID', 'CRM_PID_VALUE_SEGMENT', 'EFFECTIVESEGMENT', 'BILLING_ZIP', 'KA_NAME',
'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS', 'SUSPENDED_SUBSCRIBERS',
'TOTAL_SUBS', 'AVGMOBILEREVENUE', 'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU'
]
df = process_uploaded_file(uploaded_file, required_columns)
if df is None:
st.stop()
# Standardize required columns to uppercase and strip spaces
required_columns_upper = [col.upper().strip() for col in required_columns]
# Convert numerical columns to numeric data types
numerical_columns = [
'BILLING_ZIP', 'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS',
'SUSPENDED_SUBSCRIBERS', 'TOTAL_SUBS', 'AVGMOBILEREVENUE',
'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU'
]
for col in numerical_columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Fill missing values if any
df.fillna({
'CRM_PID_VALUE_SEGMENT': 'Unknown',
'EFFECTIVESEGMENT': 'Unknown',
'KA_NAME': 'Unknown',
'BILLING_ZIP': 0,
'ACTIVE_SUBSCRIBERS': 0,
'NOT_ACTIVE_SUBSCRIBERS': 0,
'SUSPENDED_SUBSCRIBERS': 0,
'TOTAL_SUBS': 0,
'AVGMOBILEREVENUE': 0.0,
'AVGFIXREVENUE': 0.0,
'TOTALREVENUE': 0.0,
'ARPU': 0.0
}, inplace=True)
st.write("### Processing Batch Predictions...")
model_outputs = {}
for model_name in selected_models:
model = models[model_name]
includes_preprocessor = models_with_preprocessor[model_name]
try:
# Prepare data for prediction
data_for_prediction = df[required_columns_upper[1:]] # Exclude 'PID'
if not includes_preprocessor:
data_for_prediction = preprocessor.transform(data_for_prediction)
predictions = model.predict(data_for_prediction)
model_outputs[model_name] = create_model_output(
df, predictions, model_name)
st.success(f"Predictions completed for {model_name}")
except Exception as e:
st.error(f"Error predicting with model {model_name}: {e}")
st.success('Batch predictions completed for all selected models.')
# Download options
st.header('Download Predictions')
download_option = st.radio(
"Choose how to download your predictions:",
("All Models in Separate Files", "Churn and Non-Churn in Separate Files", "Download All at Once")
)
if download_option == "All Models in Separate Files":
# Allow user to download the results for each model
for model_name, output_df in model_outputs.items():
csv = output_df.to_csv(index=False).encode('utf-8')
st.download_button(
label=f"Download {model_name} Predictions as CSV",
data=csv,
file_name=f'{model_name.lower().replace(" ", "_")}_predictions.csv',
mime='text/csv',
)
elif download_option == "Churn and Non-Churn in Separate Files":
# Consolidate results for all models and split into churn and non-churn files
for model_name, output_df in model_outputs.items():
churn_df = output_df[output_df['Churn'] == 'Yes']
non_churn_df = output_df[output_df['Churn'] == 'No']
churn_csv = churn_df.to_csv(index=False).encode('utf-8')
non_churn_csv = non_churn_df.to_csv(index=False).encode('utf-8')
st.download_button(
label=f"Download {model_name} Churn Predictions as CSV",
data=churn_csv,
file_name=f'{model_name.lower().replace(" ", "_")}_churn_predictions.csv',
mime='text/csv',
)
st.download_button(
label=f"Download {model_name} Non-Churn Predictions as CSV",
data=non_churn_csv,
file_name=f'{model_name.lower().replace(" ", "_")}_non_churn_predictions.csv',
mime='text/csv',
)
elif download_option == "Download All at Once":
# Create a zip file containing all outputs
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w") as zip_file:
for model_name, output_df in model_outputs.items():
csv_data = output_df.to_csv(index=False).encode('utf-8')
zip_file.writestr(f'{model_name.lower().replace(" ", "_")}_predictions.csv', csv_data)
zip_buffer.seek(0)
st.download_button(
label="Download All Predictions as ZIP",
data=zip_buffer,
file_name='all_model_predictions.zip',
mime='application/zip'
)
else:
st.info('Awaiting CSV or Excel file to be uploaded.')
# Sidebar information
st.sidebar.write("### Model Information")
st.sidebar.write(f"Total models available: {len(models)}")
st.sidebar.write(f"Models selected for prediction: {len(selected_models)}")
st.sidebar.write("### Model Accuracies")
for model, accuracy in model_accuracies.items():
st.sidebar.write(f"{model}: {accuracy}%")