Spaces:

hhhar
/

ChurnPredUpdated

Sleeping

App Files Files Community

ChurnPredUpdated / app.py

hhhar

Update app.py

f9e2d27 verified over 1 year ago

raw

history blame contribute delete

15.9 kB

	import streamlit as st
	import joblib
	import numpy as np
	import os
	import pandas as pd
	import openpyxl
	import zipfile
	import io

	# Load the preprocessor
	preprocessor_path = 'modelExports/preprocessor.pkl'
	preprocessor = joblib.load(preprocessor_path)

	def find_header_row(df, required_columns, max_rows_to_check=10):
	required_columns_upper = [col.upper().strip() for col in required_columns]
	for i in range(min(max_rows_to_check, len(df))):
	row_values = [str(val).upper().strip() for val in df.iloc[i].values]
	if all(col in row_values for col in required_columns_upper):
	return i # Header row found at row i
	return -1 # Header row not found

	def process_uploaded_file(uploaded_file, required_columns):
	try:
	file_extension = uploaded_file.name.split('.')[-1].lower()

	if file_extension == 'csv':
	# Read the first few rows to check for headers
	df = pd.read_csv(uploaded_file, nrows=10, header=None)
	elif file_extension in ['xlsx', 'xls']:
	df = pd.read_excel(uploaded_file, nrows=10, header=None, engine='openpyxl')
	else:
	st.error("Unsupported file format. Please upload a CSV or Excel file.")
	return None

	header_row = find_header_row(df, required_columns)

	if header_row == -1:
	st.error(f"Required columns not found in the first {len(df)} rows.")
	st.write("Expected columns:", required_columns)
	st.write("Found data rows:", df.head().values.tolist())
	return None

	# Re-read the file with the correct header row
	uploaded_file.seek(0) # Reset file pointer
	if file_extension == 'csv':
	df = pd.read_csv(uploaded_file, header=header_row)
	else:
	df = pd.read_excel(uploaded_file, header=header_row, engine='openpyxl')

	# Standardize column names to uppercase and strip spaces
	df.columns = df.columns.str.upper().str.strip()

	# Standardize required columns to uppercase and strip spaces
	required_columns_upper = [col.upper().strip() for col in required_columns]

	# Check if all required columns are present
	missing_columns = [col for col in required_columns_upper if col not in df.columns]
	if missing_columns:
	st.error(f"The following required columns are missing: {missing_columns}")
	return None

	st.write(f"Uploaded data has {df.shape[0]} rows and {df.shape[1]} columns.")

	return df
	except Exception as e:
	st.error(f"Error reading the file: {e}")
	return None

	def predict_with_model(model, data, includes_preprocessor):
	if includes_preprocessor:
	return model.predict(data)
	else:
	return model.predict(preprocessor.transform(data))

	def create_model_output(df, predictions, model_name):
	output_df = df[['PID']].copy() # Keep PID from original dataframe
	output_df['Prediction'] = predictions
	output_df['Churn Probability'] = predictions
	output_df['Churn'] = output_df['Prediction'].apply(
	lambda x: 'Yes' if x == 1 else 'No')
	return output_df

	# Load models and record whether they include the preprocessor
	model_folder = 'modelExports'
	models = {}
	models_with_preprocessor = {}
	for file_name in os.listdir(model_folder):
	if file_name.endswith('.pkl') and file_name != 'preprocessor.pkl':
	model_name = file_name.replace('.pkl', '').replace('_', ' ').upper()
	model = joblib.load(os.path.join(model_folder, file_name))
	models[model_name] = model

	# Check if model includes preprocessor
	includes_preprocessor = hasattr(
	model, 'named_steps') and 'preprocessor' in model.named_steps
	models_with_preprocessor[model_name] = includes_preprocessor

	# Model accuracies
	model_accuracies = {
	"GAUSSIAN NAIVE BAYES WITH SMOTE MODEL": 86,
	"GAUSSIAN NAIVE BAYES WITHOUT SMOTE MODEL": 85,
	"GRADIENT BOOSTING WITH SMOTE MODEL": 95,
	"GRADIENT BOOSTING WITHOUT SMOTE MODEL": 94,
	"LINEAR DISCRIMINANT ANALYSIS WITH SMOTE MODEL": 88,
	"LINEAR DISCRIMINANT ANALYSIS WITHOUT SMOTE MODEL": 87,
	"LOGISTIC REGRESSION WITH SMOTE MODEL": 90,
	"LOGISTIC REGRESSION WITHOUT SMOTE MODEL": 89,
	"RANDOM FOREST WITH SMOTE MODEL": 95,
	"RANDOM FOREST WITHOUT SMOTE MODEL": 93,
	"SUPPORT VECTOR MACHINE WITH SMOTE MODEL": 91,
	"SUPPORT VECTOR MACHINE WITHOUT SMOTE MODEL": 90,
	"ADABOOST WITH SMOTE MODEL": 92,
	"ADABOOST WITHOUT SMOTE MODEL": 90,
	"DECISION TREE WITH SMOTE MODEL": 88,
	"DECISION TREE WITHOUT SMOTE MODEL": 86
	}

	# Define the Streamlit app
	st.title('Customer Churn Prediction')

	# Sidebar for interface selection
	st.sidebar.header('Interface Selection')
	interface = st.sidebar.radio(
	"Choose an interface",
	("Single Prediction", "Batch Prediction")
	)

	# Sidebar for model selection
	st.sidebar.header('Model Selection')
	selected_models = st.sidebar.multiselect(
	'Select models for prediction',
	list(models.keys()),
	default=list(models.keys())
	)

	# Define categorical options
	crm_pid_value_segment_options = ['Bronze', 'Iron', 'Gold', 'Silver', 'Lead',
	'Platinum', 'SME', 'SE', 'Sliver', 'Unknown']
	effective_segment_options = ['SOHO', 'VSE', 'Other', 'SME', 'LE', 'SE']
	ka_name_options = ['Vladimir Manahilov', 'Desislava Ivanova', 'Martin Tilev',
	'Anna Dimitrova', 'Rumiana Jordanova', 'Anna Dimova',
	'Vania Uzunova', 'Varta Torosian', 'Daniela Stefanova',
	'Ginka Vachkova', 'Tatiana Trifonova', 'Jenia Gogova', 'Unknown']

	if interface == "Single Prediction":
	# Input fields for new customer data
	st.header('Enter New Customer Data')

	# Collect input data
	input_data = {}

	# Categorical inputs
	input_data['CRM_PID_VALUE_SEGMENT'] = st.selectbox(
	'CRM_PID_VALUE_SEGMENT', crm_pid_value_segment_options)
	input_data['EFFECTIVESEGMENT'] = st.selectbox(
	'EFFECTIVESEGMENT', effective_segment_options)
	input_data['KA_NAME'] = st.selectbox('KA_NAME', ka_name_options)

	# Numerical inputs
	input_data['BILLING_ZIP'] = st.number_input(
	'BILLING_ZIP', min_value=0, format="%d")
	input_data['ACTIVE_SUBSCRIBERS'] = st.number_input(
	'ACTIVE_SUBSCRIBERS', min_value=0, format="%d")
	input_data['NOT_ACTIVE_SUBSCRIBERS'] = st.number_input(
	'NOT_ACTIVE_SUBSCRIBERS', min_value=0, format="%d")
	input_data['SUSPENDED_SUBSCRIBERS'] = st.number_input(
	'SUSPENDED_SUBSCRIBERS', min_value=0, format="%d")
	input_data['TOTAL_SUBS'] = st.number_input(
	'TOTAL_SUBS', min_value=0, format="%d")
	input_data['AVGMOBILEREVENUE'] = st.number_input(
	'AVGMOBILEREVENUE', min_value=0.0, format="%.2f")
	input_data['AVGFIXREVENUE'] = st.number_input(
	'AVGFIXREVENUE', min_value=0.0, format="%.2f")
	input_data['TOTALREVENUE'] = st.number_input(
	'TOTALREVENUE', min_value=0.0, format="%.2f")
	input_data['ARPU'] = st.number_input('ARPU', min_value=0.0, format="%.2f")

	# Predict churn
	if st.button('Predict Churn'):
	# Convert input data to DataFrame
	input_df = pd.DataFrame([input_data])

	# Standardize column names to uppercase
	input_df.columns = input_df.columns.str.upper().str.strip()

	# Preprocess the data only if needed
	input_data_transformed = preprocessor.transform(input_df)

	st.write("### Model Predictions")

	predictions = {}
	weighted_votes = {'Churn': 0, 'No Churn': 0}

	for model_name in selected_models:
	model = models[model_name]
	includes_preprocessor = models_with_preprocessor[model_name]

	try:
	if includes_preprocessor:
	# Model includes preprocessor; use raw data
	prediction = model.predict(input_df)
	else:
	# Model does not include preprocessor; use preprocessed data
	prediction = model.predict(input_data_transformed)
	except Exception as e:
	st.error(f"Error predicting with model {model_name}: {e}")
	continue

	churn_prediction = 'Churn' if prediction[0] == 1 else 'No Churn'
	predictions[model_name] = churn_prediction

	# Add weighted vote
	weight = model_accuracies.get(model_name, 1)
	weighted_votes[churn_prediction] += weight

	# Display individual model predictions
	st.write(
	f"{model_name}: {churn_prediction} (Accuracy: {weight}%)")

	# Calculate and display the overall prediction
	total_weight = sum(weighted_votes.values())
	if total_weight == 0:
	st.error(
	"No valid predictions were made. Cannot compute churn probability.")
	else:
	churn_probability = weighted_votes['Churn'] / total_weight
	overall_prediction = 'Churn' if churn_probability > 0.5 else 'No Churn'

	st.write("### Overall Prediction")
	st.write(f"Final Prediction: {overall_prediction}")
	st.write(f"Churn Probability: {churn_probability:.2%}")
	st.write(f"No Churn Probability: {1 - churn_probability:.2%}")

	# Visualize the predictions
	st.write("### Prediction Visualization")
	chart_data = pd.DataFrame(
	{
	'Prediction': ['Churn', 'No Churn'],
	'Weighted Vote': [
	weighted_votes['Churn'],
	weighted_votes['No Churn']
	]
	}
	)
	st.bar_chart(chart_data.set_index('Prediction'))

	elif interface == "Batch Prediction":
	# Batch Prediction Interface
	st.header('Batch Prediction')
	st.write('Upload a CSV or Excel file containing customer data.')

	uploaded_file = st.file_uploader(
	"Choose a CSV or Excel file", type=["csv", "xlsx", "xls"])

	if uploaded_file is not None:
	# Check if models are selected
	if not selected_models:
	st.error(
	"No models selected for prediction. Please select at least one model in the sidebar.")
	st.stop()

	required_columns = [
	'PID', 'CRM_PID_VALUE_SEGMENT', 'EFFECTIVESEGMENT', 'BILLING_ZIP', 'KA_NAME',
	'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS', 'SUSPENDED_SUBSCRIBERS',
	'TOTAL_SUBS', 'AVGMOBILEREVENUE', 'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU'
	]

	df = process_uploaded_file(uploaded_file, required_columns)
	if df is None:
	st.stop()

	# Standardize required columns to uppercase and strip spaces
	required_columns_upper = [col.upper().strip() for col in required_columns]

	# Convert numerical columns to numeric data types
	numerical_columns = [
	'BILLING_ZIP', 'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS',
	'SUSPENDED_SUBSCRIBERS', 'TOTAL_SUBS', 'AVGMOBILEREVENUE',
	'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU'
	]

	for col in numerical_columns:
	df[col] = pd.to_numeric(df[col], errors='coerce')

	# Fill missing values if any
	df.fillna({
	'CRM_PID_VALUE_SEGMENT': 'Unknown',
	'EFFECTIVESEGMENT': 'Unknown',
	'KA_NAME': 'Unknown',
	'BILLING_ZIP': 0,
	'ACTIVE_SUBSCRIBERS': 0,
	'NOT_ACTIVE_SUBSCRIBERS': 0,
	'SUSPENDED_SUBSCRIBERS': 0,
	'TOTAL_SUBS': 0,
	'AVGMOBILEREVENUE': 0.0,
	'AVGFIXREVENUE': 0.0,
	'TOTALREVENUE': 0.0,
	'ARPU': 0.0
	}, inplace=True)

	st.write("### Processing Batch Predictions...")

	model_outputs = {}

	for model_name in selected_models:
	model = models[model_name]
	includes_preprocessor = models_with_preprocessor[model_name]

	try:
	# Prepare data for prediction
	data_for_prediction = df[required_columns_upper[1:]] # Exclude 'PID'

	if not includes_preprocessor:
	data_for_prediction = preprocessor.transform(data_for_prediction)

	predictions = model.predict(data_for_prediction)
	model_outputs[model_name] = create_model_output(
	df, predictions, model_name)
	st.success(f"Predictions completed for {model_name}")
	except Exception as e:
	st.error(f"Error predicting with model {model_name}: {e}")

	st.success('Batch predictions completed for all selected models.')

	# Download options
	st.header('Download Predictions')
	download_option = st.radio(
	"Choose how to download your predictions:",
	("All Models in Separate Files", "Churn and Non-Churn in Separate Files", "Download All at Once")
	)

	if download_option == "All Models in Separate Files":
	# Allow user to download the results for each model
	for model_name, output_df in model_outputs.items():
	csv = output_df.to_csv(index=False).encode('utf-8')
	st.download_button(
	label=f"Download {model_name} Predictions as CSV",
	data=csv,
	file_name=f'{model_name.lower().replace(" ", "_")}_predictions.csv',
	mime='text/csv',
	)
	elif download_option == "Churn and Non-Churn in Separate Files":
	# Consolidate results for all models and split into churn and non-churn files
	for model_name, output_df in model_outputs.items():
	churn_df = output_df[output_df['Churn'] == 'Yes']
	non_churn_df = output_df[output_df['Churn'] == 'No']

	churn_csv = churn_df.to_csv(index=False).encode('utf-8')
	non_churn_csv = non_churn_df.to_csv(index=False).encode('utf-8')

	st.download_button(
	label=f"Download {model_name} Churn Predictions as CSV",
	data=churn_csv,
	file_name=f'{model_name.lower().replace(" ", "_")}_churn_predictions.csv',
	mime='text/csv',
	)
	st.download_button(
	label=f"Download {model_name} Non-Churn Predictions as CSV",
	data=non_churn_csv,
	file_name=f'{model_name.lower().replace(" ", "_")}_non_churn_predictions.csv',
	mime='text/csv',
	)
	elif download_option == "Download All at Once":
	# Create a zip file containing all outputs
	zip_buffer = io.BytesIO()
	with zipfile.ZipFile(zip_buffer, "w") as zip_file:
	for model_name, output_df in model_outputs.items():
	csv_data = output_df.to_csv(index=False).encode('utf-8')
	zip_file.writestr(f'{model_name.lower().replace(" ", "_")}_predictions.csv', csv_data)
	zip_buffer.seek(0)

	st.download_button(
	label="Download All Predictions as ZIP",
	data=zip_buffer,
	file_name='all_model_predictions.zip',
	mime='application/zip'
	)
	else:
	st.info('Awaiting CSV or Excel file to be uploaded.')

	# Sidebar information
	st.sidebar.write("### Model Information")
	st.sidebar.write(f"Total models available: {len(models)}")
	st.sidebar.write(f"Models selected for prediction: {len(selected_models)}")
	st.sidebar.write("### Model Accuracies")
	for model, accuracy in model_accuracies.items():
	st.sidebar.write(f"{model}: {accuracy}%")