Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import os | |
| import sys | |
| from company_bankruptcy.components.model_trainer import ModelTrainer | |
| from company_bankruptcy.components.data_transformation import DataTransformation | |
| from company_bankruptcy.utils.utils import load_object | |
| from company_bankruptcy.logger.logger import logging | |
| from company_bankruptcy.exception.exception import CustomException | |
| def get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict): | |
| if best_model_name == 'Average Ensemble': | |
| default_prob = 0 | |
| for model_name in trained_models_dict: | |
| if model_name == 'best_model_name': | |
| continue | |
| temp_features_list = feature_selection_dict[model_name][1]['selected_shap_feats'] | |
| temp_prob = trained_models_dict[model_name].predict_proba(input_df[temp_features_list])[:, 1] | |
| default_prob += temp_prob | |
| default_prob /= (len(trained_models_dict) - 1) | |
| elif best_model_name == 'Optimized Ensemble': | |
| rfm_features_list = feature_selection_dict['RandomForestClassifier'][1]['selected_shap_feats'] | |
| xgbm_features_list = feature_selection_dict['XGBClassifier'][1]['selected_shap_feats'] | |
| lrm_features_list = feature_selection_dict['LogisticRegression'][1]['selected_shap_feats'] | |
| svcm_features_list = feature_selection_dict['SVC'][1]['selected_shap_feats'] | |
| preds_list = [] | |
| for idx in opt_dict: | |
| opt = opt_dict[idx]['opt'] | |
| rfm = opt_dict[idx]['rfm'] | |
| xgbm = opt_dict[idx]['xgbm'] | |
| lrm = opt_dict[idx]['lrm'] | |
| svcm = opt_dict[idx]['svcm'] | |
| rfm_probs = rfm.predict_proba(input_df[rfm_features_list])[:, 1] | |
| xgbm_probs = xgbm.predict_proba(input_df[xgbm_features_list])[:, 1] | |
| lrm_probs = lrm.predict_proba(input_df[lrm_features_list])[:, 1] | |
| svcm_probs = svcm.predict_proba(input_df[svcm_features_list])[:, 1] | |
| model_preds = np.column_stack([ | |
| rfm_probs, | |
| xgbm_probs, | |
| lrm_probs, | |
| svcm_probs | |
| ]) | |
| preds_list.append(opt.predict(model_preds)) | |
| default_prob = np.mean(np.column_stack(preds_list), axis=1) | |
| elif best_model_name == 'Rank Ensemble': | |
| rank_ensemble_list = [] | |
| prob_list = [] | |
| model_names_list = [] | |
| for model_name in trained_models_dict: | |
| if model_name == 'best_model_name': | |
| continue | |
| temp_features_list = feature_selection_dict[model_name][1]['selected_shap_feats'] | |
| model_names_list.append(model_name) | |
| rank_ensemble_list.append((model_name, trained_models_dict[model_name].best_score_)) | |
| prob_list.append(trained_models_dict[model_name].predict_proba(input_df[temp_features_list])[:, 1]) | |
| rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1]) | |
| default_prob = 0 | |
| for i in range(len(rank_ensemble_list)): | |
| default_prob += (i+1) * prob_list[model_names_list.index(rank_ensemble_list[i][0])] | |
| default_prob /= (len(rank_ensemble_list) * (1 + len(rank_ensemble_list)) / 2) | |
| else: | |
| model = trained_models_dict[best_model_name] | |
| temp_features_list = feature_selection_dict[best_model_name][1]['selected_shap_feats'] | |
| default_prob = model.predict_proba(input_df[temp_features_list])[:, 1] | |
| return default_prob | |
| st.set_page_config( | |
| page_title='Default Predictor', | |
| layout='centered' | |
| ) | |
| try: | |
| st.title('Company Bankruptcy Predictor') | |
| logging.info('Initiating dictionaries') | |
| if 'trained_models_dict' not in st.session_state: | |
| model_trainer_obj = ModelTrainer() | |
| trained_models_dict = load_object( | |
| os.path.join( | |
| model_trainer_obj.model_trainer_config.trained_models_path, | |
| 'trained_models_dict.pkl' | |
| ) | |
| ) | |
| opt_dict = load_object( | |
| os.path.join( | |
| model_trainer_obj.model_trainer_config.trained_models_path, | |
| 'opt_dict.pkl' | |
| ) | |
| ) | |
| data_transformation_obj = DataTransformation() | |
| feature_selection_dict = load_object( | |
| data_transformation_obj.data_transformation_config.feature_selection_dict_file_path | |
| ) | |
| example_data = pd.read_excel('app_input_example.xlsx') | |
| # example_data = pd.read_csv('app_input_example.csv') | |
| st.session_state['trained_models_dict'] = trained_models_dict | |
| st.session_state['opt_dict'] = opt_dict | |
| st.session_state['feature_selection_dict'] = feature_selection_dict | |
| st.session_state['example_data'] = example_data | |
| else: | |
| trained_models_dict = st.session_state['trained_models_dict'] | |
| opt_dict = st.session_state['opt_dict'] | |
| feature_selection_dict = st.session_state['feature_selection_dict'] | |
| example_data = st.session_state['example_data'] | |
| logging.info('Dictionaries initiated') | |
| logging.info('Checking button clicked') | |
| if 'clicked' not in st.session_state: | |
| st.session_state.clicked = False | |
| logging.info(f'Button check passed with value {st.session_state.clicked}') | |
| st.subheader('Please, fill in the input boxes or provide an csv/excel file and click on submit button to get the default probability(ies).') | |
| best_model_name = trained_models_dict['best_model_name'] | |
| logging.info("Getting features' list") | |
| if best_model_name in ['Average Ensemble', 'Optimized Ensemble', 'Rank Ensemble']: | |
| features_list = [] | |
| for model_name in feature_selection_dict: | |
| features_list.extend( | |
| feature_selection_dict[model_name][1]['selected_shap_feats'] | |
| ) | |
| features_list = list(set(features_list)) | |
| else: | |
| features_list = feature_selection_dict[best_model_name][1]['selected_shap_feats'] | |
| logging.info("Features' list found") | |
| upload_container = st.container() | |
| with upload_container: | |
| upload_col1, upload_col2 = st.columns([0.6, 0.4]) | |
| uploaded_file = upload_col1.file_uploader( | |
| 'Upload a csv/excel file with data', | |
| type=["csv", "xlsx"] | |
| ) | |
| # example_data = pd.read_csv('app_input_example.csv') | |
| # example_data = pd.read_csv('artifacts/data.csv') | |
| # example_data = pd.read_excel('app_input_example.xlsx') | |
| # @st.cache_data | |
| # def convert_df(df): | |
| # return df.to_csv(index=False).encode("utf-8") | |
| # # return df.to_excel(index=False).encode("utf-8") | |
| # csv_data = convert_df(df=example_data[features_list]) | |
| csv_data = example_data[features_list].to_csv(index=False).encode("utf-8") | |
| upload_col2.write('An example of the data file') | |
| upload_col2.download_button( | |
| 'Download', | |
| data=csv_data, | |
| file_name='input_example.csv', | |
| mime="text/csv" | |
| ) | |
| n_cols = 2 | |
| n_rows = int((len(features_list) - len(features_list) % n_cols) / n_cols) | |
| if len(features_list) % n_cols != 0: | |
| n_rows += 1 | |
| logging.info('Constructing the app input structure') | |
| input_dict = {} | |
| feature_idx = 0 | |
| for i in range(n_rows): | |
| temp_input_container = st.container() | |
| with temp_input_container: | |
| col1, col2 = st.columns(n_cols) | |
| if i <= n_rows - 1 and len(features_list) % 2 == 0: | |
| input_dict[features_list[feature_idx]] = [ | |
| col1.number_input( | |
| features_list[feature_idx], | |
| format='%.6f' if features_list[feature_idx].split(' ')[-1] != 'Flag' else '%.0f' | |
| ) | |
| ] | |
| input_dict[features_list[feature_idx+1]] = [ | |
| col2.number_input( | |
| features_list[feature_idx+1], | |
| format='%.6f' if features_list[feature_idx+1].split(' ')[-1] != 'Flag' else '%.0f' | |
| ) | |
| ] | |
| else: | |
| input_dict[features_list[feature_idx]] = [ | |
| col1.number_input( | |
| features_list[feature_idx], | |
| format='%.6f' if features_list[feature_idx].split(' ')[-1] != 'Flag' else '%.0f' | |
| ) | |
| ] | |
| feature_idx += 2 | |
| logging.info('Input structure constructed') | |
| def set_button_click(): | |
| st.session_state.clicked = True | |
| st.button('Submit', on_click=set_button_click) | |
| if st.session_state.clicked and uploaded_file is None: | |
| st.session_state.clicked = False | |
| logging.info(f'Calculating prob for {best_model_name}') | |
| input_df = pd.DataFrame(input_dict) | |
| default_prob = get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict) | |
| st.write(f"Default probability: {default_prob[0]:.4f}") | |
| logging.info(f'Default prob: {default_prob[0]:.4f}') | |
| elif st.session_state.clicked and uploaded_file is not None: | |
| st.session_state.clicked = False | |
| # bites_data = uploaded_file.getvalue() | |
| # stringio = StringIO(bites_data.decode('utf-8')) | |
| # string_data = stringio.read() | |
| logging.info('Loading uploaded data') | |
| file_extension = uploaded_file.name.split('.')[-1] | |
| if file_extension == 'csv': | |
| input_df = pd.read_csv(uploaded_file) | |
| else: | |
| input_df = pd.read_excel(uploaded_file) | |
| # input_df = pd.read_excel(uploaded_file) | |
| logging.info('Uploaded data loaded') | |
| with st.spinner('Please wait...'): | |
| logging.info(f'Calculating probabilies for {best_model_name}') | |
| default_prob = get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict) | |
| logging.info('Probabilities calculated') | |
| result_df = pd.DataFrame() | |
| result_df['default_probability'] = default_prob | |
| result_data = result_df.to_csv(index=False).encode("utf-8") | |
| st.success('Done!') | |
| st.download_button( | |
| 'Download the predicted probabilities', | |
| data=result_data, | |
| file_name='default_probabilities.csv', | |
| mime='text/csv' | |
| ) | |
| except Exception as e: | |
| logging.info('Error occured while creating streamlit app') | |
| raise CustomException(e, sys) | |