Spaces:
Sleeping
Sleeping
initialized
Browse files- app.py +277 -0
- app_input_example.xlsx +0 -0
- artifacts/feature_selection_dict.pkl +3 -0
- artifacts/models/opt_dict.pkl +3 -0
- artifacts/models/trained_models_dict.pkl +3 -0
- company_bankruptcy/__init__.py +0 -0
- company_bankruptcy/components/__init__.py +0 -0
- company_bankruptcy/components/data_ingestion.py +61 -0
- company_bankruptcy/components/data_transformation.py +85 -0
- company_bankruptcy/components/model_evaluation.py +164 -0
- company_bankruptcy/components/model_trainer.py +68 -0
- company_bankruptcy/constants/__init__.py +0 -0
- company_bankruptcy/constants/constants.py +5 -0
- company_bankruptcy/data_access/__init__.py +0 -0
- company_bankruptcy/data_access/mongo_db_connection.py +104 -0
- company_bankruptcy/exception/__init__.py +0 -0
- company_bankruptcy/exception/exception.py +20 -0
- company_bankruptcy/logger/__init__.py +0 -0
- company_bankruptcy/logger/logger.py +20 -0
- company_bankruptcy/pipeline/__init__.py +0 -0
- company_bankruptcy/pipeline/prediction_pipeline.py +0 -0
- company_bankruptcy/pipeline/training_pipeline.py +27 -0
- company_bankruptcy/utils/__init__.py +0 -0
- company_bankruptcy/utils/utils.py +974 -0
- requirements.txt +160 -0
app.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
from company_bankruptcy.components.model_trainer import ModelTrainer
|
| 9 |
+
from company_bankruptcy.components.data_transformation import DataTransformation
|
| 10 |
+
from company_bankruptcy.utils.utils import load_object
|
| 11 |
+
from company_bankruptcy.logger.logger import logging
|
| 12 |
+
from company_bankruptcy.exception.exception import CustomException
|
| 13 |
+
|
| 14 |
+
def get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict):
|
| 15 |
+
if best_model_name == 'Average Ensemble':
|
| 16 |
+
|
| 17 |
+
default_prob = 0
|
| 18 |
+
for model_name in trained_models_dict:
|
| 19 |
+
if model_name == 'best_model_name':
|
| 20 |
+
continue
|
| 21 |
+
temp_features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
|
| 22 |
+
temp_prob = trained_models_dict[model_name].predict_proba(input_df[temp_features_list])[:, 1]
|
| 23 |
+
default_prob += temp_prob
|
| 24 |
+
default_prob /= (len(trained_models_dict) - 1)
|
| 25 |
+
|
| 26 |
+
elif best_model_name == 'Optimized Ensemble':
|
| 27 |
+
|
| 28 |
+
rfm_features_list = feature_selection_dict['RandomForestClassifier'][1]['selected_shap_feats']
|
| 29 |
+
xgbm_features_list = feature_selection_dict['XGBClassifier'][1]['selected_shap_feats']
|
| 30 |
+
lrm_features_list = feature_selection_dict['LogisticRegression'][1]['selected_shap_feats']
|
| 31 |
+
svcm_features_list = feature_selection_dict['SVC'][1]['selected_shap_feats']
|
| 32 |
+
|
| 33 |
+
preds_list = []
|
| 34 |
+
|
| 35 |
+
for idx in opt_dict:
|
| 36 |
+
opt = opt_dict[idx]['opt']
|
| 37 |
+
rfm = opt_dict[idx]['rfm']
|
| 38 |
+
xgbm = opt_dict[idx]['xgbm']
|
| 39 |
+
lrm = opt_dict[idx]['lrm']
|
| 40 |
+
svcm = opt_dict[idx]['svcm']
|
| 41 |
+
|
| 42 |
+
rfm_probs = rfm.predict_proba(input_df[rfm_features_list])[:, 1]
|
| 43 |
+
xgbm_probs = xgbm.predict_proba(input_df[xgbm_features_list])[:, 1]
|
| 44 |
+
lrm_probs = lrm.predict_proba(input_df[lrm_features_list])[:, 1]
|
| 45 |
+
svcm_probs = svcm.predict_proba(input_df[svcm_features_list])[:, 1]
|
| 46 |
+
|
| 47 |
+
model_preds = np.column_stack([
|
| 48 |
+
rfm_probs,
|
| 49 |
+
xgbm_probs,
|
| 50 |
+
lrm_probs,
|
| 51 |
+
svcm_probs
|
| 52 |
+
])
|
| 53 |
+
|
| 54 |
+
preds_list.append(opt.predict(model_preds))
|
| 55 |
+
|
| 56 |
+
default_prob = np.mean(np.column_stack(preds_list), axis=1)
|
| 57 |
+
|
| 58 |
+
elif best_model_name == 'Rank Ensemble':
|
| 59 |
+
|
| 60 |
+
rank_ensemble_list = []
|
| 61 |
+
prob_list = []
|
| 62 |
+
model_names_list = []
|
| 63 |
+
|
| 64 |
+
for model_name in trained_models_dict:
|
| 65 |
+
if model_name == 'best_model_name':
|
| 66 |
+
continue
|
| 67 |
+
temp_features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
|
| 68 |
+
model_names_list.append(model_name)
|
| 69 |
+
rank_ensemble_list.append((model_name, trained_models_dict[model_name].best_score_))
|
| 70 |
+
prob_list.append(trained_models_dict[model_name].predict_proba(input_df[temp_features_list])[:, 1])
|
| 71 |
+
|
| 72 |
+
rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1])
|
| 73 |
+
|
| 74 |
+
default_prob = 0
|
| 75 |
+
for i in range(len(rank_ensemble_list)):
|
| 76 |
+
default_prob += (i+1) * prob_list[model_names_list.index(rank_ensemble_list[i][0])]
|
| 77 |
+
default_prob /= (len(rank_ensemble_list) * (1 + len(rank_ensemble_list)) / 2)
|
| 78 |
+
|
| 79 |
+
else:
|
| 80 |
+
model = trained_models_dict[best_model_name]
|
| 81 |
+
temp_features_list = feature_selection_dict[best_model_name][1]['selected_shap_feats']
|
| 82 |
+
default_prob = model.predict_proba(input_df[temp_features_list])[:, 1]
|
| 83 |
+
|
| 84 |
+
return default_prob
|
| 85 |
+
|
| 86 |
+
st.set_page_config(
|
| 87 |
+
page_title='Default Predictor',
|
| 88 |
+
layout='centered'
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
|
| 93 |
+
st.title('Company Default Predictor')
|
| 94 |
+
|
| 95 |
+
logging.info('Initiating dictionaries')
|
| 96 |
+
if 'trained_models_dict' not in st.session_state:
|
| 97 |
+
model_trainer_obj = ModelTrainer()
|
| 98 |
+
trained_models_dict = load_object(
|
| 99 |
+
os.path.join(
|
| 100 |
+
model_trainer_obj.model_trainer_config.trained_models_path,
|
| 101 |
+
'trained_models_dict.pkl'
|
| 102 |
+
)
|
| 103 |
+
)
|
| 104 |
+
opt_dict = load_object(
|
| 105 |
+
os.path.join(
|
| 106 |
+
model_trainer_obj.model_trainer_config.trained_models_path,
|
| 107 |
+
'opt_dict.pkl'
|
| 108 |
+
)
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
data_transformation_obj = DataTransformation()
|
| 112 |
+
feature_selection_dict = load_object(
|
| 113 |
+
data_transformation_obj.data_transformation_config.feature_selection_dict_file_path
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
example_data = pd.read_excel('app_input_example.xlsx')
|
| 117 |
+
# example_data = pd.read_csv('app_input_example.csv')
|
| 118 |
+
|
| 119 |
+
st.session_state['trained_models_dict'] = trained_models_dict
|
| 120 |
+
st.session_state['opt_dict'] = opt_dict
|
| 121 |
+
st.session_state['feature_selection_dict'] = feature_selection_dict
|
| 122 |
+
st.session_state['example_data'] = example_data
|
| 123 |
+
|
| 124 |
+
else:
|
| 125 |
+
|
| 126 |
+
trained_models_dict = st.session_state['trained_models_dict']
|
| 127 |
+
opt_dict = st.session_state['opt_dict']
|
| 128 |
+
feature_selection_dict = st.session_state['feature_selection_dict']
|
| 129 |
+
example_data = st.session_state['example_data']
|
| 130 |
+
logging.info('Dictionaries initiated')
|
| 131 |
+
|
| 132 |
+
logging.info('Checking button clicked')
|
| 133 |
+
if 'clicked' not in st.session_state:
|
| 134 |
+
st.session_state.clicked = False
|
| 135 |
+
logging.info(f'Button check passed with value {st.session_state.clicked}')
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
st.subheader('Please, fill in the input boxes or provide an csv/excel file and click on submit button to get the default probability(ies).')
|
| 139 |
+
|
| 140 |
+
best_model_name = trained_models_dict['best_model_name']
|
| 141 |
+
|
| 142 |
+
logging.info("Getting features' list")
|
| 143 |
+
if best_model_name in ['Average Ensemble', 'Optimized Ensemble', 'Rank Ensemble']:
|
| 144 |
+
features_list = []
|
| 145 |
+
for model_name in feature_selection_dict:
|
| 146 |
+
features_list.extend(
|
| 147 |
+
feature_selection_dict[model_name][1]['selected_shap_feats']
|
| 148 |
+
)
|
| 149 |
+
features_list = list(set(features_list))
|
| 150 |
+
else:
|
| 151 |
+
features_list = feature_selection_dict[best_model_name][1]['selected_shap_feats']
|
| 152 |
+
logging.info("Features' list found")
|
| 153 |
+
|
| 154 |
+
upload_container = st.container()
|
| 155 |
+
with upload_container:
|
| 156 |
+
upload_col1, upload_col2 = st.columns([0.6, 0.4])
|
| 157 |
+
uploaded_file = upload_col1.file_uploader(
|
| 158 |
+
'Upload a csv/excel file with data',
|
| 159 |
+
type=["csv", "xlsx"]
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# example_data = pd.read_csv('app_input_example.csv')
|
| 163 |
+
# example_data = pd.read_csv('artifacts/data.csv')
|
| 164 |
+
# example_data = pd.read_excel('app_input_example.xlsx')
|
| 165 |
+
|
| 166 |
+
# @st.cache_data
|
| 167 |
+
# def convert_df(df):
|
| 168 |
+
# return df.to_csv(index=False).encode("utf-8")
|
| 169 |
+
# # return df.to_excel(index=False).encode("utf-8")
|
| 170 |
+
|
| 171 |
+
# csv_data = convert_df(df=example_data[features_list])
|
| 172 |
+
|
| 173 |
+
csv_data = example_data[features_list].to_csv(index=False).encode("utf-8")
|
| 174 |
+
|
| 175 |
+
upload_col2.write('An example of the data file')
|
| 176 |
+
upload_col2.download_button(
|
| 177 |
+
'Download',
|
| 178 |
+
data=csv_data,
|
| 179 |
+
file_name='input_example.csv',
|
| 180 |
+
mime="text/csv"
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
n_cols = 2
|
| 184 |
+
n_rows = int((len(features_list) - len(features_list) % n_cols) / n_cols)
|
| 185 |
+
if len(features_list) % n_cols != 0:
|
| 186 |
+
n_rows += 1
|
| 187 |
+
|
| 188 |
+
logging.info('Constructing the app input structure')
|
| 189 |
+
input_dict = {}
|
| 190 |
+
feature_idx = 0
|
| 191 |
+
for i in range(n_rows):
|
| 192 |
+
|
| 193 |
+
temp_input_container = st.container()
|
| 194 |
+
|
| 195 |
+
with temp_input_container:
|
| 196 |
+
col1, col2 = st.columns(n_cols)
|
| 197 |
+
if i <= n_rows - 1 and len(features_list) % 2 == 0:
|
| 198 |
+
input_dict[features_list[feature_idx]] = [
|
| 199 |
+
col1.number_input(
|
| 200 |
+
features_list[feature_idx],
|
| 201 |
+
format='%.6f' if features_list[feature_idx].split(' ')[-1] != 'Flag' else '%.0f'
|
| 202 |
+
)
|
| 203 |
+
]
|
| 204 |
+
input_dict[features_list[feature_idx+1]] = [
|
| 205 |
+
col2.number_input(
|
| 206 |
+
features_list[feature_idx+1],
|
| 207 |
+
format='%.6f' if features_list[feature_idx+1].split(' ')[-1] != 'Flag' else '%.0f'
|
| 208 |
+
)
|
| 209 |
+
]
|
| 210 |
+
else:
|
| 211 |
+
input_dict[features_list[feature_idx]] = [
|
| 212 |
+
col1.number_input(
|
| 213 |
+
features_list[feature_idx],
|
| 214 |
+
format='%.6f' if features_list[feature_idx].split(' ')[-1] != 'Flag' else '%.0f'
|
| 215 |
+
)
|
| 216 |
+
]
|
| 217 |
+
|
| 218 |
+
feature_idx += 2
|
| 219 |
+
|
| 220 |
+
logging.info('Input structure constructed')
|
| 221 |
+
|
| 222 |
+
def set_button_click():
|
| 223 |
+
st.session_state.clicked = True
|
| 224 |
+
|
| 225 |
+
st.button('Submit', on_click=set_button_click)
|
| 226 |
+
|
| 227 |
+
if st.session_state.clicked and uploaded_file is None:
|
| 228 |
+
|
| 229 |
+
st.session_state.clicked = False
|
| 230 |
+
|
| 231 |
+
logging.info(f'Calculating prob for {best_model_name}')
|
| 232 |
+
|
| 233 |
+
input_df = pd.DataFrame(input_dict)
|
| 234 |
+
|
| 235 |
+
default_prob = get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict)
|
| 236 |
+
|
| 237 |
+
st.write(f"Default probability: {default_prob[0]:.4f}")
|
| 238 |
+
|
| 239 |
+
logging.info(f'Default prob: {default_prob[0]:.4f}')
|
| 240 |
+
|
| 241 |
+
elif st.session_state.clicked and uploaded_file is not None:
|
| 242 |
+
st.session_state.clicked = False
|
| 243 |
+
# bites_data = uploaded_file.getvalue()
|
| 244 |
+
# stringio = StringIO(bites_data.decode('utf-8'))
|
| 245 |
+
# string_data = stringio.read()
|
| 246 |
+
logging.info('Loading uploaded data')
|
| 247 |
+
file_extension = uploaded_file.name.split('.')[-1]
|
| 248 |
+
if file_extension == 'csv':
|
| 249 |
+
input_df = pd.read_csv(uploaded_file)
|
| 250 |
+
else:
|
| 251 |
+
input_df = pd.read_excel(uploaded_file)
|
| 252 |
+
# input_df = pd.read_excel(uploaded_file)
|
| 253 |
+
logging.info('Uploaded data loaded')
|
| 254 |
+
|
| 255 |
+
with st.spinner('Please wait...'):
|
| 256 |
+
logging.info(f'Calculating probabilies for {best_model_name}')
|
| 257 |
+
default_prob = get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict)
|
| 258 |
+
logging.info('Probabilities calculated')
|
| 259 |
+
|
| 260 |
+
result_df = pd.DataFrame()
|
| 261 |
+
result_df['default_probability'] = default_prob
|
| 262 |
+
|
| 263 |
+
result_data = result_df.to_csv(index=False).encode("utf-8")
|
| 264 |
+
|
| 265 |
+
st.success('Done!')
|
| 266 |
+
|
| 267 |
+
st.download_button(
|
| 268 |
+
'Download the predicted probabilities',
|
| 269 |
+
data=result_data,
|
| 270 |
+
file_name='default_probabilities.csv',
|
| 271 |
+
mime='text/csv'
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
except Exception as e:
|
| 275 |
+
logging.info('Error occured while creating streamlit app')
|
| 276 |
+
raise CustomException(e, sys)
|
| 277 |
+
|
app_input_example.xlsx
ADDED
|
Binary file (11.1 kB). View file
|
|
|
artifacts/feature_selection_dict.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ffff597549a2c76e13872a5f2048d4b83da2a0f25eeccbade02a575871d84bf9
|
| 3 |
+
size 1930217
|
artifacts/models/opt_dict.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f79cee91a25f02eb551b29c882f6afc778d175c4d755497234c1b2f49f3bbde
|
| 3 |
+
size 15200636
|
artifacts/models/trained_models_dict.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9695d244584ea79581682da0530cbdfb3dd02c76598114626a09e5f3bac3b520
|
| 3 |
+
size 1983143
|
company_bankruptcy/__init__.py
ADDED
|
File without changes
|
company_bankruptcy/components/__init__.py
ADDED
|
File without changes
|
company_bankruptcy/components/data_ingestion.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
from company_bankruptcy.logger.logger import logging
|
| 5 |
+
from company_bankruptcy.exception.exception import CustomException
|
| 6 |
+
from company_bankruptcy.data_access.mongo_db_connection import MongoOps
|
| 7 |
+
from company_bankruptcy.constants.constants import DATABASE_NAME, COLLECTION_NAME, MONGODB_COLLECTION_STR
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
|
| 14 |
+
from sklearn.model_selection import train_test_split
|
| 15 |
+
|
| 16 |
+
MONGODB_COLLECTION_STR = "mongodb+srv://vcharchian:12DyeUWoTDa10AJn@cluster0.xbq0vxb.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
|
| 17 |
+
|
| 18 |
+
@dataclass
|
| 19 |
+
class DataIngestionConfig:
|
| 20 |
+
raw_data_path:str = os.path.join('artifacts', 'data.csv')
|
| 21 |
+
train_data_path:str = os.path.join('artifacts', 'train_data.csv')
|
| 22 |
+
test_data_path:str = os.path.join('artifacts', 'test_data.csv')
|
| 23 |
+
|
| 24 |
+
class DataIngestion:
|
| 25 |
+
|
| 26 |
+
def __init__(self):
|
| 27 |
+
self.ingestion_config = DataIngestionConfig()
|
| 28 |
+
|
| 29 |
+
def initiate_data_ingestion(self):
|
| 30 |
+
logging.info('Data ingestion started')
|
| 31 |
+
try:
|
| 32 |
+
logging.info('Reading the raw data')
|
| 33 |
+
mongo_instance = MongoOps(
|
| 34 |
+
client_url=MONGODB_COLLECTION_STR
|
| 35 |
+
)
|
| 36 |
+
data = mongo_instance.get_records(coll_name=COLLECTION_NAME, db_name=DATABASE_NAME)
|
| 37 |
+
logging.info('Data loaded')
|
| 38 |
+
os.makedirs(os.path.dirname(os.path.join(self.ingestion_config.raw_data_path)), exist_ok=True)
|
| 39 |
+
logging.info('Saving the data')
|
| 40 |
+
data.to_csv(self.ingestion_config.raw_data_path, index=False)
|
| 41 |
+
logging.info('Data saved')
|
| 42 |
+
logging.info('Splitting the data into train and test sets')
|
| 43 |
+
train_df, test_df = train_test_split(
|
| 44 |
+
data,
|
| 45 |
+
test_size=0.1,
|
| 46 |
+
random_state=13,
|
| 47 |
+
stratify=data['Bankrupt?']
|
| 48 |
+
)
|
| 49 |
+
logging.info('Saving train and test sets')
|
| 50 |
+
train_df.to_csv(self.ingestion_config.train_data_path, index=False)
|
| 51 |
+
test_df.to_csv(self.ingestion_config.test_data_path, index=False)
|
| 52 |
+
logging.info('Sets are saved')
|
| 53 |
+
logging.info('Data ingestion completed')
|
| 54 |
+
return (self.ingestion_config.train_data_path, self.ingestion_config.test_data_path)
|
| 55 |
+
except Exception as e:
|
| 56 |
+
logging.info('Error occured during data ingestion')
|
| 57 |
+
raise CustomException(e, sys)
|
| 58 |
+
|
| 59 |
+
if __name__ == '__main__':
|
| 60 |
+
data_ingestion_obj = DataIngestion()
|
| 61 |
+
train_path, test_path = data_ingestion_obj.initiate_data_ingestion()
|
company_bankruptcy/components/data_transformation.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from company_bankruptcy.logger.logger import logging
|
| 3 |
+
from company_bankruptcy.exception.exception import CustomException
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
|
| 9 |
+
from sklearn.model_selection import StratifiedKFold
|
| 10 |
+
|
| 11 |
+
from company_bankruptcy.utils.utils import save_object, create_feature_selection_dict
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class DataTransformationConfig:
|
| 15 |
+
feature_selection_dict_file_path = os.path.join('artifacts', 'feature_selection_dict.pkl')
|
| 16 |
+
|
| 17 |
+
class DataTransformation:
|
| 18 |
+
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.data_transformation_config = DataTransformationConfig()
|
| 21 |
+
|
| 22 |
+
def initiate_data_transformation(self, train_path, test_path, n_cv_folds=10):
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
logging.info('Loading training data')
|
| 26 |
+
train_df = pd.read_csv(train_path)
|
| 27 |
+
logging.info('Training data loaded')
|
| 28 |
+
|
| 29 |
+
logging.info('Loading testing data')
|
| 30 |
+
test_df = pd.read_csv(test_path)
|
| 31 |
+
logging.info('Testing data loaded')
|
| 32 |
+
|
| 33 |
+
logging.info('Removing Net Income Flag')
|
| 34 |
+
train_df.drop(columns=' Net Income Flag', inplace=True)
|
| 35 |
+
test_df.drop(columns=' Net Income Flag', inplace=True)
|
| 36 |
+
logging.info('Net Income Flag removed')
|
| 37 |
+
|
| 38 |
+
logging.info('Specifying nominal and numerical features as list')
|
| 39 |
+
nominal_features = [' Liability-Assets Flag']
|
| 40 |
+
numerical_features = [col for col in train_df.columns if col not in nominal_features and col!='Bankrupt?']
|
| 41 |
+
logging.info('Nominal and numerical features specified')
|
| 42 |
+
|
| 43 |
+
logging.info(f'Creating {n_cv_folds} CV folds for train data')
|
| 44 |
+
skfold = StratifiedKFold(n_splits=n_cv_folds, random_state=42, shuffle=True)
|
| 45 |
+
skfold_list = []
|
| 46 |
+
for train_idxs, valid_idxs in skfold.split(train_df, y=train_df['Bankrupt?']):
|
| 47 |
+
skfold_list.append((train_idxs, valid_idxs))
|
| 48 |
+
logging.info('CV folds created')
|
| 49 |
+
|
| 50 |
+
# logging.info('Creating new columns using categorical and numerical iteractions')
|
| 51 |
+
# for feat in numerical_features:
|
| 52 |
+
# train_df[f"feat{numerical_features.index(feat)}"] = train_df[feat] * train_df[' Liability-Assets Flag']
|
| 53 |
+
# test_df[f"feat{numerical_features.index(feat)}"] = test_df[feat] * test_df[' Liability-Assets Flag']
|
| 54 |
+
# numerical_features.append(f"feat{numerical_features.index(feat)}")
|
| 55 |
+
# logging.info('New columns created')
|
| 56 |
+
|
| 57 |
+
logging.info('Starting feature selection')
|
| 58 |
+
selected_features_dict = create_feature_selection_dict(
|
| 59 |
+
data=train_df,
|
| 60 |
+
cv_fold_list=skfold_list,
|
| 61 |
+
numerical_features=numerical_features,
|
| 62 |
+
nominal_features=nominal_features
|
| 63 |
+
)
|
| 64 |
+
logging.info('Feature selection completed')
|
| 65 |
+
|
| 66 |
+
logging.info('Saving feature selection dictionary as pkl file')
|
| 67 |
+
save_object(
|
| 68 |
+
file_path=self.data_transformation_config.feature_selection_dict_file_path,
|
| 69 |
+
obj=selected_features_dict
|
| 70 |
+
)
|
| 71 |
+
logging.info('Dictionary saved')
|
| 72 |
+
|
| 73 |
+
return (train_df, test_df, skfold_list, numerical_features)
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logging.info('Error occured during data transformation')
|
| 77 |
+
raise CustomException(e, sys)
|
| 78 |
+
|
| 79 |
+
if __name__ == '__main__':
|
| 80 |
+
|
| 81 |
+
data_transformation_obj = DataTransformation()
|
| 82 |
+
train_df, test_df, cv_fold_list, numerical_features = data_transformation_obj.initiate_data_transformation(
|
| 83 |
+
train_path='artifacts\\train_data.csv',
|
| 84 |
+
test_path='artifacts\\test_data.csv'
|
| 85 |
+
)
|
company_bankruptcy/components/model_evaluation.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
from company_bankruptcy.logger.logger import logging
|
| 5 |
+
from company_bankruptcy.exception.exception import CustomException
|
| 6 |
+
from company_bankruptcy.utils.utils import load_object
|
| 7 |
+
from company_bankruptcy.components.model_trainer import ModelTrainer
|
| 8 |
+
from company_bankruptcy.components.data_transformation import DataTransformation
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import sys
|
| 12 |
+
|
| 13 |
+
import mlflow
|
| 14 |
+
import mlflow.sklearn
|
| 15 |
+
import mlflow.xgboost
|
| 16 |
+
|
| 17 |
+
from sklearn.metrics import roc_auc_score
|
| 18 |
+
|
| 19 |
+
from urllib.parse import urlparse
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ModelEvaluation:
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
|
| 26 |
+
logging.info('Model evaluation started')
|
| 27 |
+
|
| 28 |
+
def initiate_model_evaluation(self, test_df):
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
|
| 32 |
+
logging.info('Setting target variable')
|
| 33 |
+
y_test = test_df['Bankrupt?'].to_frame()
|
| 34 |
+
logging.info('Target variable set')
|
| 35 |
+
|
| 36 |
+
logging.info('Loading the trained models')
|
| 37 |
+
model_trainer_obj = ModelTrainer()
|
| 38 |
+
models_main_path = model_trainer_obj.model_trainer_config.trained_models_path
|
| 39 |
+
trained_models_dict = load_object(
|
| 40 |
+
os.path.join(models_main_path, 'trained_models_dict.pkl')
|
| 41 |
+
)
|
| 42 |
+
opt_dict = load_object(
|
| 43 |
+
os.path.join(models_main_path, 'opt_dict.pkl')
|
| 44 |
+
)
|
| 45 |
+
logging.info('Trained models loaded')
|
| 46 |
+
|
| 47 |
+
logging.info("Loading the features' dictionary")
|
| 48 |
+
data_transformation_obj = DataTransformation()
|
| 49 |
+
features_selection_dict_path = data_transformation_obj.data_transformation_config.feature_selection_dict_file_path
|
| 50 |
+
feature_selection_dict = load_object(features_selection_dict_path)
|
| 51 |
+
logging.info("Features' selection dictionary loaded")
|
| 52 |
+
|
| 53 |
+
test_score_dict = {}
|
| 54 |
+
|
| 55 |
+
logging.info('Finding test score for Average Ensemble')
|
| 56 |
+
y_test_pred_prob = 0
|
| 57 |
+
for model_name in trained_models_dict:
|
| 58 |
+
if model_name == 'best_model_name':
|
| 59 |
+
continue
|
| 60 |
+
features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
|
| 61 |
+
temp_prob = trained_models_dict[model_name].predict_proba(test_df[features_list])[:, 1]
|
| 62 |
+
y_test_pred_prob += temp_prob
|
| 63 |
+
y_test_pred_prob /= (len(trained_models_dict) - 1)
|
| 64 |
+
avg_ens_score = roc_auc_score(y_test, y_test_pred_prob)
|
| 65 |
+
test_score_dict['AverageEnsemble'] = avg_ens_score
|
| 66 |
+
logging.info('Average Ensemble score calculated')
|
| 67 |
+
|
| 68 |
+
logging.info('Finding test score for Optimized Ensemble')
|
| 69 |
+
rfm_features_list = feature_selection_dict['RandomForestClassifier'][1]['selected_shap_feats']
|
| 70 |
+
xgbm_features_list = feature_selection_dict['XGBClassifier'][1]['selected_shap_feats']
|
| 71 |
+
lrm_features_list = feature_selection_dict['LogisticRegression'][1]['selected_shap_feats']
|
| 72 |
+
svcm_features_list = feature_selection_dict['SVC'][1]['selected_shap_feats']
|
| 73 |
+
|
| 74 |
+
preds_list = []
|
| 75 |
+
|
| 76 |
+
for idx in opt_dict:
|
| 77 |
+
opt = opt_dict[idx]['opt']
|
| 78 |
+
rfm = opt_dict[idx]['rfm']
|
| 79 |
+
xgbm = opt_dict[idx]['xgbm']
|
| 80 |
+
lrm = opt_dict[idx]['lrm']
|
| 81 |
+
svcm = opt_dict[idx]['svcm']
|
| 82 |
+
|
| 83 |
+
rfm_probs = rfm.predict_proba(test_df[rfm_features_list])[:, 1]
|
| 84 |
+
xgbm_probs = xgbm.predict_proba(test_df[xgbm_features_list])[:, 1]
|
| 85 |
+
lrm_probs = lrm.predict_proba(test_df[lrm_features_list])[:, 1]
|
| 86 |
+
svcm_probs = svcm.predict_proba(test_df[svcm_features_list])[:, 1]
|
| 87 |
+
|
| 88 |
+
model_preds = np.column_stack([
|
| 89 |
+
rfm_probs,
|
| 90 |
+
xgbm_probs,
|
| 91 |
+
lrm_probs,
|
| 92 |
+
svcm_probs
|
| 93 |
+
])
|
| 94 |
+
|
| 95 |
+
preds_list.append(opt.predict(model_preds))
|
| 96 |
+
|
| 97 |
+
y_test_pred_prob = np.mean(np.column_stack(preds_list), axis=1)
|
| 98 |
+
optimized_ens_score = roc_auc_score(y_test, y_test_pred_prob)
|
| 99 |
+
test_score_dict['OptimizedEnsemble'] = optimized_ens_score
|
| 100 |
+
logging.info('Optimized Ensemble score calculated')
|
| 101 |
+
|
| 102 |
+
logging.info('Finding test score for Rank Ensemble')
|
| 103 |
+
rank_ensemble_list = []
|
| 104 |
+
prob_list = []
|
| 105 |
+
model_names_list = []
|
| 106 |
+
|
| 107 |
+
for model_name in trained_models_dict:
|
| 108 |
+
if model_name == 'best_model_name':
|
| 109 |
+
continue
|
| 110 |
+
features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
|
| 111 |
+
model_names_list.append(model_name)
|
| 112 |
+
rank_ensemble_list.append((model_name, trained_models_dict[model_name].best_score_))
|
| 113 |
+
prob_list.append(trained_models_dict[model_name].predict_proba(test_df[features_list])[:, 1])
|
| 114 |
+
|
| 115 |
+
rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1])
|
| 116 |
+
|
| 117 |
+
y_test_pred_prob = 0
|
| 118 |
+
for i in range(len(rank_ensemble_list)):
|
| 119 |
+
y_test_pred_prob += (i+1) * prob_list[model_names_list.index(rank_ensemble_list[i][0])]
|
| 120 |
+
y_test_pred_prob /= (len(rank_ensemble_list) * (1 + len(rank_ensemble_list)) / 2)
|
| 121 |
+
rank_ens_score = roc_auc_score(y_test, y_test_pred_prob)
|
| 122 |
+
test_score_dict['RankEnsemble'] = rank_ens_score
|
| 123 |
+
logging.info('Rank Ensemble score calculated')
|
| 124 |
+
|
| 125 |
+
for model_name in trained_models_dict:
|
| 126 |
+
if model_name == 'best_model_name':
|
| 127 |
+
continue
|
| 128 |
+
logging.info(f'Finding test score for {model_name}')
|
| 129 |
+
features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
|
| 130 |
+
model = trained_models_dict[model_name]
|
| 131 |
+
y_test_pred_prob = model.predict_proba(test_df[features_list])[:, 1]
|
| 132 |
+
temp_score = roc_auc_score(y_test, y_test_pred_prob)
|
| 133 |
+
test_score_dict[model_name] = temp_score
|
| 134 |
+
logging.info(f'{model_name} score calculated')
|
| 135 |
+
|
| 136 |
+
logging.info('Getting mlflow tracking uri type')
|
| 137 |
+
tracking_uri_type_store = urlparse(mlflow.get_tracking_uri()).scheme
|
| 138 |
+
logging.info('Tracking uri got')
|
| 139 |
+
|
| 140 |
+
logging.info('Starting mlflow')
|
| 141 |
+
with mlflow.start_run():
|
| 142 |
+
for model_name in test_score_dict:
|
| 143 |
+
mlflow.log_metric(f'{model_name} ROC-AUC', test_score_dict[model_name])
|
| 144 |
+
if model_name in trained_models_dict.keys():
|
| 145 |
+
model = trained_models_dict[model_name]
|
| 146 |
+
if tracking_uri_type_store != 'file':
|
| 147 |
+
# if model_name == 'XGBClassifier':
|
| 148 |
+
# mlflow.xgboost.log_model(model, f'{model_name}', registered_model_name=f'{model_name}_model')
|
| 149 |
+
# else:
|
| 150 |
+
mlflow.sklearn.log_model(model, f'{model_name}', registered_model_name=f'{model_name}_model')
|
| 151 |
+
else:
|
| 152 |
+
# if model_name == 'XGBClassifier':
|
| 153 |
+
# mlflow.xgboost.log_model(model, f'{model_name}')
|
| 154 |
+
# else:
|
| 155 |
+
mlflow.sklearn.log_model(model, f'{model_name}')
|
| 156 |
+
|
| 157 |
+
logging.info('mlflow succeeded')
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
|
| 162 |
+
logging.info('Error occured during model evaluation')
|
| 163 |
+
raise CustomException(e, sys)
|
| 164 |
+
|
company_bankruptcy/components/model_trainer.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from company_bankruptcy.logger.logger import logging
|
| 4 |
+
from company_bankruptcy.exception.exception import CustomException
|
| 5 |
+
from company_bankruptcy.utils.utils import save_object, find_optimal_model
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class ModelTrainerConfig:
|
| 15 |
+
trained_models_path = os.path.join('artifacts', 'models')
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ModelTrainer:
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.model_trainer_config = ModelTrainerConfig()
|
| 22 |
+
|
| 23 |
+
def initiate_model_training(self, train_df, test_df, features_dict_path, cv_fold_list, numerical_features):
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
|
| 27 |
+
logging.info('Creating a directory to save trained models')
|
| 28 |
+
os.makedirs(
|
| 29 |
+
self.model_trainer_config.trained_models_path, exist_ok=True)
|
| 30 |
+
logging.info("Models' directory created")
|
| 31 |
+
|
| 32 |
+
logging.info('Finding the best model')
|
| 33 |
+
trained_models_dict, opt_dict = find_optimal_model(
|
| 34 |
+
train_df,
|
| 35 |
+
test_df,
|
| 36 |
+
features_dict_path,
|
| 37 |
+
cv_fold_list,
|
| 38 |
+
numerical_features
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
logging.info(
|
| 42 |
+
"Saving trained models' and ensemble optimized weights' dictionaries")
|
| 43 |
+
save_object(
|
| 44 |
+
file_path=os.path.join(
|
| 45 |
+
self.model_trainer_config.trained_models_path, 'trained_models_dict.pkl'),
|
| 46 |
+
obj=trained_models_dict
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
save_object(
|
| 50 |
+
file_path=os.path.join(
|
| 51 |
+
self.model_trainer_config.trained_models_path, 'opt_dict.pkl'),
|
| 52 |
+
obj=opt_dict
|
| 53 |
+
)
|
| 54 |
+
logging.info('Saving completed')
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logging.info('Error occured during model training')
|
| 58 |
+
raise CustomException(e, sys)
|
| 59 |
+
|
| 60 |
+
# if __name__ == '__main__':
|
| 61 |
+
# model_training_obj = ModelTrainer()
|
| 62 |
+
# model_training_obj.initiate_model_training(
|
| 63 |
+
# train_df,
|
| 64 |
+
# test_df,
|
| 65 |
+
# features_dict_path,
|
| 66 |
+
# cv_fold_list,
|
| 67 |
+
# numerical_features
|
| 68 |
+
# )
|
company_bankruptcy/constants/__init__.py
ADDED
|
File without changes
|
company_bankruptcy/constants/constants.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATABASE_NAME = "bankruptcy"
|
| 2 |
+
|
| 3 |
+
COLLECTION_NAME = "data"
|
| 4 |
+
|
| 5 |
+
MONGODB_COLLECTION_STR = "MONGODB_COLLECTION_STR"
|
company_bankruptcy/data_access/__init__.py
ADDED
|
File without changes
|
company_bankruptcy/data_access/mongo_db_connection.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import pymongo
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
from company_bankruptcy.exception.exception import CustomException
|
| 6 |
+
from company_bankruptcy.logger.logger import logging
|
| 7 |
+
from company_bankruptcy.constants.constants import DATABASE_NAME, COLLECTION_NAME, MONGODB_COLLECTION_STR
|
| 8 |
+
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class MongoOps:
|
| 13 |
+
|
| 14 |
+
def __init__(self, client_url:str, database_name:str=None, collection_name:str=None):
|
| 15 |
+
self.client_url = client_url
|
| 16 |
+
self.database_name = database_name
|
| 17 |
+
self.collection_name = collection_name
|
| 18 |
+
|
| 19 |
+
def create_client(self):
|
| 20 |
+
logging.info('Initiating MongoClient')
|
| 21 |
+
client = pymongo.MongoClient(self.client_url)
|
| 22 |
+
logging.info('MongoClient initiated')
|
| 23 |
+
return client
|
| 24 |
+
|
| 25 |
+
def create_database(self):
|
| 26 |
+
logging.info('Creating Mongo database')
|
| 27 |
+
client = self.create_client()
|
| 28 |
+
database = client[self.database_name]
|
| 29 |
+
logging.info(f'Mongo database {self.database_name} created')
|
| 30 |
+
return database
|
| 31 |
+
|
| 32 |
+
def create_collection(self):
|
| 33 |
+
logging.info('Creating Mongo collection')
|
| 34 |
+
database = self.create_database()
|
| 35 |
+
collection = database[self.collection_name]
|
| 36 |
+
logging.info(f'Mongo collection {self.collection_name} created')
|
| 37 |
+
return collection
|
| 38 |
+
|
| 39 |
+
def get_database(self, db_name:str):
|
| 40 |
+
logging.info(f'Accessing {db_name} database')
|
| 41 |
+
client = self.create_client()
|
| 42 |
+
database = client[db_name]
|
| 43 |
+
logging.info(f'{db_name} database accessed')
|
| 44 |
+
return database
|
| 45 |
+
|
| 46 |
+
def get_collection(self, coll_name:str, db_name:str):
|
| 47 |
+
logging.info(f'Accessing {coll_name} collection')
|
| 48 |
+
database = self.get_database(db_name)
|
| 49 |
+
collection = database[coll_name]
|
| 50 |
+
logging.info(f'{coll_name} collection accessed')
|
| 51 |
+
return collection
|
| 52 |
+
|
| 53 |
+
def insert_record(self, record:dict, coll_name:str, db_name:str):
|
| 54 |
+
collection = self.get_collection(coll_name, db_name)
|
| 55 |
+
logging.info(f'Starting record insertion into {coll_name} collection of {db_name} database')
|
| 56 |
+
if isinstance(record, list):
|
| 57 |
+
for data in record:
|
| 58 |
+
if type(data) != dict:
|
| 59 |
+
logging.info("Records' list should have elements as dict")
|
| 60 |
+
raise TypeError("Records' list should have elements as dict")
|
| 61 |
+
collection.insert_many(record)
|
| 62 |
+
elif isinstance(record, dict):
|
| 63 |
+
collection.insert_one(record)
|
| 64 |
+
logging.info(f'Insertion into {coll_name} collection of {db_name} database completed')
|
| 65 |
+
|
| 66 |
+
def insert_from_file(self, datafile:str, coll_name:str, db_name:str):
|
| 67 |
+
logging.info(f'Starting record insertion into {coll_name} collection of {db_name} database from {datafile}')
|
| 68 |
+
self.path = datafile
|
| 69 |
+
|
| 70 |
+
if self.path.endswith('.csv'):
|
| 71 |
+
df = pd.read_csv(self.path, encoding='utf-8')
|
| 72 |
+
elif self.path.endswith('.xlsx'):
|
| 73 |
+
df = pd.read_excel(self.path, encoding='utf-8')
|
| 74 |
+
logging.info('Data is loaded as a pandas dataframe')
|
| 75 |
+
|
| 76 |
+
logging.info('Converting the data into json')
|
| 77 |
+
datajson = json.loads(df.to_json(orient='record'))
|
| 78 |
+
logging.info('Conversion to json completed')
|
| 79 |
+
|
| 80 |
+
collection = self.get_collection(coll_name, db_name)
|
| 81 |
+
|
| 82 |
+
logging.info('Inserting json data')
|
| 83 |
+
collection.insert_many(datajson)
|
| 84 |
+
logging.info('Insertion completed')
|
| 85 |
+
|
| 86 |
+
def get_records(self, coll_name:str, db_name:str):
|
| 87 |
+
collection = self.get_collection(coll_name, db_name)
|
| 88 |
+
retrieved_data = pd.DataFrame(list(collection.find()))
|
| 89 |
+
try:
|
| 90 |
+
retrieved_data.drop(columns='_id', inplace=True)
|
| 91 |
+
logging.info('Loading the data from the database completed')
|
| 92 |
+
except Exception as e:
|
| 93 |
+
retrieved_data = pd.DataFrame()
|
| 94 |
+
logging.info('Loading the data from the database failed')
|
| 95 |
+
raise CustomException(e, sys)
|
| 96 |
+
return retrieved_data
|
| 97 |
+
|
| 98 |
+
if __name__ == '__main__':
|
| 99 |
+
|
| 100 |
+
mongo_instance = MongoOps(
|
| 101 |
+
client_url=MONGODB_COLLECTION_STR
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
retrieved_data = mongo_instance.get_records(coll_name=COLLECTION_NAME, db_name=DATABASE_NAME)
|
company_bankruptcy/exception/__init__.py
ADDED
|
File without changes
|
company_bankruptcy/exception/exception.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class CustomException(Exception):
|
| 5 |
+
|
| 6 |
+
def __init__(self, error_message, error_details:sys):
|
| 7 |
+
self.error_message = error_message
|
| 8 |
+
_, _, exc_tb = error_details.exc_info()
|
| 9 |
+
self.lineno = exc_tb.tb_lineno
|
| 10 |
+
self.file_name = exc_tb.tb_frame.f_code.co_filename
|
| 11 |
+
|
| 12 |
+
def __str__(self):
|
| 13 |
+
return "Error occured in python script name [{0}] line number [{1}] error message [{2}]".format(
|
| 14 |
+
self.file_name, self.lineno, str(self.error_message))
|
| 15 |
+
|
| 16 |
+
if __name__ == '__main__':
|
| 17 |
+
try:
|
| 18 |
+
1 / 0
|
| 19 |
+
except Exception as e:
|
| 20 |
+
raise CustomException(e, sys)
|
company_bankruptcy/logger/__init__.py
ADDED
|
File without changes
|
company_bankruptcy/logger/logger.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime as dt
|
| 4 |
+
|
| 5 |
+
LOG_FILE = f"{dt.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
|
| 6 |
+
|
| 7 |
+
log_path = os.path.join(os.getcwd(), "logs")
|
| 8 |
+
|
| 9 |
+
os.makedirs(log_path, exist_ok=True)
|
| 10 |
+
|
| 11 |
+
LOG_FILEPATH = os.path.join(log_path, LOG_FILE)
|
| 12 |
+
|
| 13 |
+
logging.basicConfig(
|
| 14 |
+
level=logging.INFO,
|
| 15 |
+
filename=LOG_FILEPATH,
|
| 16 |
+
format="[%(asctime)s] %(lineno)d %(name)s - %(levelname)s - %(message)s"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
if __name__ == '__main__':
|
| 20 |
+
logging.info("Log testing executed!!!")
|
company_bankruptcy/pipeline/__init__.py
ADDED
|
File without changes
|
company_bankruptcy/pipeline/prediction_pipeline.py
ADDED
|
File without changes
|
company_bankruptcy/pipeline/training_pipeline.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from company_bankruptcy.components.data_ingestion import DataIngestion
|
| 2 |
+
from company_bankruptcy.components.data_transformation import DataTransformation
|
| 3 |
+
from company_bankruptcy.components.model_trainer import ModelTrainer
|
| 4 |
+
from company_bankruptcy.components.model_evaluation import ModelEvaluation
|
| 5 |
+
|
| 6 |
+
def run_pipeline():
|
| 7 |
+
|
| 8 |
+
data_ingestion_obj = DataIngestion()
|
| 9 |
+
train_path, test_path = data_ingestion_obj.initiate_data_ingestion()
|
| 10 |
+
|
| 11 |
+
data_transformation_obj = DataTransformation()
|
| 12 |
+
train_df, test_df, cv_fold_list, numerical_features = data_transformation_obj.initiate_data_transformation(
|
| 13 |
+
train_path=train_path,
|
| 14 |
+
test_path=test_path
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
model_training_obj = ModelTrainer()
|
| 18 |
+
model_training_obj.initiate_model_training(
|
| 19 |
+
train_df=train_df,
|
| 20 |
+
test_df=test_df,
|
| 21 |
+
features_dict_path=data_transformation_obj.data_transformation_config.feature_selection_dict_file_path,
|
| 22 |
+
cv_fold_list=cv_fold_list,
|
| 23 |
+
numerical_features=numerical_features
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
model_evaluation_obj = ModelEvaluation()
|
| 27 |
+
model_evaluation_obj.initiate_model_evaluation(test_df)
|
company_bankruptcy/utils/__init__.py
ADDED
|
File without changes
|
company_bankruptcy/utils/utils.py
ADDED
|
@@ -0,0 +1,974 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import pickle
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
from company_bankruptcy.logger.logger import logging
|
| 8 |
+
from company_bankruptcy.exception.exception import CustomException
|
| 9 |
+
|
| 10 |
+
from sklearn.svm import SVC
|
| 11 |
+
from sklearn.feature_selection import RFE
|
| 12 |
+
from sklearn.feature_selection import r_regression, SelectKBest
|
| 13 |
+
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
|
| 14 |
+
from sklearn.feature_selection import f_classif, chi2
|
| 15 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 16 |
+
from sklearn.linear_model import LogisticRegression
|
| 17 |
+
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
|
| 18 |
+
from sklearn.preprocessing import StandardScaler
|
| 19 |
+
from sklearn.model_selection import GridSearchCV
|
| 20 |
+
from sklearn.pipeline import Pipeline
|
| 21 |
+
from sklearn.compose import ColumnTransformer
|
| 22 |
+
|
| 23 |
+
from xgboost import XGBClassifier
|
| 24 |
+
|
| 25 |
+
from scipy import stats
|
| 26 |
+
from scipy.special import softmax
|
| 27 |
+
from scipy.optimize import fmin
|
| 28 |
+
|
| 29 |
+
from functools import partial
|
| 30 |
+
|
| 31 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
| 32 |
+
|
| 33 |
+
from boruta import BorutaPy
|
| 34 |
+
|
| 35 |
+
import shap
|
| 36 |
+
|
| 37 |
+
from collections import Counter
|
| 38 |
+
|
| 39 |
+
from tqdm.auto import tqdm
|
| 40 |
+
import gc
|
| 41 |
+
|
| 42 |
+
import warnings
|
| 43 |
+
warnings.filterwarnings('ignore')
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def save_object(file_path, obj):
|
| 47 |
+
try:
|
| 48 |
+
dir_path = os.path.dirname(file_path)
|
| 49 |
+
|
| 50 |
+
os.makedirs(dir_path, exist_ok=True)
|
| 51 |
+
|
| 52 |
+
with open(file_path, "wb") as file_obj:
|
| 53 |
+
pickle.dump(obj, file_obj)
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
raise CustomException(e, sys)
|
| 57 |
+
|
| 58 |
+
def load_object(file_path):
|
| 59 |
+
try:
|
| 60 |
+
with open(file_path, 'rb') as file_obj:
|
| 61 |
+
return pickle.load(file_obj)
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logging.info('Exception Occured in load_object function utils')
|
| 64 |
+
raise CustomException(e, sys)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def get_shap_features(shap_values, features, topk=10):
|
| 68 |
+
'''
|
| 69 |
+
Returns topk features selected using shap values
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
shap_values (object): shap explainer
|
| 73 |
+
features (list): list of features' name
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
list: topk features derived from shap values
|
| 77 |
+
'''
|
| 78 |
+
# Calculates the feature importance (mean absolute shap value) for each feature
|
| 79 |
+
importances = []
|
| 80 |
+
for i in range(shap_values.values.shape[1]):
|
| 81 |
+
importances.append(np.mean(np.abs(shap_values.values[:, i])))
|
| 82 |
+
# Calculates the normalized version
|
| 83 |
+
importances_norm = softmax(importances)
|
| 84 |
+
# Organize the importances and columns in a dictionary
|
| 85 |
+
feature_importances = {fea: imp for imp, fea in zip(importances, features)}
|
| 86 |
+
feature_importances_norm = {fea: imp for imp,
|
| 87 |
+
fea in zip(importances_norm, features)}
|
| 88 |
+
# Sorts the dictionary
|
| 89 |
+
feature_importances = {k: v for k, v in sorted(
|
| 90 |
+
feature_importances.items(), key=lambda item: item[1], reverse=True)}
|
| 91 |
+
feature_importances_norm = {k: v for k, v in sorted(
|
| 92 |
+
feature_importances_norm.items(), key=lambda item: item[1], reverse=True)}
|
| 93 |
+
# Prints the feature importances
|
| 94 |
+
selected_topk_feats = []
|
| 95 |
+
|
| 96 |
+
for idx, (k, v) in enumerate(feature_importances.items()):
|
| 97 |
+
# print(f"{k} -> {v:.4f} (softmax = {feature_importances_norm[k]:.4f})")
|
| 98 |
+
if idx <= topk:
|
| 99 |
+
selected_topk_feats.append(k)
|
| 100 |
+
|
| 101 |
+
return selected_topk_feats
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class FSelector():
|
| 105 |
+
'''
|
| 106 |
+
Helps to select features based on BorutaPy, RFE, and various statistics
|
| 107 |
+
'''
|
| 108 |
+
|
| 109 |
+
def __init__(self, X, y, num_feats, ordinal_feats, nominal_feats, model, is_target_cat=True, select_n_feats=15):
|
| 110 |
+
'''
|
| 111 |
+
Initializes some parameters
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
X (pd.DataFrame): contains features' values
|
| 115 |
+
y (pd.DataFrame): contains target values
|
| 116 |
+
num_feats (list): list of numerical features' names
|
| 117 |
+
ordinal_feats (list): list of ordinal features' names
|
| 118 |
+
nominal_feats (list): list of nominal features' names
|
| 119 |
+
model (model object): can be any type of model like RandomForest, LogisticRegression, etc.
|
| 120 |
+
is_target_cat (bool): indicates whether the target is categorical or not
|
| 121 |
+
select_n_feats (int): specifies the number of features to output
|
| 122 |
+
'''
|
| 123 |
+
|
| 124 |
+
self.X = X
|
| 125 |
+
self.y = y
|
| 126 |
+
self.num_feats = num_feats
|
| 127 |
+
self.ordinal_feats = ordinal_feats
|
| 128 |
+
self.nominal_feats = nominal_feats
|
| 129 |
+
self.model = model
|
| 130 |
+
self.is_target_cat = is_target_cat
|
| 131 |
+
self.select_n_feats = select_n_feats
|
| 132 |
+
|
| 133 |
+
def calculate_vif(self, X):
|
| 134 |
+
|
| 135 |
+
vif = pd.DataFrame()
|
| 136 |
+
vif["features"] = X.columns
|
| 137 |
+
vif["VIF"] = [variance_inflation_factor(
|
| 138 |
+
X.values, i) for i in range(X.shape[1])]
|
| 139 |
+
|
| 140 |
+
return vif
|
| 141 |
+
|
| 142 |
+
def select_feats_via_vif(self):
|
| 143 |
+
|
| 144 |
+
num_features = self.num_feats.copy()
|
| 145 |
+
|
| 146 |
+
vif_df = self.calculate_vif(self.X[num_features])
|
| 147 |
+
|
| 148 |
+
while vif_df[vif_df['VIF'] >= 10].shape[0] != 0:
|
| 149 |
+
vif_df.sort_values('VIF', ascending=False, inplace=True)
|
| 150 |
+
vif_df.reset_index(drop=True, inplace=True)
|
| 151 |
+
# print(vif_df)
|
| 152 |
+
elimination_candidate = vif_df.iloc[0]['features']
|
| 153 |
+
# print(elimination_candidate)
|
| 154 |
+
num_features = [i for i in num_features if i !=
|
| 155 |
+
elimination_candidate]
|
| 156 |
+
new_X = self.X[num_features]
|
| 157 |
+
vif_df = self.calculate_vif(new_X)
|
| 158 |
+
|
| 159 |
+
return list(vif_df['features'].values)
|
| 160 |
+
|
| 161 |
+
def get_spearmanr(self, X, y):
|
| 162 |
+
# return np.array([stats.spearmanr(X.values[:, f], y.values).correlation for f in range(X.shape[1])])
|
| 163 |
+
spearman_values = [stats.spearmanr(
|
| 164 |
+
X.values[:, f], y.values).correlation for f in range(X.shape[1])]
|
| 165 |
+
temp_sp_df = pd.DataFrame(
|
| 166 |
+
{'spearman': spearman_values, 'feats': list(X.columns)})
|
| 167 |
+
temp_sp_df['abs_spearman'] = np.abs(temp_sp_df['spearman'])
|
| 168 |
+
temp_sp_df.sort_values('abs_spearman', ascending=False, inplace=True)
|
| 169 |
+
temp_sp_df.reset_index(drop=True, inplace=True)
|
| 170 |
+
return temp_sp_df.iloc[:15]['feats'].to_list()
|
| 171 |
+
|
| 172 |
+
def get_kendalltau(self, X, y):
|
| 173 |
+
# return [stats.kendalltau(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
|
| 174 |
+
kendall_values = [stats.spearmanr(
|
| 175 |
+
X.values[:, f], y.values).correlation for f in range(X.shape[1])]
|
| 176 |
+
temp_ken_df = pd.DataFrame(
|
| 177 |
+
{'kendall': kendall_values, 'feats': list(X.columns)})
|
| 178 |
+
temp_ken_df['abs_kendall'] = np.abs(temp_ken_df['kendall'])
|
| 179 |
+
temp_ken_df.sort_values('abs_kendall', ascending=False, inplace=True)
|
| 180 |
+
temp_ken_df.reset_index(drop=True, inplace=True)
|
| 181 |
+
return temp_ken_df.iloc[:15]['feats'].to_list()
|
| 182 |
+
|
| 183 |
+
def get_pointbiserialr(self, X, y):
|
| 184 |
+
return [stats.pointbiserialr(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
|
| 185 |
+
|
| 186 |
+
def get_boruta_feats(self):
|
| 187 |
+
feat_selector = BorutaPy(
|
| 188 |
+
self.model, n_estimators='auto', verbose=2, random_state=1)
|
| 189 |
+
feat_selector.fit(np.array(self.X), np.array(self.y))
|
| 190 |
+
boruta_selected_features = list(
|
| 191 |
+
self.X.iloc[:, feat_selector.support_].columns)
|
| 192 |
+
return boruta_selected_features
|
| 193 |
+
|
| 194 |
+
def get_kbest(self, X, feats_list, metric):
|
| 195 |
+
selector = SelectKBest(metric, k=self.select_n_feats)
|
| 196 |
+
selector.fit_transform(X[feats_list], self.y)
|
| 197 |
+
selected_feats_idxs_list = list(selector.get_support(indices=True))
|
| 198 |
+
column_names = [feats_list[i] for i in selected_feats_idxs_list]
|
| 199 |
+
return column_names
|
| 200 |
+
|
| 201 |
+
def get_rfe_feats(self):
|
| 202 |
+
model_rfe = RFE(self.model, n_features_to_select=self.select_n_feats)
|
| 203 |
+
model_rfe.fit(self.X, self.y)
|
| 204 |
+
model_rfe_feats = list(
|
| 205 |
+
self.X.iloc[:, list(model_rfe.support_)].columns)
|
| 206 |
+
return model_rfe_feats
|
| 207 |
+
|
| 208 |
+
# def get_shap_feats(self, feats_list, topk=10):
|
| 209 |
+
# model = self.model
|
| 210 |
+
# X = self.X[feats_list]
|
| 211 |
+
# model.fit(self.X, self.y)
|
| 212 |
+
# explainer = shap.Explainer(model.predict, X, max_evals = int(2 * X.shape[1] + 1), verbose=0)
|
| 213 |
+
# shap_values = explainer(X)
|
| 214 |
+
# selected_shap_features = get_feature_importances_shap_values(
|
| 215 |
+
# shap_values, features=list(X.columns), topk=topk
|
| 216 |
+
# )
|
| 217 |
+
# return selected_shap_features
|
| 218 |
+
|
| 219 |
+
def get_features(self):
|
| 220 |
+
|
| 221 |
+
if self.num_feats is not None:
|
| 222 |
+
|
| 223 |
+
if self.is_target_cat:
|
| 224 |
+
|
| 225 |
+
temp_n_feats = self.select_n_feats
|
| 226 |
+
if len(self.num_feats) < self.select_n_feats:
|
| 227 |
+
self.select_n_feats = 'all'
|
| 228 |
+
|
| 229 |
+
# self.num_kendalltau_feats = self.get_kendalltau(self.X[self.num_feats], self.y)
|
| 230 |
+
self.num_f_feats = self.get_kbest(
|
| 231 |
+
X=self.X, feats_list=self.num_feats, metric=f_classif)
|
| 232 |
+
self.num_mi_feats = self.get_kbest(
|
| 233 |
+
X=self.X, feats_list=self.num_feats, metric=mutual_info_classif)
|
| 234 |
+
|
| 235 |
+
self.select_n_feats = temp_n_feats
|
| 236 |
+
|
| 237 |
+
self.selected_num_feats = []
|
| 238 |
+
# self.selected_num_feats.extend(self.num_kendalltau_feats)
|
| 239 |
+
self.selected_num_feats.extend(self.num_f_feats)
|
| 240 |
+
self.selected_num_feats.extend(self.num_mi_feats)
|
| 241 |
+
|
| 242 |
+
else:
|
| 243 |
+
|
| 244 |
+
self.vif_feats = self.select_feats_via_vif()
|
| 245 |
+
|
| 246 |
+
temp_n_feats = self.select_n_feats
|
| 247 |
+
if len(self.num_feats) < self.select_n_feats:
|
| 248 |
+
self.select_n_feats = 'all'
|
| 249 |
+
|
| 250 |
+
self.pearson_feats = self.get_kbest(
|
| 251 |
+
X=self.X, feats_list=self.num_feats, metric=r_regression, k=self.select_n_feats)
|
| 252 |
+
|
| 253 |
+
self.select_n_feats = temp_n_feats
|
| 254 |
+
# self.num_spearmanr_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=stats.spearmanr, k=self.select_n_feats)
|
| 255 |
+
# self.num_kendalltau_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=stats.kendalltau, k=self.select_n_feats)
|
| 256 |
+
self.num_spearmanr_feats = self.get_spearmanr(
|
| 257 |
+
self.X[self.num_feats], self.y)
|
| 258 |
+
self.num_kendalltau_feats = self.get_kendalltau(
|
| 259 |
+
self.X[self.num_feats], self.y)
|
| 260 |
+
# self.num_spearmanr_feats = SelectKBest(self.get_spearmanr, k=self.select_n_feats).fit_transform(self.X[self.num_feats], self.y)
|
| 261 |
+
# self.num_kendalltau_feats = SelectKBest(self.get_kendalltau, k=self.select_n_feats).fit_transform(self.X[self.num_feats], self.y)
|
| 262 |
+
|
| 263 |
+
self.selected_num_feats = []
|
| 264 |
+
self.selected_num_feats.extend(self.pearson_feats)
|
| 265 |
+
self.selected_num_feats.extend(self.num_spearmanr_feats)
|
| 266 |
+
self.selected_num_feats.extend(self.num_kendalltau_feats)
|
| 267 |
+
# self.selected_num_feats = list(set(self.selected_num_feats))
|
| 268 |
+
|
| 269 |
+
else:
|
| 270 |
+
|
| 271 |
+
self.selected_num_feats = []
|
| 272 |
+
|
| 273 |
+
if self.ordinal_feats is not None:
|
| 274 |
+
|
| 275 |
+
if self.is_target_cat:
|
| 276 |
+
|
| 277 |
+
temp_n_feats = self.select_n_feats
|
| 278 |
+
if len(self.ordinal_feats) < self.select_n_feats:
|
| 279 |
+
self.select_n_feats = 'all'
|
| 280 |
+
|
| 281 |
+
self.ordinal_mi_feats = self.get_kbest(
|
| 282 |
+
X=self.X, feats_list=self.ordinal_feats, metric=mutual_info_classif)
|
| 283 |
+
self.ordinal_chi2_feats = self.get_kbest(
|
| 284 |
+
X=self.X, feats_list=self.ordinal_feats, metric=chi2)
|
| 285 |
+
|
| 286 |
+
self.selected_ordinal_feats = []
|
| 287 |
+
self.selected_ordinal_feats.extend(self.ordinal_mi_feats)
|
| 288 |
+
self.selected_ordinal_feats.extend(self.ordinal_chi2_feats)
|
| 289 |
+
|
| 290 |
+
self.select_n_feats = temp_n_feats
|
| 291 |
+
|
| 292 |
+
else:
|
| 293 |
+
|
| 294 |
+
self.ordinal_spearmanr_feats = self.get_spearmanr(
|
| 295 |
+
self.X[self.ordinal_feats], self.y)
|
| 296 |
+
self.ordinal_kendalltau_feats = self.get_kendalltau(
|
| 297 |
+
self.X[self.ordinal_feats], self.y)
|
| 298 |
+
|
| 299 |
+
# self.ordinal_spearmanr_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=stats.spearmanr, k=self.select_n_feats)
|
| 300 |
+
# self.ordinal_kendalltau_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=stats.kendalltau, k=self.select_n_feats)
|
| 301 |
+
|
| 302 |
+
# self.ordinal_spearmanr_feats = SelectKBest(self.get_spearmanr, k=self.select_n_feats).fit_transform(self.X[self.ordinal_feats], self.y)
|
| 303 |
+
# self.ordinal_kendalltau_feats = SelectKBest(self.get_kendalltau, k=self.select_n_feats).fit_transform(self.X[self.ordinal_feats], self.y)
|
| 304 |
+
|
| 305 |
+
self.selected_ordinal_feats = []
|
| 306 |
+
self.selected_ordinal_feats.extend(
|
| 307 |
+
self.ordinal_spearmanr_feats)
|
| 308 |
+
self.selected_ordinal_feats.extend(
|
| 309 |
+
self.ordinal_kendalltau_feats)
|
| 310 |
+
# self.selected_ordinal_feats = list(set(self.selected_ordinal_feats))
|
| 311 |
+
|
| 312 |
+
else:
|
| 313 |
+
self.selected_ordinal_feats = []
|
| 314 |
+
|
| 315 |
+
if self.nominal_feats is not None:
|
| 316 |
+
|
| 317 |
+
if self.is_target_cat:
|
| 318 |
+
|
| 319 |
+
temp_n_feats = self.select_n_feats
|
| 320 |
+
if len(self.nominal_feats) < self.select_n_feats:
|
| 321 |
+
self.select_n_feats = 'all'
|
| 322 |
+
|
| 323 |
+
self.nominal_mi_feats = self.get_kbest(
|
| 324 |
+
X=self.X, feats_list=self.nominal_feats, metric=mutual_info_classif)
|
| 325 |
+
self.nominal_chi2_feats = self.get_kbest(
|
| 326 |
+
X=self.X, feats_list=self.nominal_feats, metric=chi2)
|
| 327 |
+
|
| 328 |
+
self.selected_nominal_feats = []
|
| 329 |
+
self.selected_nominal_feats.extend(self.nominal_mi_feats)
|
| 330 |
+
self.selected_nominal_feats.extend(self.nominal_chi2_feats)
|
| 331 |
+
|
| 332 |
+
self.select_n_feats = temp_n_feats
|
| 333 |
+
|
| 334 |
+
else:
|
| 335 |
+
|
| 336 |
+
temp_n_feats = self.select_n_feats
|
| 337 |
+
if len(self.nominal_feats) < self.select_n_feats:
|
| 338 |
+
self.select_n_feats = 'all'
|
| 339 |
+
|
| 340 |
+
self.f_feats = self.get_kbest(
|
| 341 |
+
X=self.X, feats_list=self.nominal_feats, metric=f_classif, k=self.select_n_feats)
|
| 342 |
+
self.mi_feats = self.get_kbest(
|
| 343 |
+
X=self.X, feats_list=self.nominal_feats, metric=mutual_info_regression, k=self.select_n_feats)
|
| 344 |
+
|
| 345 |
+
self.select_n_feats = temp_n_feats
|
| 346 |
+
|
| 347 |
+
# # self.f_feats = f_classif(self.X[self.nominal_feats], self.y)[0]
|
| 348 |
+
# self.f_feats = SelectKBest(f_classif, k=self.select_n_feats).fit_transform(self.X[self.nominal_feats], self.y).columns
|
| 349 |
+
|
| 350 |
+
# # self.mi_feats = mutual_info_regression(self.X[self.nominal_feats], self.y)
|
| 351 |
+
# self.mi_feats = SelectKBest(mutual_info_regression, k=self.select_n_feats).fit_transform(self.X[self.nominal_feats], self.y).columns
|
| 352 |
+
|
| 353 |
+
self.selected_nominal_feats = []
|
| 354 |
+
self.selected_nominal_feats.extend(self.f_feats)
|
| 355 |
+
self.selected_nominal_feats.extend(self.mi_feats)
|
| 356 |
+
# self.selected_nominal_feats = list(set(self.selected_nominal_feats))
|
| 357 |
+
|
| 358 |
+
else:
|
| 359 |
+
|
| 360 |
+
self.selected_nominal_feats = []
|
| 361 |
+
|
| 362 |
+
if self.model is not None:
|
| 363 |
+
# np.int = np.int32
|
| 364 |
+
# np.float = np.float64
|
| 365 |
+
# np.bool = np.bool_
|
| 366 |
+
if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
|
| 367 |
+
self.boruta_feats = self.get_boruta_feats()
|
| 368 |
+
if not isinstance(self.model, SVC):
|
| 369 |
+
self.rfe_feats = self.get_rfe_feats()
|
| 370 |
+
else:
|
| 371 |
+
self.boruta_feats = []
|
| 372 |
+
self.rfe_feats = []
|
| 373 |
+
|
| 374 |
+
if len(self.selected_num_feats) != 0:
|
| 375 |
+
if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
|
| 376 |
+
self.selected_num_feats.extend(self.boruta_feats)
|
| 377 |
+
if not isinstance(self.model, SVC):
|
| 378 |
+
self.selected_num_feats.extend(self.rfe_feats)
|
| 379 |
+
num_feats_dict = dict(Counter(self.selected_num_feats))
|
| 380 |
+
self.selected_num_feats = [
|
| 381 |
+
i for i in num_feats_dict if num_feats_dict[i] >= 2]
|
| 382 |
+
|
| 383 |
+
if len(self.selected_ordinal_feats) != 0:
|
| 384 |
+
if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
|
| 385 |
+
self.selected_ordinal_feats.extend(self.boruta_feats)
|
| 386 |
+
if not isinstance(self.model, SVC):
|
| 387 |
+
self.selected_ordinal_feats.extend(self.rfe_feats)
|
| 388 |
+
ordinal_feats_dict = dict(Counter(self.selected_ordinal_feats))
|
| 389 |
+
self.selected_ordinal_feats = [
|
| 390 |
+
i for i in ordinal_feats_dict if ordinal_feats_dict[i] >= 2]
|
| 391 |
+
|
| 392 |
+
if len(self.selected_nominal_feats) != 0:
|
| 393 |
+
if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
|
| 394 |
+
self.selected_nominal_feats.extend(self.boruta_feats)
|
| 395 |
+
if not isinstance(self.model, SVC):
|
| 396 |
+
self.selected_nominal_feats.extend(self.rfe_feats)
|
| 397 |
+
nominal_feats_dict = dict(Counter(self.selected_nominal_feats))
|
| 398 |
+
self.selected_nominal_feats = [
|
| 399 |
+
i for i in nominal_feats_dict if nominal_feats_dict[i] >= 2]
|
| 400 |
+
|
| 401 |
+
self.selected_feats = []
|
| 402 |
+
self.selected_feats.extend(self.selected_num_feats)
|
| 403 |
+
self.selected_feats.extend(self.selected_ordinal_feats)
|
| 404 |
+
self.selected_feats.extend(self.selected_nominal_feats)
|
| 405 |
+
if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
|
| 406 |
+
self.selected_feats.extend(self.boruta_feats)
|
| 407 |
+
self.selected_feats = list(set(self.selected_feats))
|
| 408 |
+
|
| 409 |
+
# self.selected_feats = self.get_shap_feats(self.selected_feats)
|
| 410 |
+
|
| 411 |
+
return self.selected_feats
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
def create_feature_selection_dict(data, cv_fold_list, numerical_features, nominal_features):
|
| 415 |
+
'''
|
| 416 |
+
Returns feature selection dictionary for 4 different models
|
| 417 |
+
|
| 418 |
+
Args:
|
| 419 |
+
data (pd.DataFrame): train data
|
| 420 |
+
cv_fold_list (list): contains tuples of indeces of train and validation data for each fold
|
| 421 |
+
numerical_features (list): contains the names of numerical features
|
| 422 |
+
nominal_features (list): contains the names of nominal features
|
| 423 |
+
|
| 424 |
+
Returns:
|
| 425 |
+
dict: contains selected features, train and validation scores, models and scalers used
|
| 426 |
+
'''
|
| 427 |
+
|
| 428 |
+
selected_features_dict = {}
|
| 429 |
+
|
| 430 |
+
for idx in tqdm(range(1)):
|
| 431 |
+
|
| 432 |
+
X_train = data.iloc[cv_fold_list[idx][0]].reset_index(drop=True)
|
| 433 |
+
y_train = data.iloc[cv_fold_list[idx][0]
|
| 434 |
+
]['Bankrupt?'].to_frame().reset_index(drop=True)
|
| 435 |
+
|
| 436 |
+
X_valid = data.iloc[cv_fold_list[idx][1]].reset_index(drop=True)
|
| 437 |
+
y_valid = data.iloc[cv_fold_list[idx][1]
|
| 438 |
+
]['Bankrupt?'].to_frame().reset_index(drop=True)
|
| 439 |
+
|
| 440 |
+
new_numerical_features = []
|
| 441 |
+
for feat in numerical_features:
|
| 442 |
+
X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * \
|
| 443 |
+
X_train[' Liability-Assets Flag']
|
| 444 |
+
X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * \
|
| 445 |
+
X_valid[' Liability-Assets Flag']
|
| 446 |
+
new_numerical_features.append(
|
| 447 |
+
f"feat{numerical_features.index(feat)}")
|
| 448 |
+
|
| 449 |
+
numerical_features.extend(new_numerical_features)
|
| 450 |
+
|
| 451 |
+
# getting categorical features
|
| 452 |
+
categorical_features = nominal_features.copy()
|
| 453 |
+
|
| 454 |
+
# getting all features
|
| 455 |
+
all_features = []
|
| 456 |
+
all_features.extend(categorical_features)
|
| 457 |
+
all_features.extend(numerical_features)
|
| 458 |
+
|
| 459 |
+
X_train = X_train[all_features]
|
| 460 |
+
X_valid = X_valid[all_features]
|
| 461 |
+
|
| 462 |
+
models_list = [RandomForestClassifier(), XGBClassifier(
|
| 463 |
+
), LogisticRegression(), SVC(probability=True)]
|
| 464 |
+
model_names_list = ['RandomForestClassifier',
|
| 465 |
+
'XGBClassifier', 'LogisticRegression', 'SVC']
|
| 466 |
+
|
| 467 |
+
for model_idx in tqdm(range(len(model_names_list))):
|
| 468 |
+
|
| 469 |
+
model_name = model_names_list[model_idx]
|
| 470 |
+
|
| 471 |
+
selected_features_dict[model_name] = {}
|
| 472 |
+
|
| 473 |
+
# feature selection
|
| 474 |
+
model = models_list[model_idx]
|
| 475 |
+
|
| 476 |
+
if isinstance(model, LogisticRegression) or isinstance(model, SVC):
|
| 477 |
+
|
| 478 |
+
scaler = StandardScaler()
|
| 479 |
+
|
| 480 |
+
X_train2 = scaler.fit_transform(X_train[numerical_features])
|
| 481 |
+
X_train2 = pd.DataFrame(X_train2, columns=numerical_features)
|
| 482 |
+
X_train2 = pd.concat(
|
| 483 |
+
[X_train2, X_train[categorical_features]], axis=1)
|
| 484 |
+
|
| 485 |
+
fselector = FSelector(
|
| 486 |
+
X=X_train2,
|
| 487 |
+
y=y_train,
|
| 488 |
+
num_feats=numerical_features,
|
| 489 |
+
ordinal_feats=None,
|
| 490 |
+
nominal_feats=nominal_features,
|
| 491 |
+
model=model
|
| 492 |
+
)
|
| 493 |
+
|
| 494 |
+
else:
|
| 495 |
+
|
| 496 |
+
fselector = FSelector(
|
| 497 |
+
X=X_train,
|
| 498 |
+
y=y_train,
|
| 499 |
+
num_feats=numerical_features,
|
| 500 |
+
ordinal_feats=None,
|
| 501 |
+
nominal_feats=nominal_features,
|
| 502 |
+
model=model
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
selected_features = fselector.get_features()
|
| 506 |
+
|
| 507 |
+
if len(selected_features) == 0:
|
| 508 |
+
continue
|
| 509 |
+
|
| 510 |
+
# selecting features using shap values
|
| 511 |
+
if isinstance(model, LogisticRegression) or isinstance(model, SVC):
|
| 512 |
+
|
| 513 |
+
X_valid2 = scaler.transform(X_valid[numerical_features])
|
| 514 |
+
X_valid2 = pd.DataFrame(X_valid2, columns=numerical_features)
|
| 515 |
+
X_valid2 = pd.concat(
|
| 516 |
+
[X_valid2, X_valid[categorical_features]], axis=1)
|
| 517 |
+
|
| 518 |
+
X_train_filtered = X_train2[selected_features]
|
| 519 |
+
X_valid_filtered = X_valid2[selected_features]
|
| 520 |
+
|
| 521 |
+
else:
|
| 522 |
+
|
| 523 |
+
X_train_filtered = X_train[selected_features]
|
| 524 |
+
X_valid_filtered = X_valid[selected_features]
|
| 525 |
+
|
| 526 |
+
# model training using selected features
|
| 527 |
+
model.fit(X_train_filtered, y_train)
|
| 528 |
+
|
| 529 |
+
explainer = shap.Explainer(
|
| 530 |
+
model.predict,
|
| 531 |
+
X_train_filtered,
|
| 532 |
+
# max_evals = int(2 * X_train_filtered.shape[1] + 1),
|
| 533 |
+
# verbose=0
|
| 534 |
+
)
|
| 535 |
+
shap_values = explainer(X_train_filtered)
|
| 536 |
+
selected_shap_features = get_shap_features(
|
| 537 |
+
shap_values,
|
| 538 |
+
features=list(X_train_filtered.columns),
|
| 539 |
+
topk=10
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
# model training using shap features
|
| 543 |
+
model = models_list[model_idx]
|
| 544 |
+
model.fit(X_train_filtered[selected_shap_features], y_train)
|
| 545 |
+
|
| 546 |
+
# metric calculation
|
| 547 |
+
y_train_pred = model.predict(
|
| 548 |
+
X_train_filtered[selected_shap_features])
|
| 549 |
+
y_train_pred_prob = model.predict_proba(
|
| 550 |
+
X_train_filtered[selected_shap_features])[:, 1]
|
| 551 |
+
|
| 552 |
+
y_valid_pred = model.predict(
|
| 553 |
+
X_valid_filtered[selected_shap_features])
|
| 554 |
+
y_valid_pred_prob = model.predict_proba(
|
| 555 |
+
X_valid_filtered[selected_shap_features])[:, 1]
|
| 556 |
+
|
| 557 |
+
train_acc = accuracy_score(y_train, y_train_pred)
|
| 558 |
+
train_f1 = f1_score(y_train, y_train_pred)
|
| 559 |
+
train_roc_auc = roc_auc_score(y_train, y_train_pred_prob)
|
| 560 |
+
|
| 561 |
+
valid_acc = accuracy_score(y_valid, y_valid_pred)
|
| 562 |
+
valid_f1 = f1_score(y_valid, y_valid_pred)
|
| 563 |
+
valid_roc_auc = roc_auc_score(y_valid, y_valid_pred_prob)
|
| 564 |
+
|
| 565 |
+
selected_features_dict[model_name][idx+1] = {}
|
| 566 |
+
selected_features_dict[model_name][idx +
|
| 567 |
+
1]['selected_feats'] = selected_features
|
| 568 |
+
selected_features_dict[model_name][idx +
|
| 569 |
+
1]['selected_shap_feats'] = selected_shap_features
|
| 570 |
+
selected_features_dict[model_name][idx+1]['train_acc'] = train_acc
|
| 571 |
+
selected_features_dict[model_name][idx+1]['train_f1'] = train_f1
|
| 572 |
+
selected_features_dict[model_name][idx +
|
| 573 |
+
1]['train_roc_auc'] = train_roc_auc
|
| 574 |
+
selected_features_dict[model_name][idx+1]['valid_acc'] = valid_acc
|
| 575 |
+
selected_features_dict[model_name][idx+1]['valid_f1'] = valid_f1
|
| 576 |
+
selected_features_dict[model_name][idx +
|
| 577 |
+
1]['valid_roc_auc'] = valid_roc_auc
|
| 578 |
+
selected_features_dict[model_name][idx+1]['model'] = model
|
| 579 |
+
if isinstance(model, LogisticRegression) or isinstance(model, SVC):
|
| 580 |
+
selected_features_dict[model_name][idx+1]['scaler'] = scaler
|
| 581 |
+
|
| 582 |
+
# print(f"##### {model_name} #####")
|
| 583 |
+
# print(f"Selected features: {selected_features}")
|
| 584 |
+
# print("Train:")
|
| 585 |
+
# print(f"Accuracy: {train_acc:.5f}, F1: {train_f1:.5f}, ROC-AUC: {train_roc_auc:.5f}")
|
| 586 |
+
# print("Validation:")
|
| 587 |
+
# print(f"Accuracy: {valid_acc:.5f}, F1: {valid_f1:.5f}, ROC-AUC: {valid_roc_auc:.5f}")
|
| 588 |
+
|
| 589 |
+
logging.info(f"##### {model_name} #####")
|
| 590 |
+
logging.info(f"Selected features: {selected_features}")
|
| 591 |
+
logging.info('Train:')
|
| 592 |
+
logging.info(
|
| 593 |
+
f"Accuracy: {train_acc:.5f}, F1: {train_f1:.5f}, ROC-AUC: {train_roc_auc:.5f}")
|
| 594 |
+
logging.info('Validation:')
|
| 595 |
+
logging.info(
|
| 596 |
+
f"Accuracy: {valid_acc:.5f}, F1: {valid_f1:.5f}, ROC-AUC: {valid_roc_auc:.5f}")
|
| 597 |
+
|
| 598 |
+
del X_train, y_train, X_valid, y_valid, X_train_filtered, X_valid_filtered, model
|
| 599 |
+
gc.collect()
|
| 600 |
+
|
| 601 |
+
return selected_features_dict
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
def get_mean_ensemble_prediction(prob_list):
|
| 605 |
+
prob_array = np.vstack(prob_list).T
|
| 606 |
+
return np.mean(prob_array, axis=1)
|
| 607 |
+
|
| 608 |
+
|
| 609 |
+
class OptimizeAUC:
|
| 610 |
+
def __init__(self):
|
| 611 |
+
self.coef_ = 0
|
| 612 |
+
|
| 613 |
+
def _auc(self, coef, X, y):
|
| 614 |
+
X_coef = X * coef
|
| 615 |
+
preds = np.sum(X_coef, axis=1)
|
| 616 |
+
auc_score = roc_auc_score(y, preds)
|
| 617 |
+
return -1 * auc_score
|
| 618 |
+
|
| 619 |
+
def fit(self, X, y):
|
| 620 |
+
loss_partial = partial(self._auc, X=X, y=y)
|
| 621 |
+
initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size=1)
|
| 622 |
+
self.coef_ = fmin(loss_partial, initial_coef, disp=True)
|
| 623 |
+
|
| 624 |
+
def predict(self, X):
|
| 625 |
+
X_coef = X * self.coef_
|
| 626 |
+
preds = np.sum(X_coef, axis=1)
|
| 627 |
+
return preds
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
def get_optimized_ensemble(train_df, test_df, cv_fold_list, selected_features_dict, trained_models_dict, numerical_features):
|
| 631 |
+
'''
|
| 632 |
+
Finds the optimized weights for ensembling using the train data and evaluates it on test data
|
| 633 |
+
|
| 634 |
+
Args:
|
| 635 |
+
train_df (pd.DataFrame): train data
|
| 636 |
+
test_df (pd.DataFrame): test data
|
| 637 |
+
cv_fold_list (list): contains tuples of indeces of train and validation data for each fold
|
| 638 |
+
selected_features_dict (dict): selected features dictionary where keys are models' names
|
| 639 |
+
trained_models_dict (dict): trained models dictionary where keys are models' names
|
| 640 |
+
numerical_features (list): contains the names of numerical features
|
| 641 |
+
|
| 642 |
+
Returns:
|
| 643 |
+
dict: contains all optimized weights for each fold
|
| 644 |
+
float: ROC-AUC score
|
| 645 |
+
'''
|
| 646 |
+
|
| 647 |
+
opt_dict = {}
|
| 648 |
+
|
| 649 |
+
test_preds_list = []
|
| 650 |
+
# valid_preds_list = []
|
| 651 |
+
|
| 652 |
+
X_test_rf = test_df[selected_features_dict['RandomForestClassifier']
|
| 653 |
+
[1]['selected_shap_feats']]
|
| 654 |
+
X_test_xgb = test_df[selected_features_dict['XGBClassifier']
|
| 655 |
+
[1]['selected_shap_feats']]
|
| 656 |
+
X_test_lr = test_df[selected_features_dict['LogisticRegression']
|
| 657 |
+
[1]['selected_shap_feats']]
|
| 658 |
+
X_test_svc = test_df[selected_features_dict['SVC']
|
| 659 |
+
[1]['selected_shap_feats']]
|
| 660 |
+
|
| 661 |
+
y_test = test_df['Bankrupt?'].to_frame()
|
| 662 |
+
|
| 663 |
+
for idx in range(len(cv_fold_list)):
|
| 664 |
+
|
| 665 |
+
logging.info(f'Starting calculations for Fold {idx+1}')
|
| 666 |
+
|
| 667 |
+
X_train = train_df.iloc[cv_fold_list[idx][0]].reset_index(drop=True)
|
| 668 |
+
y_train = train_df.iloc[cv_fold_list[idx][0]
|
| 669 |
+
]['Bankrupt?'].to_frame().reset_index(drop=True)
|
| 670 |
+
|
| 671 |
+
X_valid = train_df.iloc[cv_fold_list[idx][1]].reset_index(drop=True)
|
| 672 |
+
y_valid = train_df.iloc[cv_fold_list[idx][1]
|
| 673 |
+
]['Bankrupt?'].to_frame().reset_index(drop=True)
|
| 674 |
+
|
| 675 |
+
# RandomForest
|
| 676 |
+
logging.info('Starting RandomForest calculations')
|
| 677 |
+
rf_selected_features = selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats']
|
| 678 |
+
X_train_rf = X_train[rf_selected_features]
|
| 679 |
+
X_valid_rf = X_valid[rf_selected_features]
|
| 680 |
+
|
| 681 |
+
rf_gscv = trained_models_dict['RandomForestClassifier']
|
| 682 |
+
|
| 683 |
+
rfm = RandomForestClassifier(**rf_gscv.best_params_)
|
| 684 |
+
rfm.fit(X_train_rf, y_train)
|
| 685 |
+
rfm_valid_probs = rfm.predict_proba(X_valid_rf)[:, 1]
|
| 686 |
+
|
| 687 |
+
rfm_test_probs = rfm.predict_proba(X_test_rf)[:, 1]
|
| 688 |
+
logging.info('RandomForest calculations completed')
|
| 689 |
+
|
| 690 |
+
# XGBoost
|
| 691 |
+
logging.info('Starting XGBoost calculations')
|
| 692 |
+
xgb_selected_features = selected_features_dict['XGBClassifier'][1]['selected_shap_feats']
|
| 693 |
+
X_train_xgb = X_train[xgb_selected_features]
|
| 694 |
+
X_valid_xgb = X_valid[xgb_selected_features]
|
| 695 |
+
|
| 696 |
+
xgb_gscv = trained_models_dict['XGBClassifier']
|
| 697 |
+
|
| 698 |
+
xgbm = XGBClassifier(**xgb_gscv.best_params_)
|
| 699 |
+
xgbm.fit(X_train_xgb, y_train)
|
| 700 |
+
xgbm_valid_probs = xgbm.predict_proba(X_valid_xgb)[:, 1]
|
| 701 |
+
xgbm_test_probs = xgbm.predict_proba(X_test_xgb)[:, 1]
|
| 702 |
+
logging.info('XGBoost calculations completed')
|
| 703 |
+
|
| 704 |
+
# LogisticRegression
|
| 705 |
+
logging.info('Starting LogisticRegression calculations')
|
| 706 |
+
lr_selected_features = selected_features_dict['LogisticRegression'][1]['selected_shap_feats']
|
| 707 |
+
X_train_lr = X_train[lr_selected_features]
|
| 708 |
+
X_valid_lr = X_valid[lr_selected_features]
|
| 709 |
+
|
| 710 |
+
lr_gscv = trained_models_dict['LogisticRegression']
|
| 711 |
+
|
| 712 |
+
lr_params = {k.replace('model__', ''): v for k,
|
| 713 |
+
v in lr_gscv.best_params_.items()}
|
| 714 |
+
selected_shap_features = selected_features_dict['LogisticRegression'][1]['selected_shap_feats']
|
| 715 |
+
num_feat = [
|
| 716 |
+
col for col in selected_shap_features if col in numerical_features]
|
| 717 |
+
num_trans = Pipeline([('scale', StandardScaler())])
|
| 718 |
+
preprocessor = ColumnTransformer(
|
| 719 |
+
transformers=[('num', num_trans, num_feat)], remainder='passthrough')
|
| 720 |
+
lrm = Pipeline(
|
| 721 |
+
[
|
| 722 |
+
('preproc', preprocessor),
|
| 723 |
+
('lr', LogisticRegression(**lr_params))
|
| 724 |
+
]
|
| 725 |
+
)
|
| 726 |
+
lrm.fit(X_train_lr, y_train)
|
| 727 |
+
lrm_valid_probs = lrm.predict_proba(X_valid_lr)[:, 1]
|
| 728 |
+
lrm_test_probs = lrm.predict_proba(X_test_lr)[:, 1]
|
| 729 |
+
logging.info('LogisticRegression calculations completed')
|
| 730 |
+
|
| 731 |
+
# SVC
|
| 732 |
+
logging.info('Starting SVC calculations')
|
| 733 |
+
svc_selected_features = selected_features_dict['SVC'][1]['selected_shap_feats']
|
| 734 |
+
X_train_svc = X_train[svc_selected_features]
|
| 735 |
+
X_valid_svc = X_valid[svc_selected_features]
|
| 736 |
+
|
| 737 |
+
svc_gscv = trained_models_dict['SVC']
|
| 738 |
+
|
| 739 |
+
svc_params = {k.replace('model__', ''): v for k,
|
| 740 |
+
v in svc_gscv.best_params_.items()}
|
| 741 |
+
selected_shap_features = selected_features_dict['SVC'][1]['selected_shap_feats']
|
| 742 |
+
num_feat = [
|
| 743 |
+
col for col in selected_shap_features if col in numerical_features]
|
| 744 |
+
num_trans = Pipeline([('scale', StandardScaler())])
|
| 745 |
+
preprocessor = ColumnTransformer(
|
| 746 |
+
transformers=[('num', num_trans, num_feat)], remainder='passthrough')
|
| 747 |
+
svcm = Pipeline(
|
| 748 |
+
[
|
| 749 |
+
('preproc', preprocessor),
|
| 750 |
+
('svc', SVC(probability=True, **svc_params))
|
| 751 |
+
]
|
| 752 |
+
)
|
| 753 |
+
svcm.fit(X_train_svc, y_train)
|
| 754 |
+
svcm_valid_probs = svcm.predict_proba(X_valid_svc)[:, 1]
|
| 755 |
+
svcm_test_probs = svcm.predict_proba(X_test_svc)[:, 1]
|
| 756 |
+
logging.info('SVC calculations completed')
|
| 757 |
+
|
| 758 |
+
logging.info('Optimizing Ensemble weights')
|
| 759 |
+
valid_preds = np.column_stack([
|
| 760 |
+
rfm_valid_probs,
|
| 761 |
+
xgbm_valid_probs,
|
| 762 |
+
lrm_valid_probs,
|
| 763 |
+
svcm_valid_probs
|
| 764 |
+
])
|
| 765 |
+
|
| 766 |
+
opt = OptimizeAUC()
|
| 767 |
+
opt.fit(valid_preds, y_valid)
|
| 768 |
+
opt_dict[idx] = {}
|
| 769 |
+
opt_dict[idx]['opt'] = opt
|
| 770 |
+
opt_dict[idx]['rfm'] = rfm
|
| 771 |
+
opt_dict[idx]['xgbm'] = xgbm
|
| 772 |
+
opt_dict[idx]['lrm'] = lrm
|
| 773 |
+
opt_dict[idx]['svcm'] = svcm
|
| 774 |
+
logging.info('Optimization finished')
|
| 775 |
+
|
| 776 |
+
# valid_preds_list.append(opt.predict(valid_preds))
|
| 777 |
+
|
| 778 |
+
logging.info('Calculating predictions for test set')
|
| 779 |
+
test_preds = np.column_stack([
|
| 780 |
+
rfm_test_probs,
|
| 781 |
+
xgbm_test_probs,
|
| 782 |
+
lrm_test_probs,
|
| 783 |
+
svcm_test_probs
|
| 784 |
+
])
|
| 785 |
+
|
| 786 |
+
test_preds_list.append(opt.predict(test_preds))
|
| 787 |
+
logging.info('Test set predictions calculated')
|
| 788 |
+
|
| 789 |
+
logging.info('Getting the score for test set')
|
| 790 |
+
opt_y_test_pred_prob = np.mean(np.column_stack(test_preds_list), axis=1)
|
| 791 |
+
opt_test_roc_auc = roc_auc_score(y_test, opt_y_test_pred_prob)
|
| 792 |
+
logging.info('Test score calculated')
|
| 793 |
+
|
| 794 |
+
return (opt_dict, opt_test_roc_auc)
|
| 795 |
+
|
| 796 |
+
|
| 797 |
+
def find_optimal_model(train_df, test_df, features_dict_path, cv_fold_list, numerical_features):
|
| 798 |
+
'''
|
| 799 |
+
Finds the best model for the train data and evaluates it on test data
|
| 800 |
+
|
| 801 |
+
Args:
|
| 802 |
+
train_df (pd.DataFrame): train data
|
| 803 |
+
test_df (pd.DataFrame): test data
|
| 804 |
+
features_dict_path (str): path to selected features dictionary
|
| 805 |
+
cv_fold_list (list): contains tuples of indeces of train and validation data for each fold
|
| 806 |
+
numerical_features (list): contains the names of numerical features
|
| 807 |
+
|
| 808 |
+
Returns:
|
| 809 |
+
dict: contains all trained models and the name of the best model
|
| 810 |
+
dict: contains all optimized weights of ensembling for each fold
|
| 811 |
+
'''
|
| 812 |
+
logging.info('Loading selected features dictionary')
|
| 813 |
+
selected_features_dict = load_object(file_path=features_dict_path)
|
| 814 |
+
logging.info('Selected features dictionary loaded')
|
| 815 |
+
|
| 816 |
+
models_list = [RandomForestClassifier(), XGBClassifier(),
|
| 817 |
+
LogisticRegression(), SVC(probability=True)]
|
| 818 |
+
model_names_list = ['RandomForestClassifier',
|
| 819 |
+
'XGBClassifier', 'LogisticRegression', 'SVC']
|
| 820 |
+
model_params_list = [
|
| 821 |
+
{
|
| 822 |
+
'n_estimators': [5, 10, 15, 25, 50, 100, 120, 300, 500],
|
| 823 |
+
'max_depth': [2, 3, 5, 8, 15, 25, 30, None]
|
| 824 |
+
},
|
| 825 |
+
{
|
| 826 |
+
'eta': [0.01, 0.015, 0.025, 0.05, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9],
|
| 827 |
+
'max_depth': [3, 5, 6, 7, 9, 12, 15, 17, 25],
|
| 828 |
+
'n_estimators': [50, 100, 150, 200, 500, 1000]
|
| 829 |
+
},
|
| 830 |
+
{'model__penalty': ['l1', 'l2'], 'model__C': [
|
| 831 |
+
0.001, 0.01, 0.1, 1, 10, 100, 1000]},
|
| 832 |
+
{'model__C': [1, 10, 100, 1000], 'model__gamma': [
|
| 833 |
+
1, 0.1, 0.001, 0.0001], 'model__kernel': ['linear', 'rbf']}
|
| 834 |
+
]
|
| 835 |
+
|
| 836 |
+
trained_models_dict = {}
|
| 837 |
+
|
| 838 |
+
best_score = 0
|
| 839 |
+
best_model_name = None
|
| 840 |
+
|
| 841 |
+
|
| 842 |
+
y_train = train_df['Bankrupt?'].to_frame()
|
| 843 |
+
y_test = test_df['Bankrupt?'].to_frame()
|
| 844 |
+
|
| 845 |
+
y_train_pred_prob_list = []
|
| 846 |
+
y_test_pred_prob_list = []
|
| 847 |
+
rank_ensemble_list = []
|
| 848 |
+
|
| 849 |
+
for model_idx in tqdm(range(len(model_names_list))):
|
| 850 |
+
|
| 851 |
+
# y_train_pred_prob = np.zeros(X_train.shape)
|
| 852 |
+
|
| 853 |
+
model_name = model_names_list[model_idx]
|
| 854 |
+
|
| 855 |
+
selected_shap_features = selected_features_dict[model_name][1]['selected_shap_feats']
|
| 856 |
+
|
| 857 |
+
X_train = train_df[selected_shap_features]
|
| 858 |
+
X_test = test_df[selected_shap_features]
|
| 859 |
+
|
| 860 |
+
logging.info(f'Starting {model_name} training')
|
| 861 |
+
params_dict = model_params_list[model_idx]
|
| 862 |
+
|
| 863 |
+
model = models_list[model_idx]
|
| 864 |
+
|
| 865 |
+
if isinstance(model, LogisticRegression) or isinstance(model, SVC):
|
| 866 |
+
num_feat = [
|
| 867 |
+
col for col in selected_shap_features if col in numerical_features]
|
| 868 |
+
num_trans = Pipeline([('scale', StandardScaler())])
|
| 869 |
+
preprocessor = ColumnTransformer(
|
| 870 |
+
transformers=[('num', num_trans, num_feat)], remainder='passthrough')
|
| 871 |
+
pipe = Pipeline(
|
| 872 |
+
[
|
| 873 |
+
('preproc', preprocessor),
|
| 874 |
+
('model', model)
|
| 875 |
+
]
|
| 876 |
+
)
|
| 877 |
+
|
| 878 |
+
model_gscv = GridSearchCV(
|
| 879 |
+
pipe,
|
| 880 |
+
param_grid=params_dict,
|
| 881 |
+
scoring='roc_auc',
|
| 882 |
+
cv=cv_fold_list,
|
| 883 |
+
n_jobs=-1,
|
| 884 |
+
verbose=4
|
| 885 |
+
)
|
| 886 |
+
else:
|
| 887 |
+
model_gscv = GridSearchCV(
|
| 888 |
+
model,
|
| 889 |
+
param_grid=params_dict,
|
| 890 |
+
scoring='roc_auc',
|
| 891 |
+
cv=cv_fold_list,
|
| 892 |
+
n_jobs=-1,
|
| 893 |
+
verbose=4
|
| 894 |
+
)
|
| 895 |
+
|
| 896 |
+
model_gscv.fit(X_train, y_train)
|
| 897 |
+
logging.info(f'{model_name} training finished')
|
| 898 |
+
|
| 899 |
+
trained_models_dict[model_name] = model_gscv
|
| 900 |
+
|
| 901 |
+
rank_ensemble_list.append((model_name, model_gscv.best_score_))
|
| 902 |
+
|
| 903 |
+
# for train_idxs, valid_idxs in cv_fold_list:
|
| 904 |
+
# temp_model = models_list[model_idx]
|
| 905 |
+
# y_train_pred_prob[valid_idxs, :] = model_gscv.predict_proba(X_train[valid_idxs, :])[:, 1]
|
| 906 |
+
# y_train_pred_prob_list.append(y_train_pred_prob)
|
| 907 |
+
|
| 908 |
+
logging.info('Getting ROC-AUC for test set')
|
| 909 |
+
y_test_pred_prob = model_gscv.predict_proba(X_test)[:, 1]
|
| 910 |
+
y_test_pred_prob_list.append(y_test_pred_prob)
|
| 911 |
+
test_roc_auc = roc_auc_score(y_test, y_test_pred_prob)
|
| 912 |
+
logging.info(
|
| 913 |
+
f'{model_name}: Validation score = {model_gscv.best_score_:.4f}, Test score = {test_roc_auc:.4f}')
|
| 914 |
+
|
| 915 |
+
if test_roc_auc > best_score:
|
| 916 |
+
best_score = test_roc_auc
|
| 917 |
+
best_model_name = model_name
|
| 918 |
+
|
| 919 |
+
logging.info('Getting Average Ensemble score')
|
| 920 |
+
# avg_ens_y_train_pred_prob = get_mean_ensemble_prediction(y_train_pred_prob_list)
|
| 921 |
+
# avg_ens_train_roc_auc = roc_auc_score(y_test, avg_ens_y_train_pred_prob)
|
| 922 |
+
|
| 923 |
+
avg_ens_y_test_pred_prob = get_mean_ensemble_prediction(
|
| 924 |
+
y_test_pred_prob_list)
|
| 925 |
+
avg_ens_test_roc_auc = roc_auc_score(y_test, avg_ens_y_test_pred_prob)
|
| 926 |
+
logging.info(f'Average Ensemble: Test score = {avg_ens_test_roc_auc:.4f}')
|
| 927 |
+
# logging.info(f'Average Ensemble: Validation score = {avg_ens_train_roc_auc:.4f}, Test score = {avg_ens_test_roc_auc:.4f}')
|
| 928 |
+
|
| 929 |
+
if avg_ens_test_roc_auc > best_score:
|
| 930 |
+
best_score = avg_ens_test_roc_auc
|
| 931 |
+
best_model_name = 'Average Ensemble'
|
| 932 |
+
|
| 933 |
+
logging.info('Getting Rank Ensemble score')
|
| 934 |
+
rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1])
|
| 935 |
+
|
| 936 |
+
# rank_ens_y_train_pred_prob = 0
|
| 937 |
+
rank_ens_y_test_pred_prob = 0
|
| 938 |
+
for i in range(len(rank_ensemble_list)):
|
| 939 |
+
# rank_ens_y_train_pred_prob += (i+1) * y_train_pred_prob_list[model_names_list.index(rank_ensemble_list[i][0])]
|
| 940 |
+
rank_ens_y_test_pred_prob += (
|
| 941 |
+
i+1) * y_test_pred_prob_list[model_names_list.index(rank_ensemble_list[i][0])]
|
| 942 |
+
# rank_ens_y_train_pred_prob /= len(rank_ensemble_list) * (1+ len(rank_ensemble_list)) / 2
|
| 943 |
+
rank_ens_y_test_pred_prob /= len(rank_ensemble_list) * \
|
| 944 |
+
(1 + len(rank_ensemble_list)) / 2
|
| 945 |
+
rank_ens_test_roc_auc = roc_auc_score(y_test, rank_ens_y_test_pred_prob)
|
| 946 |
+
|
| 947 |
+
logging.info(f'Rank Ensemble: Test score = {rank_ens_test_roc_auc:.4f}')
|
| 948 |
+
# logging.info(f'Rank Ensemble: Validation score = {rank_ens_y_train_pred_prob:.4f}, Test score = {rank_ens_y_test_pred_prob:.4f}')
|
| 949 |
+
|
| 950 |
+
if rank_ens_test_roc_auc > best_score:
|
| 951 |
+
best_score = rank_ens_test_roc_auc
|
| 952 |
+
best_model_name = 'Rank Ensemble'
|
| 953 |
+
|
| 954 |
+
logging.info('Getting Optimized Ensemble score')
|
| 955 |
+
opt_dict, opt_test_roc_auc = get_optimized_ensemble(
|
| 956 |
+
train_df,
|
| 957 |
+
test_df,
|
| 958 |
+
cv_fold_list,
|
| 959 |
+
selected_features_dict,
|
| 960 |
+
trained_models_dict,
|
| 961 |
+
numerical_features
|
| 962 |
+
)
|
| 963 |
+
|
| 964 |
+
logging.info(f'Optimized Ensemble: Test score = {opt_test_roc_auc:.4f}')
|
| 965 |
+
|
| 966 |
+
if opt_test_roc_auc > best_score:
|
| 967 |
+
best_score = opt_test_roc_auc
|
| 968 |
+
best_model_name = 'Optimized Ensemble'
|
| 969 |
+
|
| 970 |
+
trained_models_dict['best_model_name'] = best_model_name
|
| 971 |
+
|
| 972 |
+
logging.info(f'{best_model_name} is the best model')
|
| 973 |
+
|
| 974 |
+
return (trained_models_dict, opt_dict)
|
requirements.txt
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
alembic==1.13.1
|
| 2 |
+
altair==5.3.0
|
| 3 |
+
aniso8601==9.0.1
|
| 4 |
+
annotated-types==0.6.0
|
| 5 |
+
anyio==4.3.0
|
| 6 |
+
appdirs==1.4.4
|
| 7 |
+
asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1698341106958/work
|
| 8 |
+
attrs==23.2.0
|
| 9 |
+
blinker==1.7.0
|
| 10 |
+
Boruta==0.3
|
| 11 |
+
BorutaShap==1.0.17
|
| 12 |
+
cachetools==5.3.3
|
| 13 |
+
certifi==2024.2.2
|
| 14 |
+
charset-normalizer==3.3.2
|
| 15 |
+
click==8.1.7
|
| 16 |
+
cloudpickle==3.0.0
|
| 17 |
+
colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1666700638685/work
|
| 18 |
+
comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1710320294760/work
|
| 19 |
+
-e git+https://github.com/VaheC/CompanyBankruptcy.git@0c9aba9c454511775cdf83313b15ca93d56c3356#egg=CompanyBankruptcy
|
| 20 |
+
contourpy==1.2.1
|
| 21 |
+
cycler==0.12.1
|
| 22 |
+
debugpy @ file:///C:/b/abs_c0y1fjipt2/croot/debugpy_1690906864587/work
|
| 23 |
+
decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work
|
| 24 |
+
Deprecated==1.2.14
|
| 25 |
+
distro==1.9.0
|
| 26 |
+
dnspython==1.16.0
|
| 27 |
+
docker==7.1.0
|
| 28 |
+
dynaconf==3.2.5
|
| 29 |
+
ensure==1.0.2
|
| 30 |
+
entrypoints==0.4
|
| 31 |
+
et-xmlfile==1.1.0
|
| 32 |
+
evidently==0.4.22
|
| 33 |
+
exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1704921103267/work
|
| 34 |
+
executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1698579936712/work
|
| 35 |
+
Faker==25.2.0
|
| 36 |
+
filelock==3.14.0
|
| 37 |
+
Flask==3.0.3
|
| 38 |
+
fonttools==4.51.0
|
| 39 |
+
from-root==1.3.0
|
| 40 |
+
fsspec==2024.3.1
|
| 41 |
+
gitdb==4.0.11
|
| 42 |
+
GitPython==3.1.43
|
| 43 |
+
graphene==3.3
|
| 44 |
+
graphql-core==3.2.3
|
| 45 |
+
graphql-relay==3.2.0
|
| 46 |
+
greenlet==3.0.3
|
| 47 |
+
h11==0.14.0
|
| 48 |
+
httpcore==1.0.5
|
| 49 |
+
httptools==0.6.1
|
| 50 |
+
httpx==0.27.0
|
| 51 |
+
idna==3.6
|
| 52 |
+
imbalanced-learn==0.12.2
|
| 53 |
+
imblearn==0.0
|
| 54 |
+
importlib-metadata==6.11.0
|
| 55 |
+
ipykernel @ file:///D:/bld/ipykernel_1708996677248/work
|
| 56 |
+
ipython @ file:///D:/bld/ipython_1709559926914/work
|
| 57 |
+
iterative-telemetry==0.0.8
|
| 58 |
+
itsdangerous==2.2.0
|
| 59 |
+
jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1696326070614/work
|
| 60 |
+
Jinja2==3.1.3
|
| 61 |
+
joblib==1.4.0
|
| 62 |
+
jsonschema==4.21.1
|
| 63 |
+
jsonschema-specifications==2023.12.1
|
| 64 |
+
jupyter_client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1710255804825/work
|
| 65 |
+
jupyter_core @ file:///D:/bld/jupyter_core_1710257272359/work
|
| 66 |
+
kiwisolver==1.4.5
|
| 67 |
+
lightgbm==4.3.0
|
| 68 |
+
litestar==2.8.3
|
| 69 |
+
llvmlite==0.42.0
|
| 70 |
+
Mako==1.3.5
|
| 71 |
+
Markdown==3.6
|
| 72 |
+
markdown-it-py==3.0.0
|
| 73 |
+
MarkupSafe==2.1.5
|
| 74 |
+
matplotlib==3.8.4
|
| 75 |
+
matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1713250518406/work
|
| 76 |
+
mdurl==0.1.2
|
| 77 |
+
mlflow==2.13.0
|
| 78 |
+
msgspec==0.18.6
|
| 79 |
+
multidict==6.0.5
|
| 80 |
+
mypy-extensions==1.0.0
|
| 81 |
+
nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1705850609492/work
|
| 82 |
+
nltk==3.8.1
|
| 83 |
+
numba==0.59.1
|
| 84 |
+
numpy==1.26.4
|
| 85 |
+
openpyxl==3.1.2
|
| 86 |
+
opentelemetry-api==1.24.0
|
| 87 |
+
opentelemetry-sdk==1.24.0
|
| 88 |
+
opentelemetry-semantic-conventions==0.45b0
|
| 89 |
+
packaging==23.2
|
| 90 |
+
pandas==2.2.1
|
| 91 |
+
parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1712320355065/work
|
| 92 |
+
patsy==0.5.6
|
| 93 |
+
pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
|
| 94 |
+
pillow==10.3.0
|
| 95 |
+
platformdirs @ file:///home/conda/feedstock_root/build_artifacts/platformdirs_1706713388748/work
|
| 96 |
+
plotly==5.22.0
|
| 97 |
+
polyfactory==2.16.0
|
| 98 |
+
prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1702399386289/work
|
| 99 |
+
protobuf==4.25.3
|
| 100 |
+
psutil @ file:///C:/Windows/Temp/abs_b2c2fd7f-9fd5-4756-95ea-8aed74d0039flsd9qufz/croots/recipe/psutil_1656431277748/work
|
| 101 |
+
pure-eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1642875951954/work
|
| 102 |
+
pyarrow==15.0.2
|
| 103 |
+
pydantic==2.7.1
|
| 104 |
+
pydantic_core==2.18.2
|
| 105 |
+
pydeck==0.8.1b0
|
| 106 |
+
Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1700607939962/work
|
| 107 |
+
pymongo==4.7.2
|
| 108 |
+
pyparsing==3.1.2
|
| 109 |
+
python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1709299778482/work
|
| 110 |
+
python-dotenv==1.0.1
|
| 111 |
+
pytz==2024.1
|
| 112 |
+
pywin32==305.1
|
| 113 |
+
PyYAML==6.0.1
|
| 114 |
+
pyzmq @ file:///C:/b/abs_89aq69t0up/croot/pyzmq_1705605705281/work
|
| 115 |
+
querystring-parser==1.2.4
|
| 116 |
+
referencing==0.34.0
|
| 117 |
+
regex==2024.5.10
|
| 118 |
+
requests==2.31.0
|
| 119 |
+
rich==13.7.1
|
| 120 |
+
rich-click==1.8.1
|
| 121 |
+
rpds-py==0.18.0
|
| 122 |
+
scikit-learn==1.4.2
|
| 123 |
+
scipy==1.13.0
|
| 124 |
+
seaborn==0.13.2
|
| 125 |
+
shap==0.45.0
|
| 126 |
+
shellingham==1.5.4
|
| 127 |
+
six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
|
| 128 |
+
slicer==0.0.7
|
| 129 |
+
smmap==5.0.1
|
| 130 |
+
sniffio==1.3.1
|
| 131 |
+
SQLAlchemy==2.0.30
|
| 132 |
+
sqlparse==0.5.0
|
| 133 |
+
stack-data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1669632077133/work
|
| 134 |
+
statsmodels==0.14.2
|
| 135 |
+
streamlit==1.28.0
|
| 136 |
+
tenacity==8.2.3
|
| 137 |
+
threadpoolctl==3.5.0
|
| 138 |
+
toml==0.10.2
|
| 139 |
+
toolz==0.12.1
|
| 140 |
+
tornado @ file:///D:/bld/tornado_1656937966227/work
|
| 141 |
+
tqdm==4.66.2
|
| 142 |
+
traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1713535121073/work
|
| 143 |
+
typer==0.12.3
|
| 144 |
+
typing-inspect==0.9.0
|
| 145 |
+
typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1712329955671/work
|
| 146 |
+
tzdata==2024.1
|
| 147 |
+
tzlocal==5.2
|
| 148 |
+
ujson==5.10.0
|
| 149 |
+
urllib3==2.2.1
|
| 150 |
+
uvicorn==0.29.0
|
| 151 |
+
validators==0.28.3
|
| 152 |
+
waitress==3.0.0
|
| 153 |
+
watchdog==4.0.0
|
| 154 |
+
watchfiles==0.21.0
|
| 155 |
+
wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1704731205417/work
|
| 156 |
+
websockets==12.0
|
| 157 |
+
Werkzeug==3.0.3
|
| 158 |
+
wrapt==1.16.0
|
| 159 |
+
xgboost==2.0.3
|
| 160 |
+
zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1695255097490/work
|