Upload 12 files
Browse files- .streamlit/config.toml +7 -0
- app.py +166 -53
- auto_optimizer.py +361 -317
- best_tts.py +2 -2
- eda.py +325 -0
- feature_selections.py +6 -6
- grid_search_cv.py +284 -0
- models.py +2 -0
- requirements.txt +5 -4
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
[theme]
|
| 3 |
+
primaryColor="#F63366"
|
| 4 |
+
backgroundColor="#002148"
|
| 5 |
+
secondaryBackgroundColor="#576c86"
|
| 6 |
+
textColor="white"
|
| 7 |
+
font="serif"
|
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import evaluationer,models, null_value_handling
|
|
| 8 |
import auto_optimizer
|
| 9 |
from sklearn.experimental import enable_iterative_imputer
|
| 10 |
from sklearn.impute import SimpleImputer, IterativeImputer
|
|
|
|
| 11 |
# st.set_page_config(layout="wide")
|
| 12 |
|
| 13 |
st.set_page_config(
|
|
@@ -21,7 +22,23 @@ st.set_page_config(
|
|
| 21 |
}
|
| 22 |
)
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# Title with Rainbow Transition Effect and Neon Glow
|
| 27 |
html_code = """
|
|
@@ -67,23 +84,74 @@ html_code = """
|
|
| 67 |
"""
|
| 68 |
|
| 69 |
st.markdown(html_code, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
|
|
|
|
| 72 |
# file uploader
|
| 73 |
csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
|
|
|
|
| 75 |
test = pd.DataFrame()
|
| 76 |
if csv_upload is not None:
|
| 77 |
# read the uploaded file into dataframe
|
| 78 |
-
df = pd.read_csv(csv_upload)
|
| 79 |
|
| 80 |
# saving the dataframe to a CSV file
|
| 81 |
df.to_csv('csv_upload.csv', index=False)
|
| 82 |
-
st.
|
| 83 |
-
|
| 84 |
if csv_upload2 is not None:
|
| 85 |
-
test = pd.read_csv(csv_upload2)
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
| 87 |
submission_id = test[id_col]
|
| 88 |
# st.write("Train File upl",submission_id)
|
| 89 |
|
|
@@ -93,8 +161,10 @@ if csv_upload is not None:
|
|
| 93 |
if len(test) >0:
|
| 94 |
# saving the test dataframe to a CSV file
|
| 95 |
test.to_csv('csv_upload_test.csv', index=False)
|
| 96 |
-
|
| 97 |
|
|
|
|
|
|
|
| 98 |
display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
|
| 99 |
if display_train_data == "Yes":
|
| 100 |
st.dataframe(df.head())
|
|
@@ -104,29 +174,40 @@ if csv_upload is not None:
|
|
| 104 |
if display_test_data == "Yes":
|
| 105 |
st.dataframe(test.head())
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
# Display the selected column
|
| 113 |
st.write('You selected:', selected_column)
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
y = df[selected_column]
|
| 116 |
|
| 117 |
if y.dtype == "O":
|
| 118 |
-
st.
|
| 119 |
-
|
|
|
|
| 120 |
from sklearn.preprocessing import LabelEncoder
|
| 121 |
le = LabelEncoder()
|
| 122 |
y= pd.Series(le.fit_transform(y))
|
| 123 |
-
st.
|
| 124 |
-
|
| 125 |
-
if st.radio("Display Target Column",["Yes","No"],index =1) == "Yes":
|
| 126 |
st.dataframe(y.head())
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
|
|
|
| 130 |
if select_target_trans == "Yes":
|
| 131 |
selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
|
| 132 |
if selected_transformation == "Log Transformation":
|
|
@@ -155,36 +236,52 @@ if csv_upload is not None:
|
|
| 155 |
|
| 156 |
if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
|
| 157 |
st.dataframe(y.head())
|
| 158 |
-
|
|
|
|
| 159 |
|
| 160 |
X = df.drop(columns = selected_column)
|
| 161 |
|
| 162 |
if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
|
| 163 |
st.dataframe(X.head())
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
len_duplicates = len(X[X.duplicated()])
|
| 166 |
if len_duplicates >0:
|
| 167 |
st.write(f"There are {len_duplicates} duplicate values in Train")
|
|
|
|
|
|
|
| 168 |
if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
|
| 169 |
X = X.drop_duplicates()
|
| 170 |
st.write("Duplicate values removed ✅")
|
| 171 |
else:
|
| 172 |
st.write("There are no duplicate values in Train")
|
|
|
|
| 173 |
# dropping not important columns
|
| 174 |
-
|
|
|
|
| 175 |
selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
|
| 176 |
X = X.drop(columns = selected_drop_column)
|
| 177 |
if len(test) >0:
|
| 178 |
test = test.drop(columns = selected_drop_column)
|
| 179 |
-
st.write("Un-Important column(s)
|
| 180 |
st.dataframe(X.head())
|
| 181 |
|
|
|
|
| 182 |
num_cols = X.select_dtypes(exclude = "O").columns
|
| 183 |
cat_cols = X.select_dtypes(include = "O").columns
|
| 184 |
st.write("Numerical Columns in Train Data: ", tuple(num_cols))
|
| 185 |
st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
|
| 189 |
|
| 190 |
if ml_cat_ao =="Regression":
|
|
@@ -192,7 +289,7 @@ if csv_upload is not None:
|
|
| 192 |
st.write("Select ML algorithm")
|
| 193 |
reg_model_name = st.selectbox("select model",models.Regression_models.index)
|
| 194 |
reg_model = models.Regression_models.loc[reg_model_name].values[0]
|
| 195 |
-
auto_optimizer.Auto_optimizer(X,y,eva,reg_model)
|
| 196 |
|
| 197 |
elif ml_cat_ao =="Classification":
|
| 198 |
eva = "class"
|
|
@@ -201,10 +298,12 @@ if csv_upload is not None:
|
|
| 201 |
class_model = models.Classification_models.loc[class_model_name].values[0]
|
| 202 |
auto_optimizer.Auto_optimizer(X,y,eva,class_model)
|
| 203 |
|
| 204 |
-
|
| 205 |
else:
|
|
|
|
| 206 |
if X.isnull().sum().sum() >0 :
|
| 207 |
-
|
|
|
|
| 208 |
|
| 209 |
if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
|
| 210 |
|
|
@@ -241,7 +340,9 @@ if csv_upload is not None:
|
|
| 241 |
|
| 242 |
|
| 243 |
clean_num_nvh_df_cat = pd.DataFrame()
|
|
|
|
| 244 |
if X[cat_cols].isnull().sum().sum() >0:
|
|
|
|
| 245 |
st.write("Categorical Columns with Percentage of Null Values: ")
|
| 246 |
cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
|
| 247 |
st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
|
|
@@ -270,33 +371,41 @@ if csv_upload is not None:
|
|
| 270 |
null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
|
| 271 |
st.write("X Data after Null value handling", X.head())
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
st.
|
| 278 |
-
|
|
|
|
|
|
|
| 279 |
ord_enc_cols = []
|
| 280 |
|
| 281 |
if len(cat_cols) == 0:
|
| 282 |
st.write("No Categorical Columns in Train")
|
| 283 |
else:
|
| 284 |
-
st.
|
|
|
|
|
|
|
|
|
|
| 285 |
for column in cat_cols:
|
| 286 |
|
| 287 |
selected = st.checkbox(column)
|
| 288 |
if selected:
|
| 289 |
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
|
| 290 |
ord_enc_cols.append(column)
|
|
|
|
| 291 |
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
|
| 292 |
ohe_enc_cols = list(ohe_enc_cols)
|
| 293 |
if len(ord_enc_cols)>0:
|
| 294 |
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
|
| 295 |
if len(ohe_enc_cols)>0:
|
| 296 |
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
|
| 297 |
-
|
|
|
|
| 298 |
if len(ord_enc_cols)>0:
|
| 299 |
-
|
|
|
|
| 300 |
ordinal_order_vals = []
|
| 301 |
|
| 302 |
for column in ord_enc_cols:
|
|
@@ -317,7 +426,7 @@ if csv_upload is not None:
|
|
| 317 |
st.write("Ordinal Encoding Completed ✅")
|
| 318 |
|
| 319 |
if len(ohe_enc_cols)>0:
|
| 320 |
-
if st.
|
| 321 |
from sklearn.preprocessing import OneHotEncoder
|
| 322 |
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
| 323 |
pd.options.mode.chained_assignment = None
|
|
@@ -331,39 +440,43 @@ if csv_upload is not None:
|
|
| 331 |
|
| 332 |
st.write("DataFrame after One Hot Encoding",X.head())
|
| 333 |
st.write("OneHot Encoding Completed ✅")
|
| 334 |
-
|
| 335 |
new_df = pd.concat([X,y],axis = 1)
|
| 336 |
|
| 337 |
csv = new_df.to_csv(index = False)
|
| 338 |
-
if st.
|
| 339 |
st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
|
| 340 |
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
"KFoldCV, Default (CV = 5)"], index = 0)== f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})":
|
| 347 |
ttsmethod = "Train_Test_split"
|
| 348 |
else:
|
| 349 |
ttsmethod = "KFoldCV"
|
| 350 |
st.write('You selected:', ttsmethod)
|
| 351 |
if ttsmethod == "Train_Test_split":
|
|
|
|
|
|
|
| 352 |
X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
|
| 353 |
-
st.write('X-Training Data shape:', (X_train.info()))
|
| 354 |
|
| 355 |
st.write('X-Training Data shape:', X_train.shape)
|
| 356 |
st.write('X-Validation Data shape:', X_Val.shape)
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
|
|
|
| 360 |
if ml_cat =="Regression":
|
| 361 |
-
|
|
|
|
|
|
|
|
|
|
| 362 |
|
| 363 |
method = evaluationer.method_df.loc[method_name_selector].values[0]
|
| 364 |
reg_algorithm = []
|
| 365 |
selected_options = []
|
| 366 |
-
|
| 367 |
for option in models.Regression_models.index:
|
| 368 |
selected = st.checkbox(option)
|
| 369 |
if selected:
|
|
@@ -450,7 +563,7 @@ if csv_upload is not None:
|
|
| 450 |
|
| 451 |
cla_algorithm = []
|
| 452 |
selected_options = []
|
| 453 |
-
|
| 454 |
for option in models.Classification_models.index:
|
| 455 |
selected = st.checkbox(option)
|
| 456 |
if selected:
|
|
|
|
| 8 |
import auto_optimizer
|
| 9 |
from sklearn.experimental import enable_iterative_imputer
|
| 10 |
from sklearn.impute import SimpleImputer, IterativeImputer
|
| 11 |
+
import eda
|
| 12 |
# st.set_page_config(layout="wide")
|
| 13 |
|
| 14 |
st.set_page_config(
|
|
|
|
| 22 |
}
|
| 23 |
)
|
| 24 |
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Set the background image
|
| 28 |
+
background_image = """
|
| 29 |
+
<style>
|
| 30 |
+
[data-testid="stAppViewContainer"] > .main {
|
| 31 |
+
background-image: url("https://w.wallhaven.cc/full/jx/wallhaven-jx7w25.png");
|
| 32 |
+
background-size: 100vw 100vh; # This sets the size to cover 100% of the viewport width and height
|
| 33 |
+
background-position: center;
|
| 34 |
+
background-repeat: no-repeat;
|
| 35 |
+
}
|
| 36 |
+
</style>
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
st.markdown(background_image, unsafe_allow_html=True)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
|
| 43 |
# Title with Rainbow Transition Effect and Neon Glow
|
| 44 |
html_code = """
|
|
|
|
| 84 |
"""
|
| 85 |
|
| 86 |
st.markdown(html_code, unsafe_allow_html=True)
|
| 87 |
+
st.divider()
|
| 88 |
+
|
| 89 |
+
st.markdown(
|
| 90 |
+
"""
|
| 91 |
+
<style>
|
| 92 |
+
.success-message {
|
| 93 |
+
font-family: Arial, sans-serif;
|
| 94 |
+
font-size: 24px;
|
| 95 |
+
color: green;
|
| 96 |
+
text-align: left;
|
| 97 |
+
}
|
| 98 |
+
.unsuccess-message {
|
| 99 |
+
font-family: Arial, sans-serif;
|
| 100 |
+
font-size: 24px;
|
| 101 |
+
color: red;
|
| 102 |
+
text-align: left;
|
| 103 |
+
}
|
| 104 |
+
.prompt-message {
|
| 105 |
+
font-family: Arial, sans-serif;
|
| 106 |
+
font-size: 24px;
|
| 107 |
+
color: #333;
|
| 108 |
+
text-align: center;
|
| 109 |
+
}
|
| 110 |
+
.success-message2 {
|
| 111 |
+
font-family: Arial, sans-serif;
|
| 112 |
+
font-size: 18px;
|
| 113 |
+
color: white;
|
| 114 |
+
text-align: left;
|
| 115 |
+
}
|
| 116 |
+
.message-box {
|
| 117 |
+
text-align: center;
|
| 118 |
+
background-color: white;
|
| 119 |
+
padding: 5px;
|
| 120 |
+
border-radius: 10px;
|
| 121 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
| 122 |
+
font-size: 24px;
|
| 123 |
+
color: #333;
|
| 124 |
+
}
|
| 125 |
+
</style>
|
| 126 |
+
""",
|
| 127 |
+
unsafe_allow_html=True
|
| 128 |
+
)
|
| 129 |
|
| 130 |
|
| 131 |
+
# st.markdown('<p class="success-message">Train File uploaded successfully. ✅</p>', unsafe_allow_html=True)
|
| 132 |
# file uploader
|
| 133 |
csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
|
| 134 |
+
|
| 135 |
+
sep = st.sidebar.text_input("Input Seperator")
|
| 136 |
+
if (len(sep) ==0):
|
| 137 |
+
sep = ","
|
| 138 |
csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
|
| 139 |
+
|
| 140 |
test = pd.DataFrame()
|
| 141 |
if csv_upload is not None:
|
| 142 |
# read the uploaded file into dataframe
|
| 143 |
+
df = pd.read_csv(csv_upload,sep = sep)
|
| 144 |
|
| 145 |
# saving the dataframe to a CSV file
|
| 146 |
df.to_csv('csv_upload.csv', index=False)
|
| 147 |
+
st.markdown('<p class="success-message">Train File uploaded successfully. ✅</p>', unsafe_allow_html=True)
|
| 148 |
+
|
| 149 |
if csv_upload2 is not None:
|
| 150 |
+
test = pd.read_csv(csv_upload2,sep = sep)
|
| 151 |
+
st.markdown('<p class="success-message">Test File uploaded successfully. ✅</p>', unsafe_allow_html=True)
|
| 152 |
+
st.divider()
|
| 153 |
+
id_col = st.selectbox("Select Column for Submission i.e, ID",test.columns)
|
| 154 |
+
st.divider()
|
| 155 |
submission_id = test[id_col]
|
| 156 |
# st.write("Train File upl",submission_id)
|
| 157 |
|
|
|
|
| 161 |
if len(test) >0:
|
| 162 |
# saving the test dataframe to a CSV file
|
| 163 |
test.to_csv('csv_upload_test.csv', index=False)
|
| 164 |
+
|
| 165 |
|
| 166 |
+
st.markdown('<p class="message-box">Display Data</p>', unsafe_allow_html=True)
|
| 167 |
+
st.write("")
|
| 168 |
display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
|
| 169 |
if display_train_data == "Yes":
|
| 170 |
st.dataframe(df.head())
|
|
|
|
| 174 |
if display_test_data == "Yes":
|
| 175 |
st.dataframe(test.head())
|
| 176 |
|
| 177 |
+
st.divider()
|
| 178 |
+
st.markdown('<div class="message-box success">Select Supervision Category</div>', unsafe_allow_html=True)
|
| 179 |
+
if st.radio("",["Supervised","Un-Supervised"],index =0) == "Supervised":
|
| 180 |
+
st.divider()
|
| 181 |
+
|
| 182 |
+
st.write('<p class="success-message2">Select Target column</p>', unsafe_allow_html=True)
|
| 183 |
+
selected_column = st.selectbox('', df.columns, index=(len(df.columns)-1))
|
| 184 |
|
| 185 |
# Display the selected column
|
| 186 |
st.write('You selected:', selected_column)
|
| 187 |
+
st.divider()
|
| 188 |
+
|
| 189 |
+
st.markdown('<div class="message-box success ">Perform EDA</div>', unsafe_allow_html=True)
|
| 190 |
+
st.write("")
|
| 191 |
+
if st.checkbox("Proceed to perform EDA"):
|
| 192 |
+
eda.eda_analysis(df)
|
| 193 |
+
st.write('<p class="success-message">EDA Performed proceed for Pre-processing</p>', unsafe_allow_html=True)
|
| 194 |
+
st.divider()
|
| 195 |
y = df[selected_column]
|
| 196 |
|
| 197 |
if y.dtype == "O":
|
| 198 |
+
st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
|
| 199 |
+
|
| 200 |
+
if st.checkbox("Proceed for Label Encoding "):
|
| 201 |
from sklearn.preprocessing import LabelEncoder
|
| 202 |
le = LabelEncoder()
|
| 203 |
y= pd.Series(le.fit_transform(y))
|
| 204 |
+
st.markdown('<p class="success-message">Label Encoding Completed ✅</p>', unsafe_allow_html=True)
|
| 205 |
+
if st.checkbox("Display Target Column"):
|
|
|
|
| 206 |
st.dataframe(y.head())
|
| 207 |
|
| 208 |
+
st.divider()
|
| 209 |
+
st.markdown('<div class="message-box success">Target column Transformation</div>', unsafe_allow_html=True)
|
| 210 |
+
select_target_trans = st.radio("",["Yes","No"],index = 1)
|
| 211 |
if select_target_trans == "Yes":
|
| 212 |
selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
|
| 213 |
if selected_transformation == "Log Transformation":
|
|
|
|
| 236 |
|
| 237 |
if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
|
| 238 |
st.dataframe(y.head())
|
| 239 |
+
|
| 240 |
+
|
| 241 |
|
| 242 |
X = df.drop(columns = selected_column)
|
| 243 |
|
| 244 |
if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
|
| 245 |
st.dataframe(X.head())
|
| 246 |
+
st.divider()
|
| 247 |
+
|
| 248 |
+
# st.checkbox()
|
| 249 |
+
st.markdown('<div class="message-box success">Check for duplicate Values</div>', unsafe_allow_html=True)
|
| 250 |
+
if st.radio(" ",["Yes","No"],index = 1) == "Yes":
|
| 251 |
len_duplicates = len(X[X.duplicated()])
|
| 252 |
if len_duplicates >0:
|
| 253 |
st.write(f"There are {len_duplicates} duplicate values in Train")
|
| 254 |
+
if st.checkbox("Show Duplicate values"):
|
| 255 |
+
st.dataframe(X[X.duplicated()])
|
| 256 |
if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
|
| 257 |
X = X.drop_duplicates()
|
| 258 |
st.write("Duplicate values removed ✅")
|
| 259 |
else:
|
| 260 |
st.write("There are no duplicate values in Train")
|
| 261 |
+
st.divider()
|
| 262 |
# dropping not important columns
|
| 263 |
+
st.markdown('<div class="message-box success">Drop Unimportant Columns</div>', unsafe_allow_html=True)
|
| 264 |
+
if st.radio(" ",["Yes","No"],index = 1) == "Yes":
|
| 265 |
selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
|
| 266 |
X = X.drop(columns = selected_drop_column)
|
| 267 |
if len(test) >0:
|
| 268 |
test = test.drop(columns = selected_drop_column)
|
| 269 |
+
st.write("Un-Important column(s) Deleted ✅")
|
| 270 |
st.dataframe(X.head())
|
| 271 |
|
| 272 |
+
st.divider()
|
| 273 |
num_cols = X.select_dtypes(exclude = "O").columns
|
| 274 |
cat_cols = X.select_dtypes(include = "O").columns
|
| 275 |
st.write("Numerical Columns in Train Data: ", tuple(num_cols))
|
| 276 |
st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
|
| 277 |
+
if st.sidebar.button("Clear Evaluation DataFrame"):
|
| 278 |
+
evaluationer.reg_evaluation_df = evaluationer.reg_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
|
| 279 |
+
evaluationer.classification_evaluation_df = evaluationer.classification_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
|
| 280 |
+
st.divider()
|
| 281 |
+
# markdown
|
| 282 |
+
st.markdown('<div class="message-box success">Select method for ML modelling</div>', unsafe_allow_html = True)
|
| 283 |
+
if st.radio(" ", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
|
| 284 |
+
st.divider()
|
| 285 |
ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
|
| 286 |
|
| 287 |
if ml_cat_ao =="Regression":
|
|
|
|
| 289 |
st.write("Select ML algorithm")
|
| 290 |
reg_model_name = st.selectbox("select model",models.Regression_models.index)
|
| 291 |
reg_model = models.Regression_models.loc[reg_model_name].values[0]
|
| 292 |
+
auto_optimizer.Auto_optimizer(X,y,eva,reg_model,reg_model_name)
|
| 293 |
|
| 294 |
elif ml_cat_ao =="Classification":
|
| 295 |
eva = "class"
|
|
|
|
| 298 |
class_model = models.Classification_models.loc[class_model_name].values[0]
|
| 299 |
auto_optimizer.Auto_optimizer(X,y,eva,class_model)
|
| 300 |
|
| 301 |
+
|
| 302 |
else:
|
| 303 |
+
st.divider()
|
| 304 |
if X.isnull().sum().sum() >0 :
|
| 305 |
+
|
| 306 |
+
st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
|
| 307 |
|
| 308 |
if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
|
| 309 |
|
|
|
|
| 340 |
|
| 341 |
|
| 342 |
clean_num_nvh_df_cat = pd.DataFrame()
|
| 343 |
+
|
| 344 |
if X[cat_cols].isnull().sum().sum() >0:
|
| 345 |
+
st.divider()
|
| 346 |
st.write("Categorical Columns with Percentage of Null Values: ")
|
| 347 |
cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
|
| 348 |
st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
|
|
|
|
| 371 |
null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
|
| 372 |
st.write("X Data after Null value handling", X.head())
|
| 373 |
|
| 374 |
+
new_df = pd.concat([X,y[X.index]],axis = 1)
|
| 375 |
+
|
| 376 |
+
csv = new_df.to_csv(index = False)
|
| 377 |
+
|
| 378 |
+
st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
|
| 379 |
+
if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
|
| 380 |
+
st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
|
| 381 |
+
st.divider()
|
| 382 |
ord_enc_cols = []
|
| 383 |
|
| 384 |
if len(cat_cols) == 0:
|
| 385 |
st.write("No Categorical Columns in Train")
|
| 386 |
else:
|
| 387 |
+
st.markdown('<div class="message-box success">Features Encoding</div>', unsafe_allow_html=True)
|
| 388 |
+
st.markdown('<p class="unsuccess-message">There are Object type Features in Train Data ⚠️</p>', unsafe_allow_html=True)
|
| 389 |
+
st.markdown('<p class="success-message2">Select Columns for Ordinal Encoding</p>', unsafe_allow_html=True)
|
| 390 |
+
|
| 391 |
for column in cat_cols:
|
| 392 |
|
| 393 |
selected = st.checkbox(column)
|
| 394 |
if selected:
|
| 395 |
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
|
| 396 |
ord_enc_cols.append(column)
|
| 397 |
+
st.divider()
|
| 398 |
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
|
| 399 |
ohe_enc_cols = list(ohe_enc_cols)
|
| 400 |
if len(ord_enc_cols)>0:
|
| 401 |
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
|
| 402 |
if len(ohe_enc_cols)>0:
|
| 403 |
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
|
| 404 |
+
st.divider()
|
| 405 |
+
st.markdown('<div class="message-box success">Proceed for Encoding</div>', unsafe_allow_html=True)
|
| 406 |
if len(ord_enc_cols)>0:
|
| 407 |
+
|
| 408 |
+
if st.checkbox("Proceed for Ordinal Encoding"):
|
| 409 |
ordinal_order_vals = []
|
| 410 |
|
| 411 |
for column in ord_enc_cols:
|
|
|
|
| 426 |
st.write("Ordinal Encoding Completed ✅")
|
| 427 |
|
| 428 |
if len(ohe_enc_cols)>0:
|
| 429 |
+
if st.checkbox("Proceed for OneHotEncoding "): # import one hot encoder
|
| 430 |
from sklearn.preprocessing import OneHotEncoder
|
| 431 |
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
| 432 |
pd.options.mode.chained_assignment = None
|
|
|
|
| 440 |
|
| 441 |
st.write("DataFrame after One Hot Encoding",X.head())
|
| 442 |
st.write("OneHot Encoding Completed ✅")
|
| 443 |
+
st.divider()
|
| 444 |
new_df = pd.concat([X,y],axis = 1)
|
| 445 |
|
| 446 |
csv = new_df.to_csv(index = False)
|
| 447 |
+
if st.checkbox("Download Encoded DataFrame as CSV File ? "):
|
| 448 |
st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
|
| 449 |
|
| 450 |
+
st.divider()
|
| 451 |
+
st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
|
| 452 |
+
st.write("")
|
| 453 |
+
st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)
|
| 454 |
+
if st.radio("",["Train_Test_split","KFoldCV, Default (CV = 5)"], index = 0)== "Train_Test_split":
|
|
|
|
| 455 |
ttsmethod = "Train_Test_split"
|
| 456 |
else:
|
| 457 |
ttsmethod = "KFoldCV"
|
| 458 |
st.write('You selected:', ttsmethod)
|
| 459 |
if ttsmethod == "Train_Test_split":
|
| 460 |
+
random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
|
| 461 |
+
test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
|
| 462 |
X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
|
|
|
|
| 463 |
|
| 464 |
st.write('X-Training Data shape:', X_train.shape)
|
| 465 |
st.write('X-Validation Data shape:', X_Val.shape)
|
| 466 |
+
st.divider()
|
| 467 |
+
st.markdown('<p class="success-message2">Select Machine Learning Category</p>', unsafe_allow_html=True)
|
| 468 |
+
ml_cat = st.radio("___",options=["Regression","Classification"],index =0)
|
| 469 |
+
st.divider()
|
| 470 |
if ml_cat =="Regression":
|
| 471 |
+
st.markdown('<p class="success-message2">Select Error Evaluation Method</p>', unsafe_allow_html=True)
|
| 472 |
+
method_name_selector = st.selectbox(" ",evaluationer.method_df.index,index = 0)
|
| 473 |
+
|
| 474 |
+
st.divider()
|
| 475 |
|
| 476 |
method = evaluationer.method_df.loc[method_name_selector].values[0]
|
| 477 |
reg_algorithm = []
|
| 478 |
selected_options = []
|
| 479 |
+
st.markdown('<div class="message-box success">Select ML Model(s)</div>', unsafe_allow_html=True)
|
| 480 |
for option in models.Regression_models.index:
|
| 481 |
selected = st.checkbox(option)
|
| 482 |
if selected:
|
|
|
|
| 563 |
|
| 564 |
cla_algorithm = []
|
| 565 |
selected_options = []
|
| 566 |
+
st.markdown('<div class="message-box success">Select ML Model(s)</div>', unsafe_allow_html=True)
|
| 567 |
for option in models.Classification_models.index:
|
| 568 |
selected = st.checkbox(option)
|
| 569 |
if selected:
|
auto_optimizer.py
CHANGED
|
@@ -1,317 +1,361 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
import numpy as np
|
| 3 |
-
import streamlit as st
|
| 4 |
-
from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
|
| 5 |
-
import best_tts, evaluationer,models
|
| 6 |
-
from sklearn.experimental import enable_iterative_imputer
|
| 7 |
-
from sklearn.model_selection import train_test_split as tts
|
| 8 |
-
from collections import Counter
|
| 9 |
-
|
| 10 |
-
from sklearn.metrics import root_mean_squared_error
|
| 11 |
-
import seaborn as sns
|
| 12 |
-
|
| 13 |
-
import
|
| 14 |
-
import
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
if
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
if
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
if
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
|
| 5 |
+
import best_tts, evaluationer,models
|
| 6 |
+
from sklearn.experimental import enable_iterative_imputer
|
| 7 |
+
from sklearn.model_selection import train_test_split as tts
|
| 8 |
+
from collections import Counter
|
| 9 |
+
from sklearn.preprocessing import PolynomialFeatures
|
| 10 |
+
from sklearn.metrics import root_mean_squared_error
|
| 11 |
+
import seaborn as sns
|
| 12 |
+
from sklearn.decomposition import PCA
|
| 13 |
+
import grid_search_cv
|
| 14 |
+
import matplotlib.pyplot as plt
|
| 15 |
+
import outliers,best_tts
|
| 16 |
+
import feature_selections
|
| 17 |
+
def Auto_optimizer(X,y,eva,model,model_name,test= None):
|
| 18 |
+
if st.button("Train Regression Model"):
|
| 19 |
+
num_cols = X.select_dtypes(exclude = "O").columns
|
| 20 |
+
cat_cols = X.select_dtypes(include = "O").columns
|
| 21 |
+
st.write("Num_cols",tuple(num_cols))
|
| 22 |
+
st.write("cat_cols",tuple(cat_cols))
|
| 23 |
+
|
| 24 |
+
# check for Duplicate and drop duplicated in X
|
| 25 |
+
|
| 26 |
+
if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
|
| 27 |
+
X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
|
| 28 |
+
st.write("Columns with more than 40% null values removed")
|
| 29 |
+
# st.write("csx",X)
|
| 30 |
+
|
| 31 |
+
len_null = X.isnull().sum().sum()
|
| 32 |
+
|
| 33 |
+
st.write(f"There are {len_null} null values in Train")
|
| 34 |
+
|
| 35 |
+
knn_imputed_num_X = X.copy()
|
| 36 |
+
si_mean_imputed_num_X = X.copy()
|
| 37 |
+
# st.write("sf",si_mean_imputed_num_X)
|
| 38 |
+
si_median_imputed_num_X = X.copy()
|
| 39 |
+
si_most_frequent_imputed_num_X = X.copy()
|
| 40 |
+
iter_imputed_num_X = X.copy()
|
| 41 |
+
knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
|
| 42 |
+
si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
|
| 43 |
+
si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
|
| 44 |
+
si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
|
| 45 |
+
iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
|
| 46 |
+
if len_null >0:
|
| 47 |
+
|
| 48 |
+
if X[num_cols].isnull().sum().sum() >0:
|
| 49 |
+
|
| 50 |
+
knn_imputer = KNNImputer(n_neighbors = 5)
|
| 51 |
+
knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
|
| 52 |
+
si_imputer = SimpleImputer(strategy = "mean")
|
| 53 |
+
si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
|
| 54 |
+
si_imputer = SimpleImputer(strategy = "median")
|
| 55 |
+
si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
|
| 56 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
| 57 |
+
si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
|
| 58 |
+
iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
|
| 59 |
+
iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
|
| 60 |
+
knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
|
| 61 |
+
si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
|
| 62 |
+
si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
|
| 63 |
+
si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
|
| 64 |
+
iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
|
| 65 |
+
|
| 66 |
+
if X[cat_cols].isnull().sum().sum() >0:
|
| 67 |
+
# treating missing values in categorical columns
|
| 68 |
+
# st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
|
| 69 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
| 70 |
+
|
| 71 |
+
knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
|
| 72 |
+
si_imputer = SimpleImputer(strategy = "most_frequent")
|
| 73 |
+
si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
|
| 74 |
+
# st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
|
| 75 |
+
si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
|
| 76 |
+
si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
|
| 77 |
+
iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
|
| 78 |
+
|
| 79 |
+
knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
|
| 80 |
+
si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
|
| 81 |
+
si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
|
| 82 |
+
si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
|
| 83 |
+
iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
miss_val_dropped_X = X.dropna()
|
| 87 |
+
|
| 88 |
+
# list of dataframes
|
| 89 |
+
|
| 90 |
+
list_X_after_missing_values= [knn_imputed_num_X,
|
| 91 |
+
si_mean_imputed_num_X,
|
| 92 |
+
si_median_imputed_num_X,
|
| 93 |
+
si_most_frequent_imputed_num_X,
|
| 94 |
+
iter_imputed_num_X,
|
| 95 |
+
knn_imputed_X_cat_dropped,
|
| 96 |
+
si_mean_imputed_X_cat_dropped,
|
| 97 |
+
si_median_imputed_X_cat_dropped,
|
| 98 |
+
si_most_frequent_imputed_X_cat_dropped,
|
| 99 |
+
iter_imputed_X_cat_dropped,
|
| 100 |
+
miss_val_dropped_X]
|
| 101 |
+
list_X_after_missing_values_names= ["knn_imputed_num_X",
|
| 102 |
+
"si_mean_imputed_num_X",
|
| 103 |
+
"si_median_imputed_num_X",
|
| 104 |
+
"si_most_frequent_imputed_num_X",
|
| 105 |
+
"iter_imputed_num_X",
|
| 106 |
+
"knn_imputed_X_cat_dropped",
|
| 107 |
+
"si_mean_imputed_X_cat_dropped",
|
| 108 |
+
"si_median_imputed_X_cat_dropped",
|
| 109 |
+
"si_most_frequent_imputed_X_cat_dropped",
|
| 110 |
+
"iter_imputed_X_cat_dropped",
|
| 111 |
+
"miss_val_dropped_X"]
|
| 112 |
+
# st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
|
| 113 |
+
ord_enc_cols = []
|
| 114 |
+
ohe_enc_cols = []
|
| 115 |
+
|
| 116 |
+
if len(cat_cols) == 0:
|
| 117 |
+
st.write("No Categorical Columns in Train")
|
| 118 |
+
else:
|
| 119 |
+
st.write("Select Columns for Ordinal Encoding")
|
| 120 |
+
for column in cat_cols:
|
| 121 |
+
selected = st.checkbox(column)
|
| 122 |
+
if selected:
|
| 123 |
+
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
|
| 124 |
+
ord_enc_cols.append(column)
|
| 125 |
+
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
|
| 126 |
+
ohe_enc_cols = list(ohe_enc_cols)
|
| 127 |
+
|
| 128 |
+
if len(ord_enc_cols)>0:
|
| 129 |
+
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
|
| 130 |
+
if len(ohe_enc_cols)>0:
|
| 131 |
+
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
|
| 132 |
+
|
| 133 |
+
if len(ord_enc_cols)>0:
|
| 134 |
+
|
| 135 |
+
ordinal_order_vals = []
|
| 136 |
+
|
| 137 |
+
for column in ord_enc_cols:
|
| 138 |
+
unique_vals = X.dropna()[column].unique()
|
| 139 |
+
# st.write(f"No. of Unique value in {column} column are", len(unique_vals))
|
| 140 |
+
|
| 141 |
+
ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
|
| 142 |
+
ordinal_order_vals.append(ordered_unique_vals)
|
| 143 |
+
|
| 144 |
+
st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
|
| 145 |
+
|
| 146 |
+
if len_null > 0:
|
| 147 |
+
|
| 148 |
+
for df_name, df in enumerate(list_X_after_missing_values):
|
| 149 |
+
# st.write(f"{list_X_after_missing_values_names[df_name]}",df)
|
| 150 |
+
from sklearn.preprocessing import OrdinalEncoder
|
| 151 |
+
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
| 152 |
+
df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
|
| 153 |
+
# st.write(f"{list_X_after_missing_values_names[df_name]}",df)
|
| 154 |
+
else :
|
| 155 |
+
from sklearn.preprocessing import OrdinalEncoder
|
| 156 |
+
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
|
| 157 |
+
X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
|
| 158 |
+
|
| 159 |
+
st.write("Ordinal Encoding Completed ✅")
|
| 160 |
+
|
| 161 |
+
if len(ohe_enc_cols)>0:
|
| 162 |
+
if len_null > 0:
|
| 163 |
+
for df_name, df in enumerate(list_X_after_missing_values):
|
| 164 |
+
from sklearn.preprocessing import OneHotEncoder
|
| 165 |
+
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
| 166 |
+
pd.options.mode.chained_assignment = None
|
| 167 |
+
df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
|
| 168 |
+
df.drop(columns = ohe_enc_cols,inplace = True)
|
| 169 |
+
pd.options.mode.chained_assignment = 'warn'
|
| 170 |
+
else:
|
| 171 |
+
from sklearn.preprocessing import OneHotEncoder
|
| 172 |
+
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
|
| 173 |
+
pd.options.mode.chained_assignment = None
|
| 174 |
+
X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
|
| 175 |
+
X.drop(columns = ohe_enc_cols,inplace = True)
|
| 176 |
+
pd.options.mode.chained_assignment = 'warn'
|
| 177 |
+
st.write("OneHot Encoding Completed ✅")
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if len(ohe_enc_cols)>0:
|
| 181 |
+
if len_null > 0:
|
| 182 |
+
for name,df in enumerate(list_X_after_missing_values):
|
| 183 |
+
X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
|
| 184 |
+
# best_tts.best_tts(df,y,model,eva)
|
| 185 |
+
evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
| 186 |
+
else:
|
| 187 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
|
| 188 |
+
# best_tts.best_tts(X,y,model,eva)
|
| 189 |
+
|
| 190 |
+
evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
| 191 |
+
|
| 192 |
+
if len_null >0:
|
| 193 |
+
for name,df in enumerate(list_X_after_missing_values):
|
| 194 |
+
X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
|
| 195 |
+
|
| 196 |
+
evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
| 197 |
+
|
| 198 |
+
if eva == "class":
|
| 199 |
+
counter = Counter(y)
|
| 200 |
+
total = sum(counter.values())
|
| 201 |
+
balance_ratio = {cls: count / total for cls, count in counter.items()}
|
| 202 |
+
num_classes = len(balance_ratio)
|
| 203 |
+
ideal_ratio = 1 / num_classes
|
| 204 |
+
a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
|
| 205 |
+
if a == True:
|
| 206 |
+
st.write("Balanced Dataset ✅")
|
| 207 |
+
st.write("Using accuracy for Evaluation")
|
| 208 |
+
value = "test_acc"
|
| 209 |
+
else:
|
| 210 |
+
st.write("Unbalanced Dataset ❌")
|
| 211 |
+
st.write("Using F1 score for Evaluation")
|
| 212 |
+
value = "test_f1"
|
| 213 |
+
|
| 214 |
+
evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
|
| 215 |
+
name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
|
| 216 |
+
st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
|
| 217 |
+
if len_null >0:
|
| 218 |
+
b = list_X_after_missing_values_names.index(name)
|
| 219 |
+
|
| 220 |
+
st.write("df",list_X_after_missing_values[b])
|
| 221 |
+
X = list_X_after_missing_values[b]
|
| 222 |
+
if eva == "reg":
|
| 223 |
+
st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
|
| 224 |
+
value = "test_r2"
|
| 225 |
+
evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
|
| 226 |
+
|
| 227 |
+
name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
|
| 228 |
+
|
| 229 |
+
if len_null >0:
|
| 230 |
+
b = list_X_after_missing_values_names.index(name)
|
| 231 |
+
|
| 232 |
+
st.write("df",list_X_after_missing_values[b])
|
| 233 |
+
X = list_X_after_missing_values[b]
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
# Create a figure and axes
|
| 237 |
+
num_plots = len(num_cols)
|
| 238 |
+
cols = 2 # Number of columns in the subplot grid
|
| 239 |
+
rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
|
| 240 |
+
|
| 241 |
+
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
|
| 242 |
+
|
| 243 |
+
# Flatten the axes array for easy iteration, and remove any excess subplots
|
| 244 |
+
axes = axes.flatten()
|
| 245 |
+
for ax in axes[num_plots:]:
|
| 246 |
+
fig.delaxes(ax)
|
| 247 |
+
|
| 248 |
+
for i, col in enumerate(num_cols):
|
| 249 |
+
sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
|
| 250 |
+
axes[i].set_title(col)
|
| 251 |
+
|
| 252 |
+
# Adjust layout
|
| 253 |
+
plt.tight_layout()
|
| 254 |
+
|
| 255 |
+
# Show the plot in Streamlit
|
| 256 |
+
st.pyplot(fig)
|
| 257 |
+
|
| 258 |
+
# Create a figure and axes
|
| 259 |
+
num_plots = len(num_cols)
|
| 260 |
+
cols = 3 # Number of columns in the subplot grid
|
| 261 |
+
rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
|
| 262 |
+
|
| 263 |
+
fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
|
| 264 |
+
|
| 265 |
+
# Flatten the axes array for easy iteration, and remove any excess subplots
|
| 266 |
+
axes = axes.flatten()
|
| 267 |
+
for ax in axes[num_plots:]:
|
| 268 |
+
fig.delaxes(ax)
|
| 269 |
+
|
| 270 |
+
for i, col in enumerate(num_cols):
|
| 271 |
+
sns.boxplot(y=X[col], ax=axes[i],palette="magma")
|
| 272 |
+
axes[i].set_title(col)
|
| 273 |
+
|
| 274 |
+
# Adjust layout
|
| 275 |
+
plt.tight_layout()
|
| 276 |
+
|
| 277 |
+
# Show the plot in Streamlit
|
| 278 |
+
st.pyplot(fig)
|
| 279 |
+
|
| 280 |
+
outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
|
| 281 |
+
|
| 282 |
+
st.write("Checking for Outliers")
|
| 283 |
+
outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
|
| 284 |
+
st.write("Outliers in Dataframe Summary",outliers_df_X)
|
| 285 |
+
st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
|
| 286 |
+
|
| 287 |
+
select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
|
| 288 |
+
resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
|
| 289 |
+
st.write("outlier handling with methods",resultant)
|
| 290 |
+
st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
|
| 291 |
+
try :
|
| 292 |
+
st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
|
| 293 |
+
|
| 294 |
+
st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
|
| 295 |
+
X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
|
| 296 |
+
except :
|
| 297 |
+
"evaluation of baseline model is better continuing with baseline model"
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
|
| 301 |
+
st.write("result_df",X)
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
try:
|
| 306 |
+
result_df_1 , feature_col, feature_col_name = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
|
| 307 |
+
X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
|
| 308 |
+
except:
|
| 309 |
+
"evaluation by feature selection is not better than previous"
|
| 310 |
+
|
| 311 |
+
try:
|
| 312 |
+
result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
|
| 313 |
+
st.write("result_df",result)
|
| 314 |
+
except:
|
| 315 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
st.write("cheking with polynomial features")
|
| 321 |
+
poly = PolynomialFeatures(degree=(2))
|
| 322 |
+
X_train_poly = poly.fit_transform(X_train)
|
| 323 |
+
X_test_poly = poly.transform(X_test)
|
| 324 |
+
result_df_2 = evaluationer.evaluation("polynomial features degree 2",X_train_poly,X_test_poly,y_train,y_test,model,root_mean_squared_error,eva)
|
| 325 |
+
st.write("after polynomial features degree 2",evaluationer.reg_evaluation_df)
|
| 326 |
+
poly1 = PolynomialFeatures(degree=(3))
|
| 327 |
+
X_train_poly1 = poly.fit_transform(X_train)
|
| 328 |
+
X_test_poly1 = poly.transform(X_test)
|
| 329 |
+
evaluationer.evaluation("polynomial features degree 3",X_train_poly1,X_test_poly1,y_train,y_test,model,root_mean_squared_error,eva)
|
| 330 |
+
st.write("after polynomial features degree 3",evaluationer.reg_evaluation_df)
|
| 331 |
+
|
| 332 |
+
pca = PCA(n_components=0.95)
|
| 333 |
+
X_train_pca = pca.fit_transform(X_train)
|
| 334 |
+
X_test_pca = pca.transform(X_test)
|
| 335 |
+
evaluationer.evaluation("PCA",X_train_pca,X_test_pca,y_train,y_test,model,root_mean_squared_error,eva)
|
| 336 |
+
st.write("After PCA",evaluationer.reg_evaluation_df)
|
| 337 |
+
|
| 338 |
+
grid_search_cv.perform_grid_search(model,model_name,X_train,X_test,y_train,y_test,eva)
|
| 339 |
+
st.write("best param",evaluationer.reg_evaluation_df)
|
| 340 |
+
st.sidebar.button("click to clear evaluation metrics",evaluationer.reg_evaluation_df.drop(index = evaluationer.reg_evaluation_df.index))
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
|
best_tts.py
CHANGED
|
@@ -10,9 +10,9 @@ def best_tts(X,y,model,eva):
|
|
| 10 |
if eva == "reg":
|
| 11 |
|
| 12 |
test_r2_,test_r2_ts,test_r2_rs = 0,0,0
|
| 13 |
-
for k in range(10,25):
|
| 14 |
i = k/100
|
| 15 |
-
for j in range(1,100):
|
| 16 |
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
|
| 17 |
|
| 18 |
model = model
|
|
|
|
| 10 |
if eva == "reg":
|
| 11 |
|
| 12 |
test_r2_,test_r2_ts,test_r2_rs = 0,0,0
|
| 13 |
+
for k in range(10,25,3):
|
| 14 |
i = k/100
|
| 15 |
+
for j in range(1,100,10):
|
| 16 |
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
|
| 17 |
|
| 18 |
model = model
|
eda.py
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import seaborn as sns
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import streamlit.components.v1 as components
|
| 8 |
+
import plotly.express as px
|
| 9 |
+
from plotly.subplots import make_subplots
|
| 10 |
+
import plotly.graph_objects as go
|
| 11 |
+
import streamlit as st
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import datashader as ds
|
| 14 |
+
import datashader.transfer_functions as tf
|
| 15 |
+
from colorcet import fire
|
| 16 |
+
import plotly.express as px
|
| 17 |
+
# function to analysing EDA
|
| 18 |
+
def eda_analysis(df):
|
| 19 |
+
|
| 20 |
+
target_col = st.sidebar.selectbox("Select Target Column", df.columns,index = len(df.columns)-1)
|
| 21 |
+
y = df[target_col]
|
| 22 |
+
X = df.drop(columns = target_col)
|
| 23 |
+
num_cols = X.select_dtypes(exclude= "O").columns.tolist()
|
| 24 |
+
cat_cols = X.select_dtypes(include= "O").columns.tolist()
|
| 25 |
+
st.write("num_cols",tuple(num_cols))
|
| 26 |
+
st.write("cat_cols",tuple(cat_cols))
|
| 27 |
+
st.divider()
|
| 28 |
+
|
| 29 |
+
results = []
|
| 30 |
+
for column in X[num_cols].columns:
|
| 31 |
+
skewness = X[column].skew()
|
| 32 |
+
kurtosis = X[column].kurtosis()
|
| 33 |
+
|
| 34 |
+
skewness_html = f'<span style="color: {"red" if abs(skewness) > .5 else "white"}">{skewness:.2f}</span>'
|
| 35 |
+
kurtosis_html = f'<span style="color: {"red" if abs(kurtosis) > 3 else "white"}">{kurtosis:.2f}</span>'
|
| 36 |
+
|
| 37 |
+
results.append({
|
| 38 |
+
'Column': column,
|
| 39 |
+
'Skewness': skewness,
|
| 40 |
+
'Kurtosis': kurtosis,
|
| 41 |
+
'Skewness_': skewness_html,
|
| 42 |
+
'Kurtosis_': kurtosis_html
|
| 43 |
+
})
|
| 44 |
+
|
| 45 |
+
result_df = pd.DataFrame(results)
|
| 46 |
+
|
| 47 |
+
# Display the data types of Skewness and Kurtosis columns
|
| 48 |
+
# st.write("Data types of Skewness and Kurtosis columns:", result_df[["Skewness", "Kurtosis"]].dtypes)
|
| 49 |
+
|
| 50 |
+
if st.toggle("Show Skewness and Kurtosis of DataFrame columns"):
|
| 51 |
+
st.write("Columns with Skewness and Kurtosis:")
|
| 52 |
+
if st.checkbox("Filter Skewed columns"):
|
| 53 |
+
filtered_df = result_df[abs(result_df["Skewness"]) > 0.5]
|
| 54 |
+
st.write(filtered_df[['Column', 'Skewness_', 'Kurtosis_']].to_html(escape=False), unsafe_allow_html=True)
|
| 55 |
+
else:
|
| 56 |
+
st.write(result_df[['Column', 'Skewness_', 'Kurtosis_']].to_html(escape=False), unsafe_allow_html=True)
|
| 57 |
+
|
| 58 |
+
st.divider()
|
| 59 |
+
st.write("Plotting Numerical Columns for Visual EDA")
|
| 60 |
+
|
| 61 |
+
# Create two columns
|
| 62 |
+
column1, column2 = st.columns(2)
|
| 63 |
+
|
| 64 |
+
# Checkbox for plotting distribution in the first column
|
| 65 |
+
with column1:
|
| 66 |
+
plot_distribution = st.checkbox("Plot Distribution of Target Column")
|
| 67 |
+
|
| 68 |
+
# Show the second checkbox in the second column only if the first checkbox is clicked
|
| 69 |
+
if plot_distribution:
|
| 70 |
+
with column2:
|
| 71 |
+
show_kde = st.checkbox("Show KDE Plot")
|
| 72 |
+
kde = show_kde
|
| 73 |
+
else:
|
| 74 |
+
kde = False
|
| 75 |
+
|
| 76 |
+
# Plot the histogram if the first checkbox is checked
|
| 77 |
+
if plot_distribution:
|
| 78 |
+
fig, ax = plt.subplots()
|
| 79 |
+
sns.histplot(y, ax=ax, kde=kde)
|
| 80 |
+
|
| 81 |
+
# Show the plot in the Streamlit app
|
| 82 |
+
st.pyplot(fig)
|
| 83 |
+
|
| 84 |
+
column3, column4 = st.columns(2)
|
| 85 |
+
with column3:
|
| 86 |
+
plot_distribution_nc =st.checkbox("Plot Distribution of Input Numerical columns")
|
| 87 |
+
if plot_distribution_nc:
|
| 88 |
+
with column4:
|
| 89 |
+
show_kde_1 = st.checkbox("Show KDE Plot for Numerical Columns")
|
| 90 |
+
kde_1 = show_kde_1
|
| 91 |
+
if plot_distribution_nc:
|
| 92 |
+
for column in num_cols:
|
| 93 |
+
fig, ax = plt.subplots()
|
| 94 |
+
sns.histplot(df[column], ax=ax, kde=kde_1)
|
| 95 |
+
st.write(f"Distribution of {column}:")
|
| 96 |
+
st.pyplot(fig)
|
| 97 |
+
st.divider()
|
| 98 |
+
# plot count plot for categorical columns
|
| 99 |
+
st.write("Plotting Categorical Columns for Visual EDA")
|
| 100 |
+
if st.checkbox("Plot Distribution of Input Categorical columns") :
|
| 101 |
+
for column in cat_cols:
|
| 102 |
+
fig, ax = plt.subplots()
|
| 103 |
+
fig = px.histogram(df.fillna('Null'), x=column, color=target_col)
|
| 104 |
+
st.write(fig)
|
| 105 |
+
|
| 106 |
+
st.divider()
|
| 107 |
+
# plot correlation matrics using plotly
|
| 108 |
+
st.write("Plotting Correlation Matrix for Numerical Columns")
|
| 109 |
+
|
| 110 |
+
column5, column6 = st.columns(2)
|
| 111 |
+
with column5:
|
| 112 |
+
plot_distribution =st.checkbox("Plot Correlation Matrix")
|
| 113 |
+
if plot_distribution:
|
| 114 |
+
with column6:
|
| 115 |
+
show_value = st.checkbox("Correlation values > 0.5")
|
| 116 |
+
if show_value:
|
| 117 |
+
# Compute correlation matrix
|
| 118 |
+
corr_matrix = df[num_cols].corr()
|
| 119 |
+
|
| 120 |
+
# Plot correlation matrix heatmap
|
| 121 |
+
fig = px.imshow(corr_matrix[abs(corr_matrix)>0.5], color_continuous_scale='RdBu')
|
| 122 |
+
|
| 123 |
+
# Add annotations for values greater than 0.5
|
| 124 |
+
for i in range(corr_matrix.shape[0]):
|
| 125 |
+
for j in range(corr_matrix.shape[1]):
|
| 126 |
+
correlation_value = corr_matrix.iloc[i, j]
|
| 127 |
+
if abs(correlation_value) > 0.5: # Filter values greater than 0.5
|
| 128 |
+
fig.add_annotation(
|
| 129 |
+
x=i, y=j,
|
| 130 |
+
text=str(round(correlation_value, 2)),
|
| 131 |
+
showarrow=False
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Update layout
|
| 135 |
+
fig.update_layout(
|
| 136 |
+
xaxis=dict(side="top"),
|
| 137 |
+
width=600,
|
| 138 |
+
height=600,
|
| 139 |
+
margin=dict(l=20, r=20, t=40, b=20)
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Display the heatmap
|
| 143 |
+
st.write(fig)
|
| 144 |
+
if plot_distribution and not show_value:
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
corr_matrix = df[num_cols].corr()
|
| 148 |
+
fig = px.imshow(corr_matrix, color_continuous_scale='RdBu')
|
| 149 |
+
for i in range(corr_matrix.shape[0]):
|
| 150 |
+
for j in range(corr_matrix.shape[1]):
|
| 151 |
+
fig.add_annotation(
|
| 152 |
+
x=i, y=j,
|
| 153 |
+
text=str(round(corr_matrix.iloc[i, j], 2)),
|
| 154 |
+
showarrow=False
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Update the layout to ensure annotations are displayed properly
|
| 158 |
+
fig.update_layout(
|
| 159 |
+
xaxis=dict(side="top"),
|
| 160 |
+
width=600,
|
| 161 |
+
height=600,
|
| 162 |
+
margin=dict(l=20, r=20, t=40, b=20)
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
st.write(fig)
|
| 166 |
+
st.divider()
|
| 167 |
+
outlier_cols = st.multiselect("Select Continous numerical columns for Outlier Plot",num_cols)
|
| 168 |
+
|
| 169 |
+
# plot px.boxplot for outlier cols
|
| 170 |
+
if st.toggle("Toggle for Violin Plot"):
|
| 171 |
+
if st.checkbox("Plot BoxPlot for Outlier Cols"):
|
| 172 |
+
if st.toggle("Split by Target"):
|
| 173 |
+
for col in outlier_cols:
|
| 174 |
+
fig = px.violin(df, x=col,color=y)
|
| 175 |
+
st.write(fig)
|
| 176 |
+
st.divider()
|
| 177 |
+
else:
|
| 178 |
+
for col in outlier_cols:
|
| 179 |
+
fig = px.violin(df, x=col)
|
| 180 |
+
st.write(fig)
|
| 181 |
+
st.divider()
|
| 182 |
+
if st.checkbox("check outlier distribution of Target column"):
|
| 183 |
+
fig = px.violin(y)
|
| 184 |
+
st.write(fig)
|
| 185 |
+
|
| 186 |
+
else:
|
| 187 |
+
if st.checkbox("Plot BoxPlot for Outlier Cols"):
|
| 188 |
+
if st.toggle("Split by Target"):
|
| 189 |
+
for col in outlier_cols:
|
| 190 |
+
fig = px.box(df, x=col,color=y)
|
| 191 |
+
st.write(fig)
|
| 192 |
+
st.divider()
|
| 193 |
+
else:
|
| 194 |
+
for col in outlier_cols:
|
| 195 |
+
fig = px.box(df, x=col)
|
| 196 |
+
st.write(fig)
|
| 197 |
+
st.divider()
|
| 198 |
+
if st.checkbox("check outlier distribution of Target column"):
|
| 199 |
+
fig = px.box(y)
|
| 200 |
+
st.write(fig)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
# plot scatter plot using px
|
| 204 |
+
st.divider()
|
| 205 |
+
|
| 206 |
+
if st.checkbox("Plot Scatter Plot"):
|
| 207 |
+
column7, column8,column9 = st.columns(3)
|
| 208 |
+
with column7:
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
# Select y-axis column
|
| 212 |
+
y_col = st.selectbox("Select y axis column", df.columns)
|
| 213 |
+
|
| 214 |
+
# Filter categorical columns for the x-axis selection
|
| 215 |
+
categorical_columns = df.columns
|
| 216 |
+
with column8:
|
| 217 |
+
# Allow user to select the x-axis column from categorical columns
|
| 218 |
+
x_col = st.selectbox("Select x axis column", categorical_columns)
|
| 219 |
+
with column9:
|
| 220 |
+
hue_col = st.selectbox("Select Hue column",categorical_columns)
|
| 221 |
+
# Plot scatter plot using Plotly
|
| 222 |
+
fig = px.scatter(df, x=x_col, y=y_col, color=hue_col)
|
| 223 |
+
st.write(fig)
|
| 224 |
+
|
| 225 |
+
# barchart and line chart
|
| 226 |
+
st.divider()
|
| 227 |
+
if st.checkbox("Plot Bar Chart"):
|
| 228 |
+
column10, column11 = st.columns(2)
|
| 229 |
+
with column10:
|
| 230 |
+
# Select y-axis column
|
| 231 |
+
y_col = st.selectbox("Select y axis column", df.columns)
|
| 232 |
+
|
| 233 |
+
# Filter categorical columns for the x-axis selection
|
| 234 |
+
categorical_columns = df.columns
|
| 235 |
+
with column11:
|
| 236 |
+
# Allow user to select the x-axis column from categorical columns
|
| 237 |
+
x_col = st.selectbox("Select x axis column", categorical_columns)
|
| 238 |
+
fig = px.bar(df, x=x_col, y=y_col,color = x_col)
|
| 239 |
+
st.write(fig)
|
| 240 |
+
st.divider()
|
| 241 |
+
if st.checkbox("Plot Line Chart"):
|
| 242 |
+
column12, column13,colx = st.columns(3)
|
| 243 |
+
with column12:
|
| 244 |
+
# Select y-axis column
|
| 245 |
+
y_col = st.selectbox("Select y axis column", df.columns)
|
| 246 |
+
|
| 247 |
+
# Filter categorical columns for the x-axis selection
|
| 248 |
+
categorical_columns = df.columns
|
| 249 |
+
with column13:
|
| 250 |
+
# Allow user to select the x-axis column from categorical columns
|
| 251 |
+
x_col = st.selectbox("Select x axis column", categorical_columns)
|
| 252 |
+
with colx:
|
| 253 |
+
hue_col1 = st.selectbox("Select Line split column",categorical_columns)
|
| 254 |
+
fig = px.line(df.sort_values(by = y_col), x=x_col, y=y_col,color = hue_col1)
|
| 255 |
+
st.write(fig)
|
| 256 |
+
st.divider()
|
| 257 |
+
# plot pie chart
|
| 258 |
+
if st.checkbox("Plot Pie Chart "):
|
| 259 |
+
column14, column15 = st.columns(2)
|
| 260 |
+
with column14:
|
| 261 |
+
# Select y-axis column
|
| 262 |
+
y_col = st.selectbox("Select values columns", df.columns)
|
| 263 |
+
|
| 264 |
+
# Filter categorical columns for the x-axis selection
|
| 265 |
+
categorical_columns = df.columns
|
| 266 |
+
with column15:
|
| 267 |
+
# Allow user to select the x-axis column from categorical columns
|
| 268 |
+
x_col = st.selectbox("Select names column", categorical_columns)
|
| 269 |
+
fig = px.pie(df, values=y_col, names=x_col)
|
| 270 |
+
st.write(fig)
|
| 271 |
+
|
| 272 |
+
st.divider()
|
| 273 |
+
# check if there are latitude and longitude columns
|
| 274 |
+
if st.checkbox("Plot on Map"):
|
| 275 |
+
lat_col = st.selectbox("Select Latitute Column",df.columns)
|
| 276 |
+
long_col = st.selectbox("Select Longitude Column",df.columns)
|
| 277 |
+
color = st.selectbox
|
| 278 |
+
|
| 279 |
+
# # Create the datashader canvas and aggregate points
|
| 280 |
+
# cvs = ds.Canvas(plot_width=1000, plot_height=1000)
|
| 281 |
+
# agg = cvs.points(df, x=long_col, y=lat_col)
|
| 282 |
+
|
| 283 |
+
# # Get the coordinates for the mapbox layer
|
| 284 |
+
# coords_lat, coords_lon = agg.coords[lat_col].values, agg.coords[long_col].values
|
| 285 |
+
# coordinates = [
|
| 286 |
+
# [coords_lon[0], coords_lat[0]],
|
| 287 |
+
# [coords_lon[-1], coords_lat[0]],
|
| 288 |
+
# [coords_lon[-1], coords_lat[-1]],
|
| 289 |
+
# [coords_lon[0], coords_lat[-1]]
|
| 290 |
+
# ]
|
| 291 |
+
|
| 292 |
+
# # Generate the datashader image
|
| 293 |
+
# img = tf.shade(agg, cmap=fire)[::-1].to_pil()
|
| 294 |
+
|
| 295 |
+
# # Create the Plotly figure with a mapbox layer
|
| 296 |
+
# fig = px.scatter_mapbox(df[:1], lat=lat_col, lon=long_col, zoom=10) # Adjust zoom level as needed
|
| 297 |
+
# fig.update_layout(mapbox_style="carto-darkmatter",
|
| 298 |
+
# mapbox_layers=[
|
| 299 |
+
# {
|
| 300 |
+
# "sourcetype": "image",
|
| 301 |
+
# "source": img,
|
| 302 |
+
# "coordinates": coordinates
|
| 303 |
+
# }
|
| 304 |
+
# ])
|
| 305 |
+
|
| 306 |
+
# # Display the figure in Streamlit
|
| 307 |
+
# st.plotly_chart(fig)
|
| 308 |
+
|
| 309 |
+
# Create a scatter mapbox plot with vibrant colors and custom marker sizes
|
| 310 |
+
if st.button("Proceed to plot map"):
|
| 311 |
+
fig = px.scatter_mapbox(df, lat=lat_col, lon=long_col,
|
| 312 |
+
|
| 313 |
+
size_max=15, # Max marker size
|
| 314 |
+
mapbox_style="open-street-map", # Using a different map style for vibrancy
|
| 315 |
+
zoom=1,
|
| 316 |
+
title='Latitude and Longitude Plotting')
|
| 317 |
+
|
| 318 |
+
# Customize the layout for more vibrant appearance
|
| 319 |
+
fig.update_layout(mapbox_accesstoken='your_mapbox_access_token')
|
| 320 |
+
st.write(fig)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
|
feature_selections.py
CHANGED
|
@@ -8,12 +8,10 @@ import pandas as pd
|
|
| 8 |
import numpy as np
|
| 9 |
import evaluationer
|
| 10 |
import streamlit as st
|
| 11 |
-
|
|
|
|
| 12 |
from sklearn.metrics import root_mean_squared_error
|
| 13 |
def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
| 14 |
-
|
| 15 |
-
st.write("dvsdv",y_train)
|
| 16 |
-
st.write("dvfssdv",X_train)
|
| 17 |
|
| 18 |
model = sm.OLS(y_train, sm.add_constant(X_train))
|
| 19 |
model_fit = model.fit()
|
|
@@ -100,5 +98,7 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
|
| 100 |
feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
|
| 101 |
st.write("feature_cols", vif_cols)
|
| 102 |
for i,j in enumerate(feature_cols):
|
| 103 |
-
evaluationer.evaluation(f"{feature_cols_name[i]}
|
| 104 |
-
return evaluationer.reg_evaluation_df
|
|
|
|
|
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
import evaluationer
|
| 10 |
import streamlit as st
|
| 11 |
+
|
| 12 |
+
|
| 13 |
from sklearn.metrics import root_mean_squared_error
|
| 14 |
def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
model = sm.OLS(y_train, sm.add_constant(X_train))
|
| 17 |
model_fit = model.fit()
|
|
|
|
| 98 |
feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
|
| 99 |
st.write("feature_cols", vif_cols)
|
| 100 |
for i,j in enumerate(feature_cols):
|
| 101 |
+
evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
|
| 102 |
+
return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name
|
| 103 |
+
|
| 104 |
+
|
grid_search_cv.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
|
| 2 |
+
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
|
| 3 |
+
from sklearn.neighbors import KNeighborsRegressor
|
| 4 |
+
from sklearn.tree import DecisionTreeRegressor
|
| 5 |
+
from sklearn.svm import SVR
|
| 6 |
+
from xgboost import XGBRegressor, XGBRFRegressor
|
| 7 |
+
from sklearn.neural_network import MLPRegressor
|
| 8 |
+
from lightgbm import LGBMRegressor
|
| 9 |
+
from sklearn.naive_bayes import GaussianNB
|
| 10 |
+
from sklearn.model_selection import GridSearchCV
|
| 11 |
+
from sklearn.datasets import make_regression
|
| 12 |
+
from sklearn.model_selection import train_test_split
|
| 13 |
+
import streamlit as st
|
| 14 |
+
import evaluationer
|
| 15 |
+
|
| 16 |
+
from sklearn.metrics import root_mean_squared_error
|
| 17 |
+
|
| 18 |
+
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
|
| 19 |
+
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
|
| 20 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 21 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 22 |
+
from sklearn.svm import SVC
|
| 23 |
+
from xgboost import XGBClassifier, XGBRFClassifier
|
| 24 |
+
from sklearn.neural_network import MLPClassifier
|
| 25 |
+
from lightgbm import LGBMClassifier
|
| 26 |
+
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
|
| 27 |
+
|
| 28 |
+
param_grids_class = {
|
| 29 |
+
"Logistic Regression": {
|
| 30 |
+
'penalty': ['l1', 'l2', 'elasticnet', 'none'],
|
| 31 |
+
'C': [0.01, 0.1, 1, 10],
|
| 32 |
+
'solver': ['lbfgs', 'liblinear', 'saga']
|
| 33 |
+
},
|
| 34 |
+
|
| 35 |
+
"SGD Classifier": {
|
| 36 |
+
'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
|
| 37 |
+
'penalty': ['l2', 'l1', 'elasticnet'],
|
| 38 |
+
'alpha': [0.0001, 0.001, 0.01],
|
| 39 |
+
'max_iter': [1000, 5000, 10000]
|
| 40 |
+
},
|
| 41 |
+
|
| 42 |
+
"Ridge Classifier": {
|
| 43 |
+
'alpha': [0.1, 1, 10, 100]
|
| 44 |
+
},
|
| 45 |
+
|
| 46 |
+
"Random Forest Classifier": {
|
| 47 |
+
'n_estimators': [100, 200, 300],
|
| 48 |
+
'max_depth': [None, 10, 20, 30],
|
| 49 |
+
'min_samples_split': [2, 5, 10],
|
| 50 |
+
'min_samples_leaf': [1, 2, 4]
|
| 51 |
+
},
|
| 52 |
+
|
| 53 |
+
"AdaBoost Classifier": {
|
| 54 |
+
'n_estimators': [50, 100, 200],
|
| 55 |
+
'learning_rate': [0.01, 0.1, 1]
|
| 56 |
+
},
|
| 57 |
+
|
| 58 |
+
"Gradient Boosting Classifier": {
|
| 59 |
+
'n_estimators': [100, 200, 300],
|
| 60 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 61 |
+
'max_depth': [3, 5, 7]
|
| 62 |
+
},
|
| 63 |
+
|
| 64 |
+
"Hist Gradient Boosting Classifier": {
|
| 65 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 66 |
+
'max_depth': [None, 10, 20],
|
| 67 |
+
'min_samples_leaf': [20, 50, 100]
|
| 68 |
+
},
|
| 69 |
+
|
| 70 |
+
"K Neighbors Classifier": {
|
| 71 |
+
'n_neighbors': [3, 5, 7],
|
| 72 |
+
'weights': ['uniform', 'distance'],
|
| 73 |
+
'metric': ['euclidean', 'manhattan']
|
| 74 |
+
},
|
| 75 |
+
|
| 76 |
+
"Decision Tree Classifier": {
|
| 77 |
+
'max_depth': [None, 10, 20, 30],
|
| 78 |
+
'min_samples_split': [2, 5, 10],
|
| 79 |
+
'min_samples_leaf': [1, 2, 4]
|
| 80 |
+
},
|
| 81 |
+
|
| 82 |
+
"SVC": {
|
| 83 |
+
'C': [0.1, 1, 10],
|
| 84 |
+
'kernel': ['linear', 'poly', 'rbf'],
|
| 85 |
+
'degree': [3, 4, 5],
|
| 86 |
+
'gamma': ['scale', 'auto']
|
| 87 |
+
},
|
| 88 |
+
|
| 89 |
+
"XGB Classifier": {
|
| 90 |
+
'n_estimators': [100, 200, 300],
|
| 91 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 92 |
+
'max_depth': [3, 5, 7]
|
| 93 |
+
},
|
| 94 |
+
|
| 95 |
+
"XGBRF Classifier": {
|
| 96 |
+
'n_estimators': [100, 200, 300],
|
| 97 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 98 |
+
'max_depth': [3, 5, 7]
|
| 99 |
+
},
|
| 100 |
+
|
| 101 |
+
"MLP Classifier": {
|
| 102 |
+
'hidden_layer_sizes': [(50,), (100,), (50, 50)],
|
| 103 |
+
'activation': ['tanh', 'relu'],
|
| 104 |
+
'solver': ['adam', 'sgd'],
|
| 105 |
+
'alpha': [0.0001, 0.001, 0.01],
|
| 106 |
+
'learning_rate': ['constant', 'adaptive']
|
| 107 |
+
},
|
| 108 |
+
|
| 109 |
+
"LGBM Classifier": {
|
| 110 |
+
'n_estimators': [100, 200, 300],
|
| 111 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 112 |
+
'max_depth': [-1, 10, 20]
|
| 113 |
+
},
|
| 114 |
+
|
| 115 |
+
"Multinomial Naive Bayes": {
|
| 116 |
+
'alpha': [0.1, 0.5, 1.0]
|
| 117 |
+
},
|
| 118 |
+
|
| 119 |
+
"Categorical Naive Bayes": {
|
| 120 |
+
'alpha': [0.1, 0.5, 1.0]
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
param_grids_reg = {
|
| 125 |
+
"Linear Regression": {},
|
| 126 |
+
|
| 127 |
+
"SGD Regressor": {
|
| 128 |
+
'loss': ['squared_loss', 'huber'],
|
| 129 |
+
'penalty': ['l2', 'l1', 'elasticnet'],
|
| 130 |
+
'alpha': [0.0001, 0.001, 0.01],
|
| 131 |
+
'max_iter': [1000, 5000, 10000]
|
| 132 |
+
},
|
| 133 |
+
|
| 134 |
+
"Ridge Regressor": {
|
| 135 |
+
'alpha': [0.1, 1, 10, 100],
|
| 136 |
+
'solver': ['auto', 'svd', 'cholesky', 'lsqr']
|
| 137 |
+
},
|
| 138 |
+
|
| 139 |
+
"Lasso Regressor": {
|
| 140 |
+
'alpha': [0.1, 1, 10, 100]
|
| 141 |
+
},
|
| 142 |
+
|
| 143 |
+
"ElasticNet Regressor": {
|
| 144 |
+
'alpha': [0.1, 1, 10, 100],
|
| 145 |
+
'l1_ratio': [0.1, 0.5, 0.9]
|
| 146 |
+
},
|
| 147 |
+
|
| 148 |
+
"Random Forest Regressor": {
|
| 149 |
+
'n_estimators': [100, 200, 300],
|
| 150 |
+
'max_depth': [None, 10, 20, 30],
|
| 151 |
+
'min_samples_split': [2, 5, 10],
|
| 152 |
+
'min_samples_leaf': [1, 2, 4]
|
| 153 |
+
},
|
| 154 |
+
|
| 155 |
+
"AdaBoost Regressor": {
|
| 156 |
+
'n_estimators': [50, 100, 200],
|
| 157 |
+
'learning_rate': [0.01, 0.1, 1]
|
| 158 |
+
},
|
| 159 |
+
|
| 160 |
+
"Gradient Boosting Regressor": {
|
| 161 |
+
'n_estimators': [100, 200, 300],
|
| 162 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 163 |
+
'max_depth': [3, 5, 7]
|
| 164 |
+
},
|
| 165 |
+
|
| 166 |
+
"Hist Gradient Boosting Regressor": {
|
| 167 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 168 |
+
'max_depth': [None, 10, 20],
|
| 169 |
+
'min_samples_leaf': [20, 50, 100]
|
| 170 |
+
},
|
| 171 |
+
|
| 172 |
+
"K Neighbors Regressor": {
|
| 173 |
+
'n_neighbors': [3, 5, 7],
|
| 174 |
+
'weights': ['uniform', 'distance'],
|
| 175 |
+
'metric': ['euclidean', 'manhattan']
|
| 176 |
+
},
|
| 177 |
+
|
| 178 |
+
"Decision Tree Regressor": {
|
| 179 |
+
'max_depth': [None, 10, 20, 30],
|
| 180 |
+
'min_samples_split': [2, 5, 10],
|
| 181 |
+
'min_samples_leaf': [1, 2, 4]
|
| 182 |
+
},
|
| 183 |
+
|
| 184 |
+
"SVR": {
|
| 185 |
+
'C': [0.1, 1, 10],
|
| 186 |
+
'kernel': ['linear', 'poly', 'rbf'],
|
| 187 |
+
'degree': [3, 4, 5],
|
| 188 |
+
'gamma': ['scale', 'auto']
|
| 189 |
+
},
|
| 190 |
+
|
| 191 |
+
"XGB Regressor": {
|
| 192 |
+
'n_estimators': [100, 200, 300],
|
| 193 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 194 |
+
'max_depth': [3, 5, 7]
|
| 195 |
+
},
|
| 196 |
+
|
| 197 |
+
"XGBRF Regressor": {
|
| 198 |
+
'n_estimators': [100, 200, 300],
|
| 199 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 200 |
+
'max_depth': [3, 5, 7]
|
| 201 |
+
},
|
| 202 |
+
|
| 203 |
+
"MLP Regressor": {
|
| 204 |
+
'hidden_layer_sizes': [(50,), (100,), (50, 50)],
|
| 205 |
+
'activation': ['tanh', 'relu'],
|
| 206 |
+
'solver': ['adam', 'sgd'],
|
| 207 |
+
'alpha': [0.0001, 0.001, 0.01],
|
| 208 |
+
'learning_rate': ['constant', 'adaptive']
|
| 209 |
+
},
|
| 210 |
+
|
| 211 |
+
"LGBM Regressor": {
|
| 212 |
+
'n_estimators': [100, 200, 300],
|
| 213 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 214 |
+
'max_depth': [-1, 10, 20]
|
| 215 |
+
},
|
| 216 |
+
|
| 217 |
+
"Gaussian Naive Bayes": {
|
| 218 |
+
'var_smoothing': [1e-9, 1e-8, 1e-7]
|
| 219 |
+
}
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
# Define the regressors
|
| 223 |
+
regressors = {
|
| 224 |
+
"Linear Regression": LinearRegression(),
|
| 225 |
+
"SGD Regressor": SGDRegressor(),
|
| 226 |
+
"Ridge Regressor": Ridge(),
|
| 227 |
+
"Lasso Regressor": Lasso(),
|
| 228 |
+
"ElasticNet Regressor": ElasticNet(),
|
| 229 |
+
"Random Forest Regressor": RandomForestRegressor(),
|
| 230 |
+
"AdaBoost Regressor": AdaBoostRegressor(),
|
| 231 |
+
"Gradient Boosting Regressor": GradientBoostingRegressor(),
|
| 232 |
+
"Hist Gradient Boosting Regressor": HistGradientBoostingRegressor(),
|
| 233 |
+
"K Neighbors Regressor": KNeighborsRegressor(),
|
| 234 |
+
"Decision Tree Regressor": DecisionTreeRegressor(),
|
| 235 |
+
"SVR": SVR(),
|
| 236 |
+
"XGB Regressor": XGBRegressor(),
|
| 237 |
+
"XGBRF Regressor": XGBRFRegressor(),
|
| 238 |
+
"MLP Regressor": MLPRegressor(),
|
| 239 |
+
"LGBM Regressor": LGBMRegressor(),
|
| 240 |
+
"Gaussian Naive Bayes": GaussianNB()
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
classifiers = {
|
| 244 |
+
"Logistic Regression": LogisticRegression(),
|
| 245 |
+
"SGD Classifier": SGDClassifier(),
|
| 246 |
+
"Ridge Classifier": RidgeClassifier(),
|
| 247 |
+
"Random Forest Classifier": RandomForestClassifier(),
|
| 248 |
+
"AdaBoost Classifier": AdaBoostClassifier(),
|
| 249 |
+
"Gradient Boosting Classifier": GradientBoostingClassifier(),
|
| 250 |
+
"Hist Gradient Boosting Classifier": HistGradientBoostingClassifier(),
|
| 251 |
+
"K Neighbors Classifier": KNeighborsClassifier(),
|
| 252 |
+
"Decision Tree Classifier": DecisionTreeClassifier(),
|
| 253 |
+
"SVC": SVC(),
|
| 254 |
+
"XGB Classifier": XGBClassifier(),
|
| 255 |
+
"XGBRF Classifier": XGBRFClassifier(),
|
| 256 |
+
"MLP Classifier": MLPClassifier(),
|
| 257 |
+
"LGBM Classifier": LGBMClassifier(),
|
| 258 |
+
"Multinomial Naive Bayes": MultinomialNB(),
|
| 259 |
+
"Categorical Naive Bayes": CategoricalNB()
|
| 260 |
+
}
|
| 261 |
+
def perform_grid_search(model,model_name,X_train,X_test,y_train,y_test,eva):
|
| 262 |
+
if eva == "reg":
|
| 263 |
+
regressor = regressors[model_name]
|
| 264 |
+
|
| 265 |
+
param_grid_reg = param_grids_reg[model_name]
|
| 266 |
+
|
| 267 |
+
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid_reg, cv=5, scoring='neg_mean_squared_error')
|
| 268 |
+
grid_search.fit(X_train,y_train)
|
| 269 |
+
st.write(f"Best Parameters for {model_name}: {grid_search.best_params_}")
|
| 270 |
+
st.write(f"Best Score for {model_name}: {grid_search.best_score_}")
|
| 271 |
+
best_model = grid_search.best_estimator_
|
| 272 |
+
y_pred = best_model.predict(X_test)
|
| 273 |
+
evaluationer.evaluation("best hyperparams",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
| 274 |
+
elif eva == "class":
|
| 275 |
+
classifier = classifiers[model_name]
|
| 276 |
+
param_grid_class = param_grids_class[model_name]
|
| 277 |
+
|
| 278 |
+
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid_class, cv=5, scoring='accuracy')
|
| 279 |
+
grid_search.fit(X_train,y_train)
|
| 280 |
+
st.write(f"Best Parameters for {model_name}: {grid_search.best_params_}")
|
| 281 |
+
st.write(f"Best Score for {model_name}: {grid_search.best_score_}")
|
| 282 |
+
best_model = grid_search.best_estimator_
|
| 283 |
+
y_pred = best_model.predict(X_test)
|
| 284 |
+
evaluationer.evaluation("best hyperparams",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
models.py
CHANGED
|
@@ -23,6 +23,8 @@ from sklearn.neural_network import MLPRegressor
|
|
| 23 |
from lightgbm import LGBMRegressor
|
| 24 |
from sklearn.naive_bayes import GaussianNB
|
| 25 |
|
|
|
|
|
|
|
| 26 |
# dictionary where keys are name of algorithm and values are algorithm for classifier
|
| 27 |
algos_class = {
|
| 28 |
"Logistic Regression": LogisticRegression(),
|
|
|
|
| 23 |
from lightgbm import LGBMRegressor
|
| 24 |
from sklearn.naive_bayes import GaussianNB
|
| 25 |
|
| 26 |
+
|
| 27 |
+
|
| 28 |
# dictionary where keys are name of algorithm and values are algorithm for classifier
|
| 29 |
algos_class = {
|
| 30 |
"Logistic Regression": LogisticRegression(),
|
requirements.txt
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
|
|
| 1 |
streamlit==1.34.0
|
| 2 |
joblib==1.4.2
|
| 3 |
numpy==1.26.4
|
| 4 |
pandas==2.2.2
|
| 5 |
scikit-learn==1.4.2
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
matplotlib==3.9.0
|
| 8 |
-
|
| 9 |
-
lightgbm==4.3.0
|
| 10 |
-
statsmodels==0.14.2
|
|
|
|
| 1 |
+
|
| 2 |
streamlit==1.34.0
|
| 3 |
joblib==1.4.2
|
| 4 |
numpy==1.26.4
|
| 5 |
pandas==2.2.2
|
| 6 |
scikit-learn==1.4.2
|
| 7 |
+
datashader==0.16.2
|
| 8 |
+
colorcet==3.1.0
|
| 9 |
+
plotly==5.22.0
|
| 10 |
matplotlib==3.9.0
|
| 11 |
+
seaborn==0.13.2
|
|
|
|
|
|