Spaces:

Gaurav069
/

LazyML

Sleeping

App Files Files Community

Gaurav069 commited on Jun 13, 2024

Commit

ba67510

verified ·

1 Parent(s): dc62f39

Upload 12 files

Browse files

Files changed (9) hide show

.streamlit/config.toml +7 -0
app.py +166 -53
auto_optimizer.py +361 -317
best_tts.py +2 -2
eda.py +325 -0
feature_selections.py +6 -6
grid_search_cv.py +284 -0
models.py +2 -0
requirements.txt +5 -4

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,7 @@

+[theme]
+primaryColor="#F63366"
+backgroundColor="#002148"
+secondaryBackgroundColor="#576c86"
+textColor="white"
+font="serif"

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import evaluationer,models, null_value_handling
 import auto_optimizer
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.impute import SimpleImputer, IterativeImputer
 # st.set_page_config(layout="wide")
 st.set_page_config(
@@ -21,7 +22,23 @@ st.set_page_config(
     }
 )
-import streamlit as st
 # Title with Rainbow Transition Effect and Neon Glow
 html_code = """
@@ -67,23 +84,74 @@ html_code = """
 """
 st.markdown(html_code, unsafe_allow_html=True)
 # file uploader
 csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
 csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
 test = pd.DataFrame()
 if csv_upload is not None:
     # read the uploaded file into dataframe
-    df = pd.read_csv(csv_upload)
     # saving the dataframe to a CSV file
     df.to_csv('csv_upload.csv', index=False)
-    st.write("Train File uploaded successfully. ✅")
     if csv_upload2 is not None:
-        test = pd.read_csv(csv_upload2)
-        id_col = st.selectbox("select column for submission i.e, ID",test.columns)
         submission_id = test[id_col]
         # st.write("Train File upl",submission_id)
@@ -93,8 +161,10 @@ if csv_upload is not None:
     if len(test) >0:
         # saving the test dataframe to a CSV file
         test.to_csv('csv_upload_test.csv', index=False)
-        st.write("Test File uploaded successfully. ✅")
     display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
     if  display_train_data == "Yes":
         st.dataframe(df.head())
@@ -104,29 +174,40 @@ if csv_upload is not None:
         if display_test_data == "Yes":
             st.dataframe(test.head())
-    if st.radio("Select Supervision Category",["Supervised","Un-Supervised"],index =0) == "Supervised":
-        selected_column = st.selectbox('Select Target column', df.columns, index=(len(df.columns)-1))
         # Display the selected column
         st.write('You selected:', selected_column)
         y = df[selected_column]
         if y.dtype == "O":
-            st.write("⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️")
-            if st.radio("Proceed for Label Encoding ",["Yes","No"],index = 1) == "Yes":
                 from sklearn.preprocessing import LabelEncoder
                 le = LabelEncoder()
                 y= pd.Series(le.fit_transform(y))
-                st.write("Label Encoding Completed ✅")
-        if st.radio("Display Target Column",["Yes","No"],index =1) == "Yes":
                 st.dataframe(y.head())
-        select_target_trans = st.radio("Target column Transformation",["Yes","No"],index = 1)
         if  select_target_trans == "Yes":
             selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
             if selected_transformation == "Log Transformation":
@@ -155,36 +236,52 @@ if csv_upload is not None:
             if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
                 st.dataframe(y.head())
-# inverse of transformation
         X = df.drop(columns = selected_column)
         if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
             st.dataframe(X.head())
-        if st.radio("Check for duplicate Values",["Yes","No"],index = 1) == "Yes":
             len_duplicates = len(X[X.duplicated()])
             if len_duplicates >0:
                 st.write(f"There are {len_duplicates} duplicate values in Train")
                 if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
                     X = X.drop_duplicates()
                     st.write("Duplicate values removed ✅")
             else:
                 st.write("There are no duplicate values in Train")
         # dropping not important columns
-        if st.radio("Drop Un-Important Column(s)",["Yes","No"],index = 1) == "Yes":
             selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
             X = X.drop(columns = selected_drop_column)
             if len(test) >0:
                 test = test.drop(columns = selected_drop_column)
-            st.write("Un-Important column(s) Delected ✅")
             st.dataframe(X.head())
         num_cols = X.select_dtypes(exclude = "O").columns
         cat_cols = X.select_dtypes(include = "O").columns
         st.write("Numerical Columns in Train Data: ", tuple(num_cols))
         st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
-        if st.radio("Select method for ML modelling", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
             ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
             if ml_cat_ao =="Regression":
@@ -192,7 +289,7 @@ if csv_upload is not None:
                 st.write("Select ML algorithm")
                 reg_model_name = st.selectbox("select model",models.Regression_models.index)
                 reg_model = models.Regression_models.loc[reg_model_name].values[0]
-                auto_optimizer.Auto_optimizer(X,y,eva,reg_model)
             elif ml_cat_ao =="Classification":
                 eva = "class"
@@ -201,10 +298,12 @@ if csv_upload is not None:
                 class_model = models.Classification_models.loc[class_model_name].values[0]
                 auto_optimizer.Auto_optimizer(X,y,eva,class_model)
         else:
             if X.isnull().sum().sum() >0 :
-                st.write("⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️")
                 if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
@@ -241,7 +340,9 @@ if csv_upload is not None:
                 clean_num_nvh_df_cat = pd.DataFrame()
                 if X[cat_cols].isnull().sum().sum() >0:
                     st.write("Categorical Columns with Percentage of Null Values: ")
                     cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
                     st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
@@ -270,33 +371,41 @@ if csv_upload is not None:
                 null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
                 st.write("X Data after Null value handling", X.head())
-            new_df = pd.concat([X,y[X.index]],axis = 1)
-            csv = new_df.to_csv(index = False)
-            if st.radio("Download Null Value Handled DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
-                st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
             ord_enc_cols = []
             if len(cat_cols) == 0:
                 st.write("No Categorical Columns in Train")
             else:
-                st.write("Select Columns for Ordinal Encoding")
                 for column in cat_cols:
                     selected = st.checkbox(column)
                     if selected:
                         st.write(f"No. of Unique value in {column} column are", X[column].nunique())
                         ord_enc_cols.append(column)
             ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
             ohe_enc_cols = list(ohe_enc_cols)
             if len(ord_enc_cols)>0:
                 st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
             if len(ohe_enc_cols)>0:
                 st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
             if len(ord_enc_cols)>0:
-                if st.radio("proceed for ordinal encoding",["Yes","No"],index = 1) == "Yes":
                     ordinal_order_vals = []
                     for column in ord_enc_cols:
@@ -317,7 +426,7 @@ if csv_upload is not None:
                     st.write("Ordinal Encoding Completed ✅")
             if len(ohe_enc_cols)>0:
-                if st.radio("proceed for OnehotEncoding ",["Yes","No"],index = 1) == "Yes":    # import one hot encoder
                     from sklearn.preprocessing import OneHotEncoder
                     ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
                     pd.options.mode.chained_assignment = None
@@ -331,39 +440,43 @@ if csv_upload is not None:
                     st.write("DataFrame after One Hot Encoding",X.head())
                     st.write("OneHot Encoding Completed ✅")
             new_df = pd.concat([X,y],axis = 1)
             csv = new_df.to_csv(index = False)
-            if st.radio("Download Encoded DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
                 st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
-            random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
-            test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
-            if st.radio("select Train Validation Split Method",
-                        [f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})",
-                        "KFoldCV, Default (CV = 5)"], index = 0)== f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})":
                 ttsmethod = "Train_Test_split"
             else:
                 ttsmethod = "KFoldCV"
             st.write('You selected:', ttsmethod)
             if ttsmethod == "Train_Test_split":
                 X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
-                st.write('X-Training Data shape:', (X_train.info()))
                 st.write('X-Training Data shape:', X_train.shape)
                 st.write('X-Validation Data shape:', X_Val.shape)
-            ml_cat = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
             if ml_cat =="Regression":
-                method_name_selector = st.selectbox("Select Error Evaluation Method",evaluationer.method_df.index,index = 0)
                 method = evaluationer.method_df.loc[method_name_selector].values[0]
                 reg_algorithm = []
                 selected_options = []
                 for option in models.Regression_models.index:
                     selected = st.checkbox(option)
                     if selected:
@@ -450,7 +563,7 @@ if csv_upload is not None:
                 cla_algorithm = []
                 selected_options = []
                 for option in models.Classification_models.index:
                     selected = st.checkbox(option)
                     if selected:

 import auto_optimizer
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.impute import SimpleImputer, IterativeImputer
+import eda
 # st.set_page_config(layout="wide")
 st.set_page_config(
     }
 )
+# Set the background image
+background_image = """
+<style>
+[data-testid="stAppViewContainer"] > .main {
+    background-image: url("https://w.wallhaven.cc/full/jx/wallhaven-jx7w25.png");
+    background-size: 100vw 100vh;  # This sets the size to cover 100% of the viewport width and height
+    background-position: center;
+    background-repeat: no-repeat;
+}
+</style>
+"""
+st.markdown(background_image, unsafe_allow_html=True)
 # Title with Rainbow Transition Effect and Neon Glow
 html_code = """
 """
 st.markdown(html_code, unsafe_allow_html=True)
+st.divider()
+st.markdown(
+    """
+    <style>
+    .success-message {
+        font-family: Arial, sans-serif;
+        font-size: 24px;
+        color: green;
+        text-align: left;
+    }
+    .unsuccess-message {
+        font-family: Arial, sans-serif;
+        font-size: 24px;
+        color: red;
+        text-align: left;
+    }
+    .prompt-message {
+        font-family: Arial, sans-serif;
+        font-size: 24px;
+        color: #333;
+        text-align: center;
+    }
+    .success-message2 {
+        font-family: Arial, sans-serif;
+        font-size: 18px;
+        color: white;
+        text-align: left;
+    }
+    .message-box {
+        text-align: center;
+        background-color: white;
+        padding: 5px;
+        border-radius: 10px;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+        font-size: 24px;
+        color: #333;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+# st.markdown('<p class="success-message">Train File uploaded successfully. ✅</p>', unsafe_allow_html=True)
 # file uploader
 csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
+sep = st.sidebar.text_input("Input Seperator")
+if (len(sep) ==0):
+    sep = ","
 csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
 test = pd.DataFrame()
 if csv_upload is not None:
     # read the uploaded file into dataframe
+    df = pd.read_csv(csv_upload,sep = sep)
     # saving the dataframe to a CSV file
     df.to_csv('csv_upload.csv', index=False)
+    st.markdown('<p class="success-message">Train File uploaded successfully. ✅</p>', unsafe_allow_html=True)
     if csv_upload2 is not None:
+        test = pd.read_csv(csv_upload2,sep = sep)
+        st.markdown('<p class="success-message">Test File uploaded successfully. ✅</p>', unsafe_allow_html=True)
+        st.divider()
+        id_col = st.selectbox("Select Column for Submission i.e, ID",test.columns)
+        st.divider()
         submission_id = test[id_col]
         # st.write("Train File upl",submission_id)
     if len(test) >0:
         # saving the test dataframe to a CSV file
         test.to_csv('csv_upload_test.csv', index=False)
+    st.markdown('<p class="message-box">Display Data</p>', unsafe_allow_html=True)
+    st.write("")
     display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
     if  display_train_data == "Yes":
         st.dataframe(df.head())
         if display_test_data == "Yes":
             st.dataframe(test.head())
+    st.divider()
+    st.markdown('<div class="message-box success">Select Supervision Category</div>', unsafe_allow_html=True)
+    if st.radio("",["Supervised","Un-Supervised"],index =0) == "Supervised":
+        st.divider()
+        st.write('<p class="success-message2">Select Target column</p>', unsafe_allow_html=True)
+        selected_column = st.selectbox('', df.columns, index=(len(df.columns)-1))
         # Display the selected column
         st.write('You selected:', selected_column)
+        st.divider()
+        st.markdown('<div class="message-box success ">Perform EDA</div>', unsafe_allow_html=True)
+        st.write("")
+        if st.checkbox("Proceed to perform EDA"):
+            eda.eda_analysis(df)
+            st.write('<p class="success-message">EDA Performed proceed for Pre-processing</p>', unsafe_allow_html=True)
+        st.divider()
         y = df[selected_column]
         if y.dtype == "O":
+            st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
+            if st.checkbox("Proceed for Label Encoding "):
                 from sklearn.preprocessing import LabelEncoder
                 le = LabelEncoder()
                 y= pd.Series(le.fit_transform(y))
+                st.markdown('<p class="success-message">Label Encoding Completed ✅</p>', unsafe_allow_html=True)
+        if st.checkbox("Display Target Column"):
                 st.dataframe(y.head())
+        st.divider()
+        st.markdown('<div class="message-box success">Target column Transformation</div>', unsafe_allow_html=True)
+        select_target_trans = st.radio("",["Yes","No"],index = 1)
         if  select_target_trans == "Yes":
             selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
             if selected_transformation == "Log Transformation":
             if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
                 st.dataframe(y.head())
         X = df.drop(columns = selected_column)
         if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
             st.dataframe(X.head())
+        st.divider()
+        # st.checkbox()
+        st.markdown('<div class="message-box success">Check for duplicate Values</div>', unsafe_allow_html=True)
+        if st.radio("  ",["Yes","No"],index = 1) == "Yes":
             len_duplicates = len(X[X.duplicated()])
             if len_duplicates >0:
                 st.write(f"There are {len_duplicates} duplicate values in Train")
+                if st.checkbox("Show Duplicate values"):
+                    st.dataframe(X[X.duplicated()])
                 if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
                     X = X.drop_duplicates()
                     st.write("Duplicate values removed ✅")
             else:
                 st.write("There are no duplicate values in Train")
+        st.divider()
         # dropping not important columns
+        st.markdown('<div class="message-box success">Drop Unimportant Columns</div>', unsafe_allow_html=True)
+        if st.radio("   ",["Yes","No"],index = 1) == "Yes":
             selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
             X = X.drop(columns = selected_drop_column)
             if len(test) >0:
                 test = test.drop(columns = selected_drop_column)
+            st.write("Un-Important column(s) Deleted ✅")
             st.dataframe(X.head())
+        st.divider()
         num_cols = X.select_dtypes(exclude = "O").columns
         cat_cols = X.select_dtypes(include = "O").columns
         st.write("Numerical Columns in Train Data: ", tuple(num_cols))
         st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
+        if st.sidebar.button("Clear Evaluation DataFrame"):
+            evaluationer.reg_evaluation_df = evaluationer.reg_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
+            evaluationer.classification_evaluation_df = evaluationer.classification_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
+        st.divider()
+        # markdown
+        st.markdown('<div class="message-box success">Select method for ML modelling</div>', unsafe_allow_html = True)
+        if st.radio("     ", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
+            st.divider()
             ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
             if ml_cat_ao =="Regression":
                 st.write("Select ML algorithm")
                 reg_model_name = st.selectbox("select model",models.Regression_models.index)
                 reg_model = models.Regression_models.loc[reg_model_name].values[0]
+                auto_optimizer.Auto_optimizer(X,y,eva,reg_model,reg_model_name)
             elif ml_cat_ao =="Classification":
                 eva = "class"
                 class_model = models.Classification_models.loc[class_model_name].values[0]
                 auto_optimizer.Auto_optimizer(X,y,eva,class_model)
         else:
+            st.divider()
             if X.isnull().sum().sum() >0 :
+                st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
                 if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
                 clean_num_nvh_df_cat = pd.DataFrame()
                 if X[cat_cols].isnull().sum().sum() >0:
+                    st.divider()
                     st.write("Categorical Columns with Percentage of Null Values: ")
                     cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
                     st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
                 null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
                 st.write("X Data after Null value handling", X.head())
+                new_df = pd.concat([X,y[X.index]],axis = 1)
+                csv = new_df.to_csv(index = False)
+                st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
+                if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
+                    st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
+                st.divider()
             ord_enc_cols = []
             if len(cat_cols) == 0:
                 st.write("No Categorical Columns in Train")
             else:
+                st.markdown('<div class="message-box success">Features Encoding</div>', unsafe_allow_html=True)
+                st.markdown('<p class="unsuccess-message">There are Object type Features in Train Data ⚠️</p>', unsafe_allow_html=True)
+                st.markdown('<p class="success-message2">Select Columns for Ordinal Encoding</p>', unsafe_allow_html=True)
                 for column in cat_cols:
                     selected = st.checkbox(column)
                     if selected:
                         st.write(f"No. of Unique value in {column} column are", X[column].nunique())
                         ord_enc_cols.append(column)
+            st.divider()
             ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
             ohe_enc_cols = list(ohe_enc_cols)
             if len(ord_enc_cols)>0:
                 st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
             if len(ohe_enc_cols)>0:
                 st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
+            st.divider()
+            st.markdown('<div class="message-box success">Proceed for Encoding</div>', unsafe_allow_html=True)
             if len(ord_enc_cols)>0:
+                if st.checkbox("Proceed for Ordinal Encoding"):
                     ordinal_order_vals = []
                     for column in ord_enc_cols:
                     st.write("Ordinal Encoding Completed ✅")
             if len(ohe_enc_cols)>0:
+                if st.checkbox("Proceed for OneHotEncoding "):    # import one hot encoder
                     from sklearn.preprocessing import OneHotEncoder
                     ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
                     pd.options.mode.chained_assignment = None
                     st.write("DataFrame after One Hot Encoding",X.head())
                     st.write("OneHot Encoding Completed ✅")
+            st.divider()
             new_df = pd.concat([X,y],axis = 1)
             csv = new_df.to_csv(index = False)
+            if st.checkbox("Download Encoded DataFrame as CSV File ? "):
                 st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
+            st.divider()
+            st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
+            st.write("")
+            st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)
+            if st.radio("",["Train_Test_split","KFoldCV, Default (CV = 5)"], index = 0)== "Train_Test_split":
                 ttsmethod = "Train_Test_split"
             else:
                 ttsmethod = "KFoldCV"
             st.write('You selected:', ttsmethod)
             if ttsmethod == "Train_Test_split":
+                random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
+                test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
                 X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
                 st.write('X-Training Data shape:', X_train.shape)
                 st.write('X-Validation Data shape:', X_Val.shape)
+            st.divider()
+            st.markdown('<p class="success-message2">Select Machine Learning Category</p>', unsafe_allow_html=True)
+            ml_cat = st.radio("___",options=["Regression","Classification"],index =0)
+            st.divider()
             if ml_cat =="Regression":
+                st.markdown('<p class="success-message2">Select Error Evaluation Method</p>', unsafe_allow_html=True)
+                method_name_selector = st.selectbox("       ",evaluationer.method_df.index,index = 0)
+                st.divider()
                 method = evaluationer.method_df.loc[method_name_selector].values[0]
                 reg_algorithm = []
                 selected_options = []
+                st.markdown('<div class="message-box success">Select ML Model(s)</div>', unsafe_allow_html=True)
                 for option in models.Regression_models.index:
                     selected = st.checkbox(option)
                     if selected:
                 cla_algorithm = []
                 selected_options = []
+                st.markdown('<div class="message-box success">Select ML Model(s)</div>', unsafe_allow_html=True)
                 for option in models.Classification_models.index:
                     selected = st.checkbox(option)
                     if selected:

auto_optimizer.py CHANGED Viewed

@@ -1,317 +1,361 @@
-import pandas as pd
-import numpy as np
-import streamlit as st
-from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
-import best_tts, evaluationer,models
-from sklearn.experimental import enable_iterative_imputer
-from sklearn.model_selection import train_test_split as tts
-from collections import Counter
-#root_mean_squared_error
-from sklearn.metrics import root_mean_squared_error
-import seaborn as sns
-import matplotlib.pyplot as plt
-import outliers,best_tts
-import feature_selections
-def Auto_optimizer(X,y,eva,model,test= None):
-    evaluationer.reg_evaluation_df =evaluationer.reg_evaluation_df.drop(index =evaluationer.reg_evaluation_df.index)
-    num_cols = X.select_dtypes(exclude = "O").columns
-    cat_cols = X.select_dtypes(include = "O").columns
-    st.write("Num_cols",tuple(num_cols))
-    st.write("cat_cols",tuple(cat_cols))
-# check for Duplicate and drop duplicated in X
-    if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
-        X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
-        st.write("Columns with more than 40% null values removed")
-    # st.write("csx",X)
-    len_null = X.isnull().sum().sum()
-    st.write(f"There are {len_null} null values in Train")
-    knn_imputed_num_X = X.copy()
-    si_mean_imputed_num_X = X.copy()
-    # st.write("sf",si_mean_imputed_num_X)
-    si_median_imputed_num_X = X.copy()
-    si_most_frequent_imputed_num_X = X.copy()
-    iter_imputed_num_X = X.copy()
-    knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
-    si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
-    si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
-    si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
-    iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
-    if len_null >0:
-        if X[num_cols].isnull().sum().sum() >0:
-            knn_imputer = KNNImputer(n_neighbors = 5)
-            knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
-            si_imputer = SimpleImputer(strategy = "mean")
-            si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
-            si_imputer = SimpleImputer(strategy = "median")
-            si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
-            si_imputer = SimpleImputer(strategy = "most_frequent")
-            si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
-            iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
-            iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
-        knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
-        si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
-        si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
-        si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
-        iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
-        if X[cat_cols].isnull().sum().sum() >0:
-            # treating missing values in categorical columns
-            # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
-            si_imputer = SimpleImputer(strategy = "most_frequent")
-            knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
-            si_imputer = SimpleImputer(strategy = "most_frequent")
-            si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
-            # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
-            si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
-            si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
-            iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
-            knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
-            si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
-            si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
-            si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
-            iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
-            st.write("sdds",knn_imputed_num_X)
-            st.write("sddssd",knn_imputed_X_cat_dropped)
-    miss_val_dropped_X = X.dropna()
-        # list of dataframes
-    list_X_after_missing_values= [knn_imputed_num_X,
-                            si_mean_imputed_num_X,
-                            si_median_imputed_num_X,
-                            si_most_frequent_imputed_num_X,
-                            iter_imputed_num_X,
-                            knn_imputed_X_cat_dropped,
-                            si_mean_imputed_X_cat_dropped,
-                            si_median_imputed_X_cat_dropped,
-                            si_most_frequent_imputed_X_cat_dropped,
-                            iter_imputed_X_cat_dropped,
-                            miss_val_dropped_X]
-    list_X_after_missing_values_names= ["knn_imputed_num_X",
-                            "si_mean_imputed_num_X",
-                            "si_median_imputed_num_X",
-                            "si_most_frequent_imputed_num_X",
-                            "iter_imputed_num_X",
-                            "knn_imputed_X_cat_dropped",
-                            "si_mean_imputed_X_cat_dropped",
-                            "si_median_imputed_X_cat_dropped",
-                            "si_most_frequent_imputed_X_cat_dropped",
-                            "iter_imputed_X_cat_dropped",
-                            "miss_val_dropped_X"]
-    # st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
-    ord_enc_cols = []
-    ohe_enc_cols = []
-    if len(cat_cols) == 0:
-        st.write("No Categorical Columns in Train")
-    else:
-        st.write("Select Columns for Ordinal Encoding")
-        for column in cat_cols:
-            selected = st.checkbox(column)
-            if selected:
-                st.write(f"No. of Unique value in {column} column are", X[column].nunique())
-                ord_enc_cols.append(column)
-    ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
-    ohe_enc_cols = list(ohe_enc_cols)
-    if len(ord_enc_cols)>0:
-                st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
-    if len(ohe_enc_cols)>0:
-        st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
-    if len(ord_enc_cols)>0:
-        ordinal_order_vals = []
-        for column in ord_enc_cols:
-            unique_vals = X.dropna()[column].unique()
-            # st.write(f"No. of Unique value in {column} column are", len(unique_vals))
-            ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
-            ordinal_order_vals.append(ordered_unique_vals)
-        st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
-        if len_null > 0:
-            for df_name, df in enumerate(list_X_after_missing_values):
-                # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
-                from sklearn.preprocessing import OrdinalEncoder
-                ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
-                df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
-                # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
-        else :
-            from sklearn.preprocessing import OrdinalEncoder
-            ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
-            X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
-        st.write("Ordinal Encoding Completed ✅")
-    if len(ohe_enc_cols)>0:
-        if len_null > 0:
-            for df_name, df in enumerate(list_X_after_missing_values):
-                from sklearn.preprocessing import OneHotEncoder
-                ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
-                pd.options.mode.chained_assignment = None
-                df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
-                df.drop(columns = ohe_enc_cols,inplace = True)
-                pd.options.mode.chained_assignment = 'warn'
-        else:
-            from sklearn.preprocessing import OneHotEncoder
-            ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
-            pd.options.mode.chained_assignment = None
-            X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
-            X.drop(columns = ohe_enc_cols,inplace = True)
-            pd.options.mode.chained_assignment = 'warn'
-        st.write("OneHot Encoding Completed ✅")
-    if len(ohe_enc_cols)>0:
-        if len_null > 0:
-            for name,df in enumerate(list_X_after_missing_values):
-                X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
-                #  best_tts.best_tts(df,y,model,eva)
-                evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
-        else:
-            X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
-            #  best_tts.best_tts(X,y,model,eva)
-            evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
-    if len_null >0:
-        for name,df in enumerate(list_X_after_missing_values):
-            X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
-            st.write(f"this is test{list_X_after_missing_values_names[name]}",X_train.isnull().sum().sum())
-            evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
-    if eva == "class":
-        counter = Counter(y)
-        total = sum(counter.values())
-        balance_ratio = {cls: count / total for cls, count in counter.items()}
-        num_classes = len(balance_ratio)
-        ideal_ratio = 1 / num_classes
-        a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
-        if a == True:
-            st.write("Balanced Dataset ✅")
-            st.write("Using accuracy for Evaluation")
-            value = "test_acc"
-        else:
-            st.write("Unbalanced Dataset ❌")
-            st.write("Using F1 score for Evaluation")
-            value = "test_f1"
-        st.write("SFdfs",evaluationer.classification_evaluation_df)
-        evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
-        name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
-        st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
-        if len_null >0:
-            b = list_X_after_missing_values_names.index(name)
-            st.write("Sdffsf",b)
-            st.write("df",list_X_after_missing_values[b])
-            X = list_X_after_missing_values[b]
-    if eva == "reg":
-        st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
-        value = "test_r2"
-        evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
-        st.write("adfsdf",evaluationer.reg_evaluation_df.iloc[-1,0])
-        name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
-        st.write("Sdffsf",name)
-        if len_null >0:
-            b = list_X_after_missing_values_names.index(name)
-            st.write("Sdffsf",b)
-            st.write("df",list_X_after_missing_values[b])
-            X = list_X_after_missing_values[b]
-    # Create a figure and axes
-    num_plots = len(num_cols)
-    cols = 2  # Number of columns in the subplot grid
-    rows = (num_plots + cols - 1) // cols  # Calculate the number of rows needed
-    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
-    # Flatten the axes array for easy iteration, and remove any excess subplots
-    axes = axes.flatten()
-    for ax in axes[num_plots:]:
-        fig.delaxes(ax)
-    for i, col in enumerate(num_cols):
-        sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
-        axes[i].set_title(col)
-    # Adjust layout
-    plt.tight_layout()
-    # Show the plot in Streamlit
-    st.pyplot(fig)
-    # Create a figure and axes
-    num_plots = len(num_cols)
-    cols = 3  # Number of columns in the subplot grid
-    rows = (num_plots + cols - 1) // cols  # Calculate the number of rows needed
-    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
-    # Flatten the axes array for easy iteration, and remove any excess subplots
-    axes = axes.flatten()
-    for ax in axes[num_plots:]:
-        fig.delaxes(ax)
-    for i, col in enumerate(num_cols):
-        sns.boxplot(y=X[col], ax=axes[i],palette="magma")
-        axes[i].set_title(col)
-    # Adjust layout
-    plt.tight_layout()
-    # Show the plot in Streamlit
-    st.pyplot(fig)
-    outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
-    st.write("Checking for Outliers")
-    outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
-    st.write("Outliers in Dataframe Summary",outliers_df_X)
-    st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
-    select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
-    resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
-    st.write("outlier handling with methods",resultant)
-    st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
-    try :
-        st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
-        st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
-        X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
-    except :
-        "evaluation of baseline model is better continuing with baseline model"
-    # result_df ,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
-    X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
-    st.write("result_df",X)
-    st.write("fsdfs",X_train)
-    result_df_1 = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
-    st.write("sdchsvdgj",result_df_1)

+import pandas as pd
+import numpy as np
+import streamlit as st
+from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
+import best_tts, evaluationer,models
+from sklearn.experimental import enable_iterative_imputer
+from sklearn.model_selection import train_test_split as tts
+from collections import Counter
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.metrics import root_mean_squared_error
+import seaborn as sns
+from sklearn.decomposition import PCA
+import grid_search_cv
+import matplotlib.pyplot as plt
+import outliers,best_tts
+import feature_selections
+def Auto_optimizer(X,y,eva,model,model_name,test= None):
+    if st.button("Train Regression Model"):
+        num_cols = X.select_dtypes(exclude = "O").columns
+        cat_cols = X.select_dtypes(include = "O").columns
+        st.write("Num_cols",tuple(num_cols))
+        st.write("cat_cols",tuple(cat_cols))
+    # check for Duplicate and drop duplicated in X
+        if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
+            X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
+            st.write("Columns with more than 40% null values removed")
+        # st.write("csx",X)
+        len_null = X.isnull().sum().sum()
+        st.write(f"There are {len_null} null values in Train")
+        knn_imputed_num_X = X.copy()
+        si_mean_imputed_num_X = X.copy()
+        # st.write("sf",si_mean_imputed_num_X)
+        si_median_imputed_num_X = X.copy()
+        si_most_frequent_imputed_num_X = X.copy()
+        iter_imputed_num_X = X.copy()
+        knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
+        si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
+        si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
+        si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
+        iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
+        if len_null >0:
+            if X[num_cols].isnull().sum().sum() >0:
+                knn_imputer = KNNImputer(n_neighbors = 5)
+                knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
+                si_imputer = SimpleImputer(strategy = "mean")
+                si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
+                si_imputer = SimpleImputer(strategy = "median")
+                si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
+                si_imputer = SimpleImputer(strategy = "most_frequent")
+                si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
+                iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
+                iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
+            knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
+            si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
+            si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
+            si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
+            iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
+            if X[cat_cols].isnull().sum().sum() >0:
+                # treating missing values in categorical columns
+                # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
+                si_imputer = SimpleImputer(strategy = "most_frequent")
+                knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
+                si_imputer = SimpleImputer(strategy = "most_frequent")
+                si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
+                # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
+                si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
+                si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
+                iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
+                knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
+                si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
+                si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
+                si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
+                iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
+        miss_val_dropped_X = X.dropna()
+            # list of dataframes
+        list_X_after_missing_values= [knn_imputed_num_X,
+                                si_mean_imputed_num_X,
+                                si_median_imputed_num_X,
+                                si_most_frequent_imputed_num_X,
+                                iter_imputed_num_X,
+                                knn_imputed_X_cat_dropped,
+                                si_mean_imputed_X_cat_dropped,
+                                si_median_imputed_X_cat_dropped,
+                                si_most_frequent_imputed_X_cat_dropped,
+                                iter_imputed_X_cat_dropped,
+                                miss_val_dropped_X]
+        list_X_after_missing_values_names= ["knn_imputed_num_X",
+                                "si_mean_imputed_num_X",
+                                "si_median_imputed_num_X",
+                                "si_most_frequent_imputed_num_X",
+                                "iter_imputed_num_X",
+                                "knn_imputed_X_cat_dropped",
+                                "si_mean_imputed_X_cat_dropped",
+                                "si_median_imputed_X_cat_dropped",
+                                "si_most_frequent_imputed_X_cat_dropped",
+                                "iter_imputed_X_cat_dropped",
+                                "miss_val_dropped_X"]
+        # st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
+        ord_enc_cols = []
+        ohe_enc_cols = []
+        if len(cat_cols) == 0:
+            st.write("No Categorical Columns in Train")
+        else:
+            st.write("Select Columns for Ordinal Encoding")
+            for column in cat_cols:
+                selected = st.checkbox(column)
+                if selected:
+                    st.write(f"No. of Unique value in {column} column are", X[column].nunique())
+                    ord_enc_cols.append(column)
+        ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
+        ohe_enc_cols = list(ohe_enc_cols)
+        if len(ord_enc_cols)>0:
+                    st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
+        if len(ohe_enc_cols)>0:
+            st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
+        if len(ord_enc_cols)>0:
+            ordinal_order_vals = []
+            for column in ord_enc_cols:
+                unique_vals = X.dropna()[column].unique()
+                # st.write(f"No. of Unique value in {column} column are", len(unique_vals))
+                ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
+                ordinal_order_vals.append(ordered_unique_vals)
+            st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
+            if len_null > 0:
+                for df_name, df in enumerate(list_X_after_missing_values):
+                    # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
+                    from sklearn.preprocessing import OrdinalEncoder
+                    ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
+                    df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
+                    # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
+            else :
+                from sklearn.preprocessing import OrdinalEncoder
+                ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
+                X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
+            st.write("Ordinal Encoding Completed ✅")
+        if len(ohe_enc_cols)>0:
+            if len_null > 0:
+                for df_name, df in enumerate(list_X_after_missing_values):
+                    from sklearn.preprocessing import OneHotEncoder
+                    ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
+                    pd.options.mode.chained_assignment = None
+                    df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
+                    df.drop(columns = ohe_enc_cols,inplace = True)
+                    pd.options.mode.chained_assignment = 'warn'
+            else:
+                from sklearn.preprocessing import OneHotEncoder
+                ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
+                pd.options.mode.chained_assignment = None
+                X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
+                X.drop(columns = ohe_enc_cols,inplace = True)
+                pd.options.mode.chained_assignment = 'warn'
+            st.write("OneHot Encoding Completed ✅")
+        if len(ohe_enc_cols)>0:
+            if len_null > 0:
+                for name,df in enumerate(list_X_after_missing_values):
+                    X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
+                    #  best_tts.best_tts(df,y,model,eva)
+                    evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
+            else:
+                X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
+                #  best_tts.best_tts(X,y,model,eva)
+                evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
+        if len_null >0:
+            for name,df in enumerate(list_X_after_missing_values):
+                X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
+                evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
+        if eva == "class":
+            counter = Counter(y)
+            total = sum(counter.values())
+            balance_ratio = {cls: count / total for cls, count in counter.items()}
+            num_classes = len(balance_ratio)
+            ideal_ratio = 1 / num_classes
+            a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
+            if a == True:
+                st.write("Balanced Dataset ✅")
+                st.write("Using accuracy for Evaluation")
+                value = "test_acc"
+            else:
+                st.write("Unbalanced Dataset ❌")
+                st.write("Using F1 score for Evaluation")
+                value = "test_f1"
+            evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
+            name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
+            st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
+            if len_null >0:
+                b = list_X_after_missing_values_names.index(name)
+                st.write("df",list_X_after_missing_values[b])
+                X = list_X_after_missing_values[b]
+        if eva == "reg":
+            st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
+            value = "test_r2"
+            evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
+            name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
+            if len_null >0:
+                b = list_X_after_missing_values_names.index(name)
+                st.write("df",list_X_after_missing_values[b])
+                X = list_X_after_missing_values[b]
+        # Create a figure and axes
+        num_plots = len(num_cols)
+        cols = 2  # Number of columns in the subplot grid
+        rows = (num_plots + cols - 1) // cols  # Calculate the number of rows needed
+        fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
+        # Flatten the axes array for easy iteration, and remove any excess subplots
+        axes = axes.flatten()
+        for ax in axes[num_plots:]:
+            fig.delaxes(ax)
+        for i, col in enumerate(num_cols):
+            sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
+            axes[i].set_title(col)
+        # Adjust layout
+        plt.tight_layout()
+        # Show the plot in Streamlit
+        st.pyplot(fig)
+        # Create a figure and axes
+        num_plots = len(num_cols)
+        cols = 3  # Number of columns in the subplot grid
+        rows = (num_plots + cols - 1) // cols  # Calculate the number of rows needed
+        fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
+        # Flatten the axes array for easy iteration, and remove any excess subplots
+        axes = axes.flatten()
+        for ax in axes[num_plots:]:
+            fig.delaxes(ax)
+        for i, col in enumerate(num_cols):
+            sns.boxplot(y=X[col], ax=axes[i],palette="magma")
+            axes[i].set_title(col)
+        # Adjust layout
+        plt.tight_layout()
+        # Show the plot in Streamlit
+        st.pyplot(fig)
+        outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
+        st.write("Checking for Outliers")
+        outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
+        st.write("Outliers in Dataframe Summary",outliers_df_X)
+        st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
+        select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
+        resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
+        st.write("outlier handling with methods",resultant)
+        st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
+        try :
+            st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
+            st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
+            X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
+        except :
+            "evaluation of baseline model is better continuing with baseline model"
+        X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
+        st.write("result_df",X)
+        try:
+            result_df_1 , feature_col, feature_col_name = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
+            X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
+        except:
+            "evaluation by feature selection is not better than previous"
+        try:
+            result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
+            st.write("result_df",result)
+        except:
+            X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
+        st.write("cheking with polynomial features")
+        poly = PolynomialFeatures(degree=(2))
+        X_train_poly = poly.fit_transform(X_train)
+        X_test_poly = poly.transform(X_test)
+        result_df_2 = evaluationer.evaluation("polynomial features degree 2",X_train_poly,X_test_poly,y_train,y_test,model,root_mean_squared_error,eva)
+        st.write("after polynomial features degree 2",evaluationer.reg_evaluation_df)
+        poly1 = PolynomialFeatures(degree=(3))
+        X_train_poly1 = poly.fit_transform(X_train)
+        X_test_poly1 = poly.transform(X_test)
+        evaluationer.evaluation("polynomial features degree 3",X_train_poly1,X_test_poly1,y_train,y_test,model,root_mean_squared_error,eva)
+        st.write("after polynomial features degree 3",evaluationer.reg_evaluation_df)
+        pca = PCA(n_components=0.95)
+        X_train_pca = pca.fit_transform(X_train)
+        X_test_pca = pca.transform(X_test)
+        evaluationer.evaluation("PCA",X_train_pca,X_test_pca,y_train,y_test,model,root_mean_squared_error,eva)
+        st.write("After PCA",evaluationer.reg_evaluation_df)
+        grid_search_cv.perform_grid_search(model,model_name,X_train,X_test,y_train,y_test,eva)
+        st.write("best param",evaluationer.reg_evaluation_df)
+        st.sidebar.button("click to clear evaluation metrics",evaluationer.reg_evaluation_df.drop(index = evaluationer.reg_evaluation_df.index))

best_tts.py CHANGED Viewed

@@ -10,9 +10,9 @@ def best_tts(X,y,model,eva):
     if eva == "reg":
         test_r2_,test_r2_ts,test_r2_rs = 0,0,0
-        for k in range(10,25):
             i = k/100
-            for j in range(1,100):
                 X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
                 model = model

     if eva == "reg":
         test_r2_,test_r2_ts,test_r2_rs = 0,0,0
+        for k in range(10,25,3):
             i = k/100
+            for j in range(1,100,10):
                 X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
                 model = model

eda.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import numpy as np
+import pandas as pd
+import streamlit as st
+import matplotlib.pyplot as plt
+import seaborn as sns
+import streamlit as st
+import streamlit.components.v1 as components
+import plotly.express as px
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+import streamlit as st
+import pandas as pd
+import datashader as ds
+import datashader.transfer_functions as tf
+from colorcet import fire
+import plotly.express as px
+# function to analysing EDA
+def eda_analysis(df):
+    target_col = st.sidebar.selectbox("Select Target Column", df.columns,index = len(df.columns)-1)
+    y = df[target_col]
+    X = df.drop(columns = target_col)
+    num_cols = X.select_dtypes(exclude= "O").columns.tolist()
+    cat_cols = X.select_dtypes(include= "O").columns.tolist()
+    st.write("num_cols",tuple(num_cols))
+    st.write("cat_cols",tuple(cat_cols))
+    st.divider()
+    results = []
+    for column in X[num_cols].columns:
+        skewness = X[column].skew()
+        kurtosis = X[column].kurtosis()
+        skewness_html = f'<span style="color: {"red" if abs(skewness) > .5 else "white"}">{skewness:.2f}</span>'
+        kurtosis_html = f'<span style="color: {"red" if abs(kurtosis) > 3 else "white"}">{kurtosis:.2f}</span>'
+        results.append({
+            'Column': column,
+            'Skewness': skewness,
+            'Kurtosis': kurtosis,
+            'Skewness_': skewness_html,
+            'Kurtosis_': kurtosis_html
+        })
+    result_df = pd.DataFrame(results)
+    # Display the data types of Skewness and Kurtosis columns
+    # st.write("Data types of Skewness and Kurtosis columns:", result_df[["Skewness", "Kurtosis"]].dtypes)
+    if st.toggle("Show Skewness and Kurtosis of DataFrame columns"):
+        st.write("Columns with Skewness and Kurtosis:")
+        if st.checkbox("Filter Skewed columns"):
+            filtered_df = result_df[abs(result_df["Skewness"]) > 0.5]
+            st.write(filtered_df[['Column', 'Skewness_', 'Kurtosis_']].to_html(escape=False), unsafe_allow_html=True)
+        else:
+            st.write(result_df[['Column', 'Skewness_', 'Kurtosis_']].to_html(escape=False), unsafe_allow_html=True)
+    st.divider()
+    st.write("Plotting Numerical Columns for Visual EDA")
+     # Create two columns
+    column1, column2 = st.columns(2)
+    # Checkbox for plotting distribution in the first column
+    with column1:
+        plot_distribution = st.checkbox("Plot Distribution of Target Column")
+    # Show the second checkbox in the second column only if the first checkbox is clicked
+    if plot_distribution:
+        with column2:
+            show_kde = st.checkbox("Show KDE Plot")
+        kde = show_kde
+    else:
+        kde = False
+    # Plot the histogram if the first checkbox is checked
+    if plot_distribution:
+        fig, ax = plt.subplots()
+        sns.histplot(y, ax=ax, kde=kde)
+        # Show the plot in the Streamlit app
+        st.pyplot(fig)
+    column3, column4 = st.columns(2)
+    with column3:
+        plot_distribution_nc =st.checkbox("Plot Distribution of Input Numerical columns")
+    if plot_distribution_nc:
+        with column4:
+            show_kde_1 = st.checkbox("Show KDE Plot for Numerical Columns")
+        kde_1 = show_kde_1
+    if plot_distribution_nc:
+        for column in num_cols:
+            fig, ax = plt.subplots()
+            sns.histplot(df[column], ax=ax, kde=kde_1)
+            st.write(f"Distribution of {column}:")
+            st.pyplot(fig)
+    st.divider()
+    # plot count plot for categorical columns
+    st.write("Plotting Categorical Columns for Visual EDA")
+    if st.checkbox("Plot Distribution of Input Categorical columns") :
+        for column in cat_cols:
+            fig, ax = plt.subplots()
+            fig = px.histogram(df.fillna('Null'), x=column, color=target_col)
+            st.write(fig)
+    st.divider()
+    # plot correlation matrics using plotly
+    st.write("Plotting Correlation Matrix for Numerical Columns")
+    column5, column6 = st.columns(2)
+    with column5:
+        plot_distribution =st.checkbox("Plot Correlation Matrix")
+    if plot_distribution:
+        with column6:
+            show_value = st.checkbox("Correlation values > 0.5")
+        if show_value:
+            # Compute correlation matrix
+            corr_matrix = df[num_cols].corr()
+            # Plot correlation matrix heatmap
+            fig = px.imshow(corr_matrix[abs(corr_matrix)>0.5], color_continuous_scale='RdBu')
+            # Add annotations for values greater than 0.5
+            for i in range(corr_matrix.shape[0]):
+                for j in range(corr_matrix.shape[1]):
+                    correlation_value = corr_matrix.iloc[i, j]
+                    if abs(correlation_value) > 0.5:  # Filter values greater than 0.5
+                        fig.add_annotation(
+                            x=i, y=j,
+                            text=str(round(correlation_value, 2)),
+                            showarrow=False
+                        )
+            # Update layout
+            fig.update_layout(
+                xaxis=dict(side="top"),
+                width=600,
+                height=600,
+                margin=dict(l=20, r=20, t=40, b=20)
+            )
+            # Display the heatmap
+            st.write(fig)
+    if plot_distribution and not show_value:
+        corr_matrix = df[num_cols].corr()
+        fig = px.imshow(corr_matrix, color_continuous_scale='RdBu')
+        for i in range(corr_matrix.shape[0]):
+            for j in range(corr_matrix.shape[1]):
+                fig.add_annotation(
+                    x=i, y=j,
+                    text=str(round(corr_matrix.iloc[i, j], 2)),
+                    showarrow=False
+                )
+        # Update the layout to ensure annotations are displayed properly
+        fig.update_layout(
+            xaxis=dict(side="top"),
+            width=600,
+            height=600,
+            margin=dict(l=20, r=20, t=40, b=20)
+        )
+        st.write(fig)
+    st.divider()
+    outlier_cols = st.multiselect("Select Continous numerical columns for Outlier Plot",num_cols)
+    # plot px.boxplot for outlier cols
+    if st.toggle("Toggle for Violin Plot"):
+        if st.checkbox("Plot BoxPlot for Outlier Cols"):
+            if st.toggle("Split by Target"):
+                for col in outlier_cols:
+                    fig = px.violin(df, x=col,color=y)
+                    st.write(fig)
+                st.divider()
+            else:
+                for col in outlier_cols:
+                    fig = px.violin(df, x=col)
+                    st.write(fig)
+                st.divider()
+        if st.checkbox("check outlier distribution of Target column"):
+            fig = px.violin(y)
+            st.write(fig)
+    else:
+        if st.checkbox("Plot BoxPlot for Outlier Cols"):
+            if st.toggle("Split by Target"):
+                for col in outlier_cols:
+                    fig = px.box(df, x=col,color=y)
+                    st.write(fig)
+                st.divider()
+            else:
+                for col in outlier_cols:
+                    fig = px.box(df, x=col)
+                    st.write(fig)
+                st.divider()
+        if st.checkbox("check outlier distribution of Target column"):
+            fig = px.box(y)
+            st.write(fig)
+    # plot scatter plot using px
+    st.divider()
+    if st.checkbox("Plot Scatter Plot"):
+        column7, column8,column9 = st.columns(3)
+        with column7:
+            # Select y-axis column
+            y_col = st.selectbox("Select y axis column", df.columns)
+            # Filter categorical columns for the x-axis selection
+            categorical_columns = df.columns
+        with column8:
+            # Allow user to select the x-axis column from categorical columns
+            x_col = st.selectbox("Select x axis column", categorical_columns)
+        with column9:
+            hue_col = st.selectbox("Select Hue column",categorical_columns)
+            # Plot scatter plot using Plotly
+        fig = px.scatter(df, x=x_col, y=y_col, color=hue_col)
+        st.write(fig)
+    # barchart and line chart
+    st.divider()
+    if st.checkbox("Plot Bar Chart"):
+        column10, column11 = st.columns(2)
+        with column10:
+            # Select y-axis column
+            y_col = st.selectbox("Select y axis column", df.columns)
+            # Filter categorical columns for the x-axis selection
+            categorical_columns = df.columns
+        with column11:
+            # Allow user to select the x-axis column from categorical columns
+            x_col = st.selectbox("Select x axis column", categorical_columns)
+        fig = px.bar(df, x=x_col, y=y_col,color = x_col)
+        st.write(fig)
+    st.divider()
+    if st.checkbox("Plot Line Chart"):
+        column12, column13,colx = st.columns(3)
+        with column12:
+            # Select y-axis column
+            y_col = st.selectbox("Select y axis column", df.columns)
+            # Filter categorical columns for the x-axis selection
+            categorical_columns = df.columns
+        with column13:
+            # Allow user to select the x-axis column from categorical columns
+            x_col = st.selectbox("Select x axis column", categorical_columns)
+        with colx:
+            hue_col1 = st.selectbox("Select Line split column",categorical_columns)
+        fig = px.line(df.sort_values(by = y_col), x=x_col, y=y_col,color = hue_col1)
+        st.write(fig)
+    st.divider()
+    # plot pie chart
+    if st.checkbox("Plot Pie Chart "):
+        column14, column15 = st.columns(2)
+        with column14:
+            # Select y-axis column
+            y_col = st.selectbox("Select values columns", df.columns)
+            # Filter categorical columns for the x-axis selection
+            categorical_columns = df.columns
+        with column15:
+            # Allow user to select the x-axis column from categorical columns
+            x_col = st.selectbox("Select names column", categorical_columns)
+        fig = px.pie(df, values=y_col, names=x_col)
+        st.write(fig)
+    st.divider()
+    # check if there are latitude and longitude columns
+    if st.checkbox("Plot on Map"):
+        lat_col = st.selectbox("Select Latitute Column",df.columns)
+        long_col = st.selectbox("Select Longitude Column",df.columns)
+        color = st.selectbox
+        # # Create the datashader canvas and aggregate points
+        # cvs = ds.Canvas(plot_width=1000, plot_height=1000)
+        # agg = cvs.points(df, x=long_col, y=lat_col)
+        # # Get the coordinates for the mapbox layer
+        # coords_lat, coords_lon = agg.coords[lat_col].values, agg.coords[long_col].values
+        # coordinates = [
+        #     [coords_lon[0], coords_lat[0]],
+        #     [coords_lon[-1], coords_lat[0]],
+        #     [coords_lon[-1], coords_lat[-1]],
+        #     [coords_lon[0], coords_lat[-1]]
+        # ]
+        # # Generate the datashader image
+        # img = tf.shade(agg, cmap=fire)[::-1].to_pil()
+        # # Create the Plotly figure with a mapbox layer
+        # fig = px.scatter_mapbox(df[:1], lat=lat_col, lon=long_col, zoom=10)  # Adjust zoom level as needed
+        # fig.update_layout(mapbox_style="carto-darkmatter",
+        #                 mapbox_layers=[
+        #                     {
+        #                         "sourcetype": "image",
+        #                         "source": img,
+        #                         "coordinates": coordinates
+        #                     }
+        #                 ])
+        # # Display the figure in Streamlit
+        # st.plotly_chart(fig)
+        # Create a scatter mapbox plot with vibrant colors and custom marker sizes
+        if st.button("Proceed to plot map"):
+            fig = px.scatter_mapbox(df, lat=lat_col, lon=long_col,
+                                    size_max=15,  # Max marker size
+                                    mapbox_style="open-street-map",  # Using a different map style for vibrancy
+                                    zoom=1,
+                                    title='Latitude and Longitude Plotting')
+            # Customize the layout for more vibrant appearance
+            fig.update_layout(mapbox_accesstoken='your_mapbox_access_token')
+            st.write(fig)

feature_selections.py CHANGED Viewed

@@ -8,12 +8,10 @@ import pandas as pd
 import numpy as np
 import evaluationer
 import streamlit as st
-# import root_mean_squared_error
 from sklearn.metrics import root_mean_squared_error
 def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
-    st.write("dvsdv",y_train)
-    st.write("dvfssdv",X_train)
     model = sm.OLS(y_train, sm.add_constant(X_train))
     model_fit = model.fit()
@@ -100,5 +98,7 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
     feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
     st.write("feature_cols", vif_cols)
     for i,j in enumerate(feature_cols):
-        evaluationer.evaluation(f"{feature_cols_name[i]} dropped" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
-    return evaluationer.reg_evaluation_df

 import numpy as np
 import evaluationer
 import streamlit as st
 from sklearn.metrics import root_mean_squared_error
 def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
     model = sm.OLS(y_train, sm.add_constant(X_train))
     model_fit = model.fit()
     feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
     st.write("feature_cols", vif_cols)
     for i,j in enumerate(feature_cols):
+        evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
+    return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name

grid_search_cv.py ADDED Viewed

	@@ -0,0 +1,284 @@

+from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
+from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.svm import SVR
+from xgboost import XGBRegressor, XGBRFRegressor
+from sklearn.neural_network import MLPRegressor
+from lightgbm import LGBMRegressor
+from sklearn.naive_bayes import GaussianNB
+from sklearn.model_selection import GridSearchCV
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split
+import streamlit as st
+import evaluationer
+from sklearn.metrics import root_mean_squared_error
+from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import SVC
+from xgboost import XGBClassifier, XGBRFClassifier
+from sklearn.neural_network import MLPClassifier
+from lightgbm import LGBMClassifier
+from sklearn.naive_bayes import MultinomialNB, CategoricalNB
+param_grids_class = {
+    "Logistic Regression": {
+        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
+        'C': [0.01, 0.1, 1, 10],
+        'solver': ['lbfgs', 'liblinear', 'saga']
+    },
+    "SGD Classifier": {
+        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'],
+        'penalty': ['l2', 'l1', 'elasticnet'],
+        'alpha': [0.0001, 0.001, 0.01],
+        'max_iter': [1000, 5000, 10000]
+    },
+    "Ridge Classifier": {
+        'alpha': [0.1, 1, 10, 100]
+    },
+    "Random Forest Classifier": {
+        'n_estimators': [100, 200, 300],
+        'max_depth': [None, 10, 20, 30],
+        'min_samples_split': [2, 5, 10],
+        'min_samples_leaf': [1, 2, 4]
+    },
+    "AdaBoost Classifier": {
+        'n_estimators': [50, 100, 200],
+        'learning_rate': [0.01, 0.1, 1]
+    },
+    "Gradient Boosting Classifier": {
+        'n_estimators': [100, 200, 300],
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [3, 5, 7]
+    },
+    "Hist Gradient Boosting Classifier": {
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [None, 10, 20],
+        'min_samples_leaf': [20, 50, 100]
+    },
+    "K Neighbors Classifier": {
+        'n_neighbors': [3, 5, 7],
+        'weights': ['uniform', 'distance'],
+        'metric': ['euclidean', 'manhattan']
+    },
+    "Decision Tree Classifier": {
+        'max_depth': [None, 10, 20, 30],
+        'min_samples_split': [2, 5, 10],
+        'min_samples_leaf': [1, 2, 4]
+    },
+    "SVC": {
+        'C': [0.1, 1, 10],
+        'kernel': ['linear', 'poly', 'rbf'],
+        'degree': [3, 4, 5],
+        'gamma': ['scale', 'auto']
+    },
+    "XGB Classifier": {
+        'n_estimators': [100, 200, 300],
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [3, 5, 7]
+    },
+    "XGBRF Classifier": {
+        'n_estimators': [100, 200, 300],
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [3, 5, 7]
+    },
+    "MLP Classifier": {
+        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
+        'activation': ['tanh', 'relu'],
+        'solver': ['adam', 'sgd'],
+        'alpha': [0.0001, 0.001, 0.01],
+        'learning_rate': ['constant', 'adaptive']
+    },
+    "LGBM Classifier": {
+        'n_estimators': [100, 200, 300],
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [-1, 10, 20]
+    },
+    "Multinomial Naive Bayes": {
+        'alpha': [0.1, 0.5, 1.0]
+    },
+    "Categorical Naive Bayes": {
+        'alpha': [0.1, 0.5, 1.0]
+    }
+}
+param_grids_reg = {
+    "Linear Regression": {},
+    "SGD Regressor": {
+        'loss': ['squared_loss', 'huber'],
+        'penalty': ['l2', 'l1', 'elasticnet'],
+        'alpha': [0.0001, 0.001, 0.01],
+        'max_iter': [1000, 5000, 10000]
+    },
+    "Ridge Regressor": {
+        'alpha': [0.1, 1, 10, 100],
+        'solver': ['auto', 'svd', 'cholesky', 'lsqr']
+    },
+    "Lasso Regressor": {
+        'alpha': [0.1, 1, 10, 100]
+    },
+    "ElasticNet Regressor": {
+        'alpha': [0.1, 1, 10, 100],
+        'l1_ratio': [0.1, 0.5, 0.9]
+    },
+    "Random Forest Regressor": {
+        'n_estimators': [100, 200, 300],
+        'max_depth': [None, 10, 20, 30],
+        'min_samples_split': [2, 5, 10],
+        'min_samples_leaf': [1, 2, 4]
+    },
+    "AdaBoost Regressor": {
+        'n_estimators': [50, 100, 200],
+        'learning_rate': [0.01, 0.1, 1]
+    },
+    "Gradient Boosting Regressor": {
+        'n_estimators': [100, 200, 300],
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [3, 5, 7]
+    },
+    "Hist Gradient Boosting Regressor": {
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [None, 10, 20],
+        'min_samples_leaf': [20, 50, 100]
+    },
+    "K Neighbors Regressor": {
+        'n_neighbors': [3, 5, 7],
+        'weights': ['uniform', 'distance'],
+        'metric': ['euclidean', 'manhattan']
+    },
+    "Decision Tree Regressor": {
+        'max_depth': [None, 10, 20, 30],
+        'min_samples_split': [2, 5, 10],
+        'min_samples_leaf': [1, 2, 4]
+    },
+    "SVR": {
+        'C': [0.1, 1, 10],
+        'kernel': ['linear', 'poly', 'rbf'],
+        'degree': [3, 4, 5],
+        'gamma': ['scale', 'auto']
+    },
+    "XGB Regressor": {
+        'n_estimators': [100, 200, 300],
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [3, 5, 7]
+    },
+    "XGBRF Regressor": {
+        'n_estimators': [100, 200, 300],
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [3, 5, 7]
+    },
+    "MLP Regressor": {
+        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
+        'activation': ['tanh', 'relu'],
+        'solver': ['adam', 'sgd'],
+        'alpha': [0.0001, 0.001, 0.01],
+        'learning_rate': ['constant', 'adaptive']
+    },
+    "LGBM Regressor": {
+        'n_estimators': [100, 200, 300],
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [-1, 10, 20]
+    },
+    "Gaussian Naive Bayes": {
+        'var_smoothing': [1e-9, 1e-8, 1e-7]
+    }
+}
+# Define the regressors
+regressors = {
+    "Linear Regression": LinearRegression(),
+    "SGD Regressor": SGDRegressor(),
+    "Ridge Regressor": Ridge(),
+    "Lasso Regressor": Lasso(),
+    "ElasticNet Regressor": ElasticNet(),
+    "Random Forest Regressor": RandomForestRegressor(),
+    "AdaBoost Regressor": AdaBoostRegressor(),
+    "Gradient Boosting Regressor": GradientBoostingRegressor(),
+    "Hist Gradient Boosting Regressor": HistGradientBoostingRegressor(),
+    "K Neighbors Regressor": KNeighborsRegressor(),
+    "Decision Tree Regressor": DecisionTreeRegressor(),
+    "SVR": SVR(),
+    "XGB Regressor": XGBRegressor(),
+    "XGBRF Regressor": XGBRFRegressor(),
+    "MLP Regressor": MLPRegressor(),
+    "LGBM Regressor": LGBMRegressor(),
+    "Gaussian Naive Bayes": GaussianNB()
+}
+classifiers = {
+    "Logistic Regression": LogisticRegression(),
+    "SGD Classifier": SGDClassifier(),
+    "Ridge Classifier": RidgeClassifier(),
+    "Random Forest Classifier": RandomForestClassifier(),
+    "AdaBoost Classifier": AdaBoostClassifier(),
+    "Gradient Boosting Classifier": GradientBoostingClassifier(),
+    "Hist Gradient Boosting Classifier": HistGradientBoostingClassifier(),
+    "K Neighbors Classifier": KNeighborsClassifier(),
+    "Decision Tree Classifier": DecisionTreeClassifier(),
+    "SVC": SVC(),
+    "XGB Classifier": XGBClassifier(),
+    "XGBRF Classifier": XGBRFClassifier(),
+    "MLP Classifier": MLPClassifier(),
+    "LGBM Classifier": LGBMClassifier(),
+    "Multinomial Naive Bayes": MultinomialNB(),
+    "Categorical Naive Bayes": CategoricalNB()
+}
+def perform_grid_search(model,model_name,X_train,X_test,y_train,y_test,eva):
+    if eva == "reg":
+        regressor = regressors[model_name]
+        param_grid_reg = param_grids_reg[model_name]
+        grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid_reg, cv=5, scoring='neg_mean_squared_error')
+        grid_search.fit(X_train,y_train)
+        st.write(f"Best Parameters for {model_name}: {grid_search.best_params_}")
+        st.write(f"Best Score for {model_name}: {grid_search.best_score_}")
+        best_model = grid_search.best_estimator_
+        y_pred = best_model.predict(X_test)
+        evaluationer.evaluation("best hyperparams",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
+    elif eva == "class":
+        classifier = classifiers[model_name]
+        param_grid_class = param_grids_class[model_name]
+        grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid_class, cv=5, scoring='accuracy')
+        grid_search.fit(X_train,y_train)
+        st.write(f"Best Parameters for {model_name}: {grid_search.best_params_}")
+        st.write(f"Best Score for {model_name}: {grid_search.best_score_}")
+        best_model = grid_search.best_estimator_
+        y_pred = best_model.predict(X_test)
+        evaluationer.evaluation("best hyperparams",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)

models.py CHANGED Viewed

@@ -23,6 +23,8 @@ from sklearn.neural_network import MLPRegressor
 from lightgbm import LGBMRegressor
 from sklearn.naive_bayes import GaussianNB
 # dictionary where keys are name of algorithm and values are algorithm for classifier
 algos_class = {
     "Logistic Regression": LogisticRegression(),

 from lightgbm import LGBMRegressor
 from sklearn.naive_bayes import GaussianNB
 # dictionary where keys are name of algorithm and values are algorithm for classifier
 algos_class = {
     "Logistic Regression": LogisticRegression(),

requirements.txt CHANGED Viewed

@@ -1,10 +1,11 @@
 streamlit==1.34.0
 joblib==1.4.2
 numpy==1.26.4
 pandas==2.2.2
 scikit-learn==1.4.2
-seaborn==0.13.2
 matplotlib==3.9.0
-xgboost==2.0.3
-lightgbm==4.3.0
-statsmodels==0.14.2

 streamlit==1.34.0
 joblib==1.4.2
 numpy==1.26.4
 pandas==2.2.2
 scikit-learn==1.4.2
+datashader==0.16.2
+colorcet==3.1.0
+plotly==5.22.0
 matplotlib==3.9.0
+seaborn==0.13.2