Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import numpy as np | |
| import pandas as pd | |
| import io | |
| import matplotlib.pyplot as plt | |
| from matplotlib.ticker import PercentFormatter | |
| import seaborn as sns | |
| from sklearn.preprocessing import ( | |
| OneHotEncoder, | |
| OrdinalEncoder, | |
| StandardScaler, | |
| MinMaxScaler, | |
| ) | |
| from sklearn.model_selection import train_test_split | |
| from imblearn.under_sampling import RandomUnderSampler | |
| from imblearn.over_sampling import RandomOverSampler, SMOTE | |
| from sklearn.linear_model import Ridge, Lasso, LogisticRegression | |
| from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier | |
| from sklearn.svm import SVR, SVC | |
| from sklearn.naive_bayes import MultinomialNB | |
| from xgboost import XGBRFRegressor, XGBRFClassifier | |
| from lightgbm import LGBMRegressor, LGBMClassifier | |
| from sklearn.metrics import ( | |
| mean_absolute_error, | |
| mean_squared_error, | |
| mean_squared_error, | |
| r2_score, | |
| ) | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| f1_score, | |
| confusion_matrix, | |
| precision_score, | |
| recall_score, | |
| ) | |
| import pickle | |
| st.set_page_config(page_title="Data Analytics", page_icon="📊") | |
| sns.set_style("white") | |
| sns.set_context("poster", font_scale=0.7) | |
| palette = [ | |
| "#1d7874", | |
| "#679289", | |
| "#f4c095", | |
| "#ee2e31", | |
| "#ffb563", | |
| "#918450", | |
| "#f85e00", | |
| "#a41623", | |
| "#9a031e", | |
| "#d6d6d6", | |
| "#ffee32", | |
| "#ffd100", | |
| "#333533", | |
| "#202020", | |
| ] | |
| def main(): | |
| file = st.sidebar.file_uploader("Upload Your CSV File Here: ") | |
| #st.markdown("**Process the uploaded CSV file**") | |
| process = st.sidebar.button("Process files") | |
| option = st.sidebar.radio( | |
| "Select Data Analysis: ", | |
| ( | |
| "Exploratory Data Analysis", | |
| "Univariate Analysis", | |
| "Bivariate Analysis", | |
| ), | |
| ) | |
| placeholder = st.empty() | |
| placeholder.markdown( | |
| "<h1 style='text-align: center;'>Tabular/CSV Data Analytics📊</h1>", | |
| unsafe_allow_html=True | |
| ) | |
| if file is not None and process: | |
| data = load_csv(file) | |
| st.session_state["data"] = data | |
| if "data" in st.session_state: | |
| data = st.session_state["data"] | |
| placeholder.empty() | |
| if option == "Exploratory Data Analysis": | |
| st.markdown( | |
| "<h1 style='text-align: center;'>Exploratory Data Analysis</h1>", unsafe_allow_html=True | |
| ) | |
| st.subheader("Data Overview") | |
| st.write(data_overview(data)) | |
| st.write(duplicate(data)) | |
| st.dataframe(data.head()) | |
| st.subheader("Data Types and Unique Value Counts") | |
| display_data_info(data) | |
| st.subheader("Missing Data") | |
| missing_data(data) | |
| st.subheader("Value Counts") | |
| value_counts(data) | |
| st.subheader("Descriptive Statistics") | |
| st.write(data.describe().T) | |
| if option == "Univariate Analysis": | |
| st.markdown( | |
| "<h1 style='text-align: center;'>Univariate Analysis</h1>", | |
| unsafe_allow_html=True, | |
| ) | |
| plot = st.radio( | |
| "Select a chart: ", | |
| ("Count Plot", "Pie Chart", "Histogram", "Violin Plot", "Scatter Plot"), | |
| ) | |
| if plot == "Count Plot": | |
| column = st.selectbox( | |
| "Select a column", [""] + list(data.select_dtypes("O")) | |
| ) | |
| if column: | |
| countplot(data, column) | |
| if plot == "Pie Chart": | |
| column = st.selectbox( | |
| "Select a column", [""] + list(data.select_dtypes("O")) | |
| ) | |
| if column: | |
| piechart(data, column) | |
| if plot == "Histogram": | |
| column = st.selectbox( | |
| "Select a column", | |
| [""] + list(data.select_dtypes(include=["int", "float"])), | |
| ) | |
| if column: | |
| histogram(data, column) | |
| if plot == "Violin Plot": | |
| column = st.selectbox( | |
| "Select a column", | |
| [""] + list(data.select_dtypes(include=["int", "float"])), | |
| ) | |
| if column: | |
| violinplot(data, column) | |
| if plot == "Scatter Plot": | |
| column = st.selectbox( | |
| "Select a column", | |
| [""] + list(data.select_dtypes(include=["int", "float"])), | |
| ) | |
| if column: | |
| scatterplot(data, column) | |
| if option == "Bivariate Analysis": | |
| st.markdown( | |
| "<h1 style='text-align: center;'>Bivariate Analysis</h1>", | |
| unsafe_allow_html=True, | |
| ) | |
| plot = st.radio( | |
| "Select a chart: ", | |
| ("Scatter Plot", "Bar Plot", "Box Plot", "Pareto Chart"), | |
| ) | |
| if plot == "Scatter Plot": | |
| columns = st.multiselect( | |
| "Select two columns", | |
| [""] + list(data.select_dtypes(include=["int", "float"])), | |
| ) | |
| if columns: | |
| biscatterplot(data, columns) | |
| if plot == "Bar Plot": | |
| columns = st.multiselect("Select two columns", list(data.columns)) | |
| if columns: | |
| bibarplot(data, columns) | |
| if plot == "Box Plot": | |
| columns = st.multiselect("Select two columns", list(data.columns)) | |
| if columns: | |
| biboxplot(data, columns) | |
| if plot == "Pareto Chart": | |
| column = st.selectbox( | |
| "Select a columns", | |
| [""] + list(data.select_dtypes(include="object")), | |
| ) | |
| if column: | |
| paretoplot(data, column) | |
| if option == "Preprocess": | |
| st.markdown( | |
| "<h1 style='text-align: center;'>Data Preprocessing</h1>", | |
| unsafe_allow_html=True, | |
| ) | |
| operation = st.radio( | |
| "Select preprocessing step: ", | |
| ( | |
| "Drop Columns", | |
| "Handling Missing Values", | |
| "Encode Categorical Features", | |
| ), | |
| ) | |
| if operation == "Drop Columns": | |
| columns = st.multiselect("Select Columns to drop: ", (data.columns)) | |
| drop_columns = st.button("Drop Columns") | |
| if drop_columns: | |
| data.drop(columns, axis=1, inplace=True) | |
| st.success("Dropped selected columns✅✅✅") | |
| elif operation == "Handling Missing Values": | |
| num_missing = st.selectbox( | |
| "Select a Approach (Numerical columns only): ", | |
| ("", "Drop", "Backward Fill", "Forward Fill", "Mean", "Median"), | |
| ).lower() | |
| cat_missing = st.selectbox( | |
| "Select a Approach (Categorical columns only): ", | |
| ("", "Drop", "Most Frequent Values", "Replace with 'Unknown'"), | |
| ).lower() | |
| hmv = st.button("Handle Missing Values") | |
| if hmv: | |
| if num_missing: | |
| num_data = data.select_dtypes(include=["int64", "float64"]) | |
| if num_missing == "drop": | |
| data = data.dropna(subset=num_data.columns) | |
| elif num_missing in [ | |
| "mean", | |
| "median", | |
| "backward fill", | |
| "forward fill", | |
| ]: | |
| if num_missing == "mean": | |
| fill_values = num_data.mean() | |
| elif num_missing == "median": | |
| fill_values = num_data.median() | |
| elif num_missing == "backward fill": | |
| fill_values = num_data.bfill() | |
| elif num_missing == "forward fill": | |
| fill_values = num_data.ffill() | |
| data.fillna(value=fill_values, inplace=True) | |
| st.success( | |
| "Imputed missing values in numerical columns with selected approach." | |
| ) | |
| if cat_missing: | |
| cat_data = data.select_dtypes(exclude=["int", "float"]) | |
| if cat_missing == "drop": | |
| data = data.dropna(subset=cat_data.columns) | |
| elif cat_missing == "most frequent values": | |
| mode_values = data[cat_data.columns].mode().iloc[0] | |
| data[cat_data.columns] = data[cat_data.columns].fillna( | |
| mode_values | |
| ) | |
| elif cat_missing == "replace with 'unknown'": | |
| data[cat_data.columns] = data[cat_data.columns].fillna( | |
| "Unknown" | |
| ) | |
| st.success( | |
| "Imputed missing values in categorical columns with selected approach." | |
| ) | |
| elif operation == "Encode Categorical Features": | |
| oe_columns = st.multiselect( | |
| "Choose Columns for Ordinal Encoding", | |
| [""] + list(data.select_dtypes(include="object")), | |
| ) | |
| st.info("Other columns will be One Hot Encoded.") | |
| encode_columns = st.button("Encode Columns") | |
| if encode_columns: | |
| bool_columns = data.select_dtypes(include=bool).columns | |
| data[bool_columns] = data[bool_columns].astype(int) | |
| if oe_columns: | |
| oe = OrdinalEncoder() | |
| data[oe_columns] = oe.fit_transform( | |
| data[oe_columns].astype("str") | |
| ) | |
| try: | |
| remaining_cat_cols = [ | |
| col | |
| for col in data.select_dtypes(include="object") | |
| if col not in oe_columns | |
| ] | |
| except: | |
| pass | |
| if len(remaining_cat_cols) > 0: | |
| data = pd.get_dummies( | |
| data, columns=remaining_cat_cols, drop_first=False | |
| ) | |
| st.success("Encoded categorical columns") | |
| bool_columns = data.select_dtypes(include=bool).columns | |
| data[bool_columns] = data[bool_columns].astype(int) | |
| st.session_state["data"] = data | |
| preprocessed_data_csv = data.to_csv(index=False) | |
| preprocessed_data_buffer = io.StringIO() | |
| preprocessed_data_buffer.write(preprocessed_data_csv) | |
| preprocessed_data_bytes = preprocessed_data_buffer.getvalue() | |
| if st.download_button( | |
| label="Download Preprocessed Data", | |
| key="preprocessed_data", | |
| on_click=None, | |
| data=preprocessed_data_bytes.encode(), | |
| file_name="preprocessed_data.csv", | |
| mime="text/csv", | |
| ): | |
| st.success('Data Downloaded') | |
| if option == "Training and Evaluation": | |
| st.markdown( | |
| "<h1 style='text-align: center;'>Training and Evaluation</h1>", | |
| unsafe_allow_html=True, | |
| ) | |
| algo = st.selectbox("Choose Algorithm Type:", ("", "Regression", "Classification")) | |
| if algo == "Regression": | |
| target = st.selectbox("Chose Target Variable (Y): ", list(data.columns)) | |
| try: | |
| X = data.drop(target, axis=1) | |
| Y = data[target] | |
| except Exception as e: | |
| st.write(str(e)) | |
| st.write( | |
| "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model." | |
| ) | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, Y, test_size=0.2, random_state=42 | |
| ) | |
| scale = st.selectbox( | |
| "Choose how do you want to scale features:", | |
| ("", "Standard Scaler", "Min Max Scaler"), | |
| ) | |
| if scale == "Standard Scaler": | |
| scaler = StandardScaler() | |
| X_train = scaler.fit_transform(X_train) | |
| X_test = scaler.transform(X_test) | |
| elif scale == "Min Max Scaler": | |
| scaler = MinMaxScaler() | |
| X_train = scaler.fit_transform(X_train) | |
| X_test = scaler.transform(X_test) | |
| model = st.selectbox( | |
| "Choose Regression Model for training: ", | |
| ( | |
| "", | |
| "Ridge Regression", | |
| "Decision Tree Regressor", | |
| "Random Forest Regressor", | |
| "SVR", | |
| "XGBRF Regressor", | |
| "LGBM Regressor", | |
| ), | |
| ) | |
| if model == "Ridge Regression": | |
| reg = Ridge(alpha=1.0) | |
| reg.fit(X_train, y_train) | |
| pred = reg.predict(X_test) | |
| st.write( | |
| "Mean Absolute Error (MAE): {:.4f}".format( | |
| mean_absolute_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Mean Squared Error (MSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Root Mean Squared Error (RMSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test, squared=False) | |
| ) | |
| ) | |
| st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(reg), | |
| file_name="ridge_regression_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open("ridge_regression_model.pkl", "wb") as model_file: | |
| pickle.dump(reg, model_file) | |
| elif model == "Decision Tree Regressor": | |
| reg = DecisionTreeRegressor(max_depth=10) | |
| reg.fit(X_train, y_train) | |
| pred = reg.predict(X_test) | |
| st.write( | |
| "Mean Absolute Error (MAE): {:.4f}".format( | |
| mean_absolute_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Mean Squared Error (MSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Root Mean Squared Error (RMSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test, squared=False) | |
| ) | |
| ) | |
| st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(reg), | |
| file_name="decision_tree_regression_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open( | |
| "decision_tree_regression_model.pkl", "wb" | |
| ) as model_file: | |
| pickle.dump(reg, model_file) | |
| elif model == "Random Forest Regressor": | |
| reg = RandomForestRegressor(max_depth=10, n_estimators=100) | |
| reg.fit(X_train, y_train) | |
| pred = reg.predict(X_test) | |
| st.write( | |
| "Mean Absolute Error (MAE): {:.4f}".format( | |
| mean_absolute_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Mean Squared Error (MSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Root Mean Squared Error (RMSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test, squared=False) | |
| ) | |
| ) | |
| st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(reg), | |
| file_name="random_forest_regression_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open( | |
| "random_forest_regression_model.pkl", "wb" | |
| ) as model_file: | |
| pickle.dump(reg, model_file) | |
| elif model == "SVR": | |
| reg = SVR(C=1.0, epsilon=0.2) | |
| reg.fit(X_train, y_train) | |
| pred = reg.predict(X_test) | |
| st.write( | |
| "Mean Absolute Error (MAE): {:.4f}".format( | |
| mean_absolute_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Mean Squared Error (MSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Root Mean Squared Error (RMSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test, squared=False) | |
| ) | |
| ) | |
| st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(reg), | |
| file_name="svr_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open("svr_model.pkl", "wb") as model_file: | |
| pickle.dump(reg, model_file) | |
| elif model == "XGBRF Regressor": | |
| reg = XGBRFRegressor(reg_lambda=1) | |
| reg.fit(X_train, y_train) | |
| pred = reg.predict(X_test) | |
| st.write( | |
| "Mean Absolute Error (MAE): {:.4f}".format( | |
| mean_absolute_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Mean Squared Error (MSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Root Mean Squared Error (RMSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test, squared=False) | |
| ) | |
| ) | |
| st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(reg), | |
| file_name="xgbrf_regression_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open("xgbrf_regression_model.pkl", "wb") as model_file: | |
| pickle.dump(reg, model_file) | |
| elif model == "LGBM Regressor": | |
| reg = LGBMRegressor(reg_lambda=1) | |
| reg.fit(X_train, y_train) | |
| pred = reg.predict(X_test) | |
| st.write( | |
| "Mean Absolute Error (MAE): {:.4f}".format( | |
| mean_absolute_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Mean Squared Error (MSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test) | |
| ) | |
| ) | |
| st.write( | |
| "Root Mean Squared Error (RMSE): {:.4f}".format( | |
| mean_squared_error(pred, y_test, squared=False) | |
| ) | |
| ) | |
| st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test))) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(reg), | |
| file_name="lgbm_regression_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open("lgbm_regression_model.pkl", "wb") as model_file: | |
| pickle.dump(reg, model_file) | |
| elif algo == "Classification": | |
| target = st.selectbox("Chose Target Variable (Y): ", list(data.columns)) | |
| try: | |
| X = data.drop(target, axis=1) | |
| Y = data[target] | |
| except Exception as e: | |
| st.write(str(e)) | |
| st.write( | |
| "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model." | |
| ) | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, Y, test_size=0.2, random_state=42 | |
| ) | |
| balance = st.selectbox( | |
| "Do you want to balance dataset?", ("", "Yes", "No") | |
| ) | |
| if balance == "Yes": | |
| piechart(data, target) | |
| sample = st.selectbox( | |
| "Which approach you want to use?", | |
| ("", "Random Under Sampling", "Random Over Sampling", "SMOTE"), | |
| ) | |
| if sample == "Random Under Sampling": | |
| rus = RandomUnderSampler(random_state=42) | |
| X_train, y_train = rus.fit_resample(X_train, y_train) | |
| elif sample == "Random Over Sampling": | |
| ros = RandomOverSampler(random_state=42) | |
| X_train, y_train = ros.fit_resample(X_train, y_train) | |
| elif sample == "SMOTE": | |
| smote = SMOTE(random_state=42) | |
| X_train, y_train = smote.fit_resample(X_train, y_train) | |
| scale = st.selectbox( | |
| "Choose how do you want to scale features:", | |
| ("", "Standard Scaler", "Min Max Scaler"), | |
| ) | |
| if scale == "Standard Scaler": | |
| scaler = StandardScaler() | |
| X_train = scaler.fit_transform(X_train) | |
| X_test = scaler.transform(X_test) | |
| elif scale == "Min Max Scaler": | |
| scaler = MinMaxScaler() | |
| X_train = scaler.fit_transform(X_train) | |
| X_test = scaler.transform(X_test) | |
| model = st.selectbox( | |
| "Choose Classification Model for training: ", | |
| ( | |
| "", | |
| "Logistic Regression", | |
| "Decision Tree Classifier", | |
| "Random Forest Classifier", | |
| "SVC", | |
| "XGBRF Classifier", | |
| "LGBM Classifier", | |
| ), | |
| ) | |
| if model == "Logistic Regression": | |
| clf = LogisticRegression(penalty="l2") | |
| clf.fit(X_train, y_train) | |
| pred = clf.predict(X_test) | |
| st.write( | |
| "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
| ) | |
| try: | |
| st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
| st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
| st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
| except ValueError: | |
| st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
| st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
| st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
| plot_confusion_matrix( | |
| pred, y_test, "Logistic Regression Confusion Matrix " | |
| ) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(clf), | |
| file_name="logistic_regression_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open("logistic_regression_model.pkl", "wb") as model_file: | |
| pickle.dump(clf, model_file) | |
| if model == "Decision Tree Classifier": | |
| clf = DecisionTreeClassifier(max_depth=5) | |
| clf.fit(X_train, y_train) | |
| pred = clf.predict(X_test) | |
| st.write( | |
| "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
| ) | |
| try: | |
| st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
| st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
| st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
| except ValueError: | |
| st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
| st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
| st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
| plot_confusion_matrix( | |
| pred, y_test, "DecisionTree Classifier Confusion Matrix " | |
| ) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(clf), | |
| file_name="decision_tree_classifier_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open( | |
| "decision_tree_classifier_model.pkl", "wb" | |
| ) as model_file: | |
| pickle.dump(clf, model_file) | |
| if model == "Random Forest Classifier": | |
| clf = RandomForestClassifier(n_estimators=100, max_depth=5) | |
| clf.fit(X_train, y_train) | |
| pred = clf.predict(X_test) | |
| st.write( | |
| "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
| ) | |
| try: | |
| st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
| st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
| st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
| except ValueError: | |
| st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
| st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
| st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
| plot_confusion_matrix( | |
| pred, y_test, "RandomForest Classifier Confusion Matrix " | |
| ) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(clf), | |
| file_name="random_forest_classifier_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open( | |
| "random_forest_classifier_model.pkl", "wb" | |
| ) as model_file: | |
| pickle.dump(clf, model_file) | |
| if model == "SVC": | |
| clf = SVC(C=1.5) | |
| clf.fit(X_train, y_train) | |
| pred = clf.predict(X_test) | |
| st.write( | |
| "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
| ) | |
| try: | |
| st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
| st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
| st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
| except ValueError: | |
| st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
| st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
| st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
| plot_confusion_matrix(pred, y_test, "SVC Confusion Matrix ") | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(clf), | |
| file_name="svc_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open("svc_model.pkl", "wb") as model_file: | |
| pickle.dump(clf, model_file) | |
| if model == "XGBRF Classifier": | |
| clf = XGBRFClassifier(reg_lambda=1.0) | |
| clf.fit(X_train, y_train) | |
| pred = clf.predict(X_test) | |
| st.write( | |
| "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
| ) | |
| try: | |
| st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
| st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
| st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
| except ValueError: | |
| st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
| st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
| st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
| plot_confusion_matrix( | |
| pred, y_test, "XGBRF Classifier Confusion Matrix " | |
| ) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(clf), | |
| file_name="xgbrf_classifier_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open("xgbrf_classifier_model.pkl", "wb") as model_file: | |
| pickle.dump(clf, model_file) | |
| if model == "LGBM Classifier": | |
| clf = LGBMClassifier(reg_lambda=1.0) | |
| clf.fit(X_train, y_train) | |
| pred = clf.predict(X_test) | |
| st.write( | |
| "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test)) | |
| ) | |
| try: | |
| st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test))) | |
| st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test))) | |
| st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test))) | |
| except ValueError: | |
| st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro'))) | |
| st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro'))) | |
| st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro'))) | |
| plot_confusion_matrix( | |
| pred, y_test, "LGBM Classifier Confusion Matrix " | |
| ) | |
| if st.download_button( | |
| label="Download Trained Model", | |
| key="trained_model", | |
| on_click=None, | |
| data=pickle.dumps(clf), | |
| file_name="lgbm_classifier_model.pkl", | |
| mime="application/octet-stream", | |
| ): | |
| with open("lgbm_classifier_model.pkl", "wb") as model_file: | |
| pickle.dump(clf, model_file) | |
| def load_csv(file): | |
| data = pd.read_csv(file) | |
| return data | |
| def data_overview(data): | |
| r, c = data.shape | |
| st.write(f"Number of Rows: {r}") | |
| return f"Number of Columns: {c}" | |
| def missing_data(data): | |
| missing_values = data.isna().sum() | |
| missing_values = missing_values[missing_values > 0] | |
| missing_value_per = (missing_values / data.shape[0]) * 100 | |
| missing_value_per = missing_value_per.round(2).astype(str) + "%" | |
| missing_df = pd.DataFrame( | |
| {"Missing Values": missing_values, "Percentage": missing_value_per} | |
| ) | |
| missing_df_html = missing_df.to_html( | |
| classes="table table-striped", justify="center" | |
| ) | |
| return st.markdown(missing_df_html, unsafe_allow_html=True) | |
| def display_data_info(data): | |
| dtypes = pd.DataFrame(data.dtypes, columns=["Data Type"]) | |
| dtypes.reset_index(inplace=True) | |
| nunique = pd.DataFrame(data.nunique(), columns=["Unique Counts"]) | |
| nunique.reset_index(inplace=True) | |
| dtypes.columns = ["Column", "Data Type"] | |
| nunique.columns = ["Column", "Unique Counts"] | |
| combined_df = pd.merge(dtypes, nunique, on="Column") | |
| combined_df_html = combined_df.to_html( | |
| classes="table table-striped", justify="center" | |
| ) | |
| return st.markdown(combined_df_html, unsafe_allow_html=True) | |
| def value_counts(data): | |
| column = st.selectbox("Select a Column", [""] + list(data.columns)) | |
| if column: | |
| st.write(data[column].value_counts()) | |
| def duplicate(data): | |
| if data.duplicated().any(): | |
| st.write( | |
| f"There is/are {data.duplicated().sum()} duplicate rows in the DataFrame. Duplicated values will be dropped." | |
| ) | |
| data.drop_duplicates(keep="first", inplace=True) | |
| return "" | |
| else: | |
| return "There are no duplicate rows in the DataFrame." | |
| def countplot(data, col): | |
| plt.figure(figsize=(10, 6)) | |
| sns.countplot(y=data[col], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2) | |
| plt.title(f"Countplot of {col} Column") | |
| st.pyplot(plt) | |
| def piechart(data, col): | |
| value_counts = data[col].value_counts() | |
| plt.figure(figsize=(8, 6)) | |
| plt.pie( | |
| value_counts, | |
| labels=value_counts.index, | |
| autopct="%1.1f%%", | |
| colors=palette, | |
| shadow=False, | |
| wedgeprops=dict(edgecolor="#1c1c1c"), | |
| ) | |
| plt.title(f"Pie Chart of {col} Column") | |
| st.pyplot(plt) | |
| def histogram(data, col): | |
| plt.figure(figsize=(10, 6)) | |
| sns.histplot( | |
| data[col], | |
| kde=True, | |
| color=palette[4], | |
| fill=True, | |
| edgecolor="#1c1c1c", | |
| linewidth=2, | |
| ) | |
| plt.title(f"Histogram of {col} Column") | |
| st.pyplot(plt) | |
| def violinplot(data, col): | |
| plt.figure(figsize=(10, 6)) | |
| sns.violinplot(data[col], color=palette[8]) | |
| plt.title(f"Violin Plot of {col} Column") | |
| st.pyplot(plt) | |
| def scatterplot(data, col): | |
| plt.figure(figsize=(10, 8)) | |
| sns.scatterplot(data[col], color=palette[3]) | |
| plt.title(f"Scatter Plot of {col} Column") | |
| st.pyplot(plt) | |
| def biscatterplot(data, cols): | |
| try: | |
| plt.figure(figsize=(10, 8)) | |
| sns.scatterplot( | |
| data=data, | |
| x=cols[0], | |
| y=cols[1], | |
| palette=palette[1:], | |
| edgecolor="#1c1c1c", | |
| linewidth=2, | |
| ) | |
| plt.title(f"Scatter Plot of {cols[0]} and {cols[1]} Columns") | |
| st.pyplot(plt) | |
| except Exception as e: | |
| st.write(str(e)) | |
| def bibarplot(data, cols): | |
| try: | |
| plt.figure(figsize=(10, 8)) | |
| sns.barplot( | |
| data=data, | |
| x=cols[0], | |
| y=cols[1], | |
| palette=palette[1:], | |
| edgecolor="#1c1c1c", | |
| linewidth=2, | |
| ) | |
| plt.title(f"Bar Plot of {cols[0]} and {cols[1]} Columns") | |
| st.pyplot(plt) | |
| except Exception as e: | |
| st.write(str(e)) | |
| def biboxplot(data, cols): | |
| try: | |
| plt.figure(figsize=(10, 8)) | |
| sns.boxplot(data=data, x=cols[0], y=cols[1], palette=palette[1:], linewidth=2) | |
| plt.title(f"Box Plot of {cols[0]} and {cols[1]} Columns") | |
| st.pyplot(plt) | |
| except Exception as e: | |
| st.write(str(e)) | |
| def paretoplot(data, categorical_col): | |
| try: | |
| value_counts = data[categorical_col].value_counts() | |
| cumulative_percentage = (value_counts / value_counts.sum()).cumsum() | |
| pareto_df = pd.DataFrame( | |
| { | |
| "Categories": value_counts.index, | |
| "Frequency": value_counts.values, | |
| "Cumulative Percentage": cumulative_percentage.values * 100, | |
| } | |
| ) | |
| pareto_df = pareto_df.sort_values(by="Frequency", ascending=False) | |
| fig, ax1 = plt.subplots(figsize=(10, 8)) | |
| ax1.bar( | |
| pareto_df["Categories"], | |
| pareto_df["Frequency"], | |
| color=palette[1:], | |
| edgecolor="#1c1c1c", | |
| linewidth=2, | |
| ) | |
| ax2 = ax1.twinx() | |
| ax2.yaxis.set_major_formatter(PercentFormatter()) | |
| ax2.plot( | |
| pareto_df["Categories"], | |
| pareto_df["Cumulative Percentage"], | |
| color=palette[3], | |
| marker="D", | |
| ms=10, | |
| ) | |
| ax1.set_xlabel(categorical_col) | |
| ax1.set_ylabel("Frequency", color=palette[0]) | |
| ax2.set_ylabel("Cumulative Percentage", color=palette[3]) | |
| st.pyplot(fig) | |
| except Exception as e: | |
| pass | |
| def plot_confusion_matrix(y_true, y_pred, title): | |
| cm = confusion_matrix(y_true, y_pred) | |
| plt.figure(figsize=(6, 4)) | |
| sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False) | |
| plt.xlabel("Predicted Label") | |
| plt.ylabel("True Label") | |
| plt.title(title) | |
| st.pyplot(plt) | |
| if __name__ == "__main__": | |
| main() |