Spaces:

TBF
/

AutomaticDatavisualization

Runtime error

+import streamlit as st
+import pandas as pd
+import os
+import json
+import csv
+# import sys
+# sys.path.append('scripts/')
+from visualization import st_data_visualization
+from feature_select import st_feature_selection
+from classification import st_classification
+from data_clean import handle_missing_value
+def try_read_df(f, f_name):
+    filename, file_extension = os.path.splitext(f_name)
+    try:
+        if file_extension.startswith(".xls"):
+            return pd.read_excel(f)
+        elif file_extension.endswith(".csv"):
+            return pd.read_csv(f)
+        elif file_extension.endswith('.json'):
+            return pd.read_json(f)
+        else:
+            st.write("File Type did not match")
+    except Exception as e:
+        st.write(e)
+def main():
+    st.title("Rahuls Data Science App")
+    st.write("This is a Data Science App for Data Visualization, Data Cleaning, Feature Selection and Classification")
+    st.write("This App is built using Streamlit and Python")
+    st.subheader("How to use the app")
+    #procedure to upload and use other functions
+    st.write("- Use left side bar that says browse, to upload the files.")
+    st.write("- Upload a CSV/Excel file and then choose the functionality you want to use.")
+    st.write("- The file will be saved in the temp_data folder.")
+    st.write("- Use side bar to navigate to other functionalities.")
+    st.write("- The file will be deleted after the session is closed.")
+    st.subheader("The app is still in development phase.")
+    st.write("Need to add more functionalities in data clean up and feature selection")
+    st.write("This App is built by Rahul Parajuli.")
+    st.subheader("Start by uploading and see the results below - ")
+    st.write("There is also a sample file 'Titanic Dataset' in the program so go ahead and press on app functionality and choose data visualization")
+    # SideBar Settings
+    st.sidebar.title("TBF Control Panel")
+    st.sidebar.info(
+            "The Byte Factory"
+        )
+    # app functionalities
+    primary_function = st.sidebar.selectbox(
+        'Choose App Functionality', ["Upload CSV File", "Data Visualization", \
+                    "Data Cleanup", "Feature Selection", "Classification"])
+    if primary_function == "Upload CSV File":
+        uploaded_file = st.sidebar.file_uploader("Upload a CSV/Excel file", accept_multiple_files=False,\
+                                                                        type=("csv", "xls", "json"))
+        if uploaded_file is not None:
+            data = try_read_df(uploaded_file, uploaded_file.name)
+            st.write("Here are the first ten rows of the File")
+            st.table(data.head(10))
+            file_details = {"FileName":uploaded_file.name,"FileType":uploaded_file.type,\
+                                                        "FileSize":uploaded_file.size}
+            st.sidebar.write(file_details)
+            with open(os.path.join("temp_data", "test.csv"), "wb") as f:
+                f.write(uploaded_file.getbuffer())
+    if primary_function == "Data Visualization":
+        st_data_visualization()
+    if primary_function == "Data Cleanup":
+        handle_missing_value()
+    if primary_function == "Feature Selection":
+        st_feature_selection()
+    if primary_function == "Classification":
+        st_classification()
+    # data_visualization = st.sidebar.button("Visualize Data")
+    # data_cleanup = st.sidebar.button("Clean Data")
+    # feature_selection = st.sidebar.button("Feature Selection")
+    # classification = st.sidebar.button("Classification")
+    # if data_visualization:
+    #     st_data_visualization()
+    # if data_cleanup:
+    #     handle_missing_value()
+    # if feature_selection:
+    #     st_feature_selection()
+    # if classification:
+    #     st_classification()
+if __name__ == '__main__':
+    main()

classification.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import pandas as pd
+import numpy as np
+# import matplotlib.pyplot as plt
+# import seaborn as sns
+from sklearn.metrics import classification_report, confusion_matrix
+import joblib
+import streamlit as st
+import os
+import plotly.express as px
+def preprocess(dataset, x_iloc_list, y_iloc, testSize):
+    # dataset = pd.read_csv(csv_file)
+    X = dataset.iloc[:, x_iloc_list].values
+    y = dataset.iloc[:, y_iloc].values
+    # split into training and testing set
+    from sklearn.model_selection import train_test_split
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testSize, random_state = 0)
+    # standardization of values
+    from sklearn.preprocessing import StandardScaler
+    sc = StandardScaler()
+    X_train = sc.fit_transform(X_train)
+    X_test = sc.transform(X_test)
+    return X_train, X_test, y_train, y_test
+class classification:
+    def __init__(self, X_train, X_test, y_train, y_test):
+        self.X_train = X_train
+        self.X_test = X_test
+        self.y_train = y_train
+        self.y_test = y_test
+    def accuracy(self, confusion_matrix):
+        sum, total = 0,0
+        for i in range(len(confusion_matrix)):
+            for j in range(len(confusion_matrix[0])):
+                if i == j:
+                    sum += confusion_matrix[i,j]
+                total += confusion_matrix[i,j]
+        return sum/total
+    def classification_report_plot(self, clf_report):
+        fig = px.imshow(pd.DataFrame(clf_report).iloc[:-1, :].T)
+        st.plotly_chart(fig)
+    def LR(self):
+        from sklearn.linear_model import LogisticRegression
+        lr_classifier = LogisticRegression()
+        lr_classifier.fit(self.X_train, self.y_train)
+        joblib.dump(lr_classifier, "model/lr.sav")
+        y_pred = lr_classifier.predict(self.X_test)
+        st.write("\n")
+        st.write("--------------------------------------")
+        st.write("### Random Forest Classifier ###")
+        st.write("--------------------------------------")
+        st.write('Classification Report: ')
+        clf = classification_report(self.y_test, y_pred, output_dict=True)
+        st.table(pd.DataFrame(clf))
+        st.write('Confusion Matrix: ')
+        st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
+        st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
+        self.classification_report_plot(clf)
+    def KNN(self):
+        from sklearn.neighbors import KNeighborsClassifier
+        knn_classifier = KNeighborsClassifier()
+        knn_classifier.fit(self.X_train, self.y_train)
+        joblib.dump(knn_classifier, "model/knn.sav")
+        y_pred = knn_classifier.predict(self.X_test)
+        st.write("\n")
+        st.write("-------------------------------")
+        st.write("### K-Neighbors Classifier ###")
+        st.write("-------------------------------")
+        st.write('Classification Report: ')
+        clf = classification_report(self.y_test, y_pred, output_dict=True)
+        st.table(pd.DataFrame(clf))
+        st.write('Confusion Matrix: ')
+        st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
+        st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
+        self.classification_report_plot(clf)
+    # kernel type could be 'linear' or 'rbf' (Gaussian)
+    def SVM(self, kernel_type):
+        from sklearn.svm import SVC
+        svm_classifier = SVC(kernel = kernel_type)
+        svm_classifier.fit(self.X_train, self.y_train)
+        joblib.dump(svm_classifier, "model/svm.sav")
+        y_pred = svm_classifier.predict(self.X_test)
+        st.write("\n")
+        st.write("--------------------------------------")
+        st.write("### Support Vector Classifier (" + kernel_type + ") ###")
+        st.write("--------------------------------------")
+        st.write('Classification Report: ')
+        clf = classification_report(self.y_test, y_pred, output_dict=True)
+        st.table(pd.DataFrame(clf))
+        st.write('Confusion Matrix: ')
+        st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
+        st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
+        self.classification_report_plot(clf)
+    def NB(self):
+        from sklearn.naive_bayes import GaussianNB
+        nb_classifier = GaussianNB()
+        nb_classifier.fit(self.X_train, self.y_train)
+        joblib.dump(nb_classifier, "model/nb.sav")
+        y_pred = nb_classifier.predict(self.X_test)
+        st.write("\n")
+        st.write("------------------------------")
+        st.write("### Naive Bayes Classifier ###")
+        st.write("------------------------------")
+        st.write('Classification Report: ')
+        clf = classification_report(self.y_test, y_pred, output_dict=True)
+        st.table(pd.DataFrame(clf))
+        st.write('Confusion Matrix: ')
+        st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
+        st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
+        self.classification_report_plot(clf)
+    def DT(self):
+        from sklearn.tree import DecisionTreeClassifier
+        tree_classifier = DecisionTreeClassifier()
+        tree_classifier.fit(self.X_train, self.y_train)
+        joblib.dump(tree_classifier, "model/tree.sav")
+        y_pred = tree_classifier.predict(self.X_test)
+        st.write("\n")
+        st.write("--------------------------------")
+        st.write("### Decision Tree Classifier ###")
+        st.write("--------------------------------")
+        st.write('Classification Report: ')
+        clf = classification_report(self.y_test, y_pred, output_dict=True)
+        st.table(pd.DataFrame(clf))
+        st.write('Confusion Matrix: ')
+        st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
+        st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
+        self.classification_report_plot(clf)
+    def RF(self):
+        from sklearn.ensemble import RandomForestClassifier
+        rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
+        rf_classifier.fit(self.X_train, self.y_train)
+        joblib.dump(rf_classifier, "model/rf-model.pkl")
+        y_pred = rf_classifier.predict(self.X_test)
+        st.write("\n")
+        st.write("--------------------------------")
+        st.write("### Random Forest Classifier ###")
+        st.write("--------------------------------")
+        st.write('Classification Report: ')
+        clf = classification_report(self.y_test, y_pred, output_dict=True)
+        st.table(pd.DataFrame(clf))
+        st.write('Confusion Matrix: ')
+        st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
+        st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
+        self.classification_report_plot(clf)
+# primary App interfacing function for classification
+def st_classification():
+    df = pd.read_csv("temp_data/test.csv")
+    # select features/columns
+    col_names = []
+    feature_list = list(df.columns)
+    st.sidebar.write("Select Column Names from the Dataset")
+    for col_name in feature_list:
+        check_box = st.sidebar.checkbox(col_name)
+        if check_box:
+            col_names.append(col_name)
+    try:
+        df = df[col_names]
+        st.write(df)
+    except:
+        pass
+    try :
+        x_iloc_list = list(range(0,len(df.columns)-1))
+        y_iloc = len(df.columns)-1
+        test_size = st.sidebar.slider("Enter Test Data Size (default 0.2)", 0.0,0.4,0.2,0.1)
+        X_train, X_test, y_train, y_test = preprocess(df, x_iloc_list, y_iloc, test_size)
+        model = st.sidebar.selectbox(
+                    'Choose Model', ["LR", "KNN", "SVM", "NB", "DT", "RF"])
+        classifier = classification(X_train, X_test, y_train, y_test)
+        if model == "LR":
+            try:
+                classifier.LR()
+            except Exception as e:
+                st.write(e)
+        if model == "KNN":
+            try:
+                classifier.KNN()
+            except Exception as e:
+                st.write(e)
+        if model == "SVM":
+            kernel_choice = st.sidebar.selectbox('Select Feature Selection Method',\
+                                                    ["linear", "rbf"])
+            try:
+                classifier.SVM(kernel_choice)
+            except Exception as e:
+                st.write(e)
+        if model == "NB":
+            try:
+                classifier.NB()
+            except Exception as e:
+                st.write(e)
+        if model == "DT":
+            try:
+                classifier.DT()
+            except Exception as e:
+                st.write(e)
+        if model == "RF":
+            try:
+                classifier.RF()
+            except Exception as e:
+                st.write(e)
+    except Exception as e:
+        st.warning('Consider selecting the columns in the left bar for classification', icon="⚠️")

data_clean.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# from cv2 import dft
+import pandas as pd
+import numpy as np
+from sklearn.impute import KNNImputer
+import streamlit as st
+# def remove_col(df ,i):
+#     df.drop([i], axis = 1,inplace = True)
+#     return df
+# def column_delete(df, column_name):
+#   print("deleting the column: ", column_name)
+#   # new_df = (df.drop['column_name'], axis=1)
+#   del df[column_name]
+#   df.head()
+#   return df
+# def row_delete(df, row_number):
+#   print("deleting the row number: ", row_number)
+#   df.drop(df.index[row_number])
+#   df.head()
+#   return df
+# def mean_fill(df,column_name):
+#   mean_value=df[column_name].mean()
+#   filled = df[column_name].fillna(value=mean_value, inplace=True)
+#   return filled
+# def median_fill(df,column_name):
+#   median_value=df[column_name].median()
+#   filled = df[column_name].fillna(value=median_value, inplace=True)
+#   return filled
+# def random_fill(df):
+#     for i in df.columns:
+#         df[i+"_imputed"] = df[i]
+#         df[i+"_imputed"][df[i+"_imputed"].isnull()] = df[i].dropna().sample(df[i].isnull().sum()).values
+# def EndDistribution(df, column_name):
+#       mean = df[column_name].mean()
+#       std = df[column_name].std()
+#       #calculating extreme standard deviation
+#       extreme = (mean + (3*std))
+#       df[column_name+'_median'] = df[column_name].fillna(df[column_name].median())
+#       df[column_name+'_end_distribution'] = df[column_name].fillna(extreme)
+#       return df
+# #knn imputer
+# def impute_knn(df):
+#     '''
+#     function for knn imputation in missing values in the data
+#     df - dataset provided by the users
+#     '''
+#     from sklearn.impute import KNNImputer
+#     imputer =KNNImputer(n_neighbors=5)
+#     #finding only numeric columns
+#     cols_num = df.select_dtypes(include=np.number).columns
+#     for feature in df.columns:
+#         #for numeric type
+#         if feature in cols_num:
+#             df[feature] = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1)))
+#         else:
+#         #for categorical type
+#             df[feature] = df[feature].fillna(df[feature].mode().iloc[0])
+#     return df
+# #Z score capping
+# def zScore(df):
+#     cols_num = df.select_dtypes(include=np.number).columns
+#     for i in cols_num:
+#         max_threshold = df[i].mean() + 3*df[i].std()
+#         min_threshold = df[i].mean() - 3*df[i].std()
+# #       df = df[(df['cgpa'] > 8.80) | (df['cgpa'] < 5.11)]
+#         df[i] = np.where(
+#             df[i]>max_threshold,
+#             max_threshold,
+#             np.where(
+#                 df[i]<min_threshold,
+#                 min_threshold,
+#                 df[i]
+#             )
+#         )
+#     return df
+# # zscore trimming
+# def zScore_trim(df):
+#     cols_num = df.select_dtypes(include=np.number).columns
+#     for i in cols_num:
+#         max_threshold = df[i].mean() + 3*df[i].std()
+#         min_threshold = df[i].mean() - 3*df[i].std()
+#         df = df[(df[i] < max_threshold) | (df[i] > min_threshold)]
+#     return df
+# # Ourlier using Percentile
+# # trimming
+# def percentile_trimming(df):
+#     cols_num = df.select_dtypes(include=np.number).columns
+#     for i in cols_num:
+#         percentile25 = df[i].quantile(0.25)
+#         percentile75 = df[i].quantile(0.75)
+#         iqr = percentile75 - percentile25
+#         max_threshold = percentile75 + 3*iqr
+#         min_threshold = percentile25 - 3*iqr
+#         df = df[(df[i] < max_threshold) | (df[i] > min_threshold)]
+#     return df
+# #capping
+# def percentile_capping(df):
+#     cols_num = df.select_dtypes(include=np.number).columns
+#     for i in cols_num:
+#         percentile25 = df[i].quantile(0.25)
+#         percentile75 = df[i].quantile(0.75)
+#         iqr = percentile75 - percentile25
+#         max_threshold = percentile75 + 3*iqr
+#         min_threshold = percentile25 - 3*iqr
+#         df[i] = np.where(
+#             df[i]>max_threshold,
+#             max_threshold,
+#             np.where(
+#                 df[i]<min_threshold,
+#                 min_threshold,
+#                 df[i]
+#             )
+#         )
+#     return df
+# # Function to find date column in dataframe and convert it to datetime format
+# def convert_date(df):
+#     '''
+#     function parameter  : dataframe
+#     parameter datatype  : pandas.core.frame.DataFrame
+#     function returns    : dataframe
+#     return datatype     : pandas.core.frame.DataFrame
+#     function definition : takes dataframe as input and finds the date columns in the dataframe.
+#                             if found, converts the column to datetime format.
+#     '''
+#     df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0)
+#     return df
+# # Function to find price column in dataframe
+# def price_column(df):
+#     '''
+#     function parameter  : dataframe
+#     parameter datatype  : pandas.core.frame.DataFrame
+#     function returns    : dataframe
+#     return datatype     : pandas.core.frame.DataFrame
+#     function definition : takes dataframe as input and finds the price related columns in the dataframe.
+#                             if found, renames the column to price_1.
+#     '''
+#     numeric_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
+#     price_cols = [col for col in numeric_cols if col.lower().find('price') != -1 or col.lower().find('cost') != -1 or
+#                     col.lower().find('total') != -1 or col.lower().find('amount') != -1 or col.lower().find('revenue') != -1 or
+#                     col.lower().find('profit') != -1 or col.lower().find('margin') != -1 or col.lower().find('sales') != -1]
+#     if len(price_cols) > 1:
+#         for i in range(len(price_cols)):
+#             df.rename(columns={price_cols[i]: 'price_'+str(i+1)}, inplace=True)
+#     elif len(price_cols) == 1:
+#         df.rename(columns={price_cols[0]: 'price'}, inplace=True)
+#     return df
+# def data_cleaning(df):
+#     import pandas as pd
+#     import numpy as np
+#     from sklearn.impute import KNNImputer
+#     pd.set_option('display.max_rows', 100)
+#     for i in df.columns:
+#         if ((df[i].isna().sum())/df.shape[0]) > 0.95:
+#             df = remove_col(df,i)
+#         else:
+#             df = df.copy()
+#     df = impute_knn(df)
+#     return df
+# class missing_df:
+#     def __init__(self, df):
+#         self.df = df
+#         print(self.df)
+#functions for handling missing values
+class missing_df:
+  def __init__ (self,dataset):
+    self.dataset = dataset
+def handle_missing_value():
+    df = pd.read_csv("temp_data/test.csv")
+    missing_count = df.isnull().sum().sum()
+    if missing_count != 0:
+        print(f"Found total of {missing_count} missing values.")
+    #remove column having name starts with Unnamed
+    df =df.loc[:,~df.columns.str.startswith('Unnamed')]
+    #drop columns having more than 90% missing values
+    for i in df.columns.to_list():
+        if df[f"{i}"].isna().mean().round(4) > 0.9:
+            df = df.drop(i, axis=1)
+    #converting object datatype to integer if present
+    for j in df.columns.values.tolist(): # Iterate on columns of dataframe
+      try:
+          df[j] = df[j].astype('int') # Convert datatype from object to int, of columns having all integer values
+      except:
+          pass
+    # find date column in dataframe and convert it to datetime format
+    try:
+        df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0)
+    except:
+        pass
+    #impute missing values
+    imputer = KNNImputer(n_neighbors=3)
+    #finding numerical columns from dataset
+    cols_num = df.select_dtypes(include=np.number).columns
+    for feature in df.columns:
+        #for numeric type
+        if feature in cols_num:
+            df[feature] = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1)))
+        else:
+        #for categorical type
+            df[feature] = df[feature].fillna(df[feature].mode().iloc[0])
+    # def add_binary_col(df):
+        # """
+        # Functions to add binary column which tells if the data was missing or not
+        # """
+        # for label, content in df.items():
+            # if pd.isnull(content).sum():
+                # df["ismissing_"+label] = pd.isnull(content)
+        # return df
+    st.write(df)
+    return df

feature_select.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import pandas as pd
+import streamlit as st
+import plotly.express as px
+import os
+def feature_importance_plot(model,names):
+    importance = {}
+    for i,j in zip(names, list(model.feature_importances_)):
+        importance[i] = j
+    feature_importance = dict(sorted(importance.items(), key=lambda item: item[1], reverse=True))
+    plot_df = pd.DataFrame(feature_importance.items(), columns=["Features", "Importance"])
+    fig = px.bar(plot_df,
+                    x = "Features",
+                    y = "Importance")
+    st.plotly_chart(fig)
+# Feature Importance with Random Forest Classifier
+from sklearn.ensemble import RandomForestClassifier
+def random_forest_classifier(X,Y,col_names):
+    model = RandomForestClassifier(n_estimators=100)
+    model.fit(X, Y)
+    feature_importance_plot(model,col_names)
+# Feature Importance with Extra Trees Classifier
+from sklearn.ensemble import ExtraTreesClassifier
+def extra_tree_classifier(X,Y,col_names):
+    model = ExtraTreesClassifier(n_estimators=100)
+    model.fit(X, Y)
+    feature_importance_plot(model,col_names)
+from xgboost import XGBClassifier
+def xgboost(X,Y,col_names):
+    model = XGBClassifier(random_state = 0)
+    model.fit(X, Y)
+    feature_importance_plot(model,col_names)
+# primary interface for the App
+def st_feature_selection():
+    df = pd.read_csv("temp_data/test.csv")
+    # drop object/string containing columns
+    df_without_obj = df.select_dtypes(exclude=['object'])
+    # add the label column once again
+    # df = pd.concat([df_without_obj, df["os"]], axis=1)
+    consider_features = st.sidebar.selectbox(
+        'Choose No. of Target Features', ["All", "Select Features"])
+    if consider_features == "All":
+        col_names = list(df.columns)
+    if consider_features == "Select Features":
+        col_names = []
+        feature_list = list(df.columns)
+        for col_name in feature_list:
+            check_box = st.sidebar.checkbox(col_name)
+            if check_box:
+                col_names.append(col_name)
+    df = df[col_names]
+    st.write(df)
+    # considering the last column as class labels
+    array = df.values
+    X = array[:,0:len(col_names)-1]
+    Y = array[:,len(col_names)-1]
+    select_method = st.sidebar.selectbox(
+        'Select Feature Selection Method', ["Random Forest", "ExtraTree", "XGBoost"])
+    if select_method == "Random Forest":
+        try:
+            random_forest_classifier(X,Y,col_names)
+        except Exception as e:
+            st.write(e)
+    if select_method == "ExtraTree":
+        try:
+            extra_tree_classifier(X,Y,col_names)
+        except Exception as e:
+            st.write(e)
+    if select_method == "XGBoost":
+        try:
+            xgboost(X,Y,col_names)
+        except Exception as e:
+            st.write(e)

model/knn.sav ADDED Viewed

Binary file (10 kB). View file

model/lr.sav ADDED Viewed

Binary file (789 Bytes). View file

model/rf-model.pkl ADDED Viewed

Binary file (115 Bytes). View file

model/svm.sav ADDED Viewed

Binary file (5.78 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+joblib==1.2.0
+numpy==1.21.6
+pandas==1.3.5
+plotly==5.13.1
+python-dateutil==2.8.2
+pytz==2022.7.1
+six==1.16.0
+sklearn==0.0.post1
+tenacity==8.2.2

temp_data/5000_sales_records.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

temp_data/Electric_Production.csv ADDED Viewed

	@@ -0,0 +1,398 @@

+DATE,IPG2211A2N
+1/1/1985,72.5052
+2/1/1985,70.672
+3/1/1985,62.4502
+4/1/1985,57.4714
+5/1/1985,55.3151
+6/1/1985,58.0904
+7/1/1985,62.6202
+8/1/1985,63.2485
+9/1/1985,60.5846
+10/1/1985,56.3154
+11/1/1985,58.0005
+12/1/1985,68.7145
+1/1/1986,73.3057
+2/1/1986,67.9869
+3/1/1986,62.2221
+4/1/1986,57.0329
+5/1/1986,55.8137
+6/1/1986,59.9005
+7/1/1986,65.7655
+8/1/1986,64.4816
+9/1/1986,61.0005
+10/1/1986,57.5322
+11/1/1986,59.3417
+12/1/1986,68.1354
+1/1/1987,73.8152
+2/1/1987,70.062
+3/1/1987,65.61
+4/1/1987,60.1586
+5/1/1987,58.8734
+6/1/1987,63.8918
+7/1/1987,68.8694
+8/1/1987,70.0669
+9/1/1987,64.1151
+10/1/1987,60.3789
+11/1/1987,62.4643
+12/1/1987,70.5777
+1/1/1988,79.8703
+2/1/1988,76.1622
+3/1/1988,70.2928
+4/1/1988,63.2384
+5/1/1988,61.4065
+6/1/1988,67.1097
+7/1/1988,72.9816
+8/1/1988,75.7655
+9/1/1988,67.5152
+10/1/1988,63.2832
+11/1/1988,65.1078
+12/1/1988,73.8631
+1/1/1989,77.9188
+2/1/1989,76.6822
+3/1/1989,73.3523
+4/1/1989,65.1081
+5/1/1989,63.6892
+6/1/1989,68.4722
+7/1/1989,74.0301
+8/1/1989,75.0448
+9/1/1989,69.3053
+10/1/1989,65.8735
+11/1/1989,69.0706
+12/1/1989,84.1949
+1/1/1990,84.3598
+2/1/1990,77.1726
+3/1/1990,73.1964
+4/1/1990,67.2781
+5/1/1990,65.8218
+6/1/1990,71.4654
+7/1/1990,76.614
+8/1/1990,77.1052
+9/1/1990,73.061
+10/1/1990,67.4365
+11/1/1990,68.5665
+12/1/1990,77.6839
+1/1/1991,86.0214
+2/1/1991,77.5573
+3/1/1991,73.365
+4/1/1991,67.15
+5/1/1991,68.8162
+6/1/1991,74.8448
+7/1/1991,80.0928
+8/1/1991,79.1606
+9/1/1991,73.5743
+10/1/1991,68.7538
+11/1/1991,72.5166
+12/1/1991,79.4894
+1/1/1992,85.2855
+2/1/1992,80.1643
+3/1/1992,74.5275
+4/1/1992,69.6441
+5/1/1992,67.1784
+6/1/1992,71.2078
+7/1/1992,77.5081
+8/1/1992,76.5374
+9/1/1992,72.3541
+10/1/1992,69.0286
+11/1/1992,73.4992
+12/1/1992,84.5159
+1/1/1993,87.9464
+2/1/1993,84.5561
+3/1/1993,79.4747
+4/1/1993,71.0578
+5/1/1993,67.6762
+6/1/1993,74.3297
+7/1/1993,82.1048
+8/1/1993,82.0605
+9/1/1993,74.6031
+10/1/1993,69.681
+11/1/1993,74.4292
+12/1/1993,84.2284
+1/1/1994,94.1386
+2/1/1994,87.1607
+3/1/1994,79.2456
+4/1/1994,70.9749
+5/1/1994,69.3844
+6/1/1994,77.9831
+7/1/1994,83.277
+8/1/1994,81.8872
+9/1/1994,75.6826
+10/1/1994,71.2661
+11/1/1994,75.2458
+12/1/1994,84.8147
+1/1/1995,92.4532
+2/1/1995,87.4033
+3/1/1995,81.2661
+4/1/1995,73.8167
+5/1/1995,73.2682
+6/1/1995,78.3026
+7/1/1995,85.9841
+8/1/1995,89.5467
+9/1/1995,78.5035
+10/1/1995,73.7066
+11/1/1995,79.6543
+12/1/1995,90.8251
+1/1/1996,98.9732
+2/1/1996,92.8883
+3/1/1996,86.9356
+4/1/1996,77.2214
+5/1/1996,76.6826
+6/1/1996,81.9306
+7/1/1996,85.9606
+8/1/1996,86.5562
+9/1/1996,79.1919
+10/1/1996,74.6891
+11/1/1996,81.074
+12/1/1996,90.4855
+1/1/1997,98.4613
+2/1/1997,89.7795
+3/1/1997,83.0125
+4/1/1997,76.1476
+5/1/1997,73.8471
+6/1/1997,79.7645
+7/1/1997,88.4519
+8/1/1997,87.7828
+9/1/1997,81.9386
+10/1/1997,77.5027
+11/1/1997,82.0448
+12/1/1997,92.101
+1/1/1998,94.792
+2/1/1998,87.82
+3/1/1998,86.5549
+4/1/1998,76.7521
+5/1/1998,78.0303
+6/1/1998,86.4579
+7/1/1998,93.8379
+8/1/1998,93.531
+9/1/1998,87.5414
+10/1/1998,80.0924
+11/1/1998,81.4349
+12/1/1998,91.6841
+1/1/1999,102.1348
+2/1/1999,91.1829
+3/1/1999,90.7381
+4/1/1999,80.5176
+5/1/1999,79.3887
+6/1/1999,87.8431
+7/1/1999,97.4903
+8/1/1999,96.4157
+9/1/1999,87.2248
+10/1/1999,80.6409
+11/1/1999,82.2025
+12/1/1999,94.5113
+1/1/2000,102.2301
+2/1/2000,94.2989
+3/1/2000,88.0927
+4/1/2000,81.4425
+5/1/2000,84.4552
+6/1/2000,91.0406
+7/1/2000,95.9957
+8/1/2000,99.3704
+9/1/2000,90.9178
+10/1/2000,83.1408
+11/1/2000,88.041
+12/1/2000,102.4558
+1/1/2001,109.1081
+2/1/2001,97.1717
+3/1/2001,92.8283
+4/1/2001,82.915
+5/1/2001,82.5465
+6/1/2001,90.3955
+7/1/2001,96.074
+8/1/2001,99.5534
+9/1/2001,88.281
+10/1/2001,82.686
+11/1/2001,82.9319
+12/1/2001,93.0381
+1/1/2002,102.9955
+2/1/2002,95.2075
+3/1/2002,93.2556
+4/1/2002,85.795
+5/1/2002,85.2351
+6/1/2002,93.1896
+7/1/2002,102.393
+8/1/2002,101.6293
+9/1/2002,93.3089
+10/1/2002,86.9002
+11/1/2002,88.5749
+12/1/2002,100.8003
+1/1/2003,110.1807
+2/1/2003,103.8413
+3/1/2003,94.5532
+4/1/2003,85.062
+5/1/2003,85.4653
+6/1/2003,91.0761
+7/1/2003,102.22
+8/1/2003,104.4682
+9/1/2003,92.9135
+10/1/2003,86.5047
+11/1/2003,88.5735
+12/1/2003,103.5428
+1/1/2004,113.7226
+2/1/2004,106.159
+3/1/2004,95.4029
+4/1/2004,86.7233
+5/1/2004,89.0302
+6/1/2004,95.5045
+7/1/2004,101.7948
+8/1/2004,100.2025
+9/1/2004,94.024
+10/1/2004,87.5262
+11/1/2004,89.6144
+12/1/2004,105.7263
+1/1/2005,111.1614
+2/1/2005,101.7795
+3/1/2005,98.9565
+4/1/2005,86.4776
+5/1/2005,87.2234
+6/1/2005,99.5076
+7/1/2005,108.3501
+8/1/2005,109.4862
+9/1/2005,99.1155
+10/1/2005,89.7567
+11/1/2005,90.4587
+12/1/2005,108.2257
+1/1/2006,104.4724
+2/1/2006,101.5196
+3/1/2006,98.4017
+4/1/2006,87.5093
+5/1/2006,90.0222
+6/1/2006,100.5244
+7/1/2006,110.9503
+8/1/2006,111.5192
+9/1/2006,95.7632
+10/1/2006,90.3738
+11/1/2006,92.3566
+12/1/2006,103.066
+1/1/2007,112.0576
+2/1/2007,111.8399
+3/1/2007,99.1925
+4/1/2007,90.8177
+5/1/2007,92.0587
+6/1/2007,100.9676
+7/1/2007,107.5686
+8/1/2007,114.1036
+9/1/2007,101.5316
+10/1/2007,93.0068
+11/1/2007,93.9126
+12/1/2007,106.7528
+1/1/2008,114.8331
+2/1/2008,108.2353
+3/1/2008,100.4386
+4/1/2008,90.9944
+5/1/2008,91.2348
+6/1/2008,103.9581
+7/1/2008,110.7631
+8/1/2008,107.5665
+9/1/2008,97.7183
+10/1/2008,90.9979
+11/1/2008,93.8057
+12/1/2008,109.4221
+1/1/2009,116.8316
+2/1/2009,104.4202
+3/1/2009,97.8529
+4/1/2009,88.1973
+5/1/2009,87.5366
+6/1/2009,97.2387
+7/1/2009,103.9086
+8/1/2009,105.7486
+9/1/2009,94.8823
+10/1/2009,89.2977
+11/1/2009,89.3585
+12/1/2009,110.6844
+1/1/2010,119.0166
+2/1/2010,110.533
+3/1/2010,98.2672
+4/1/2010,86.3
+5/1/2010,90.8364
+6/1/2010,104.3538
+7/1/2010,112.8066
+8/1/2010,112.9014
+9/1/2010,100.1209
+10/1/2010,88.9251
+11/1/2010,92.775
+12/1/2010,114.3266
+1/1/2011,119.488
+2/1/2011,107.3753
+3/1/2011,99.1028
+4/1/2011,89.3583
+5/1/2011,90.0698
+6/1/2011,102.8204
+7/1/2011,114.7068
+8/1/2011,113.5958
+9/1/2011,99.4712
+10/1/2011,90.3566
+11/1/2011,93.8095
+12/1/2011,107.3312
+1/1/2012,111.9646
+2/1/2012,103.3679
+3/1/2012,93.5772
+4/1/2012,87.5566
+5/1/2012,92.7603
+6/1/2012,101.14
+7/1/2012,113.0357
+8/1/2012,109.8601
+9/1/2012,96.7431
+10/1/2012,90.3805
+11/1/2012,94.3417
+12/1/2012,105.2722
+1/1/2013,115.501
+2/1/2013,106.734
+3/1/2013,102.9948
+4/1/2013,91.0092
+5/1/2013,90.9634
+6/1/2013,100.6957
+7/1/2013,110.148
+8/1/2013,108.1756
+9/1/2013,99.2809
+10/1/2013,91.7871
+11/1/2013,97.2853
+12/1/2013,113.4732
+1/1/2014,124.2549
+2/1/2014,112.8811
+3/1/2014,104.7631
+4/1/2014,90.2867
+5/1/2014,92.134
+6/1/2014,101.878
+7/1/2014,108.5497
+8/1/2014,108.194
+9/1/2014,100.4172
+10/1/2014,92.3837
+11/1/2014,99.7033
+12/1/2014,109.3477
+1/1/2015,120.2696
+2/1/2015,116.3788
+3/1/2015,104.4706
+4/1/2015,89.7461
+5/1/2015,91.093
+6/1/2015,102.6495
+7/1/2015,111.6354
+8/1/2015,110.5925
+9/1/2015,101.9204
+10/1/2015,91.5959
+11/1/2015,93.0628
+12/1/2015,103.2203
+1/1/2016,117.0837
+2/1/2016,106.6688
+3/1/2016,95.3548
+4/1/2016,89.3254
+5/1/2016,90.7369
+6/1/2016,104.0375
+7/1/2016,114.5397
+8/1/2016,115.5159
+9/1/2016,102.7637
+10/1/2016,91.4867
+11/1/2016,92.89
+12/1/2016,112.7694
+1/1/2017,114.8505
+2/1/2017,99.4901
+3/1/2017,101.0396
+4/1/2017,88.353
+5/1/2017,92.0805
+6/1/2017,102.1532
+7/1/2017,112.1538
+8/1/2017,108.9312
+9/1/2017,98.6154
+10/1/2017,93.6137
+11/1/2017,97.3359
+12/1/2017,114.7212
+1/1/2018,129.4048

temp_data/test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

visualization.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import streamlit as st
+import pandas as pd
+import plotly
+import plotly.express as px
+import plotly.graph_objects as go
+class one_feature:
+    def __init__(self, df, x_col_name):
+        self.df = df
+        self.x_col_name = x_col_name
+    def bar_plot(self):
+        #labels
+        key = self.df[self.x_col_name].value_counts().keys().tolist()
+        #values
+        val = self.df[self.x_col_name].value_counts().values.tolist()
+        trace = go.Bar(x = key, y=val,\
+                marker=dict(color=val,colorscale='Viridis',showscale=True),text = val)
+        data=[trace]
+        fig = go.Figure(data=data)
+        st.plotly_chart(fig)
+    def pi_plot(self):
+        #labels
+        key = self.df[self.x_col_name].value_counts().keys().tolist()
+        #values
+        val = self.df[self.x_col_name].value_counts().values.tolist()
+        trace = go.Pie(labels=key,
+                values=val,
+                marker=dict(colors=['red']),
+                # Seting values to
+                hoverinfo="value"
+              )
+        data = [trace]
+        fig = go.Figure(data = data)
+        st.plotly_chart(fig)
+    # def histogram_plot(self):
+    #     fig = px.histogram(
+    #             data_frame = self.df,
+    #             x = self.x_col_name
+    #         )
+    #     st.plotly_chart(fig)
+    def histogram_plot(self):
+        # defining data
+        trace = go.Histogram(x=self.df[self.x_col_name],nbinsx=40,histnorm='percent')
+        data = [trace]
+        fig = go.Figure(data = data)
+        st.plotly_chart(fig)
+class two_features:
+    def __init__(self, df, x_col_name, y_col_name):
+        self.df = df
+        self.x_col_name = x_col_name
+        self.y_col_name = y_col_name
+    def box_plot(self):
+        fig = px.box(self.df, x = self.x_col_name, y = self.y_col_name)
+        st.plotly_chart(fig)
+    def violin_plot(self):
+        fig = px.violin(self.df, x = self.x_col_name, y = self.y_col_name)
+        st.plotly_chart(fig)
+    def scatter_plot(self):
+        fig = px.scatter(self.df, x = self.x_col_name, y = self.y_col_name, color = self.y_col_name, \
+                            color_continuous_scale=px.colors.sequential.Viridis)
+        st.plotly_chart(fig)
+    def bar_plot(self):
+        self.df = self.df.groupby([self.x_col_name,self.y_col_name]).size().reset_index(name='quantity')
+        fig = px.bar(self.df,
+                    x = self.x_col_name,
+                    y = 'quantity',
+                    color = self.y_col_name,
+                    barmode = 'stack')
+        st.plotly_chart(fig)
+    def time_series(self):
+        fig = px.line(self.df, x=self.x_col_name, y = self.y_col_name)
+        st.plotly_chart(fig)
+class three_features:
+    def __init__(self, df, x_col_name, y_col_name, category_col_name):
+        self.df = df
+        self.x_col_name = x_col_name
+        self.y_col_name = y_col_name
+        self.category_col_name = category_col_name
+    def scatter_plot(self):
+        fig = px.scatter(self.df, x=self.x_col_name, y=self.y_col_name, \
+                            color=self.category_col_name)
+        st.plotly_chart(fig)
+    def line_plot(self):
+        fig = px.line(
+                data_frame=self.df,
+                x = self.x_col_name,
+                y = self.y_col_name,
+                color = self.category_col_name
+            )
+        st.plotly_chart(fig)
+def st_data_visualization():
+    # original saved database -> test.csv
+    df = pd.read_csv("temp_data/test.csv")
+    # for code testing -> 5000_sales_records.csv
+    # df = pd.read_csv("temp_data/5000_sales_records.csv")
+    column_list = df.columns.values.tolist()
+    target_feature_no = st.sidebar.selectbox(
+        'Choose No. of Target Features', ["One", "Two", "Three", "All"])
+    if target_feature_no == 'One':
+        st.sidebar.write("Choose One Column")
+        x_col_name = st.sidebar.selectbox('Select X column', column_list)
+        plot_list = ["bar", "pi", "histogram"]
+        plot_type = st.sidebar.selectbox('Select Plot Type', plot_list)
+        plot = one_feature(df, x_col_name)
+        if plot_type == "bar":
+            plot.bar_plot()
+        if plot_type == "pi":
+            plot.pi_plot()
+        if plot_type == "histogram":
+            plot.histogram_plot()
+    if target_feature_no == 'Two':
+        st.sidebar.write("Choose Two Columns for Viewing Relationships")
+        x_col_name = st.sidebar.selectbox('Select X column', column_list)
+        y_col_name = st.sidebar.selectbox('Select Y column', column_list)
+        plot_list = ["box", "violin", "scatter", "bar","time_series"]
+        plot_type = st.sidebar.selectbox('Select Plot Type', plot_list)
+        plot = two_features(df, x_col_name, y_col_name)
+        if plot_type == "box":
+            plot.box_plot()
+        if plot_type == "violin":
+            plot.violin_plot()
+        if plot_type == "scatter":
+            plot.scatter_plot()
+        if plot_type == "bar":
+            plot.bar_plot()
+        if plot_type == "time_series":
+            plot.time_series()
+    if target_feature_no == 'Three':
+        st.sidebar.write("Choose Two Columns for Viewing Relationships")
+        x_col_name = st.sidebar.selectbox('Select X column', column_list)
+        y_col_name = st.sidebar.selectbox('Select Y column', column_list)
+        st.sidebar.write("Choose Category Column")
+        category_col_name = st.sidebar.selectbox('Select Category', column_list)
+        plot_list = ["scatter", "line"]
+        plot_type = st.sidebar.selectbox('Select Plot Type', plot_list)
+        plot = three_features(df, x_col_name, y_col_name, category_col_name)
+        if plot_type == "scatter":
+            plot.scatter_plot()
+        if plot_type == "line":
+            plot.line_plot()