RahulParajuli12 commited on
Commit
2735152
·
1 Parent(s): e64e826

Added application file

Browse files
__pycache__/classification.cpython-310.pyc ADDED
Binary file (6.59 kB). View file
 
__pycache__/data_clean.cpython-310.pyc ADDED
Binary file (1.74 kB). View file
 
__pycache__/feature_select.cpython-310.pyc ADDED
Binary file (2.56 kB). View file
 
__pycache__/visualization.cpython-310.pyc ADDED
Binary file (5.13 kB). View file
 
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ import json
5
+ import csv
6
+ # import sys
7
+ # sys.path.append('scripts/')
8
+
9
+ from visualization import st_data_visualization
10
+ from feature_select import st_feature_selection
11
+ from classification import st_classification
12
+ from data_clean import handle_missing_value
13
+
14
+ def try_read_df(f, f_name):
15
+ filename, file_extension = os.path.splitext(f_name)
16
+ try:
17
+ if file_extension.startswith(".xls"):
18
+ return pd.read_excel(f)
19
+ elif file_extension.endswith(".csv"):
20
+ return pd.read_csv(f)
21
+ elif file_extension.endswith('.json'):
22
+ return pd.read_json(f)
23
+ else:
24
+ st.write("File Type did not match")
25
+ except Exception as e:
26
+ st.write(e)
27
+
28
+ def main():
29
+ st.title("Rahuls Data Science App")
30
+ st.write("This is a Data Science App for Data Visualization, Data Cleaning, Feature Selection and Classification")
31
+ st.write("This App is built using Streamlit and Python")
32
+
33
+ st.subheader("How to use the app")
34
+ #procedure to upload and use other functions
35
+ st.write("- Use left side bar that says browse, to upload the files.")
36
+ st.write("- Upload a CSV/Excel file and then choose the functionality you want to use.")
37
+ st.write("- The file will be saved in the temp_data folder.")
38
+ st.write("- Use side bar to navigate to other functionalities.")
39
+ st.write("- The file will be deleted after the session is closed.")
40
+ st.subheader("The app is still in development phase.")
41
+ st.write("Need to add more functionalities in data clean up and feature selection")
42
+ st.write("This App is built by Rahul Parajuli.")
43
+ st.subheader("Start by uploading and see the results below - ")
44
+ st.write("There is also a sample file 'Titanic Dataset' in the program so go ahead and press on app functionality and choose data visualization")
45
+
46
+ # SideBar Settings
47
+ st.sidebar.title("TBF Control Panel")
48
+ st.sidebar.info(
49
+ "The Byte Factory"
50
+ )
51
+
52
+ # app functionalities
53
+
54
+ primary_function = st.sidebar.selectbox(
55
+ 'Choose App Functionality', ["Upload CSV File", "Data Visualization", \
56
+ "Data Cleanup", "Feature Selection", "Classification"])
57
+
58
+ if primary_function == "Upload CSV File":
59
+ uploaded_file = st.sidebar.file_uploader("Upload a CSV/Excel file", accept_multiple_files=False,\
60
+ type=("csv", "xls", "json"))
61
+
62
+ if uploaded_file is not None:
63
+ data = try_read_df(uploaded_file, uploaded_file.name)
64
+ st.write("Here are the first ten rows of the File")
65
+ st.table(data.head(10))
66
+ file_details = {"FileName":uploaded_file.name,"FileType":uploaded_file.type,\
67
+ "FileSize":uploaded_file.size}
68
+ st.sidebar.write(file_details)
69
+ with open(os.path.join("temp_data", "test.csv"), "wb") as f:
70
+ f.write(uploaded_file.getbuffer())
71
+
72
+
73
+ if primary_function == "Data Visualization":
74
+ st_data_visualization()
75
+ if primary_function == "Data Cleanup":
76
+ handle_missing_value()
77
+ if primary_function == "Feature Selection":
78
+ st_feature_selection()
79
+ if primary_function == "Classification":
80
+ st_classification()
81
+
82
+ # data_visualization = st.sidebar.button("Visualize Data")
83
+ # data_cleanup = st.sidebar.button("Clean Data")
84
+ # feature_selection = st.sidebar.button("Feature Selection")
85
+ # classification = st.sidebar.button("Classification")
86
+
87
+ # if data_visualization:
88
+ # st_data_visualization()
89
+ # if data_cleanup:
90
+ # handle_missing_value()
91
+ # if feature_selection:
92
+ # st_feature_selection()
93
+ # if classification:
94
+ # st_classification()
95
+
96
+
97
+ if __name__ == '__main__':
98
+ main()
classification.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ # import matplotlib.pyplot as plt
4
+ # import seaborn as sns
5
+ from sklearn.metrics import classification_report, confusion_matrix
6
+ import joblib
7
+ import streamlit as st
8
+ import os
9
+ import plotly.express as px
10
+
11
+ def preprocess(dataset, x_iloc_list, y_iloc, testSize):
12
+ # dataset = pd.read_csv(csv_file)
13
+ X = dataset.iloc[:, x_iloc_list].values
14
+ y = dataset.iloc[:, y_iloc].values
15
+
16
+ # split into training and testing set
17
+ from sklearn.model_selection import train_test_split
18
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testSize, random_state = 0)
19
+
20
+ # standardization of values
21
+ from sklearn.preprocessing import StandardScaler
22
+ sc = StandardScaler()
23
+ X_train = sc.fit_transform(X_train)
24
+ X_test = sc.transform(X_test)
25
+ return X_train, X_test, y_train, y_test
26
+
27
+
28
+ class classification:
29
+
30
+ def __init__(self, X_train, X_test, y_train, y_test):
31
+ self.X_train = X_train
32
+ self.X_test = X_test
33
+ self.y_train = y_train
34
+ self.y_test = y_test
35
+
36
+
37
+ def accuracy(self, confusion_matrix):
38
+ sum, total = 0,0
39
+ for i in range(len(confusion_matrix)):
40
+ for j in range(len(confusion_matrix[0])):
41
+ if i == j:
42
+ sum += confusion_matrix[i,j]
43
+ total += confusion_matrix[i,j]
44
+ return sum/total
45
+
46
+
47
+ def classification_report_plot(self, clf_report):
48
+ fig = px.imshow(pd.DataFrame(clf_report).iloc[:-1, :].T)
49
+ st.plotly_chart(fig)
50
+
51
+
52
+ def LR(self):
53
+ from sklearn.linear_model import LogisticRegression
54
+ lr_classifier = LogisticRegression()
55
+ lr_classifier.fit(self.X_train, self.y_train)
56
+ joblib.dump(lr_classifier, "model/lr.sav")
57
+ y_pred = lr_classifier.predict(self.X_test)
58
+
59
+ st.write("\n")
60
+ st.write("--------------------------------------")
61
+ st.write("### Random Forest Classifier ###")
62
+ st.write("--------------------------------------")
63
+ st.write('Classification Report: ')
64
+ clf = classification_report(self.y_test, y_pred, output_dict=True)
65
+ st.table(pd.DataFrame(clf))
66
+ st.write('Confusion Matrix: ')
67
+ st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
68
+ st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
69
+
70
+ self.classification_report_plot(clf)
71
+
72
+
73
+
74
+ def KNN(self):
75
+ from sklearn.neighbors import KNeighborsClassifier
76
+ knn_classifier = KNeighborsClassifier()
77
+ knn_classifier.fit(self.X_train, self.y_train)
78
+ joblib.dump(knn_classifier, "model/knn.sav")
79
+ y_pred = knn_classifier.predict(self.X_test)
80
+
81
+ st.write("\n")
82
+ st.write("-------------------------------")
83
+ st.write("### K-Neighbors Classifier ###")
84
+ st.write("-------------------------------")
85
+ st.write('Classification Report: ')
86
+ clf = classification_report(self.y_test, y_pred, output_dict=True)
87
+ st.table(pd.DataFrame(clf))
88
+ st.write('Confusion Matrix: ')
89
+ st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
90
+ st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
91
+
92
+ self.classification_report_plot(clf)
93
+
94
+
95
+
96
+ # kernel type could be 'linear' or 'rbf' (Gaussian)
97
+ def SVM(self, kernel_type):
98
+ from sklearn.svm import SVC
99
+ svm_classifier = SVC(kernel = kernel_type)
100
+ svm_classifier.fit(self.X_train, self.y_train)
101
+ joblib.dump(svm_classifier, "model/svm.sav")
102
+ y_pred = svm_classifier.predict(self.X_test)
103
+
104
+ st.write("\n")
105
+ st.write("--------------------------------------")
106
+ st.write("### Support Vector Classifier (" + kernel_type + ") ###")
107
+ st.write("--------------------------------------")
108
+ st.write('Classification Report: ')
109
+ clf = classification_report(self.y_test, y_pred, output_dict=True)
110
+ st.table(pd.DataFrame(clf))
111
+ st.write('Confusion Matrix: ')
112
+ st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
113
+ st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
114
+
115
+ self.classification_report_plot(clf)
116
+
117
+
118
+
119
+ def NB(self):
120
+ from sklearn.naive_bayes import GaussianNB
121
+ nb_classifier = GaussianNB()
122
+ nb_classifier.fit(self.X_train, self.y_train)
123
+ joblib.dump(nb_classifier, "model/nb.sav")
124
+ y_pred = nb_classifier.predict(self.X_test)
125
+
126
+ st.write("\n")
127
+ st.write("------------------------------")
128
+ st.write("### Naive Bayes Classifier ###")
129
+ st.write("------------------------------")
130
+ st.write('Classification Report: ')
131
+ clf = classification_report(self.y_test, y_pred, output_dict=True)
132
+ st.table(pd.DataFrame(clf))
133
+ st.write('Confusion Matrix: ')
134
+ st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
135
+ st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
136
+
137
+ self.classification_report_plot(clf)
138
+
139
+
140
+
141
+
142
+ def DT(self):
143
+ from sklearn.tree import DecisionTreeClassifier
144
+ tree_classifier = DecisionTreeClassifier()
145
+ tree_classifier.fit(self.X_train, self.y_train)
146
+ joblib.dump(tree_classifier, "model/tree.sav")
147
+ y_pred = tree_classifier.predict(self.X_test)
148
+
149
+ st.write("\n")
150
+ st.write("--------------------------------")
151
+ st.write("### Decision Tree Classifier ###")
152
+ st.write("--------------------------------")
153
+ st.write('Classification Report: ')
154
+ clf = classification_report(self.y_test, y_pred, output_dict=True)
155
+ st.table(pd.DataFrame(clf))
156
+ st.write('Confusion Matrix: ')
157
+ st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
158
+ st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
159
+
160
+ self.classification_report_plot(clf)
161
+
162
+ def RF(self):
163
+ from sklearn.ensemble import RandomForestClassifier
164
+ rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
165
+ rf_classifier.fit(self.X_train, self.y_train)
166
+ joblib.dump(rf_classifier, "model/rf-model.pkl")
167
+ y_pred = rf_classifier.predict(self.X_test)
168
+
169
+ st.write("\n")
170
+ st.write("--------------------------------")
171
+ st.write("### Random Forest Classifier ###")
172
+ st.write("--------------------------------")
173
+ st.write('Classification Report: ')
174
+ clf = classification_report(self.y_test, y_pred, output_dict=True)
175
+ st.table(pd.DataFrame(clf))
176
+ st.write('Confusion Matrix: ')
177
+ st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
178
+ st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
179
+
180
+ self.classification_report_plot(clf)
181
+
182
+ # primary App interfacing function for classification
183
+ def st_classification():
184
+ df = pd.read_csv("temp_data/test.csv")
185
+
186
+ # select features/columns
187
+ col_names = []
188
+ feature_list = list(df.columns)
189
+ st.sidebar.write("Select Column Names from the Dataset")
190
+ for col_name in feature_list:
191
+ check_box = st.sidebar.checkbox(col_name)
192
+ if check_box:
193
+ col_names.append(col_name)
194
+
195
+ try:
196
+ df = df[col_names]
197
+ st.write(df)
198
+ except:
199
+ pass
200
+ try :
201
+ x_iloc_list = list(range(0,len(df.columns)-1))
202
+
203
+ y_iloc = len(df.columns)-1
204
+
205
+ test_size = st.sidebar.slider("Enter Test Data Size (default 0.2)", 0.0,0.4,0.2,0.1)
206
+
207
+ X_train, X_test, y_train, y_test = preprocess(df, x_iloc_list, y_iloc, test_size)
208
+
209
+ model = st.sidebar.selectbox(
210
+ 'Choose Model', ["LR", "KNN", "SVM", "NB", "DT", "RF"])
211
+
212
+ classifier = classification(X_train, X_test, y_train, y_test)
213
+
214
+ if model == "LR":
215
+ try:
216
+ classifier.LR()
217
+ except Exception as e:
218
+ st.write(e)
219
+
220
+ if model == "KNN":
221
+ try:
222
+ classifier.KNN()
223
+ except Exception as e:
224
+ st.write(e)
225
+
226
+ if model == "SVM":
227
+ kernel_choice = st.sidebar.selectbox('Select Feature Selection Method',\
228
+ ["linear", "rbf"])
229
+ try:
230
+ classifier.SVM(kernel_choice)
231
+ except Exception as e:
232
+ st.write(e)
233
+
234
+ if model == "NB":
235
+ try:
236
+ classifier.NB()
237
+ except Exception as e:
238
+ st.write(e)
239
+
240
+ if model == "DT":
241
+ try:
242
+ classifier.DT()
243
+ except Exception as e:
244
+ st.write(e)
245
+
246
+ if model == "RF":
247
+ try:
248
+ classifier.RF()
249
+ except Exception as e:
250
+ st.write(e)
251
+ except Exception as e:
252
+ st.warning('Consider selecting the columns in the left bar for classification', icon="⚠️")
data_clean.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from cv2 import dft
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.impute import KNNImputer
5
+ import streamlit as st
6
+
7
+ # def remove_col(df ,i):
8
+ # df.drop([i], axis = 1,inplace = True)
9
+ # return df
10
+
11
+ # def column_delete(df, column_name):
12
+ # print("deleting the column: ", column_name)
13
+ # # new_df = (df.drop['column_name'], axis=1)
14
+ # del df[column_name]
15
+ # df.head()
16
+ # return df
17
+
18
+ # def row_delete(df, row_number):
19
+ # print("deleting the row number: ", row_number)
20
+ # df.drop(df.index[row_number])
21
+ # df.head()
22
+ # return df
23
+
24
+ # def mean_fill(df,column_name):
25
+ # mean_value=df[column_name].mean()
26
+ # filled = df[column_name].fillna(value=mean_value, inplace=True)
27
+ # return filled
28
+
29
+ # def median_fill(df,column_name):
30
+ # median_value=df[column_name].median()
31
+ # filled = df[column_name].fillna(value=median_value, inplace=True)
32
+ # return filled
33
+
34
+ # def random_fill(df):
35
+ # for i in df.columns:
36
+ # df[i+"_imputed"] = df[i]
37
+ # df[i+"_imputed"][df[i+"_imputed"].isnull()] = df[i].dropna().sample(df[i].isnull().sum()).values
38
+
39
+ # def EndDistribution(df, column_name):
40
+
41
+ # mean = df[column_name].mean()
42
+ # std = df[column_name].std()
43
+ # #calculating extreme standard deviation
44
+ # extreme = (mean + (3*std))
45
+ # df[column_name+'_median'] = df[column_name].fillna(df[column_name].median())
46
+ # df[column_name+'_end_distribution'] = df[column_name].fillna(extreme)
47
+ # return df
48
+
49
+ # #knn imputer
50
+
51
+
52
+ # def impute_knn(df):
53
+ # '''
54
+ # function for knn imputation in missing values in the data
55
+ # df - dataset provided by the users
56
+ # '''
57
+ # from sklearn.impute import KNNImputer
58
+ # imputer =KNNImputer(n_neighbors=5)
59
+
60
+ # #finding only numeric columns
61
+ # cols_num = df.select_dtypes(include=np.number).columns
62
+ # for feature in df.columns:
63
+ # #for numeric type
64
+ # if feature in cols_num:
65
+ # df[feature] = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1)))
66
+ # else:
67
+ # #for categorical type
68
+ # df[feature] = df[feature].fillna(df[feature].mode().iloc[0])
69
+ # return df
70
+
71
+ # #Z score capping
72
+ # def zScore(df):
73
+ # cols_num = df.select_dtypes(include=np.number).columns
74
+ # for i in cols_num:
75
+ # max_threshold = df[i].mean() + 3*df[i].std()
76
+ # min_threshold = df[i].mean() - 3*df[i].std()
77
+ # # df = df[(df['cgpa'] > 8.80) | (df['cgpa'] < 5.11)]
78
+ # df[i] = np.where(
79
+ # df[i]>max_threshold,
80
+ # max_threshold,
81
+ # np.where(
82
+ # df[i]<min_threshold,
83
+ # min_threshold,
84
+ # df[i]
85
+ # )
86
+ # )
87
+ # return df
88
+
89
+ # # zscore trimming
90
+ # def zScore_trim(df):
91
+ # cols_num = df.select_dtypes(include=np.number).columns
92
+ # for i in cols_num:
93
+ # max_threshold = df[i].mean() + 3*df[i].std()
94
+ # min_threshold = df[i].mean() - 3*df[i].std()
95
+ # df = df[(df[i] < max_threshold) | (df[i] > min_threshold)]
96
+ # return df
97
+
98
+ # # Ourlier using Percentile
99
+ # # trimming
100
+ # def percentile_trimming(df):
101
+ # cols_num = df.select_dtypes(include=np.number).columns
102
+ # for i in cols_num:
103
+ # percentile25 = df[i].quantile(0.25)
104
+ # percentile75 = df[i].quantile(0.75)
105
+ # iqr = percentile75 - percentile25
106
+ # max_threshold = percentile75 + 3*iqr
107
+ # min_threshold = percentile25 - 3*iqr
108
+ # df = df[(df[i] < max_threshold) | (df[i] > min_threshold)]
109
+ # return df
110
+
111
+ # #capping
112
+ # def percentile_capping(df):
113
+ # cols_num = df.select_dtypes(include=np.number).columns
114
+ # for i in cols_num:
115
+ # percentile25 = df[i].quantile(0.25)
116
+ # percentile75 = df[i].quantile(0.75)
117
+ # iqr = percentile75 - percentile25
118
+ # max_threshold = percentile75 + 3*iqr
119
+ # min_threshold = percentile25 - 3*iqr
120
+ # df[i] = np.where(
121
+ # df[i]>max_threshold,
122
+ # max_threshold,
123
+ # np.where(
124
+ # df[i]<min_threshold,
125
+ # min_threshold,
126
+ # df[i]
127
+ # )
128
+ # )
129
+ # return df
130
+
131
+ # # Function to find date column in dataframe and convert it to datetime format
132
+ # def convert_date(df):
133
+ # '''
134
+ # function parameter : dataframe
135
+ # parameter datatype : pandas.core.frame.DataFrame
136
+ # function returns : dataframe
137
+ # return datatype : pandas.core.frame.DataFrame
138
+ # function definition : takes dataframe as input and finds the date columns in the dataframe.
139
+ # if found, converts the column to datetime format.
140
+ # '''
141
+ # df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0)
142
+ # return df
143
+
144
+ # # Function to find price column in dataframe
145
+ # def price_column(df):
146
+ # '''
147
+ # function parameter : dataframe
148
+ # parameter datatype : pandas.core.frame.DataFrame
149
+ # function returns : dataframe
150
+ # return datatype : pandas.core.frame.DataFrame
151
+ # function definition : takes dataframe as input and finds the price related columns in the dataframe.
152
+ # if found, renames the column to price_1.
153
+ # '''
154
+ # numeric_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
155
+ # price_cols = [col for col in numeric_cols if col.lower().find('price') != -1 or col.lower().find('cost') != -1 or
156
+ # col.lower().find('total') != -1 or col.lower().find('amount') != -1 or col.lower().find('revenue') != -1 or
157
+ # col.lower().find('profit') != -1 or col.lower().find('margin') != -1 or col.lower().find('sales') != -1]
158
+ # if len(price_cols) > 1:
159
+ # for i in range(len(price_cols)):
160
+ # df.rename(columns={price_cols[i]: 'price_'+str(i+1)}, inplace=True)
161
+ # elif len(price_cols) == 1:
162
+ # df.rename(columns={price_cols[0]: 'price'}, inplace=True)
163
+ # return df
164
+
165
+
166
+ # def data_cleaning(df):
167
+ # import pandas as pd
168
+ # import numpy as np
169
+ # from sklearn.impute import KNNImputer
170
+ # pd.set_option('display.max_rows', 100)
171
+ # for i in df.columns:
172
+ # if ((df[i].isna().sum())/df.shape[0]) > 0.95:
173
+ # df = remove_col(df,i)
174
+ # else:
175
+ # df = df.copy()
176
+ # df = impute_knn(df)
177
+ # return df
178
+
179
+
180
+ # class missing_df:
181
+ # def __init__(self, df):
182
+ # self.df = df
183
+ # print(self.df)
184
+ #functions for handling missing values
185
+
186
+ class missing_df:
187
+ def __init__ (self,dataset):
188
+ self.dataset = dataset
189
+
190
+ def handle_missing_value():
191
+ df = pd.read_csv("temp_data/test.csv")
192
+ missing_count = df.isnull().sum().sum()
193
+ if missing_count != 0:
194
+ print(f"Found total of {missing_count} missing values.")
195
+
196
+ #remove column having name starts with Unnamed
197
+ df =df.loc[:,~df.columns.str.startswith('Unnamed')]
198
+
199
+ #drop columns having more than 90% missing values
200
+ for i in df.columns.to_list():
201
+ if df[f"{i}"].isna().mean().round(4) > 0.9:
202
+ df = df.drop(i, axis=1)
203
+
204
+ #converting object datatype to integer if present
205
+ for j in df.columns.values.tolist(): # Iterate on columns of dataframe
206
+ try:
207
+ df[j] = df[j].astype('int') # Convert datatype from object to int, of columns having all integer values
208
+ except:
209
+ pass
210
+
211
+
212
+ # find date column in dataframe and convert it to datetime format
213
+ try:
214
+ df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0)
215
+ except:
216
+ pass
217
+
218
+ #impute missing values
219
+ imputer = KNNImputer(n_neighbors=3)
220
+ #finding numerical columns from dataset
221
+ cols_num = df.select_dtypes(include=np.number).columns
222
+ for feature in df.columns:
223
+ #for numeric type
224
+ if feature in cols_num:
225
+ df[feature] = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1)))
226
+ else:
227
+ #for categorical type
228
+ df[feature] = df[feature].fillna(df[feature].mode().iloc[0])
229
+
230
+ # def add_binary_col(df):
231
+ # """
232
+ # Functions to add binary column which tells if the data was missing or not
233
+ # """
234
+ # for label, content in df.items():
235
+ # if pd.isnull(content).sum():
236
+ # df["ismissing_"+label] = pd.isnull(content)
237
+ # return df
238
+ st.write(df)
239
+ return df
240
+
241
+
242
+
feature_select.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import plotly.express as px
4
+ import os
5
+
6
+ def feature_importance_plot(model,names):
7
+ importance = {}
8
+
9
+ for i,j in zip(names, list(model.feature_importances_)):
10
+ importance[i] = j
11
+
12
+ feature_importance = dict(sorted(importance.items(), key=lambda item: item[1], reverse=True))
13
+ plot_df = pd.DataFrame(feature_importance.items(), columns=["Features", "Importance"])
14
+ fig = px.bar(plot_df,
15
+ x = "Features",
16
+ y = "Importance")
17
+ st.plotly_chart(fig)
18
+
19
+
20
+ # Feature Importance with Random Forest Classifier
21
+ from sklearn.ensemble import RandomForestClassifier
22
+
23
+ def random_forest_classifier(X,Y,col_names):
24
+ model = RandomForestClassifier(n_estimators=100)
25
+ model.fit(X, Y)
26
+
27
+ feature_importance_plot(model,col_names)
28
+
29
+
30
+
31
+ # Feature Importance with Extra Trees Classifier
32
+ from sklearn.ensemble import ExtraTreesClassifier
33
+
34
+ def extra_tree_classifier(X,Y,col_names):
35
+ model = ExtraTreesClassifier(n_estimators=100)
36
+ model.fit(X, Y)
37
+
38
+ feature_importance_plot(model,col_names)
39
+
40
+
41
+
42
+ from xgboost import XGBClassifier
43
+
44
+ def xgboost(X,Y,col_names):
45
+ model = XGBClassifier(random_state = 0)
46
+ model.fit(X, Y)
47
+
48
+ feature_importance_plot(model,col_names)
49
+
50
+
51
+
52
+
53
+ # primary interface for the App
54
+ def st_feature_selection():
55
+ df = pd.read_csv("temp_data/test.csv")
56
+ # drop object/string containing columns
57
+ df_without_obj = df.select_dtypes(exclude=['object'])
58
+ # add the label column once again
59
+ # df = pd.concat([df_without_obj, df["os"]], axis=1)
60
+
61
+ consider_features = st.sidebar.selectbox(
62
+ 'Choose No. of Target Features', ["All", "Select Features"])
63
+
64
+ if consider_features == "All":
65
+ col_names = list(df.columns)
66
+ if consider_features == "Select Features":
67
+ col_names = []
68
+ feature_list = list(df.columns)
69
+ for col_name in feature_list:
70
+ check_box = st.sidebar.checkbox(col_name)
71
+ if check_box:
72
+ col_names.append(col_name)
73
+
74
+
75
+ df = df[col_names]
76
+ st.write(df)
77
+
78
+ # considering the last column as class labels
79
+ array = df.values
80
+ X = array[:,0:len(col_names)-1]
81
+ Y = array[:,len(col_names)-1]
82
+
83
+ select_method = st.sidebar.selectbox(
84
+ 'Select Feature Selection Method', ["Random Forest", "ExtraTree", "XGBoost"])
85
+
86
+
87
+ if select_method == "Random Forest":
88
+ try:
89
+ random_forest_classifier(X,Y,col_names)
90
+ except Exception as e:
91
+ st.write(e)
92
+
93
+ if select_method == "ExtraTree":
94
+ try:
95
+ extra_tree_classifier(X,Y,col_names)
96
+ except Exception as e:
97
+ st.write(e)
98
+
99
+ if select_method == "XGBoost":
100
+ try:
101
+ xgboost(X,Y,col_names)
102
+ except Exception as e:
103
+ st.write(e)
model/knn.sav ADDED
Binary file (10 kB). View file
 
model/lr.sav ADDED
Binary file (789 Bytes). View file
 
model/rf-model.pkl ADDED
Binary file (115 Bytes). View file
 
model/svm.sav ADDED
Binary file (5.78 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ joblib==1.2.0
2
+ numpy==1.21.6
3
+ pandas==1.3.5
4
+ plotly==5.13.1
5
+ python-dateutil==2.8.2
6
+ pytz==2022.7.1
7
+ six==1.16.0
8
+ sklearn==0.0.post1
9
+ tenacity==8.2.2
temp_data/5000_sales_records.csv ADDED
The diff for this file is too large to render. See raw diff
 
temp_data/Electric_Production.csv ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DATE,IPG2211A2N
2
+ 1/1/1985,72.5052
3
+ 2/1/1985,70.672
4
+ 3/1/1985,62.4502
5
+ 4/1/1985,57.4714
6
+ 5/1/1985,55.3151
7
+ 6/1/1985,58.0904
8
+ 7/1/1985,62.6202
9
+ 8/1/1985,63.2485
10
+ 9/1/1985,60.5846
11
+ 10/1/1985,56.3154
12
+ 11/1/1985,58.0005
13
+ 12/1/1985,68.7145
14
+ 1/1/1986,73.3057
15
+ 2/1/1986,67.9869
16
+ 3/1/1986,62.2221
17
+ 4/1/1986,57.0329
18
+ 5/1/1986,55.8137
19
+ 6/1/1986,59.9005
20
+ 7/1/1986,65.7655
21
+ 8/1/1986,64.4816
22
+ 9/1/1986,61.0005
23
+ 10/1/1986,57.5322
24
+ 11/1/1986,59.3417
25
+ 12/1/1986,68.1354
26
+ 1/1/1987,73.8152
27
+ 2/1/1987,70.062
28
+ 3/1/1987,65.61
29
+ 4/1/1987,60.1586
30
+ 5/1/1987,58.8734
31
+ 6/1/1987,63.8918
32
+ 7/1/1987,68.8694
33
+ 8/1/1987,70.0669
34
+ 9/1/1987,64.1151
35
+ 10/1/1987,60.3789
36
+ 11/1/1987,62.4643
37
+ 12/1/1987,70.5777
38
+ 1/1/1988,79.8703
39
+ 2/1/1988,76.1622
40
+ 3/1/1988,70.2928
41
+ 4/1/1988,63.2384
42
+ 5/1/1988,61.4065
43
+ 6/1/1988,67.1097
44
+ 7/1/1988,72.9816
45
+ 8/1/1988,75.7655
46
+ 9/1/1988,67.5152
47
+ 10/1/1988,63.2832
48
+ 11/1/1988,65.1078
49
+ 12/1/1988,73.8631
50
+ 1/1/1989,77.9188
51
+ 2/1/1989,76.6822
52
+ 3/1/1989,73.3523
53
+ 4/1/1989,65.1081
54
+ 5/1/1989,63.6892
55
+ 6/1/1989,68.4722
56
+ 7/1/1989,74.0301
57
+ 8/1/1989,75.0448
58
+ 9/1/1989,69.3053
59
+ 10/1/1989,65.8735
60
+ 11/1/1989,69.0706
61
+ 12/1/1989,84.1949
62
+ 1/1/1990,84.3598
63
+ 2/1/1990,77.1726
64
+ 3/1/1990,73.1964
65
+ 4/1/1990,67.2781
66
+ 5/1/1990,65.8218
67
+ 6/1/1990,71.4654
68
+ 7/1/1990,76.614
69
+ 8/1/1990,77.1052
70
+ 9/1/1990,73.061
71
+ 10/1/1990,67.4365
72
+ 11/1/1990,68.5665
73
+ 12/1/1990,77.6839
74
+ 1/1/1991,86.0214
75
+ 2/1/1991,77.5573
76
+ 3/1/1991,73.365
77
+ 4/1/1991,67.15
78
+ 5/1/1991,68.8162
79
+ 6/1/1991,74.8448
80
+ 7/1/1991,80.0928
81
+ 8/1/1991,79.1606
82
+ 9/1/1991,73.5743
83
+ 10/1/1991,68.7538
84
+ 11/1/1991,72.5166
85
+ 12/1/1991,79.4894
86
+ 1/1/1992,85.2855
87
+ 2/1/1992,80.1643
88
+ 3/1/1992,74.5275
89
+ 4/1/1992,69.6441
90
+ 5/1/1992,67.1784
91
+ 6/1/1992,71.2078
92
+ 7/1/1992,77.5081
93
+ 8/1/1992,76.5374
94
+ 9/1/1992,72.3541
95
+ 10/1/1992,69.0286
96
+ 11/1/1992,73.4992
97
+ 12/1/1992,84.5159
98
+ 1/1/1993,87.9464
99
+ 2/1/1993,84.5561
100
+ 3/1/1993,79.4747
101
+ 4/1/1993,71.0578
102
+ 5/1/1993,67.6762
103
+ 6/1/1993,74.3297
104
+ 7/1/1993,82.1048
105
+ 8/1/1993,82.0605
106
+ 9/1/1993,74.6031
107
+ 10/1/1993,69.681
108
+ 11/1/1993,74.4292
109
+ 12/1/1993,84.2284
110
+ 1/1/1994,94.1386
111
+ 2/1/1994,87.1607
112
+ 3/1/1994,79.2456
113
+ 4/1/1994,70.9749
114
+ 5/1/1994,69.3844
115
+ 6/1/1994,77.9831
116
+ 7/1/1994,83.277
117
+ 8/1/1994,81.8872
118
+ 9/1/1994,75.6826
119
+ 10/1/1994,71.2661
120
+ 11/1/1994,75.2458
121
+ 12/1/1994,84.8147
122
+ 1/1/1995,92.4532
123
+ 2/1/1995,87.4033
124
+ 3/1/1995,81.2661
125
+ 4/1/1995,73.8167
126
+ 5/1/1995,73.2682
127
+ 6/1/1995,78.3026
128
+ 7/1/1995,85.9841
129
+ 8/1/1995,89.5467
130
+ 9/1/1995,78.5035
131
+ 10/1/1995,73.7066
132
+ 11/1/1995,79.6543
133
+ 12/1/1995,90.8251
134
+ 1/1/1996,98.9732
135
+ 2/1/1996,92.8883
136
+ 3/1/1996,86.9356
137
+ 4/1/1996,77.2214
138
+ 5/1/1996,76.6826
139
+ 6/1/1996,81.9306
140
+ 7/1/1996,85.9606
141
+ 8/1/1996,86.5562
142
+ 9/1/1996,79.1919
143
+ 10/1/1996,74.6891
144
+ 11/1/1996,81.074
145
+ 12/1/1996,90.4855
146
+ 1/1/1997,98.4613
147
+ 2/1/1997,89.7795
148
+ 3/1/1997,83.0125
149
+ 4/1/1997,76.1476
150
+ 5/1/1997,73.8471
151
+ 6/1/1997,79.7645
152
+ 7/1/1997,88.4519
153
+ 8/1/1997,87.7828
154
+ 9/1/1997,81.9386
155
+ 10/1/1997,77.5027
156
+ 11/1/1997,82.0448
157
+ 12/1/1997,92.101
158
+ 1/1/1998,94.792
159
+ 2/1/1998,87.82
160
+ 3/1/1998,86.5549
161
+ 4/1/1998,76.7521
162
+ 5/1/1998,78.0303
163
+ 6/1/1998,86.4579
164
+ 7/1/1998,93.8379
165
+ 8/1/1998,93.531
166
+ 9/1/1998,87.5414
167
+ 10/1/1998,80.0924
168
+ 11/1/1998,81.4349
169
+ 12/1/1998,91.6841
170
+ 1/1/1999,102.1348
171
+ 2/1/1999,91.1829
172
+ 3/1/1999,90.7381
173
+ 4/1/1999,80.5176
174
+ 5/1/1999,79.3887
175
+ 6/1/1999,87.8431
176
+ 7/1/1999,97.4903
177
+ 8/1/1999,96.4157
178
+ 9/1/1999,87.2248
179
+ 10/1/1999,80.6409
180
+ 11/1/1999,82.2025
181
+ 12/1/1999,94.5113
182
+ 1/1/2000,102.2301
183
+ 2/1/2000,94.2989
184
+ 3/1/2000,88.0927
185
+ 4/1/2000,81.4425
186
+ 5/1/2000,84.4552
187
+ 6/1/2000,91.0406
188
+ 7/1/2000,95.9957
189
+ 8/1/2000,99.3704
190
+ 9/1/2000,90.9178
191
+ 10/1/2000,83.1408
192
+ 11/1/2000,88.041
193
+ 12/1/2000,102.4558
194
+ 1/1/2001,109.1081
195
+ 2/1/2001,97.1717
196
+ 3/1/2001,92.8283
197
+ 4/1/2001,82.915
198
+ 5/1/2001,82.5465
199
+ 6/1/2001,90.3955
200
+ 7/1/2001,96.074
201
+ 8/1/2001,99.5534
202
+ 9/1/2001,88.281
203
+ 10/1/2001,82.686
204
+ 11/1/2001,82.9319
205
+ 12/1/2001,93.0381
206
+ 1/1/2002,102.9955
207
+ 2/1/2002,95.2075
208
+ 3/1/2002,93.2556
209
+ 4/1/2002,85.795
210
+ 5/1/2002,85.2351
211
+ 6/1/2002,93.1896
212
+ 7/1/2002,102.393
213
+ 8/1/2002,101.6293
214
+ 9/1/2002,93.3089
215
+ 10/1/2002,86.9002
216
+ 11/1/2002,88.5749
217
+ 12/1/2002,100.8003
218
+ 1/1/2003,110.1807
219
+ 2/1/2003,103.8413
220
+ 3/1/2003,94.5532
221
+ 4/1/2003,85.062
222
+ 5/1/2003,85.4653
223
+ 6/1/2003,91.0761
224
+ 7/1/2003,102.22
225
+ 8/1/2003,104.4682
226
+ 9/1/2003,92.9135
227
+ 10/1/2003,86.5047
228
+ 11/1/2003,88.5735
229
+ 12/1/2003,103.5428
230
+ 1/1/2004,113.7226
231
+ 2/1/2004,106.159
232
+ 3/1/2004,95.4029
233
+ 4/1/2004,86.7233
234
+ 5/1/2004,89.0302
235
+ 6/1/2004,95.5045
236
+ 7/1/2004,101.7948
237
+ 8/1/2004,100.2025
238
+ 9/1/2004,94.024
239
+ 10/1/2004,87.5262
240
+ 11/1/2004,89.6144
241
+ 12/1/2004,105.7263
242
+ 1/1/2005,111.1614
243
+ 2/1/2005,101.7795
244
+ 3/1/2005,98.9565
245
+ 4/1/2005,86.4776
246
+ 5/1/2005,87.2234
247
+ 6/1/2005,99.5076
248
+ 7/1/2005,108.3501
249
+ 8/1/2005,109.4862
250
+ 9/1/2005,99.1155
251
+ 10/1/2005,89.7567
252
+ 11/1/2005,90.4587
253
+ 12/1/2005,108.2257
254
+ 1/1/2006,104.4724
255
+ 2/1/2006,101.5196
256
+ 3/1/2006,98.4017
257
+ 4/1/2006,87.5093
258
+ 5/1/2006,90.0222
259
+ 6/1/2006,100.5244
260
+ 7/1/2006,110.9503
261
+ 8/1/2006,111.5192
262
+ 9/1/2006,95.7632
263
+ 10/1/2006,90.3738
264
+ 11/1/2006,92.3566
265
+ 12/1/2006,103.066
266
+ 1/1/2007,112.0576
267
+ 2/1/2007,111.8399
268
+ 3/1/2007,99.1925
269
+ 4/1/2007,90.8177
270
+ 5/1/2007,92.0587
271
+ 6/1/2007,100.9676
272
+ 7/1/2007,107.5686
273
+ 8/1/2007,114.1036
274
+ 9/1/2007,101.5316
275
+ 10/1/2007,93.0068
276
+ 11/1/2007,93.9126
277
+ 12/1/2007,106.7528
278
+ 1/1/2008,114.8331
279
+ 2/1/2008,108.2353
280
+ 3/1/2008,100.4386
281
+ 4/1/2008,90.9944
282
+ 5/1/2008,91.2348
283
+ 6/1/2008,103.9581
284
+ 7/1/2008,110.7631
285
+ 8/1/2008,107.5665
286
+ 9/1/2008,97.7183
287
+ 10/1/2008,90.9979
288
+ 11/1/2008,93.8057
289
+ 12/1/2008,109.4221
290
+ 1/1/2009,116.8316
291
+ 2/1/2009,104.4202
292
+ 3/1/2009,97.8529
293
+ 4/1/2009,88.1973
294
+ 5/1/2009,87.5366
295
+ 6/1/2009,97.2387
296
+ 7/1/2009,103.9086
297
+ 8/1/2009,105.7486
298
+ 9/1/2009,94.8823
299
+ 10/1/2009,89.2977
300
+ 11/1/2009,89.3585
301
+ 12/1/2009,110.6844
302
+ 1/1/2010,119.0166
303
+ 2/1/2010,110.533
304
+ 3/1/2010,98.2672
305
+ 4/1/2010,86.3
306
+ 5/1/2010,90.8364
307
+ 6/1/2010,104.3538
308
+ 7/1/2010,112.8066
309
+ 8/1/2010,112.9014
310
+ 9/1/2010,100.1209
311
+ 10/1/2010,88.9251
312
+ 11/1/2010,92.775
313
+ 12/1/2010,114.3266
314
+ 1/1/2011,119.488
315
+ 2/1/2011,107.3753
316
+ 3/1/2011,99.1028
317
+ 4/1/2011,89.3583
318
+ 5/1/2011,90.0698
319
+ 6/1/2011,102.8204
320
+ 7/1/2011,114.7068
321
+ 8/1/2011,113.5958
322
+ 9/1/2011,99.4712
323
+ 10/1/2011,90.3566
324
+ 11/1/2011,93.8095
325
+ 12/1/2011,107.3312
326
+ 1/1/2012,111.9646
327
+ 2/1/2012,103.3679
328
+ 3/1/2012,93.5772
329
+ 4/1/2012,87.5566
330
+ 5/1/2012,92.7603
331
+ 6/1/2012,101.14
332
+ 7/1/2012,113.0357
333
+ 8/1/2012,109.8601
334
+ 9/1/2012,96.7431
335
+ 10/1/2012,90.3805
336
+ 11/1/2012,94.3417
337
+ 12/1/2012,105.2722
338
+ 1/1/2013,115.501
339
+ 2/1/2013,106.734
340
+ 3/1/2013,102.9948
341
+ 4/1/2013,91.0092
342
+ 5/1/2013,90.9634
343
+ 6/1/2013,100.6957
344
+ 7/1/2013,110.148
345
+ 8/1/2013,108.1756
346
+ 9/1/2013,99.2809
347
+ 10/1/2013,91.7871
348
+ 11/1/2013,97.2853
349
+ 12/1/2013,113.4732
350
+ 1/1/2014,124.2549
351
+ 2/1/2014,112.8811
352
+ 3/1/2014,104.7631
353
+ 4/1/2014,90.2867
354
+ 5/1/2014,92.134
355
+ 6/1/2014,101.878
356
+ 7/1/2014,108.5497
357
+ 8/1/2014,108.194
358
+ 9/1/2014,100.4172
359
+ 10/1/2014,92.3837
360
+ 11/1/2014,99.7033
361
+ 12/1/2014,109.3477
362
+ 1/1/2015,120.2696
363
+ 2/1/2015,116.3788
364
+ 3/1/2015,104.4706
365
+ 4/1/2015,89.7461
366
+ 5/1/2015,91.093
367
+ 6/1/2015,102.6495
368
+ 7/1/2015,111.6354
369
+ 8/1/2015,110.5925
370
+ 9/1/2015,101.9204
371
+ 10/1/2015,91.5959
372
+ 11/1/2015,93.0628
373
+ 12/1/2015,103.2203
374
+ 1/1/2016,117.0837
375
+ 2/1/2016,106.6688
376
+ 3/1/2016,95.3548
377
+ 4/1/2016,89.3254
378
+ 5/1/2016,90.7369
379
+ 6/1/2016,104.0375
380
+ 7/1/2016,114.5397
381
+ 8/1/2016,115.5159
382
+ 9/1/2016,102.7637
383
+ 10/1/2016,91.4867
384
+ 11/1/2016,92.89
385
+ 12/1/2016,112.7694
386
+ 1/1/2017,114.8505
387
+ 2/1/2017,99.4901
388
+ 3/1/2017,101.0396
389
+ 4/1/2017,88.353
390
+ 5/1/2017,92.0805
391
+ 6/1/2017,102.1532
392
+ 7/1/2017,112.1538
393
+ 8/1/2017,108.9312
394
+ 9/1/2017,98.6154
395
+ 10/1/2017,93.6137
396
+ 11/1/2017,97.3359
397
+ 12/1/2017,114.7212
398
+ 1/1/2018,129.4048
temp_data/test.csv ADDED
The diff for this file is too large to render. See raw diff
 
visualization.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+
7
+ class one_feature:
8
+ def __init__(self, df, x_col_name):
9
+ self.df = df
10
+ self.x_col_name = x_col_name
11
+
12
+ def bar_plot(self):
13
+ #labels
14
+
15
+ key = self.df[self.x_col_name].value_counts().keys().tolist()
16
+ #values
17
+ val = self.df[self.x_col_name].value_counts().values.tolist()
18
+ trace = go.Bar(x = key, y=val,\
19
+ marker=dict(color=val,colorscale='Viridis',showscale=True),text = val)
20
+ data=[trace]
21
+ fig = go.Figure(data=data)
22
+ st.plotly_chart(fig)
23
+
24
+ def pi_plot(self):
25
+ #labels
26
+ key = self.df[self.x_col_name].value_counts().keys().tolist()
27
+ #values
28
+ val = self.df[self.x_col_name].value_counts().values.tolist()
29
+ trace = go.Pie(labels=key,
30
+ values=val,
31
+ marker=dict(colors=['red']),
32
+ # Seting values to
33
+ hoverinfo="value"
34
+ )
35
+ data = [trace]
36
+ fig = go.Figure(data = data)
37
+ st.plotly_chart(fig)
38
+
39
+ # def histogram_plot(self):
40
+ # fig = px.histogram(
41
+ # data_frame = self.df,
42
+ # x = self.x_col_name
43
+ # )
44
+ # st.plotly_chart(fig)
45
+
46
+ def histogram_plot(self):
47
+ # defining data
48
+ trace = go.Histogram(x=self.df[self.x_col_name],nbinsx=40,histnorm='percent')
49
+ data = [trace]
50
+ fig = go.Figure(data = data)
51
+ st.plotly_chart(fig)
52
+
53
+ class two_features:
54
+ def __init__(self, df, x_col_name, y_col_name):
55
+ self.df = df
56
+ self.x_col_name = x_col_name
57
+ self.y_col_name = y_col_name
58
+
59
+ def box_plot(self):
60
+ fig = px.box(self.df, x = self.x_col_name, y = self.y_col_name)
61
+ st.plotly_chart(fig)
62
+
63
+ def violin_plot(self):
64
+ fig = px.violin(self.df, x = self.x_col_name, y = self.y_col_name)
65
+ st.plotly_chart(fig)
66
+
67
+ def scatter_plot(self):
68
+ fig = px.scatter(self.df, x = self.x_col_name, y = self.y_col_name, color = self.y_col_name, \
69
+ color_continuous_scale=px.colors.sequential.Viridis)
70
+ st.plotly_chart(fig)
71
+
72
+ def bar_plot(self):
73
+ self.df = self.df.groupby([self.x_col_name,self.y_col_name]).size().reset_index(name='quantity')
74
+ fig = px.bar(self.df,
75
+ x = self.x_col_name,
76
+ y = 'quantity',
77
+ color = self.y_col_name,
78
+ barmode = 'stack')
79
+ st.plotly_chart(fig)
80
+
81
+ def time_series(self):
82
+ fig = px.line(self.df, x=self.x_col_name, y = self.y_col_name)
83
+ st.plotly_chart(fig)
84
+
85
+ class three_features:
86
+ def __init__(self, df, x_col_name, y_col_name, category_col_name):
87
+ self.df = df
88
+ self.x_col_name = x_col_name
89
+ self.y_col_name = y_col_name
90
+ self.category_col_name = category_col_name
91
+
92
+ def scatter_plot(self):
93
+ fig = px.scatter(self.df, x=self.x_col_name, y=self.y_col_name, \
94
+ color=self.category_col_name)
95
+ st.plotly_chart(fig)
96
+
97
+ def line_plot(self):
98
+ fig = px.line(
99
+ data_frame=self.df,
100
+ x = self.x_col_name,
101
+ y = self.y_col_name,
102
+ color = self.category_col_name
103
+ )
104
+ st.plotly_chart(fig)
105
+
106
+ def st_data_visualization():
107
+ # original saved database -> test.csv
108
+ df = pd.read_csv("temp_data/test.csv")
109
+ # for code testing -> 5000_sales_records.csv
110
+ # df = pd.read_csv("temp_data/5000_sales_records.csv")
111
+ column_list = df.columns.values.tolist()
112
+
113
+ target_feature_no = st.sidebar.selectbox(
114
+ 'Choose No. of Target Features', ["One", "Two", "Three", "All"])
115
+
116
+ if target_feature_no == 'One':
117
+ st.sidebar.write("Choose One Column")
118
+ x_col_name = st.sidebar.selectbox('Select X column', column_list)
119
+
120
+ plot_list = ["bar", "pi", "histogram"]
121
+ plot_type = st.sidebar.selectbox('Select Plot Type', plot_list)
122
+
123
+ plot = one_feature(df, x_col_name)
124
+ if plot_type == "bar":
125
+ plot.bar_plot()
126
+ if plot_type == "pi":
127
+ plot.pi_plot()
128
+ if plot_type == "histogram":
129
+ plot.histogram_plot()
130
+
131
+
132
+ if target_feature_no == 'Two':
133
+ st.sidebar.write("Choose Two Columns for Viewing Relationships")
134
+ x_col_name = st.sidebar.selectbox('Select X column', column_list)
135
+ y_col_name = st.sidebar.selectbox('Select Y column', column_list)
136
+
137
+ plot_list = ["box", "violin", "scatter", "bar","time_series"]
138
+ plot_type = st.sidebar.selectbox('Select Plot Type', plot_list)
139
+
140
+ plot = two_features(df, x_col_name, y_col_name)
141
+ if plot_type == "box":
142
+ plot.box_plot()
143
+ if plot_type == "violin":
144
+ plot.violin_plot()
145
+ if plot_type == "scatter":
146
+ plot.scatter_plot()
147
+ if plot_type == "bar":
148
+ plot.bar_plot()
149
+ if plot_type == "time_series":
150
+ plot.time_series()
151
+
152
+
153
+
154
+ if target_feature_no == 'Three':
155
+ st.sidebar.write("Choose Two Columns for Viewing Relationships")
156
+ x_col_name = st.sidebar.selectbox('Select X column', column_list)
157
+ y_col_name = st.sidebar.selectbox('Select Y column', column_list)
158
+
159
+ st.sidebar.write("Choose Category Column")
160
+ category_col_name = st.sidebar.selectbox('Select Category', column_list)
161
+
162
+ plot_list = ["scatter", "line"]
163
+ plot_type = st.sidebar.selectbox('Select Plot Type', plot_list)
164
+
165
+ plot = three_features(df, x_col_name, y_col_name, category_col_name)
166
+ if plot_type == "scatter":
167
+ plot.scatter_plot()
168
+ if plot_type == "line":
169
+ plot.line_plot()