Harnitha Suresh commited on
Commit
e6a69b4
·
1 Parent(s): 2aac190

intial commit-descriptive analysis

Browse files
Files changed (1) hide show
  1. app.py +264 -0
app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ import statsmodels.api as sm
7
+ st.set_option('deprecation.showPyplotGlobalUse', False)
8
+
9
+ uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
10
+
11
+ st.title("EDA: Descriptive Analyzer")
12
+ # Read the dataset
13
+ if uploaded_file is not None:
14
+ df = pd.DataFrame()
15
+ intial_df = pd.read_csv(uploaded_file)
16
+ df=intial_df
17
+
18
+
19
+
20
+ def descriptive_analysis():
21
+ global df
22
+ groups = {
23
+ "Descriptive Statistics": ["count", "sum", "mean", "median", "min", "max", "std", "var", "quantile"],
24
+ "Aggregation": ["sum", "mean", "median", "std"], #"agg"
25
+ # "Cumulative Statistics": ["cumsum", "cumprod", "cummax", "cummin"],# all
26
+ # "Correlation and Covariance": ["corr", "cov"],#all
27
+ "Value Counts": [ "nunique"], #["value_counts", "unique"]
28
+ "Quantiles and Percentiles": ["quantile"], # showing only 0.5
29
+ "Miscellaneous Statistics": ["prod", "skew", "kurt"], # mad
30
+ # "Histograms": ["hist"],# all
31
+ # "Central Tendency": ["mode"],# all
32
+ # "Missing Data Statistics": ["isna", "notna", "dropna"],# all
33
+ # "Categorical Statistics": ["describe", "count_categorical"] #all
34
+ }
35
+ selected_group = st.sidebar.selectbox("Select Analysis Type", list(groups.keys()))
36
+
37
+ # Create separate dropdowns and result tables for the selected group
38
+ st.write(f"## {selected_group}")
39
+
40
+ # Multi-select for selecting functions in the group
41
+ selected_functions = st.multiselect(f"Select functions in {selected_group}", groups[selected_group])
42
+
43
+ if not selected_functions:
44
+ st.info("Please select at least one function.")
45
+ else:
46
+ # Create an empty DataFrame to store the results
47
+ results_df = pd.DataFrame()
48
+ function_list=[]
49
+
50
+ # Compute and concatenate results based on user selection
51
+ for function in selected_functions:
52
+ if function == "quantile":
53
+ # For quantile_series, user needs to provide a list of quantiles
54
+ #quantiles = st.text_input(f"Enter quantiles for {function} (comma-separated):", "0.25,0.5,0.75")
55
+ quantiles = [0.25,0.5,0.75]
56
+ result_25 = df.quantile(0.25)
57
+ result_5 = df.quantile(0.5)
58
+ result_75 = df.quantile(0.75)
59
+ result = pd.concat([result_25, result_5, result_75], axis=1)
60
+ function_list.append('Quantite-0.25')
61
+ function_list.append('Quantite-0.5')
62
+ function_list.append('Quantite-0.75')
63
+ else:
64
+ # For other functions, apply the selected function to the DataFrame
65
+ result = getattr(df, function)()
66
+ function_list.append(function)
67
+
68
+ # Concatenate the result along columns
69
+ results_df = pd.concat([results_df, result], axis=1)
70
+
71
+
72
+ # Transpose the result table
73
+ results_df = results_df.transpose()
74
+ results_df['Function'] = function_list
75
+ results_df = results_df[['Function'] + [col for col in results_df.columns if col != 'Function']]
76
+
77
+ # Display the transposed results
78
+ st.write("### Results:")
79
+ st.dataframe(results_df, hide_index = True)
80
+
81
+ def data_visualization():
82
+ global df
83
+ visuals=["Line Plot", "Bar Chart", "Histogram","Scatter Plot", "Box Plot", "Violin Plot","Heatmap", "Pair Plot", "Pie Chart"]
84
+ data=pd.DataFrame(df)
85
+ selected_chart = st.sidebar.selectbox("Select Visualization Type", list(visuals))
86
+ sns.boxplot(x=df['Age'])
87
+ st.pyplot()
88
+
89
+ # Display selected chart
90
+ if selected_chart == "Line Plot":
91
+ st.subheader("Line Plot")
92
+ x_col=st.selectbox("Select column for x-axis:",df.columns)
93
+ y_col=st.selectbox("Select column for y-axis:",df.columns)
94
+ plt.scatter(df[x_col],df[y_col])
95
+ st.pyplot()
96
+
97
+ elif selected_chart == "Bar Chart":
98
+ col=st.multiselect("Select columns for bar-chart",df.columns)
99
+ plt.bar(col,height=[range(len(col))])
100
+ st.pyplot()
101
+
102
+ elif selected_chart == "Histogram":
103
+ st.subheader("Histogram")
104
+ plt.hist(data['value'], bins=10)
105
+ st.pyplot()
106
+
107
+ elif selected_chart == "Scatter Plot":
108
+ st.subheader("Scatter Plot")
109
+ sm.qqplot(data, line='45')
110
+ st.pyplot()
111
+
112
+ elif selected_chart == "Box Plot":
113
+ st.subheader("Box Plot")
114
+ sns.boxplot(x='category', y='value', data=data)
115
+ st.pyplot()
116
+
117
+ elif selected_chart == "Violin Plot":
118
+ st.subheader("Violin Plot")
119
+ sns.violinplot(x='category', y='value', data=data)
120
+ st.pyplot()
121
+
122
+
123
+ elif selected_chart == "Pair Plot":
124
+ st.subheader("Pair Plot")
125
+ sns.pairplot(data)
126
+ st.pyplot()
127
+
128
+ elif selected_chart == "Pie Chart":
129
+ st.subheader("Pie Chart")
130
+ sizes = [15, 30, 45]
131
+ labels = ['Category A', 'Category B', 'Category C']
132
+ plt.pie(sizes, labels=labels, autopct='%1.1f%%')
133
+ st.pyplot()
134
+
135
+ def collinearity_pairs():
136
+ global df
137
+ st.write("### Collinearity")
138
+ st.sidebar.markdown("[Collinearity](#collinearity)")
139
+ # Set your collinearity threshold (e.g., 0.7)
140
+ st.subheader("Heatmap")
141
+ sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
142
+ st.pyplot()
143
+ collinearity_threshold = st.number_input("Enter collinearity threshold from range [0 1]:")
144
+
145
+ # Calculate the correlation matrix
146
+ correlation_matrix = df.corr()
147
+
148
+ # Find distinct column pairs with collinearity above the threshold
149
+ high_collinear_pairs = (
150
+ (correlation_matrix.abs() > collinearity_threshold) & (correlation_matrix < 1)
151
+ ).stack().reset_index()
152
+
153
+ # Rename the columns for clarity
154
+ high_collinear_pairs.columns = ['Column1', 'Column2', 'Collinearity']
155
+
156
+ # Filter for pairs with collinearity above the threshold
157
+ high_collinear_pairs = high_collinear_pairs[high_collinear_pairs['Collinearity']]
158
+
159
+ # Create a list to store the column pairs and their collinearity
160
+ df_col = []
161
+ distinct_col = set()
162
+ for index, row in high_collinear_pairs.iterrows():
163
+ col1, col2 = row['Column1'], row['Column2']
164
+ df_col.append([col1, col2])
165
+ distinct_col.add(col1)
166
+ distinct_col.add(col2)
167
+
168
+ df_col = pd.DataFrame(df_col)
169
+ st.write(f"Number of distinct pairs: {len(distinct_col)}")
170
+ st.write("Collinearity Pairs")
171
+ st.dataframe(df_col)
172
+
173
+ def missing_values():
174
+ global df
175
+ st.write("### Missing Values")
176
+ st.sidebar.markdown("[Missing Values](#missing-values)")
177
+ methods=["None","dropna","Value","mean","Previous Value","Next Value","interpolate"]
178
+ selected_missing = st.selectbox("Select Missing Values handling method",methods)
179
+
180
+ if selected_missing == "None":
181
+ df=df
182
+ elif selected_missing == "dropna":
183
+ df.dropna(inplace=True)
184
+ elif selected_missing == "Value":
185
+ value = st.text_input("Enter value:")
186
+ df.fillna(value, inplace=True)
187
+ elif selected_missing == "mean":
188
+ df.fillna(df.mean(), inplace=True)
189
+ elif selected_missing == "Previous Value":
190
+ df.ffill(inplace=True)
191
+ elif selected_missing == "Next Value":
192
+ df.bfill(inplace=True)
193
+ elif selected_missing == "interpolate":
194
+ df.interpolate(inplace=True)
195
+
196
+
197
+ def replace_value():
198
+ global df
199
+ st.write("### Replace Value")
200
+ st.sidebar.markdown("[Replace Value](#replace-value)")
201
+ prev = st.text_input("Enter value to be changed")
202
+ change = st.text_input("Enter new value")
203
+ st.text("Data Type:")
204
+ intD = st.button("Int")
205
+ floatD = st.button("Float")
206
+ if intD:
207
+ prev=int(prev)
208
+ new=int(prev)
209
+ elif floatD:
210
+ prev=float(prev)
211
+ new=float(prev)
212
+
213
+ df=df.replace(prev, change, inplace=True)
214
+
215
+
216
+ def display_df():
217
+ global df
218
+ st.dataframe(df)
219
+
220
+
221
+ def reset_df():
222
+ global df
223
+ global intial_df
224
+ st.write("### Reset Data Set")
225
+ st.sidebar.markdown("[Reset Data Set](#reset-data-set)")
226
+ result = st.button("Reset Data Set")
227
+ if result:
228
+ st.write("Data Set reset complete.")
229
+ df = intial_df
230
+
231
+
232
+ def main():
233
+ global df
234
+ global intial_df
235
+ st.sidebar.title("EDA Stages")
236
+ reset_df()
237
+
238
+ st.sidebar.markdown("[Drop columns](#drop-columns)")
239
+ # drop columns
240
+ st.write("### Drop columns")
241
+ data_cols = df.columns
242
+ selected_cols = st.multiselect("Select any columns to be dropped", data_cols)
243
+ if selected_cols:
244
+ df=df.drop(columns=selected_cols)
245
+ st.write(f"Columns Dropped:{selected_cols}")
246
+ st.sidebar.markdown("[Dataset](#dataset)")
247
+ st.write("### Dataset")
248
+ res = st.button("Show Dataset")
249
+ if res:
250
+ display_df()
251
+ descriptive_analysis()
252
+ # replace_value()
253
+ # missing_values()
254
+ # collinearity_pairs()
255
+ # data_visualization()
256
+
257
+
258
+
259
+ # File upload
260
+
261
+ if uploaded_file is not None:
262
+ main()
263
+
264
+