File size: 8,981 Bytes
e6a69b4
 
 
245b95b
 
 
e6a69b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# app.py
import streamlit as st
import pandas as pd
#import seaborn as sns
#import matplotlib.pyplot as plt
#import statsmodels.api as sm
st.set_option('deprecation.showPyplotGlobalUse', False)

uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])

st.title("EDA: Descriptive Analyzer")
# Read the dataset
if uploaded_file is not None:
    df = pd.DataFrame()
    intial_df = pd.read_csv(uploaded_file)
    df=intial_df

        

def descriptive_analysis():
    global df
    groups = {
            "Descriptive Statistics": ["count", "sum", "mean", "median", "min", "max", "std", "var", "quantile"],
            "Aggregation": ["sum", "mean", "median", "std"], #"agg"
            # "Cumulative Statistics": ["cumsum", "cumprod", "cummax", "cummin"],# all
            # "Correlation and Covariance": ["corr", "cov"],#all
            "Value Counts": [ "nunique"], #["value_counts", "unique"]
            "Quantiles and Percentiles": ["quantile"], # showing only 0.5
            "Miscellaneous Statistics": ["prod", "skew", "kurt"], # mad
            # "Histograms": ["hist"],# all
            # "Central Tendency": ["mode"],# all
            # "Missing Data Statistics": ["isna", "notna", "dropna"],# all
            # "Categorical Statistics": ["describe", "count_categorical"] #all
        }
    selected_group = st.sidebar.selectbox("Select Analysis Type", list(groups.keys()))

    # Create separate dropdowns and result tables for the selected group
    st.write(f"## {selected_group}")

    # Multi-select for selecting functions in the group
    selected_functions = st.multiselect(f"Select functions in {selected_group}", groups[selected_group])

    if not selected_functions:
        st.info("Please select at least one function.")
    else:
        # Create an empty DataFrame to store the results
        results_df = pd.DataFrame()
        function_list=[]

        # Compute and concatenate results based on user selection
        for function in selected_functions:
            if function == "quantile":
                # For quantile_series, user needs to provide a list of quantiles
                #quantiles = st.text_input(f"Enter quantiles for {function} (comma-separated):", "0.25,0.5,0.75")
                quantiles = [0.25,0.5,0.75]
                result_25 = df.quantile(0.25)
                result_5 = df.quantile(0.5)
                result_75 = df.quantile(0.75)
                result = pd.concat([result_25, result_5, result_75], axis=1)
                function_list.append('Quantite-0.25')
                function_list.append('Quantite-0.5')
                function_list.append('Quantite-0.75')
            else:
                # For other functions, apply the selected function to the DataFrame
                result = getattr(df, function)()
                function_list.append(function)

            # Concatenate the result along columns
            results_df = pd.concat([results_df, result], axis=1)
           

        # Transpose the result table
        results_df = results_df.transpose()
        results_df['Function'] = function_list
        results_df = results_df[['Function'] + [col for col in results_df.columns if col != 'Function']]

        # Display the transposed results
        st.write("### Results:")
        st.dataframe(results_df, hide_index = True)

def data_visualization():
    global df
    visuals=["Line Plot", "Bar Chart", "Histogram","Scatter Plot", "Box Plot", "Violin Plot","Heatmap", "Pair Plot", "Pie Chart"]
    data=pd.DataFrame(df)
    selected_chart = st.sidebar.selectbox("Select Visualization Type", list(visuals))
    sns.boxplot(x=df['Age'])
    st.pyplot()

    # Display selected chart
    if selected_chart == "Line Plot":
        st.subheader("Line Plot")
        x_col=st.selectbox("Select column for x-axis:",df.columns)
        y_col=st.selectbox("Select column for y-axis:",df.columns)
        plt.scatter(df[x_col],df[y_col])
        st.pyplot()

    elif selected_chart == "Bar Chart":
        col=st.multiselect("Select columns for bar-chart",df.columns)
        plt.bar(col,height=[range(len(col))])
        st.pyplot()

    elif selected_chart == "Histogram":
        st.subheader("Histogram")
        plt.hist(data['value'], bins=10)
        st.pyplot()

    elif selected_chart == "Scatter Plot":
        st.subheader("Scatter Plot")
        sm.qqplot(data, line='45')
        st.pyplot()

    elif selected_chart == "Box Plot":
        st.subheader("Box Plot")
        sns.boxplot(x='category', y='value', data=data)
        st.pyplot()

    elif selected_chart == "Violin Plot":
        st.subheader("Violin Plot")
        sns.violinplot(x='category', y='value', data=data)
        st.pyplot()


    elif selected_chart == "Pair Plot":
        st.subheader("Pair Plot")
        sns.pairplot(data)
        st.pyplot()

    elif selected_chart == "Pie Chart":
        st.subheader("Pie Chart")
        sizes = [15, 30, 45]
        labels = ['Category A', 'Category B', 'Category C']
        plt.pie(sizes, labels=labels, autopct='%1.1f%%')
        st.pyplot()

def collinearity_pairs():
    global df
    st.write("### Collinearity")
    st.sidebar.markdown("[Collinearity](#collinearity)")
    # Set your collinearity threshold (e.g., 0.7)
    st.subheader("Heatmap")
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    st.pyplot()
    collinearity_threshold = st.number_input("Enter collinearity threshold from range [0 1]:")

    # Calculate the correlation matrix
    correlation_matrix = df.corr()

    # Find distinct column pairs with collinearity above the threshold
    high_collinear_pairs = (
        (correlation_matrix.abs() > collinearity_threshold) & (correlation_matrix < 1)
    ).stack().reset_index()

    # Rename the columns for clarity
    high_collinear_pairs.columns = ['Column1', 'Column2', 'Collinearity']

    # Filter for pairs with collinearity above the threshold
    high_collinear_pairs = high_collinear_pairs[high_collinear_pairs['Collinearity']]

    # Create a list to store the column pairs and their collinearity
    df_col = []
    distinct_col = set()
    for index, row in high_collinear_pairs.iterrows():
        col1, col2 = row['Column1'], row['Column2']
        df_col.append([col1, col2])
        distinct_col.add(col1)
        distinct_col.add(col2)

    df_col = pd.DataFrame(df_col)
    st.write(f"Number of distinct pairs: {len(distinct_col)}")
    st.write("Collinearity Pairs")
    st.dataframe(df_col)

def missing_values():
    global df
    st.write("### Missing Values")
    st.sidebar.markdown("[Missing Values](#missing-values)")
    methods=["None","dropna","Value","mean","Previous Value","Next Value","interpolate"]
    selected_missing = st.selectbox("Select Missing Values handling method",methods)
    
    if selected_missing == "None":
        df=df
    elif selected_missing == "dropna":
        df.dropna(inplace=True)
    elif selected_missing == "Value":
        value = st.text_input("Enter value:")
        df.fillna(value, inplace=True)
    elif selected_missing == "mean":
        df.fillna(df.mean(), inplace=True)
    elif selected_missing == "Previous Value":
        df.ffill(inplace=True)
    elif selected_missing == "Next Value":
        df.bfill(inplace=True)
    elif selected_missing == "interpolate":
        df.interpolate(inplace=True)
    
    
def replace_value():
    global df
    st.write("### Replace Value")
    st.sidebar.markdown("[Replace Value](#replace-value)")
    prev = st.text_input("Enter value to be changed")
    change = st.text_input("Enter new value")
    st.text("Data Type:")
    intD = st.button("Int")
    floatD = st.button("Float")
    if intD:
        prev=int(prev)
        new=int(prev)
    elif floatD:
        prev=float(prev)
        new=float(prev)
    
    df=df.replace(prev, change, inplace=True)
    

def display_df():
    global df
    st.dataframe(df)


def reset_df():
    global df
    global intial_df
    st.write("### Reset Data Set")
    st.sidebar.markdown("[Reset Data Set](#reset-data-set)")
    result = st.button("Reset Data Set")
    if result:
        st.write("Data Set reset complete.")
        df = intial_df
    

def main():
    global df
    global intial_df
    st.sidebar.title("EDA Stages")
    reset_df()

    st.sidebar.markdown("[Drop columns](#drop-columns)")
    # drop columns
    st.write("### Drop columns")
    data_cols = df.columns
    selected_cols = st.multiselect("Select any columns to be dropped", data_cols)
    if selected_cols:
        df=df.drop(columns=selected_cols)
        st.write(f"Columns Dropped:{selected_cols}")
    st.sidebar.markdown("[Dataset](#dataset)")
    st.write("### Dataset")
    res = st.button("Show Dataset")
    if res:
        display_df()
    descriptive_analysis()
    # replace_value()
    # missing_values()
    # collinearity_pairs()
    # data_visualization()
        
      

# File upload

if uploaded_file is not None:
    main()