Spaces:

HarnithaS
/

EDA_DescriptiveAnalyzer

Build error

App Files Files Community

Harnitha Suresh commited on Dec 31, 2023

Commit

e6a69b4

1 Parent(s): 2aac190

intial commit-descriptive analysis

Browse files

Files changed (1) hide show

app.py +264 -0

app.py ADDED Viewed

	@@ -0,0 +1,264 @@

+# app.py
+import streamlit as st
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import statsmodels.api as sm
+st.set_option('deprecation.showPyplotGlobalUse', False)
+uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
+st.title("EDA: Descriptive Analyzer")
+# Read the dataset
+if uploaded_file is not None:
+    df = pd.DataFrame()
+    intial_df = pd.read_csv(uploaded_file)
+    df=intial_df
+def descriptive_analysis():
+    global df
+    groups = {
+            "Descriptive Statistics": ["count", "sum", "mean", "median", "min", "max", "std", "var", "quantile"],
+            "Aggregation": ["sum", "mean", "median", "std"], #"agg"
+            # "Cumulative Statistics": ["cumsum", "cumprod", "cummax", "cummin"],# all
+            # "Correlation and Covariance": ["corr", "cov"],#all
+            "Value Counts": [ "nunique"], #["value_counts", "unique"]
+            "Quantiles and Percentiles": ["quantile"], # showing only 0.5
+            "Miscellaneous Statistics": ["prod", "skew", "kurt"], # mad
+            # "Histograms": ["hist"],# all
+            # "Central Tendency": ["mode"],# all
+            # "Missing Data Statistics": ["isna", "notna", "dropna"],# all
+            # "Categorical Statistics": ["describe", "count_categorical"] #all
+        }
+    selected_group = st.sidebar.selectbox("Select Analysis Type", list(groups.keys()))
+    # Create separate dropdowns and result tables for the selected group
+    st.write(f"## {selected_group}")
+    # Multi-select for selecting functions in the group
+    selected_functions = st.multiselect(f"Select functions in {selected_group}", groups[selected_group])
+    if not selected_functions:
+        st.info("Please select at least one function.")
+    else:
+        # Create an empty DataFrame to store the results
+        results_df = pd.DataFrame()
+        function_list=[]
+        # Compute and concatenate results based on user selection
+        for function in selected_functions:
+            if function == "quantile":
+                # For quantile_series, user needs to provide a list of quantiles
+                #quantiles = st.text_input(f"Enter quantiles for {function} (comma-separated):", "0.25,0.5,0.75")
+                quantiles = [0.25,0.5,0.75]
+                result_25 = df.quantile(0.25)
+                result_5 = df.quantile(0.5)
+                result_75 = df.quantile(0.75)
+                result = pd.concat([result_25, result_5, result_75], axis=1)
+                function_list.append('Quantite-0.25')
+                function_list.append('Quantite-0.5')
+                function_list.append('Quantite-0.75')
+            else:
+                # For other functions, apply the selected function to the DataFrame
+                result = getattr(df, function)()
+                function_list.append(function)
+            # Concatenate the result along columns
+            results_df = pd.concat([results_df, result], axis=1)
+        # Transpose the result table
+        results_df = results_df.transpose()
+        results_df['Function'] = function_list
+        results_df = results_df[['Function'] + [col for col in results_df.columns if col != 'Function']]
+        # Display the transposed results
+        st.write("### Results:")
+        st.dataframe(results_df, hide_index = True)
+def data_visualization():
+    global df
+    visuals=["Line Plot", "Bar Chart", "Histogram","Scatter Plot", "Box Plot", "Violin Plot","Heatmap", "Pair Plot", "Pie Chart"]
+    data=pd.DataFrame(df)
+    selected_chart = st.sidebar.selectbox("Select Visualization Type", list(visuals))
+    sns.boxplot(x=df['Age'])
+    st.pyplot()
+    # Display selected chart
+    if selected_chart == "Line Plot":
+        st.subheader("Line Plot")
+        x_col=st.selectbox("Select column for x-axis:",df.columns)
+        y_col=st.selectbox("Select column for y-axis:",df.columns)
+        plt.scatter(df[x_col],df[y_col])
+        st.pyplot()
+    elif selected_chart == "Bar Chart":
+        col=st.multiselect("Select columns for bar-chart",df.columns)
+        plt.bar(col,height=[range(len(col))])
+        st.pyplot()
+    elif selected_chart == "Histogram":
+        st.subheader("Histogram")
+        plt.hist(data['value'], bins=10)
+        st.pyplot()
+    elif selected_chart == "Scatter Plot":
+        st.subheader("Scatter Plot")
+        sm.qqplot(data, line='45')
+        st.pyplot()
+    elif selected_chart == "Box Plot":
+        st.subheader("Box Plot")
+        sns.boxplot(x='category', y='value', data=data)
+        st.pyplot()
+    elif selected_chart == "Violin Plot":
+        st.subheader("Violin Plot")
+        sns.violinplot(x='category', y='value', data=data)
+        st.pyplot()
+    elif selected_chart == "Pair Plot":
+        st.subheader("Pair Plot")
+        sns.pairplot(data)
+        st.pyplot()
+    elif selected_chart == "Pie Chart":
+        st.subheader("Pie Chart")
+        sizes = [15, 30, 45]
+        labels = ['Category A', 'Category B', 'Category C']
+        plt.pie(sizes, labels=labels, autopct='%1.1f%%')
+        st.pyplot()
+def collinearity_pairs():
+    global df
+    st.write("### Collinearity")
+    st.sidebar.markdown("[Collinearity](#collinearity)")
+    # Set your collinearity threshold (e.g., 0.7)
+    st.subheader("Heatmap")
+    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
+    st.pyplot()
+    collinearity_threshold = st.number_input("Enter collinearity threshold from range [0 1]:")
+    # Calculate the correlation matrix
+    correlation_matrix = df.corr()
+    # Find distinct column pairs with collinearity above the threshold
+    high_collinear_pairs = (
+        (correlation_matrix.abs() > collinearity_threshold) & (correlation_matrix < 1)
+    ).stack().reset_index()
+    # Rename the columns for clarity
+    high_collinear_pairs.columns = ['Column1', 'Column2', 'Collinearity']
+    # Filter for pairs with collinearity above the threshold
+    high_collinear_pairs = high_collinear_pairs[high_collinear_pairs['Collinearity']]
+    # Create a list to store the column pairs and their collinearity
+    df_col = []
+    distinct_col = set()
+    for index, row in high_collinear_pairs.iterrows():
+        col1, col2 = row['Column1'], row['Column2']
+        df_col.append([col1, col2])
+        distinct_col.add(col1)
+        distinct_col.add(col2)
+    df_col = pd.DataFrame(df_col)
+    st.write(f"Number of distinct pairs: {len(distinct_col)}")
+    st.write("Collinearity Pairs")
+    st.dataframe(df_col)
+def missing_values():
+    global df
+    st.write("### Missing Values")
+    st.sidebar.markdown("[Missing Values](#missing-values)")
+    methods=["None","dropna","Value","mean","Previous Value","Next Value","interpolate"]
+    selected_missing = st.selectbox("Select Missing Values handling method",methods)
+    if selected_missing == "None":
+        df=df
+    elif selected_missing == "dropna":
+        df.dropna(inplace=True)
+    elif selected_missing == "Value":
+        value = st.text_input("Enter value:")
+        df.fillna(value, inplace=True)
+    elif selected_missing == "mean":
+        df.fillna(df.mean(), inplace=True)
+    elif selected_missing == "Previous Value":
+        df.ffill(inplace=True)
+    elif selected_missing == "Next Value":
+        df.bfill(inplace=True)
+    elif selected_missing == "interpolate":
+        df.interpolate(inplace=True)
+def replace_value():
+    global df
+    st.write("### Replace Value")
+    st.sidebar.markdown("[Replace Value](#replace-value)")
+    prev = st.text_input("Enter value to be changed")
+    change = st.text_input("Enter new value")
+    st.text("Data Type:")
+    intD = st.button("Int")
+    floatD = st.button("Float")
+    if intD:
+        prev=int(prev)
+        new=int(prev)
+    elif floatD:
+        prev=float(prev)
+        new=float(prev)
+    df=df.replace(prev, change, inplace=True)
+def display_df():
+    global df
+    st.dataframe(df)
+def reset_df():
+    global df
+    global intial_df
+    st.write("### Reset Data Set")
+    st.sidebar.markdown("[Reset Data Set](#reset-data-set)")
+    result = st.button("Reset Data Set")
+    if result:
+        st.write("Data Set reset complete.")
+        df = intial_df
+def main():
+    global df
+    global intial_df
+    st.sidebar.title("EDA Stages")
+    reset_df()
+    st.sidebar.markdown("[Drop columns](#drop-columns)")
+    # drop columns
+    st.write("### Drop columns")
+    data_cols = df.columns
+    selected_cols = st.multiselect("Select any columns to be dropped", data_cols)
+    if selected_cols:
+        df=df.drop(columns=selected_cols)
+        st.write(f"Columns Dropped:{selected_cols}")
+    st.sidebar.markdown("[Dataset](#dataset)")
+    st.write("### Dataset")
+    res = st.button("Show Dataset")
+    if res:
+        display_df()
+    descriptive_analysis()
+    # replace_value()
+    # missing_values()
+    # collinearity_pairs()
+    # data_visualization()
+# File upload
+if uploaded_file is not None:
+    main()