Spaces:

Mpavan45
/

Model

Sleeping

App Files Files Community

Mpavan45 commited on Jan 6, 2025

Commit

c4f9397

verified ·

1 Parent(s): 9c761b8

Update pages/Simple EDA.py

Browse files

Files changed (1) hide show

pages/Simple EDA.py +89 -21

pages/Simple EDA.py CHANGED Viewed

@@ -1,33 +1,101 @@
 import streamlit as st
 import pandas as pd
-# EDA and Feature Engineering Page
-st.title("Simple EDA")
 st.markdown("""
-By performing simple Exploratory Data Analysis (EDA), we can examine the data, identify patterns, and detect anomalies or inconsistencies. This process allows us to clean and preprocess the dataset effectively, ensuring it is well-structured and ready for further analysis or modeling..
 """)
-# File uploader for dataset
-uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
-if uploaded_file is not None:
-    # Read and display the dataset
-    data = pd.read_csv(uploaded_file)
-    st.write("### Uploaded Dataset:")
-    st.dataframe(data)
-    # Overview of the dataset
-    st.write("### Dataset Overview:")
-    st.write(data.describe())
-    # Missing values in the dataset
-    st.write("### Missing Values:")
-    st.write(data.isnull().sum())
-    # Correlation matrix
-    st.write("### Correlation Matrix:")
-    st.write(data.corr())
 else:
-    st.warning("Please upload a dataset to proceed with EDA.")

 import streamlit as st
 import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+# Configure the Streamlit app
+st.title("Exploratory Data Analysis (EDA) App")
 st.markdown("""
+This app allows you to perform basic EDA on your dataset.
+Upload your dataset to explore, clean, and visualize your data interactively.
 """)
+# File upload
+uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type="csv")
+if uploaded_file:
+    # Load dataset
+    df = pd.read_csv(uploaded_file)
+    st.subheader("Dataset Overview")
+    st.write("First 5 Rows of the Dataset:")
+    st.write(df.head())
+    # Basic Information
+    st.subheader("Basic Information about the Dataset")
+    st.write("Structure of the Dataset:")
+    buffer = []
+    df.info(buf=buffer)
+    st.text("".join(buffer))
+    st.write("Summary of Numeric Columns:")
+    st.write(df.describe())
+    st.write("Data Types of Each Column:")
+    st.write(df.dtypes)
+    # Missing Values
+    st.subheader("Missing Values")
+    st.write("Number of Missing Values per Column:")
+    st.write(df.isnull().sum())
+    # Duplicate Rows
+    st.subheader("Duplicate Rows")
+    st.write(f"Number of Duplicate Rows: {df.duplicated().sum()}")
+    # Visualize Numeric Columns
+    st.subheader("Numeric Column Visualizations")
+    st.write("Histograms:")
+    fig, ax = plt.subplots(figsize=(10, 8))
+    df.hist(ax=ax)
+    st.pyplot(fig)
+    st.write("Boxplot:")
+    fig, ax = plt.subplots()
+    sns.boxplot(data=df, orient='h', ax=ax)
+    st.pyplot(fig)
+    # Categorical Column Analysis
+    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
+    if len(categorical_columns) > 0:
+        st.subheader("Categorical Column Analysis")
+        selected_cat_col = st.selectbox("Select a Categorical Column to Analyze", categorical_columns)
+        st.write(f"Value Counts for {selected_cat_col}:")
+        st.write(df[selected_cat_col].value_counts())
+        st.write(f"Bar Plot for {selected_cat_col}:")
+        fig, ax = plt.subplots()
+        sns.countplot(x=selected_cat_col, data=df, ax=ax)
+        st.pyplot(fig)
+    else:
+        st.write("No categorical columns available in the dataset.")
+    # Correlation Matrix
+    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
+    if len(numeric_columns) > 1:
+        st.subheader("Correlation Analysis")
+        st.write("Correlation Matrix:")
+        correlation_matrix = df[numeric_columns].corr()
+        st.write(correlation_matrix)
+        st.write("Heatmap of Correlation Matrix:")
+        fig, ax = plt.subplots()
+        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax)
+        st.pyplot(fig)
+    else:
+        st.write("Not enough numeric columns for correlation analysis.")
+    # Save Cleaned Data
+    st.subheader("Save Cleaned Dataset")
+    if st.button("Save Dataset (after removing duplicates)"):
+        cleaned_df = df.drop_duplicates()
+        cleaned_csv = cleaned_df.to_csv(index=False).encode('utf-8')
+        st.download_button(
+            label="Download Cleaned Dataset",
+            data=cleaned_csv,
+            file_name="cleaned_dataset.csv",
+            mime="text/csv"
+        )
+        st.success("Cleaned dataset is ready for download!")
 else:
+    st.info("Please upload a CSV file to get started.")