Update pages/Simple EDA.py
Browse files- pages/Simple EDA.py +61 -71
pages/Simple EDA.py
CHANGED
|
@@ -6,96 +6,86 @@ import matplotlib.pyplot as plt
|
|
| 6 |
# Configure the Streamlit app
|
| 7 |
st.title("Exploratory Data Analysis (EDA) App")
|
| 8 |
st.markdown("""
|
| 9 |
-
This
|
| 10 |
-
|
| 11 |
""")
|
| 12 |
|
| 13 |
-
# File
|
| 14 |
-
uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type="csv")
|
| 15 |
-
|
| 16 |
-
if uploaded_file:
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
-
st.
|
| 20 |
-
st.
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
st.
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
st.
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
st.write(
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
# Missing Values
|
| 37 |
-
st.subheader("Missing Values")
|
| 38 |
-
st.write("Number of Missing Values per Column:")
|
| 39 |
-
st.write(df.isnull().sum())
|
| 40 |
-
|
| 41 |
-
# Duplicate Rows
|
| 42 |
-
st.subheader("Duplicate Rows")
|
| 43 |
-
st.write(f"Number of Duplicate Rows: {df.duplicated().sum()}")
|
| 44 |
-
|
| 45 |
-
# Visualize Numeric Columns
|
| 46 |
-
st.subheader("Numeric Column Visualizations")
|
| 47 |
st.write("Histograms:")
|
| 48 |
-
fig, ax = plt.subplots(
|
| 49 |
-
|
| 50 |
st.pyplot(fig)
|
| 51 |
|
| 52 |
st.write("Boxplot:")
|
| 53 |
fig, ax = plt.subplots()
|
| 54 |
-
sns.boxplot(data=
|
| 55 |
st.pyplot(fig)
|
| 56 |
|
| 57 |
-
#
|
| 58 |
-
categorical_columns =
|
| 59 |
if len(categorical_columns) > 0:
|
| 60 |
-
st.subheader("Categorical Column Analysis")
|
| 61 |
selected_cat_col = st.selectbox("Select a Categorical Column to Analyze", categorical_columns)
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
st.write(f"
|
| 64 |
-
st.write(df[selected_cat_col].value_counts())
|
| 65 |
-
|
| 66 |
-
st.write(f"Bar Plot for {selected_cat_col}:")
|
| 67 |
fig, ax = plt.subplots()
|
| 68 |
-
sns.countplot(x=selected_cat_col, data=
|
| 69 |
st.pyplot(fig)
|
| 70 |
else:
|
| 71 |
-
st.write("No categorical columns available
|
| 72 |
|
| 73 |
-
# Correlation
|
| 74 |
-
numeric_columns =
|
| 75 |
if len(numeric_columns) > 1:
|
| 76 |
-
st.
|
| 77 |
-
st.write(
|
| 78 |
-
correlation_matrix = df[numeric_columns].corr()
|
| 79 |
-
st.write(correlation_matrix)
|
| 80 |
|
| 81 |
st.write("Heatmap of Correlation Matrix:")
|
| 82 |
fig, ax = plt.subplots()
|
| 83 |
-
sns.heatmap(
|
| 84 |
st.pyplot(fig)
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
#
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
else:
|
| 101 |
-
st.
|
|
|
|
| 6 |
# Configure the Streamlit app
|
| 7 |
st.title("Exploratory Data Analysis (EDA) App")
|
| 8 |
st.markdown("""
|
| 9 |
+
By performing simple Exploratory Data Analysis (EDA), we can examine the data, identify patterns, and detect anomalies or inconsistencies. This process allows us to clean and preprocess the dataset effectively, ensuring it is well-structured and ready for further analysis or modeling. Simple EDA helps uncover hidden insights, address missing or erroneous values, and optimize the data for better decision-making.
|
| 10 |
+
|
| 11 |
""")
|
| 12 |
|
| 13 |
+
# File uploader for dataset
|
| 14 |
+
uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
|
| 15 |
+
|
| 16 |
+
if uploaded_file is not None:
|
| 17 |
+
# Read and display the dataset
|
| 18 |
+
data = pd.read_csv(uploaded_file)
|
| 19 |
+
st.write("### Uploaded Dataset:")
|
| 20 |
+
st.dataframe(data)
|
| 21 |
+
|
| 22 |
+
# Overview of the dataset
|
| 23 |
+
st.write("### Dataset Overview:")
|
| 24 |
+
st.write(data.describe())
|
| 25 |
+
|
| 26 |
+
# Missing values in the dataset
|
| 27 |
+
st.write("### Missing Values:")
|
| 28 |
+
st.write(data.isnull().sum())
|
| 29 |
+
|
| 30 |
+
# Duplicate rows in the dataset
|
| 31 |
+
st.write("### Duplicate Rows:")
|
| 32 |
+
st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
|
| 33 |
+
|
| 34 |
+
# Visualizations for numeric columns
|
| 35 |
+
st.write("### Numeric Column Visualizations:")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
st.write("Histograms:")
|
| 37 |
+
fig, ax = plt.subplots()
|
| 38 |
+
data.hist(ax=ax, figsize=(10, 8))
|
| 39 |
st.pyplot(fig)
|
| 40 |
|
| 41 |
st.write("Boxplot:")
|
| 42 |
fig, ax = plt.subplots()
|
| 43 |
+
sns.boxplot(data=data, orient='h', ax=ax)
|
| 44 |
st.pyplot(fig)
|
| 45 |
|
| 46 |
+
# Value counts and bar plot for categorical data
|
| 47 |
+
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
|
| 48 |
if len(categorical_columns) > 0:
|
|
|
|
| 49 |
selected_cat_col = st.selectbox("Select a Categorical Column to Analyze", categorical_columns)
|
| 50 |
+
st.write(f"Value Counts for '{selected_cat_col}':")
|
| 51 |
+
st.write(data[selected_cat_col].value_counts())
|
| 52 |
|
| 53 |
+
st.write(f"Bar Plot for '{selected_cat_col}':")
|
|
|
|
|
|
|
|
|
|
| 54 |
fig, ax = plt.subplots()
|
| 55 |
+
sns.countplot(x=selected_cat_col, data=data, ax=ax)
|
| 56 |
st.pyplot(fig)
|
| 57 |
else:
|
| 58 |
+
st.write("No categorical columns available for analysis.")
|
| 59 |
|
| 60 |
+
# Correlation matrix
|
| 61 |
+
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
|
| 62 |
if len(numeric_columns) > 1:
|
| 63 |
+
st.write("### Correlation Matrix:")
|
| 64 |
+
st.write(data[numeric_columns].corr())
|
|
|
|
|
|
|
| 65 |
|
| 66 |
st.write("Heatmap of Correlation Matrix:")
|
| 67 |
fig, ax = plt.subplots()
|
| 68 |
+
sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
|
| 69 |
st.pyplot(fig)
|
| 70 |
+
|
| 71 |
+
# Clean the data: Handle missing values and duplicates
|
| 72 |
+
st.write("### Cleaned Dataset:")
|
| 73 |
+
cleaned_data = data.drop_duplicates() # Remove duplicate rows
|
| 74 |
+
cleaned_data = cleaned_data.fillna(cleaned_data.mean()) # Replace missing values with the mean for numeric columns
|
| 75 |
+
st.dataframe(cleaned_data)
|
| 76 |
+
|
| 77 |
+
# Download button for the cleaned dataset
|
| 78 |
+
cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
|
| 79 |
+
st.download_button(
|
| 80 |
+
label="Download Cleaned Dataset",
|
| 81 |
+
data=cleaned_csv,
|
| 82 |
+
file_name="cleaned_dataset.csv",
|
| 83 |
+
mime="text/csv"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
st.markdown("""
|
| 87 |
+
This analysis provides a basic understanding of the dataset.
|
| 88 |
+
You can now download the cleaned dataset and proceed with further analysis or modeling.
|
| 89 |
+
""")
|
| 90 |
else:
|
| 91 |
+
st.warning("Please upload a dataset to proceed with EDA.")
|