|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import seaborn as sns |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
|
|
|
st.title("Exploratory Data Analysis (EDA) App") |
|
|
st.markdown(""" |
|
|
By performing simple Exploratory Data Analysis (EDA), we can examine the data, identify patterns, and detect anomalies or inconsistencies. This process allows us to clean and preprocess the dataset effectively, ensuring it is well-structured and ready for further analysis or modeling. Simple EDA helps uncover hidden insights, address missing or erroneous values, and optimize the data for better decision-making. |
|
|
|
|
|
""") |
|
|
|
|
|
|
|
|
uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"]) |
|
|
|
|
|
if uploaded_file is not None: |
|
|
|
|
|
data = pd.read_csv(uploaded_file) |
|
|
st.write("### Uploaded Dataset:") |
|
|
st.dataframe(data) |
|
|
|
|
|
|
|
|
st.write("### Dataset Overview:") |
|
|
st.write(data.describe()) |
|
|
|
|
|
|
|
|
st.write("### Missing Values:") |
|
|
st.write(data.isnull().sum()) |
|
|
|
|
|
|
|
|
st.write("### Duplicate Rows:") |
|
|
st.write(f"Number of duplicate rows: {data.duplicated().sum()}") |
|
|
|
|
|
|
|
|
st.write("### Numeric Column Visualizations:") |
|
|
st.write("Histograms:") |
|
|
fig, ax = plt.subplots() |
|
|
data.hist(ax=ax, figsize=(10, 8)) |
|
|
st.pyplot(fig) |
|
|
|
|
|
st.write("Boxplot:") |
|
|
fig, ax = plt.subplots() |
|
|
sns.boxplot(data=data, orient='h', ax=ax) |
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
|
categorical_columns = data.select_dtypes(include=['object', 'category']).columns |
|
|
if len(categorical_columns) > 0: |
|
|
selected_cat_col = st.selectbox("Select a Categorical Column to Analyze", categorical_columns) |
|
|
st.write(f"Value Counts for '{selected_cat_col}':") |
|
|
st.write(data[selected_cat_col].value_counts()) |
|
|
|
|
|
st.write(f"Bar Plot for '{selected_cat_col}':") |
|
|
fig, ax = plt.subplots() |
|
|
sns.countplot(x=selected_cat_col, data=data, ax=ax) |
|
|
st.pyplot(fig) |
|
|
else: |
|
|
st.write("No categorical columns available for analysis.") |
|
|
|
|
|
|
|
|
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns |
|
|
if len(numeric_columns) > 1: |
|
|
st.write("### Correlation Matrix:") |
|
|
st.write(data[numeric_columns].corr()) |
|
|
|
|
|
st.write("Heatmap of Correlation Matrix:") |
|
|
fig, ax = plt.subplots() |
|
|
sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax) |
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
|
st.write("### Cleaned Dataset:") |
|
|
cleaned_data = data.drop_duplicates() |
|
|
|
|
|
st.dataframe(cleaned_data) |
|
|
|
|
|
|
|
|
cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8') |
|
|
st.download_button( |
|
|
label="Download Cleaned Dataset", |
|
|
data=cleaned_csv, |
|
|
file_name="cleaned_dataset.csv", |
|
|
mime="text/csv" |
|
|
) |
|
|
|
|
|
st.markdown(""" |
|
|
This analysis provides a basic understanding of the dataset. |
|
|
You can now download the cleaned dataset and proceed with further analysis or modeling. |
|
|
""") |
|
|
else: |
|
|
st.warning("Please upload a dataset to proceed with Simple EDA.") |
|
|
|