Rename pages/EDA.py to pages/Data_CLeaning_and_Preprocessing.py
Browse files- pages/Data_CLeaning_and_Preprocessing.py +90 -0
- pages/EDA.py +0 -163
pages/Data_CLeaning_and_Preprocessing.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import seaborn as sns
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
|
| 6 |
+
# Configure the Streamlit app
|
| 7 |
+
st.title("Exploratory Data Analysis (EDA) App")
|
| 8 |
+
st.markdown("""
|
| 9 |
+
By performing simple Exploratory Data Analysis (EDA), we can examine the data, identify patterns, and detect anomalies or inconsistencies. This process allows us to clean and preprocess the dataset effectively, ensuring it is well-structured and ready for further analysis or modeling. Simple EDA helps uncover hidden insights, address missing or erroneous values, and optimize the data for better decision-making.
|
| 10 |
+
""")
|
| 11 |
+
|
| 12 |
+
# File uploader for dataset
|
| 13 |
+
uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
|
| 14 |
+
|
| 15 |
+
if uploaded_file is not None:
|
| 16 |
+
# Read and display the dataset
|
| 17 |
+
data = pd.read_csv(uploaded_file)
|
| 18 |
+
st.write("### Uploaded Dataset:")
|
| 19 |
+
st.dataframe(data)
|
| 20 |
+
|
| 21 |
+
# Overview of the dataset
|
| 22 |
+
st.write("### Dataset Overview:")
|
| 23 |
+
st.write(data.describe())
|
| 24 |
+
|
| 25 |
+
# Missing values in the dataset
|
| 26 |
+
st.write("### Missing Values:")
|
| 27 |
+
st.write(data.isnull().sum())
|
| 28 |
+
|
| 29 |
+
# Duplicate rows in the dataset
|
| 30 |
+
st.write("### Duplicate Rows:")
|
| 31 |
+
st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
|
| 32 |
+
|
| 33 |
+
# Visualizations for numeric columns
|
| 34 |
+
st.write("### Numeric Column Visualizations:")
|
| 35 |
+
st.write("Histograms:")
|
| 36 |
+
fig, ax = plt.subplots()
|
| 37 |
+
data.hist(ax=ax, figsize=(10, 8))
|
| 38 |
+
st.pyplot(fig)
|
| 39 |
+
|
| 40 |
+
st.write("Boxplot:")
|
| 41 |
+
fig, ax = plt.subplots()
|
| 42 |
+
sns.boxplot(data=data, orient='h', ax=ax)
|
| 43 |
+
st.pyplot(fig)
|
| 44 |
+
|
| 45 |
+
# Value counts and bar plot for categorical data
|
| 46 |
+
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
|
| 47 |
+
if len(categorical_columns) > 0:
|
| 48 |
+
selected_cat_col = st.selectbox("Select a Categorical Column to Analyze", categorical_columns)
|
| 49 |
+
st.write(f"Value Counts for '{selected_cat_col}':")
|
| 50 |
+
st.write(data[selected_cat_col].value_counts())
|
| 51 |
+
|
| 52 |
+
st.write(f"Bar Plot for '{selected_cat_col}':")
|
| 53 |
+
fig, ax = plt.subplots()
|
| 54 |
+
sns.countplot(x=selected_cat_col, data=data, ax=ax)
|
| 55 |
+
st.pyplot(fig)
|
| 56 |
+
else:
|
| 57 |
+
st.write("No categorical columns available for analysis.")
|
| 58 |
+
|
| 59 |
+
# Correlation matrix
|
| 60 |
+
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
|
| 61 |
+
if len(numeric_columns) > 1:
|
| 62 |
+
st.write("### Correlation Matrix:")
|
| 63 |
+
st.write(data[numeric_columns].corr())
|
| 64 |
+
|
| 65 |
+
st.write("Heatmap of Correlation Matrix:")
|
| 66 |
+
fig, ax = plt.subplots()
|
| 67 |
+
sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
|
| 68 |
+
st.pyplot(fig)
|
| 69 |
+
|
| 70 |
+
# Clean the data: Handle missing values and duplicates
|
| 71 |
+
st.write("### Cleaned Dataset:")
|
| 72 |
+
cleaned_data = data.drop_duplicates() # Remove duplicate rows
|
| 73 |
+
#cleaned_data = cleaned_data.fillna(cleaned_data.mean()) # Replace missing values with the mean for numeric columns
|
| 74 |
+
st.dataframe(cleaned_data)
|
| 75 |
+
|
| 76 |
+
# Download button for the cleaned dataset
|
| 77 |
+
cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
|
| 78 |
+
st.download_button(
|
| 79 |
+
label="Download Cleaned Dataset",
|
| 80 |
+
data=cleaned_csv,
|
| 81 |
+
file_name="cleaned_dataset.csv",
|
| 82 |
+
mime="text/csv"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
st.markdown("""
|
| 86 |
+
This analysis provides a basic understanding of the dataset.
|
| 87 |
+
You can now download the cleaned dataset and proceed with further analysis or modeling.
|
| 88 |
+
""")
|
| 89 |
+
else:
|
| 90 |
+
st.warning("Please upload a dataset to proceed with Simple EDA.")
|
pages/EDA.py
DELETED
|
@@ -1,163 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import seaborn as sns
|
| 4 |
-
import matplotlib.pyplot as plt
|
| 5 |
-
|
| 6 |
-
# Function to generate automatic insights for univariate analysis
|
| 7 |
-
def generate_univariate_insights(data, column):
|
| 8 |
-
mean_val = data[column].mean()
|
| 9 |
-
median_val = data[column].median()
|
| 10 |
-
std_val = data[column].std()
|
| 11 |
-
min_val = data[column].min()
|
| 12 |
-
max_val = data[column].max()
|
| 13 |
-
|
| 14 |
-
insights = f"""
|
| 15 |
-
- The mean value of '{column}' is {mean_val:.2f}.
|
| 16 |
-
- The median value is {median_val:.2f}, indicating the central tendency of the data.
|
| 17 |
-
- The standard deviation is {std_val:.2f}, suggesting the spread of the values.
|
| 18 |
-
- The minimum value observed is {min_val}, and the maximum value is {max_val}.
|
| 19 |
-
"""
|
| 20 |
-
return insights
|
| 21 |
-
|
| 22 |
-
# Function to generate automatic insights for bivariate analysis (scatter plot)
|
| 23 |
-
def generate_bivariate_insights(data, col1, col2):
|
| 24 |
-
correlation = data[col1].corr(data[col2])
|
| 25 |
-
|
| 26 |
-
insights = f"""
|
| 27 |
-
- The correlation between '{col1}' and '{col2}' is {correlation:.2f}.
|
| 28 |
-
- A correlation close to 1 indicates a strong positive relationship, while a correlation close to -1 indicates a strong negative relationship.
|
| 29 |
-
- A correlation near 0 suggests no linear relationship between the variables.
|
| 30 |
-
"""
|
| 31 |
-
return insights
|
| 32 |
-
|
| 33 |
-
# Function to generate automatic insights for multivariate analysis (pairplot)
|
| 34 |
-
def generate_multivariate_insights(data, columns):
|
| 35 |
-
correlations = data[columns].corr()
|
| 36 |
-
insights = f"""
|
| 37 |
-
- The pairplot shows the relationships between the selected numeric variables: {', '.join(columns)}.
|
| 38 |
-
- The diagonal displays the distributions of each variable.
|
| 39 |
-
- Strong correlations (positive or negative) can be seen in the scatter plots between some variables.
|
| 40 |
-
"""
|
| 41 |
-
return insights
|
| 42 |
-
|
| 43 |
-
# Introduction to EDA
|
| 44 |
-
st.markdown("""
|
| 45 |
-
# Exploratory Data Analysis (EDA)
|
| 46 |
-
Exploratory Data Analysis (EDA) is an essential step in the data analysis process. It involves:
|
| 47 |
-
- **Understanding the Structure**: By examining the dataset’s statistics and structure, we can identify patterns, trends, and potential issues.
|
| 48 |
-
- **Visualizing Distributions**: Histograms and boxplots give insight into the distribution of data, the spread of numerical values, and the presence of any outliers.
|
| 49 |
-
- **Finding Relationships**: Through scatter plots and correlation matrices, we can identify relationships between two or more variables, which helps in building predictive models.
|
| 50 |
-
|
| 51 |
-
EDA helps in:
|
| 52 |
-
- Cleaning the dataset by handling missing values, detecting outliers, and fixing errors.
|
| 53 |
-
- Gaining insights that can inform further analysis or modeling steps.
|
| 54 |
-
""")
|
| 55 |
-
|
| 56 |
-
# File uploader for dataset
|
| 57 |
-
uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
|
| 58 |
-
|
| 59 |
-
if uploaded_file is not None:
|
| 60 |
-
# Read and display the dataset
|
| 61 |
-
data = pd.read_csv(uploaded_file)
|
| 62 |
-
st.write("### Uploaded Dataset:")
|
| 63 |
-
st.dataframe(data)
|
| 64 |
-
|
| 65 |
-
# Dataset Overview
|
| 66 |
-
st.write("### Dataset Overview:")
|
| 67 |
-
st.write(data.describe())
|
| 68 |
-
|
| 69 |
-
# Missing values in the dataset
|
| 70 |
-
st.write("### Missing Values:")
|
| 71 |
-
st.write(data.isnull().sum())
|
| 72 |
-
|
| 73 |
-
# Correlation matrix for numerical columns
|
| 74 |
-
st.write("### Correlation Matrix:")
|
| 75 |
-
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
|
| 76 |
-
if len(numeric_columns) > 1:
|
| 77 |
-
st.write(data[numeric_columns].corr())
|
| 78 |
-
|
| 79 |
-
st.write("Heatmap of Correlation Matrix:")
|
| 80 |
-
fig, ax = plt.subplots(figsize=(10, 8))
|
| 81 |
-
sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
|
| 82 |
-
st.pyplot(fig)
|
| 83 |
-
|
| 84 |
-
# Univariate Plots (For a single column)
|
| 85 |
-
st.write("### Univariate Analysis: Distribution of Columns")
|
| 86 |
-
selected_numeric_column = st.selectbox("Select a Numeric Column for Univariate Analysis", numeric_columns)
|
| 87 |
-
|
| 88 |
-
# Histogram for univariate distribution
|
| 89 |
-
st.write(f"Histogram for '{selected_numeric_column}':")
|
| 90 |
-
fig, ax = plt.subplots()
|
| 91 |
-
sns.histplot(data[selected_numeric_column], kde=True, ax=ax)
|
| 92 |
-
st.pyplot(fig)
|
| 93 |
-
|
| 94 |
-
# Display automatic insights for univariate analysis
|
| 95 |
-
univariate_insights = generate_univariate_insights(data, selected_numeric_column)
|
| 96 |
-
st.write("### Insights:")
|
| 97 |
-
st.write(univariate_insights)
|
| 98 |
-
|
| 99 |
-
# Boxplot for univariate distribution
|
| 100 |
-
st.write(f"Boxplot for '{selected_numeric_column}':")
|
| 101 |
-
fig, ax = plt.subplots()
|
| 102 |
-
sns.boxplot(x=data[selected_numeric_column], ax=ax)
|
| 103 |
-
st.pyplot(fig)
|
| 104 |
-
|
| 105 |
-
# Bivariate Plots (For two columns)
|
| 106 |
-
st.write("### Bivariate Analysis: Relationships between Two Variables")
|
| 107 |
-
selected_bivariate_columns = st.multiselect(
|
| 108 |
-
"Select Two Columns for Bivariate Analysis",
|
| 109 |
-
options=numeric_columns,
|
| 110 |
-
default=numeric_columns[:2]
|
| 111 |
-
)
|
| 112 |
-
|
| 113 |
-
if len(selected_bivariate_columns) == 2:
|
| 114 |
-
st.write(f"Scatter Plot between '{selected_bivariate_columns[0]}' and '{selected_bivariate_columns[1]}':")
|
| 115 |
-
fig, ax = plt.subplots()
|
| 116 |
-
sns.scatterplot(x=data[selected_bivariate_columns[0]], y=data[selected_bivariate_columns[1]], ax=ax)
|
| 117 |
-
st.pyplot(fig)
|
| 118 |
-
|
| 119 |
-
# Display automatic insights for bivariate analysis
|
| 120 |
-
bivariate_insights = generate_bivariate_insights(data, selected_bivariate_columns[0], selected_bivariate_columns[1])
|
| 121 |
-
st.write("### Insights:")
|
| 122 |
-
st.write(bivariate_insights)
|
| 123 |
-
|
| 124 |
-
# Multivariate Plots (For multiple columns)
|
| 125 |
-
st.write("### Multivariate Analysis: Relationships between Multiple Variables")
|
| 126 |
-
selected_multivariate_columns = st.multiselect(
|
| 127 |
-
"Select Columns for Multivariate Analysis",
|
| 128 |
-
options=numeric_columns,
|
| 129 |
-
default=numeric_columns[:3]
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
if len(selected_multivariate_columns) > 1:
|
| 133 |
-
st.write(f"Pairplot for selected variables: {', '.join(selected_multivariate_columns)}")
|
| 134 |
-
fig, ax = plt.subplots(figsize=(10, 8))
|
| 135 |
-
sns.pairplot(data[selected_multivariate_columns])
|
| 136 |
-
st.pyplot(fig)
|
| 137 |
-
|
| 138 |
-
# Display automatic insights for multivariate analysis
|
| 139 |
-
multivariate_insights = generate_multivariate_insights(data, selected_multivariate_columns)
|
| 140 |
-
st.write("### Insights:")
|
| 141 |
-
st.write(multivariate_insights)
|
| 142 |
-
|
| 143 |
-
# Categorical vs Numeric (boxplots)
|
| 144 |
-
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
|
| 145 |
-
if len(categorical_columns) > 0:
|
| 146 |
-
selected_cat_column = st.selectbox("Select a Categorical Column for Analysis", categorical_columns)
|
| 147 |
-
|
| 148 |
-
st.write(f"Boxplot for '{selected_cat_column}' vs Numeric Column:")
|
| 149 |
-
selected_numeric_column_for_cat = st.selectbox("Select a Numeric Column to Plot", numeric_columns)
|
| 150 |
-
fig, ax = plt.subplots()
|
| 151 |
-
sns.boxplot(x=data[selected_cat_column], y=data[selected_numeric_column_for_cat], ax=ax)
|
| 152 |
-
st.pyplot(fig)
|
| 153 |
-
|
| 154 |
-
st.write(f"### Insights:")
|
| 155 |
-
st.write(f"Boxplot shows the distribution of '{selected_numeric_column_for_cat}' values for each category in '{selected_cat_column}'. It helps identify if the numerical values differ across categories.")
|
| 156 |
-
|
| 157 |
-
# Download the cleaned dataset if needed
|
| 158 |
-
st.markdown("""
|
| 159 |
-
This analysis provides a basic understanding of the dataset.
|
| 160 |
-
You can now proceed with further analysis or modeling.
|
| 161 |
-
""")
|
| 162 |
-
else:
|
| 163 |
-
st.warning("Please upload a dataset to proceed with EDA.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|