File size: 7,257 Bytes
7b7ee83 86aea7c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Function to generate automatic insights for univariate analysis
def generate_univariate_insights(data, column):
mean_val = data[column].mean()
median_val = data[column].median()
std_val = data[column].std()
min_val = data[column].min()
max_val = data[column].max()
insights = f"""
- The mean value of '{column}' is {mean_val:.2f}.
- The median value is {median_val:.2f}, indicating the central tendency of the data.
- The standard deviation is {std_val:.2f}, suggesting the spread of the values.
- The minimum value observed is {min_val}, and the maximum value is {max_val}.
"""
return insights
# Function to generate automatic insights for bivariate analysis (scatter plot)
def generate_bivariate_insights(data, col1, col2):
correlation = data[col1].corr(data[col2])
insights = f"""
- The correlation between '{col1}' and '{col2}' is {correlation:.2f}.
- A correlation close to 1 indicates a strong positive relationship, while a correlation close to -1 indicates a strong negative relationship.
- A correlation near 0 suggests no linear relationship between the variables.
"""
return insights
# Function to generate automatic insights for multivariate analysis (pairplot)
def generate_multivariate_insights(data, columns):
correlations = data[columns].corr()
insights = f"""
- The pairplot shows the relationships between the selected numeric variables: {', '.join(columns)}.
- The diagonal displays the distributions of each variable.
- Strong correlations (positive or negative) can be seen in the scatter plots between some variables.
"""
return insights
# Introduction to EDA
st.markdown("""
# Exploratory Data Analysis (EDA)
Exploratory Data Analysis (EDA) is an essential step in the data analysis process. It involves:
- **Understanding the Structure**: By examining the dataset’s statistics and structure, we can identify patterns, trends, and potential issues.
- **Visualizing Distributions**: Histograms and boxplots give insight into the distribution of data, the spread of numerical values, and the presence of any outliers.
- **Finding Relationships**: Through scatter plots and correlation matrices, we can identify relationships between two or more variables, which helps in building predictive models.
EDA helps in:
- Cleaning the dataset by handling missing values, detecting outliers, and fixing errors.
- Gaining insights that can inform further analysis or modeling steps.
""")
# File uploader for dataset
uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
if uploaded_file is not None:
# Read and display the dataset
data = pd.read_csv(uploaded_file)
st.write("### Uploaded Dataset:")
st.dataframe(data)
# Dataset Overview
st.write("### Dataset Overview:")
st.write(data.describe())
# Missing values in the dataset
st.write("### Missing Values:")
st.write(data.isnull().sum())
# Correlation matrix for numerical columns
st.write("### Correlation Matrix:")
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
if len(numeric_columns) > 1:
st.write(data[numeric_columns].corr())
st.write("Heatmap of Correlation Matrix:")
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
st.pyplot(fig)
# Univariate Plots (For a single column)
st.write("### Univariate Analysis: Distribution of Columns")
selected_numeric_column = st.selectbox("Select a Numeric Column for Univariate Analysis", numeric_columns)
# Histogram for univariate distribution
st.write(f"Histogram for '{selected_numeric_column}':")
fig, ax = plt.subplots()
sns.histplot(data[selected_numeric_column], kde=True, ax=ax)
st.pyplot(fig)
# Display automatic insights for univariate analysis
univariate_insights = generate_univariate_insights(data, selected_numeric_column)
st.write("### Insights:")
st.write(univariate_insights)
# Boxplot for univariate distribution
st.write(f"Boxplot for '{selected_numeric_column}':")
fig, ax = plt.subplots()
sns.boxplot(x=data[selected_numeric_column], ax=ax)
st.pyplot(fig)
# Bivariate Plots (For two columns)
st.write("### Bivariate Analysis: Relationships between Two Variables")
selected_bivariate_columns = st.multiselect(
"Select Two Columns for Bivariate Analysis",
options=numeric_columns,
default=numeric_columns[:2]
)
if len(selected_bivariate_columns) == 2:
st.write(f"Scatter Plot between '{selected_bivariate_columns[0]}' and '{selected_bivariate_columns[1]}':")
fig, ax = plt.subplots()
sns.scatterplot(x=data[selected_bivariate_columns[0]], y=data[selected_bivariate_columns[1]], ax=ax)
st.pyplot(fig)
# Display automatic insights for bivariate analysis
bivariate_insights = generate_bivariate_insights(data, selected_bivariate_columns[0], selected_bivariate_columns[1])
st.write("### Insights:")
st.write(bivariate_insights)
# Multivariate Plots (For multiple columns)
st.write("### Multivariate Analysis: Relationships between Multiple Variables")
selected_multivariate_columns = st.multiselect(
"Select Columns for Multivariate Analysis",
options=numeric_columns,
default=numeric_columns[:3]
)
if len(selected_multivariate_columns) > 1:
st.write(f"Pairplot for selected variables: {', '.join(selected_multivariate_columns)}")
fig, ax = plt.subplots(figsize=(10, 8))
sns.pairplot(data[selected_multivariate_columns])
st.pyplot(fig)
# Display automatic insights for multivariate analysis
multivariate_insights = generate_multivariate_insights(data, selected_multivariate_columns)
st.write("### Insights:")
st.write(multivariate_insights)
# Categorical vs Numeric (boxplots)
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
if len(categorical_columns) > 0:
selected_cat_column = st.selectbox("Select a Categorical Column for Analysis", categorical_columns)
st.write(f"Boxplot for '{selected_cat_column}' vs Numeric Column:")
selected_numeric_column_for_cat = st.selectbox("Select a Numeric Column to Plot", numeric_columns)
fig, ax = plt.subplots()
sns.boxplot(x=data[selected_cat_column], y=data[selected_numeric_column_for_cat], ax=ax)
st.pyplot(fig)
st.write(f"### Insights:")
st.write(f"Boxplot shows the distribution of '{selected_numeric_column_for_cat}' values for each category in '{selected_cat_column}'. It helps identify if the numerical values differ across categories.")
# Download the cleaned dataset if needed
st.markdown("""
This analysis provides a basic understanding of the dataset.
You can now proceed with further analysis or modeling.
""")
else:
st.warning("Please upload a dataset to proceed with EDA.")
|