| import streamlit as st |
| import pandas as pd |
| import seaborn as sns |
| import matplotlib.pyplot as plt |
|
|
| |
| def generate_univariate_insights(data, column): |
| mean_val = data[column].mean() |
| median_val = data[column].median() |
| std_val = data[column].std() |
| min_val = data[column].min() |
| max_val = data[column].max() |
|
|
| insights = f""" |
| - The mean value of '{column}' is {mean_val:.2f}. |
| - The median value is {median_val:.2f}, indicating the central tendency of the data. |
| - The standard deviation is {std_val:.2f}, suggesting the spread of the values. |
| - The minimum value observed is {min_val}, and the maximum value is {max_val}. |
| """ |
| return insights |
|
|
| |
| def generate_bivariate_insights(data, col1, col2): |
| correlation = data[col1].corr(data[col2]) |
| |
| insights = f""" |
| - The correlation between '{col1}' and '{col2}' is {correlation:.2f}. |
| - A correlation close to 1 indicates a strong positive relationship, while a correlation close to -1 indicates a strong negative relationship. |
| - A correlation near 0 suggests no linear relationship between the variables. |
| """ |
| return insights |
|
|
| |
| def generate_multivariate_insights(data, columns): |
| correlations = data[columns].corr() |
| insights = f""" |
| - The pairplot shows the relationships between the selected numeric variables: {', '.join(columns)}. |
| - The diagonal displays the distributions of each variable. |
| - Strong correlations (positive or negative) can be seen in the scatter plots between some variables. |
| """ |
| return insights |
|
|
| |
| st.markdown(""" |
| # Exploratory Data Analysis (EDA) |
| Exploratory Data Analysis (EDA) is an essential step in the data analysis process. It involves: |
| - **Understanding the Structure**: By examining the dataset’s statistics and structure, we can identify patterns, trends, and potential issues. |
| - **Visualizing Distributions**: Histograms and boxplots give insight into the distribution of data, the spread of numerical values, and the presence of any outliers. |
| - **Finding Relationships**: Through scatter plots and correlation matrices, we can identify relationships between two or more variables, which helps in building predictive models. |
| |
| EDA helps in: |
| - Cleaning the dataset by handling missing values, detecting outliers, and fixing errors. |
| - Gaining insights that can inform further analysis or modeling steps. |
| """) |
|
|
| |
| uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"]) |
|
|
| if uploaded_file is not None: |
| |
| data = pd.read_csv(uploaded_file) |
| st.write("### Uploaded Dataset:") |
| st.dataframe(data) |
|
|
| |
| st.write("### Dataset Overview:") |
| st.write(data.describe()) |
|
|
| |
| st.write("### Missing Values:") |
| st.write(data.isnull().sum()) |
|
|
| |
| st.write("### Correlation Matrix:") |
| numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns |
| if len(numeric_columns) > 1: |
| st.write(data[numeric_columns].corr()) |
|
|
| st.write("Heatmap of Correlation Matrix:") |
| fig, ax = plt.subplots(figsize=(10, 8)) |
| sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax) |
| st.pyplot(fig) |
|
|
| |
| st.write("### Univariate Analysis: Distribution of Columns") |
| selected_numeric_column = st.selectbox("Select a Numeric Column for Univariate Analysis", numeric_columns) |
| |
| |
| st.write(f"Histogram for '{selected_numeric_column}':") |
| fig, ax = plt.subplots() |
| sns.histplot(data[selected_numeric_column], kde=True, ax=ax) |
| st.pyplot(fig) |
|
|
| |
| univariate_insights = generate_univariate_insights(data, selected_numeric_column) |
| st.write("### Insights:") |
| st.write(univariate_insights) |
|
|
| |
| st.write(f"Boxplot for '{selected_numeric_column}':") |
| fig, ax = plt.subplots() |
| sns.boxplot(x=data[selected_numeric_column], ax=ax) |
| st.pyplot(fig) |
|
|
| |
| st.write("### Bivariate Analysis: Relationships between Two Variables") |
| selected_bivariate_columns = st.multiselect( |
| "Select Two Columns for Bivariate Analysis", |
| options=numeric_columns, |
| default=numeric_columns[:2] |
| ) |
| |
| if len(selected_bivariate_columns) == 2: |
| st.write(f"Scatter Plot between '{selected_bivariate_columns[0]}' and '{selected_bivariate_columns[1]}':") |
| fig, ax = plt.subplots() |
| sns.scatterplot(x=data[selected_bivariate_columns[0]], y=data[selected_bivariate_columns[1]], ax=ax) |
| st.pyplot(fig) |
|
|
| |
| bivariate_insights = generate_bivariate_insights(data, selected_bivariate_columns[0], selected_bivariate_columns[1]) |
| st.write("### Insights:") |
| st.write(bivariate_insights) |
|
|
| |
| st.write("### Multivariate Analysis: Relationships between Multiple Variables") |
| selected_multivariate_columns = st.multiselect( |
| "Select Columns for Multivariate Analysis", |
| options=numeric_columns, |
| default=numeric_columns[:3] |
| ) |
| |
| if len(selected_multivariate_columns) > 1: |
| st.write(f"Pairplot for selected variables: {', '.join(selected_multivariate_columns)}") |
| fig, ax = plt.subplots(figsize=(10, 8)) |
| sns.pairplot(data[selected_multivariate_columns]) |
| st.pyplot(fig) |
|
|
| |
| multivariate_insights = generate_multivariate_insights(data, selected_multivariate_columns) |
| st.write("### Insights:") |
| st.write(multivariate_insights) |
|
|
| |
| categorical_columns = data.select_dtypes(include=['object', 'category']).columns |
| if len(categorical_columns) > 0: |
| selected_cat_column = st.selectbox("Select a Categorical Column for Analysis", categorical_columns) |
| |
| st.write(f"Boxplot for '{selected_cat_column}' vs Numeric Column:") |
| selected_numeric_column_for_cat = st.selectbox("Select a Numeric Column to Plot", numeric_columns) |
| fig, ax = plt.subplots() |
| sns.boxplot(x=data[selected_cat_column], y=data[selected_numeric_column_for_cat], ax=ax) |
| st.pyplot(fig) |
|
|
| st.write(f"### Insights:") |
| st.write(f"Boxplot shows the distribution of '{selected_numeric_column_for_cat}' values for each category in '{selected_cat_column}'. It helps identify if the numerical values differ across categories.") |
|
|
| |
| st.markdown(""" |
| This analysis provides a basic understanding of the dataset. |
| You can now proceed with further analysis or modeling. |
| """) |
| else: |
| st.warning("Please upload a dataset to proceed with EDA.") |
|
|