File size: 7,257 Bytes
7b7ee83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86aea7c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Function to generate automatic insights for univariate analysis
def generate_univariate_insights(data, column):
    mean_val = data[column].mean()
    median_val = data[column].median()
    std_val = data[column].std()
    min_val = data[column].min()
    max_val = data[column].max()

    insights = f"""
    - The mean value of '{column}' is {mean_val:.2f}.
    - The median value is {median_val:.2f}, indicating the central tendency of the data.
    - The standard deviation is {std_val:.2f}, suggesting the spread of the values.
    - The minimum value observed is {min_val}, and the maximum value is {max_val}.
    """
    return insights

# Function to generate automatic insights for bivariate analysis (scatter plot)
def generate_bivariate_insights(data, col1, col2):
    correlation = data[col1].corr(data[col2])
    
    insights = f"""
    - The correlation between '{col1}' and '{col2}' is {correlation:.2f}.
    - A correlation close to 1 indicates a strong positive relationship, while a correlation close to -1 indicates a strong negative relationship.
    - A correlation near 0 suggests no linear relationship between the variables.
    """
    return insights

# Function to generate automatic insights for multivariate analysis (pairplot)
def generate_multivariate_insights(data, columns):
    correlations = data[columns].corr()
    insights = f"""
    - The pairplot shows the relationships between the selected numeric variables: {', '.join(columns)}.
    - The diagonal displays the distributions of each variable.
    - Strong correlations (positive or negative) can be seen in the scatter plots between some variables.
    """
    return insights

# Introduction to EDA
st.markdown("""
# Exploratory Data Analysis (EDA)
Exploratory Data Analysis (EDA) is an essential step in the data analysis process. It involves:
- **Understanding the Structure**: By examining the dataset’s statistics and structure, we can identify patterns, trends, and potential issues.
- **Visualizing Distributions**: Histograms and boxplots give insight into the distribution of data, the spread of numerical values, and the presence of any outliers.
- **Finding Relationships**: Through scatter plots and correlation matrices, we can identify relationships between two or more variables, which helps in building predictive models.

EDA helps in:
- Cleaning the dataset by handling missing values, detecting outliers, and fixing errors.
- Gaining insights that can inform further analysis or modeling steps.
""")

# File uploader for dataset
uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])

if uploaded_file is not None:
    # Read and display the dataset
    data = pd.read_csv(uploaded_file)
    st.write("### Uploaded Dataset:")
    st.dataframe(data)

    # Dataset Overview
    st.write("### Dataset Overview:")
    st.write(data.describe())

    # Missing values in the dataset
    st.write("### Missing Values:")
    st.write(data.isnull().sum())

    # Correlation matrix for numerical columns
    st.write("### Correlation Matrix:")
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    if len(numeric_columns) > 1:
        st.write(data[numeric_columns].corr())

        st.write("Heatmap of Correlation Matrix:")
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
        st.pyplot(fig)

    # Univariate Plots (For a single column)
    st.write("### Univariate Analysis: Distribution of Columns")
    selected_numeric_column = st.selectbox("Select a Numeric Column for Univariate Analysis", numeric_columns)
    
    # Histogram for univariate distribution
    st.write(f"Histogram for '{selected_numeric_column}':")
    fig, ax = plt.subplots()
    sns.histplot(data[selected_numeric_column], kde=True, ax=ax)
    st.pyplot(fig)

    # Display automatic insights for univariate analysis
    univariate_insights = generate_univariate_insights(data, selected_numeric_column)
    st.write("### Insights:")
    st.write(univariate_insights)

    # Boxplot for univariate distribution
    st.write(f"Boxplot for '{selected_numeric_column}':")
    fig, ax = plt.subplots()
    sns.boxplot(x=data[selected_numeric_column], ax=ax)
    st.pyplot(fig)

    # Bivariate Plots (For two columns)
    st.write("### Bivariate Analysis: Relationships between Two Variables")
    selected_bivariate_columns = st.multiselect(
        "Select Two Columns for Bivariate Analysis", 
        options=numeric_columns, 
        default=numeric_columns[:2]
    )
    
    if len(selected_bivariate_columns) == 2:
        st.write(f"Scatter Plot between '{selected_bivariate_columns[0]}' and '{selected_bivariate_columns[1]}':")
        fig, ax = plt.subplots()
        sns.scatterplot(x=data[selected_bivariate_columns[0]], y=data[selected_bivariate_columns[1]], ax=ax)
        st.pyplot(fig)

        # Display automatic insights for bivariate analysis
        bivariate_insights = generate_bivariate_insights(data, selected_bivariate_columns[0], selected_bivariate_columns[1])
        st.write("### Insights:")
        st.write(bivariate_insights)

    # Multivariate Plots (For multiple columns)
    st.write("### Multivariate Analysis: Relationships between Multiple Variables")
    selected_multivariate_columns = st.multiselect(
        "Select Columns for Multivariate Analysis", 
        options=numeric_columns, 
        default=numeric_columns[:3]
    )
    
    if len(selected_multivariate_columns) > 1:
        st.write(f"Pairplot for selected variables: {', '.join(selected_multivariate_columns)}")
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.pairplot(data[selected_multivariate_columns])
        st.pyplot(fig)

        # Display automatic insights for multivariate analysis
        multivariate_insights = generate_multivariate_insights(data, selected_multivariate_columns)
        st.write("### Insights:")
        st.write(multivariate_insights)

    # Categorical vs Numeric (boxplots)
    categorical_columns = data.select_dtypes(include=['object', 'category']).columns
    if len(categorical_columns) > 0:
        selected_cat_column = st.selectbox("Select a Categorical Column for Analysis", categorical_columns)
        
        st.write(f"Boxplot for '{selected_cat_column}' vs Numeric Column:")
        selected_numeric_column_for_cat = st.selectbox("Select a Numeric Column to Plot", numeric_columns)
        fig, ax = plt.subplots()
        sns.boxplot(x=data[selected_cat_column], y=data[selected_numeric_column_for_cat], ax=ax)
        st.pyplot(fig)

        st.write(f"### Insights:")
        st.write(f"Boxplot shows the distribution of '{selected_numeric_column_for_cat}' values for each category in '{selected_cat_column}'. It helps identify if the numerical values differ across categories.")

    # Download the cleaned dataset if needed
    st.markdown("""
    This analysis provides a basic understanding of the dataset. 
    You can now proceed with further analysis or modeling.
    """)
else:
    st.warning("Please upload a dataset to proceed with EDA.")