File size: 5,351 Bytes
d939849
 
 
 
 
d2358c2
 
 
 
 
d939849
 
 
 
 
 
2d76a15
d2358c2
d939849
 
2d76a15
d2358c2
d939849
2d76a15
 
d2358c2
d939849
 
2d76a15
d2358c2
 
d939849
 
 
d2358c2
d939849
d2358c2
 
 
2d76a15
d2358c2
 
d939849
 
 
 
 
 
2d76a15
d2358c2
 
d939849
 
d2358c2
 
d939849
 
 
d2358c2
 
 
d939849
 
 
d2358c2
d939849
 
d2358c2
 
2d76a15
d2358c2
 
 
d939849
d2358c2
 
d939849
 
 
 
 
2d76a15
 
d2358c2
d939849
d2358c2
d939849
 
 
d2358c2
 
d939849
 
2d76a15
d2358c2
d939849
d2358c2
d939849
 
 
 
d2358c2
d939849
 
2d76a15
d2358c2
 
d939849
 
d2358c2
d939849
 
 
 
 
 
 
d2358c2
d939849
 
d2358c2
d939849
 
 
 
 
 
 
 
d2358c2
 
d939849
2d76a15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import streamlit as st
import pandas as pd
import io

st.markdown("""
    <div style="text-align: center; margin-bottom: 20px;">
        <h2 style="color: #c71585; font-size: 36px;">Simple EDA: Understanding Your Data🔍</h1>
        <h3 style="color: #4F4F4F; font-size: 20px;">
            This helps us understand the quality of the data and see how the data looks.
        </h3>
    </div>
""", unsafe_allow_html=True)

if "df" in st.session_state and st.session_state.df is not None:
    df = st.session_state.df

    # Dataset Preview
    st.markdown("<h3 style='color: #2a52be;'>Dataset Preview📌</h3>", unsafe_allow_html=True)
    st.dataframe(df.head())
    
    # Shape of the Data
    st.markdown("<h3 style='color: #843f5b;'>Dataset Shape</h3>", unsafe_allow_html=True)
    st.write(f"🔹 The dataset contains **{df.shape[0]} rows** and **{df.shape[1]} columns**.")
    
    # Column Names & Data Types
    st.markdown("<h3 style='color: #e25822;'>Column Names & Data Types</h3>", unsafe_allow_html=True)
    st.write(df.dtypes)

    # 📝 Dataset Information (Equivalent to df.info())
    st.markdown("<h3 style='color: #9400d3;'>Dataset Information📝</h3>", unsafe_allow_html=True)
    
    buffer = io.StringIO()
    df.info(buf=buffer)
    info_str = buffer.getvalue()
    st.text(info_str)

    st.markdown(f"<pre style='background-color: #f8f8f8; padding: 10px; border-radius: 5px; font-size: 14px; font-family: monospace;'>{info_str}</pre>", unsafe_allow_html=True)


    # Numerical and categorical Columns
    st.markdown("<h3 style='color: #9400d3;'>Numerical and Categorical Columns</h3>", unsafe_allow_html=True)
    
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    st.write(f"🔹 **Numerical Columns ({len(numerical_cols)}):** {', '.join(numerical_cols) if numerical_cols else 'None'}")
    st.write(f"🔹 **Categorical Columns ({len(categorical_cols)}):** {', '.join(categorical_cols) if categorical_cols else 'None'}")

    # Unique Values in Categorical Columns
    st.markdown("<h3 style='color: #e25822;'>Unique Values in Categorical Columns</h3>", unsafe_allow_html=True)
    
    if categorical_cols:
        for col in categorical_cols:
            unique_count = df[col].nunique()
            st.write(f"**{col}:** {unique_count} unique values")
    else:
        st.info("No categorical columns detectedℹ️.")
    
    # Value Counts in Categorical Columns
    st.markdown("<h3 style='color: #9400d3;'>Value Counts in Categorical Columns</h3>", unsafe_allow_html=True)
    
    if categorical_cols:
        for col in categorical_cols:
            st.write(f"🔹 **{col} Value Distribution:**")
            st.write(df[col].value_counts().head(10))  # Show top 10 categories
    else:
        st.info("No categorical columns detectedℹ️.")


    # Summary Statistics
    st.markdown("<h3 style='color: #843f5b;'>Summary Statistics for Numerical Columns</h3>", unsafe_allow_html=True)
    
    st.write("🔹 **Basic statistical insights into the dataset:**")
    st.write(df.describe())

    st.markdown("<h3 style='color: #2a52be;'>Summary Statistics for Categorical Columns</h3>", unsafe_allow_html=True)
    
    if categorical_cols:
        st.write(df[categorical_cols].describe(include='object'))
    else:
        st.info("No categorical columns detectedℹ️.")
    
    # Checking for Missing Values
    st.markdown("<h3 style='color: #9400d3;'>Missing Values in the Dataset⚠️</h3>", unsafe_allow_html=True)
    missing_values = df.isnull().sum()
    
    if missing_values.sum() == 0:
        st.success("No missing values found!")
    else:
        st.warning(f"Found missing values in the dataset.")
        st.write("🔹 **Columns with Missing Values:**")
        st.write(missing_values[missing_values > 0])

    # Checking for Duplicate Records
    st.markdown("<h3 style='color: #2a52be;'>Duplicate Records</h3>", unsafe_allow_html=True)
    duplicate_count = df.duplicated().sum()
    
    if duplicate_count == 0:
        st.success("No duplicate records found!")
    else:
        st.warning(f"Found {duplicate_count} duplicate rows in the dataset.")
        st.write("🔹 **Example Duplicate Rows:**")
        st.dataframe(df[df.duplicated()].head())

    # 📊 Outlier Detection
    st.markdown("<h3 style='color: #e25822;'>Outlier Detection</h3>", unsafe_allow_html=True)
    
    if numerical_cols:
        outlier_info = {}
        
        for col in numerical_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
            
            if outliers > 0:
                outlier_info[col] = outliers
    
        if outlier_info:
            st.warning("Outliers detected:")
            for col, count in outlier_info.items():
                st.write(f"🔹 **{col}:** {count} outliers")
        else:
            st.success("No significant outliers detected!")
    else:
        st.info("No numerical columns detectedℹ️.")


else:
    st.warning("No dataset found! Please upload a dataset first⚠️.")