Harika22 commited on
Commit
d2358c2
·
verified ·
1 Parent(s): d939849

Update pages/2_Simple_EDA.py

Browse files
Files changed (1) hide show
  1. pages/2_Simple_EDA.py +48 -54
pages/2_Simple_EDA.py CHANGED
@@ -3,112 +3,102 @@ import pandas as pd
3
  import io
4
 
5
  st.markdown("""
6
- <style>
7
- .title-container {
8
- text-align: center;
9
- margin-bottom: 20px;
10
- }
11
- .main-title {
12
- color: #c71585;
13
- font-size: 36px;
14
- font-weight: bold;
15
- }
16
- .subtitle {
17
- color: #4F4F4F;
18
- font-size: 20px;
19
- font-weight: normal;
20
- }
21
- .section-title {
22
- color: #2a52be;
23
- font-size: 24px;
24
- font-weight: bold;
25
- margin-top: 20px;
26
- }
27
- .highlight {
28
- background-color: #f8f8f8;
29
- padding: 10px;
30
- border-radius: 5px;
31
- font-size: 14px;
32
- font-family: monospace;
33
- }
34
- </style>
35
- """, unsafe_allow_html=True)
36
-
37
- st.markdown("""
38
- <div class="title-container">
39
- <h2 class="main-title">Simple EDA: Understanding Your Data🔍</h2>
40
- <h3 class="subtitle">This helps us understand the quality of the data and see how the data looks.</h3>
41
  </div>
42
  """, unsafe_allow_html=True)
43
 
44
  if "df" in st.session_state and st.session_state.df is not None:
45
  df = st.session_state.df
46
 
47
- st.markdown("<h3 class='section-title'>Dataset Preview📌</h3>", unsafe_allow_html=True)
48
  st.dataframe(df.head())
49
 
50
- st.markdown("<h3 class='section-title'>Dataset Shape</h3>", unsafe_allow_html=True)
 
51
  st.write(f"🔹 The dataset contains **{df.shape[0]} rows** and **{df.shape[1]} columns**.")
52
-
53
- st.markdown("<h3 class='section-title'>Column Names & Data Types</h3>", unsafe_allow_html=True)
54
  st.write(df.dtypes)
55
 
56
- st.markdown("<h3 class='section-title'>Dataset Information📝</h3>", unsafe_allow_html=True)
 
57
  buffer = io.StringIO()
58
  df.info(buf=buffer)
59
  info_str = buffer.getvalue()
60
- st.markdown(f"<pre class='highlight'>{info_str}</pre>", unsafe_allow_html=True)
61
 
62
- st.markdown("<h3 class='section-title'>Numerical and Categorical Columns</h3>", unsafe_allow_html=True)
 
 
 
 
63
  numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
64
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
65
 
66
  st.write(f"🔹 **Numerical Columns ({len(numerical_cols)}):** {', '.join(numerical_cols) if numerical_cols else 'None'}")
67
  st.write(f"🔹 **Categorical Columns ({len(categorical_cols)}):** {', '.join(categorical_cols) if categorical_cols else 'None'}")
68
 
69
- st.markdown("<h3 class='section-title'>Unique Values in Categorical Columns</h3>", unsafe_allow_html=True)
 
70
  if categorical_cols:
71
  for col in categorical_cols:
72
- st.write(f"**{col}:** {df[col].nunique()} unique values")
 
73
  else:
74
  st.info("No categorical columns detectedℹ️.")
75
 
76
- st.markdown("<h3 class='section-title'>Value Counts in Categorical Columns</h3>", unsafe_allow_html=True)
 
 
77
  if categorical_cols:
78
  for col in categorical_cols:
79
  st.write(f"🔹 **{col} Value Distribution:**")
80
- st.write(df[col].value_counts().head(10))
81
  else:
82
  st.info("No categorical columns detectedℹ️.")
83
-
84
- st.markdown("<h3 class='section-title'>Summary Statistics for Numerical Columns</h3>", unsafe_allow_html=True)
 
 
 
85
  st.write(df.describe())
 
 
86
 
87
- st.markdown("<h3 class='section-title'>Summary Statistics for Categorical Columns</h3>", unsafe_allow_html=True)
88
  if categorical_cols:
89
  st.write(df[categorical_cols].describe(include='object'))
90
  else:
91
  st.info("No categorical columns detectedℹ️.")
92
-
93
- st.markdown("<h3 class='section-title'>Missing Values in the Dataset⚠️</h3>", unsafe_allow_html=True)
94
  missing_values = df.isnull().sum()
 
95
  if missing_values.sum() == 0:
96
  st.success("No missing values found!")
97
  else:
98
- st.warning("Found missing values in the dataset.")
 
99
  st.write(missing_values[missing_values > 0])
100
 
101
- st.markdown("<h3 class='section-title'>Duplicate Records</h3>", unsafe_allow_html=True)
102
  duplicate_count = df.duplicated().sum()
 
103
  if duplicate_count == 0:
104
  st.success("No duplicate records found!")
105
  else:
106
  st.warning(f"Found {duplicate_count} duplicate rows in the dataset.")
 
107
  st.dataframe(df[df.duplicated()].head())
108
 
109
- st.markdown("<h3 class='section-title'>Outlier Detection</h3>", unsafe_allow_html=True)
 
110
  if numerical_cols:
111
  outlier_info = {}
 
112
  for col in numerical_cols:
113
  Q1 = df[col].quantile(0.25)
114
  Q3 = df[col].quantile(0.75)
@@ -116,8 +106,10 @@ if "df" in st.session_state and st.session_state.df is not None:
116
  lower_bound = Q1 - 1.5 * IQR
117
  upper_bound = Q3 + 1.5 * IQR
118
  outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
 
119
  if outliers > 0:
120
  outlier_info[col] = outliers
 
121
  if outlier_info:
122
  st.warning("Outliers detected:")
123
  for col, count in outlier_info.items():
@@ -126,5 +118,7 @@ if "df" in st.session_state and st.session_state.df is not None:
126
  st.success("No significant outliers detected!")
127
  else:
128
  st.info("No numerical columns detectedℹ️.")
 
 
129
  else:
130
  st.warning("No dataset found! Please upload a dataset first⚠️.")
 
3
  import io
4
 
5
  st.markdown("""
6
+ <div style="text-align: center; margin-bottom: 20px;">
7
+ <h2 style="color: #c71585; font-size: 36px;">Simple EDA: Understanding Your Data🔍</h1>
8
+ <h3 style="color: #4F4F4F; font-size: 20px;">
9
+ This helps us understand the quality of the data and see how the data looks.
10
+ </h3>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  </div>
12
  """, unsafe_allow_html=True)
13
 
14
  if "df" in st.session_state and st.session_state.df is not None:
15
  df = st.session_state.df
16
 
17
+ st.markdown("<h3 style='color: #2a52be;'>Dataset Preview📌</h3>", unsafe_allow_html=True)
18
  st.dataframe(df.head())
19
 
20
+
21
+ st.markdown("<h3 style='color: #843f5b;'>Dataset Shape</h3>", unsafe_allow_html=True)
22
  st.write(f"🔹 The dataset contains **{df.shape[0]} rows** and **{df.shape[1]} columns**.")
23
+
24
+ st.markdown("<h3 style='color: #e25822;'>Column Names & Data Types</h3>", unsafe_allow_html=True)
25
  st.write(df.dtypes)
26
 
27
+ st.markdown("<h3 style='color: #9400d3;'>Dataset Information📝</h3>", unsafe_allow_html=True)
28
+
29
  buffer = io.StringIO()
30
  df.info(buf=buffer)
31
  info_str = buffer.getvalue()
32
+ st.text(info_str)
33
 
34
+ st.markdown(f"<pre style='background-color: #f8f8f8; padding: 10px; border-radius: 5px; font-size: 14px; font-family: monospace;'>{info_str}</pre>", unsafe_allow_html=True)
35
+
36
+
37
+ st.markdown("<h3 style='color: #9400d3;'>Numerical and Categorical Columns</h3>", unsafe_allow_html=True)
38
+
39
  numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
40
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
41
 
42
  st.write(f"🔹 **Numerical Columns ({len(numerical_cols)}):** {', '.join(numerical_cols) if numerical_cols else 'None'}")
43
  st.write(f"🔹 **Categorical Columns ({len(categorical_cols)}):** {', '.join(categorical_cols) if categorical_cols else 'None'}")
44
 
45
+ st.markdown("<h3 style='color: #e25822;'>Unique Values in Categorical Columns</h3>", unsafe_allow_html=True)
46
+
47
  if categorical_cols:
48
  for col in categorical_cols:
49
+ unique_count = df[col].nunique()
50
+ st.write(f"**{col}:** {unique_count} unique values")
51
  else:
52
  st.info("No categorical columns detectedℹ️.")
53
 
54
+ # Value Counts in Categorical Columns
55
+ st.markdown("<h3 style='color: #9400d3;'>Value Counts in Categorical Columns</h3>", unsafe_allow_html=True)
56
+
57
  if categorical_cols:
58
  for col in categorical_cols:
59
  st.write(f"🔹 **{col} Value Distribution:**")
60
+ st.write(df[col].value_counts().head(10)) # Show top 10 categories
61
  else:
62
  st.info("No categorical columns detectedℹ️.")
63
+
64
+
65
+ st.markdown("<h3 style='color: #843f5b;'>Summary Statistics for Numerical Columns</h3>", unsafe_allow_html=True)
66
+
67
+ st.write("🔹 **Basic statistical insights into the dataset:**")
68
  st.write(df.describe())
69
+
70
+ st.markdown("<h3 style='color: #2a52be;'>Summary Statistics for Categorical Columns</h3>", unsafe_allow_html=True)
71
 
 
72
  if categorical_cols:
73
  st.write(df[categorical_cols].describe(include='object'))
74
  else:
75
  st.info("No categorical columns detectedℹ️.")
76
+
77
+ st.markdown("<h3 style='color: #9400d3;'>Missing Values in the Dataset⚠️</h3>", unsafe_allow_html=True)
78
  missing_values = df.isnull().sum()
79
+
80
  if missing_values.sum() == 0:
81
  st.success("No missing values found!")
82
  else:
83
+ st.warning(f"Found missing values in the dataset.")
84
+ st.write("🔹 **Columns with Missing Values:**")
85
  st.write(missing_values[missing_values > 0])
86
 
87
+ st.markdown("<h3 style='color: #2a52be;'>Duplicate Records</h3>", unsafe_allow_html=True)
88
  duplicate_count = df.duplicated().sum()
89
+
90
  if duplicate_count == 0:
91
  st.success("No duplicate records found!")
92
  else:
93
  st.warning(f"Found {duplicate_count} duplicate rows in the dataset.")
94
+ st.write("🔹 **Example Duplicate Rows:**")
95
  st.dataframe(df[df.duplicated()].head())
96
 
97
+ st.markdown("<h3 style='color: #e25822;'>Outlier Detection</h3>", unsafe_allow_html=True)
98
+
99
  if numerical_cols:
100
  outlier_info = {}
101
+
102
  for col in numerical_cols:
103
  Q1 = df[col].quantile(0.25)
104
  Q3 = df[col].quantile(0.75)
 
106
  lower_bound = Q1 - 1.5 * IQR
107
  upper_bound = Q3 + 1.5 * IQR
108
  outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
109
+
110
  if outliers > 0:
111
  outlier_info[col] = outliers
112
+
113
  if outlier_info:
114
  st.warning("Outliers detected:")
115
  for col, count in outlier_info.items():
 
118
  st.success("No significant outliers detected!")
119
  else:
120
  st.info("No numerical columns detectedℹ️.")
121
+
122
+
123
  else:
124
  st.warning("No dataset found! Please upload a dataset first⚠️.")