LakshmiHarika commited on
Commit
4261a67
·
verified ·
1 Parent(s): ac70638

Update pages/2Simple_EDA.py

Browse files
Files changed (1) hide show
  1. pages/2Simple_EDA.py +94 -5
pages/2Simple_EDA.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  import pandas as pd
 
3
 
4
  st.markdown("""
5
  <div style="text-align: center; margin-bottom: 20px;">
@@ -14,27 +15,115 @@ if "df" in st.session_state and st.session_state.df is not None:
14
  df = st.session_state.df
15
 
16
  # Dataset Preview
17
- st.markdown("<h2 style='color: #2a52be;'>Dataset Preview📌</h2>", unsafe_allow_html=True)
18
  st.dataframe(df.head())
19
 
20
  # Shape of the Data
21
- st.markdown("<h2 style='color: #843f5b;'>Dataset Shape📏</h2>", unsafe_allow_html=True)
22
  st.write(f"🔹 The dataset contains **{df.shape[0]} rows** and **{df.shape[1]} columns**.")
23
 
24
  # Column Names & Data Types
25
- st.markdown("<h2 style='color: #e25822;'>Column Names & Data Types🛠</h2>", unsafe_allow_html=True)
26
  st.write(df.dtypes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # Checking for Missing Values
29
- st.markdown("<h2 style='color: #9400d3;'>Missing Values in the Dataset⚠️</h2>", unsafe_allow_html=True)
30
  missing_values = df.isnull().sum()
31
 
32
  if missing_values.sum() == 0:
33
- st.success("No missing values found!")
34
  else:
35
  st.write("🔹 **Columns with Missing Values:**")
36
  st.write(missing_values[missing_values > 0])
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  else:
40
  st.warning("No dataset found! Please upload a dataset first⚠️.")
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import io
4
 
5
  st.markdown("""
6
  <div style="text-align: center; margin-bottom: 20px;">
 
15
  df = st.session_state.df
16
 
17
  # Dataset Preview
18
+ st.markdown("<h3 style='color: #2a52be;'>Dataset Preview📌</h3>", unsafe_allow_html=True)
19
  st.dataframe(df.head())
20
 
21
  # Shape of the Data
22
+ st.markdown("<h3 style='color: #843f5b;'>Dataset Shape📏</h3>", unsafe_allow_html=True)
23
  st.write(f"🔹 The dataset contains **{df.shape[0]} rows** and **{df.shape[1]} columns**.")
24
 
25
  # Column Names & Data Types
26
+ st.markdown("<h3 style='color: #e25822;'>Column Names & Data Types🛠</h3>", unsafe_allow_html=True)
27
  st.write(df.dtypes)
28
+
29
+ # 📝 Dataset Information (Equivalent to df.info())
30
+ st.markdown("<h2 style='color: #9400d3;'>📝 Dataset Information</h2>", unsafe_allow_html=True)
31
+
32
+ buffer = io.StringIO()
33
+ df.info(buf=buffer)
34
+ info_str = buffer.getvalue()
35
+ st.text(info_str)
36
+
37
+ # Numerical and categorical Columns
38
+ st.markdown("<h3 style='color: #9400d3;'>Numerical and Categorical Columns</h3>", unsafe_allow_html=True)
39
+
40
+ numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
41
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
42
+
43
+ st.write(f"🔹 **Numerical Columns ({len(numerical_cols)}):** {', '.join(numerical_cols) if numerical_cols else 'None'}")
44
+ st.write(f"🔹 **Categorical Columns ({len(categorical_cols)}):** {', '.join(categorical_cols) if categorical_cols else 'None'}")
45
+
46
+ # Unique Values in Categorical Columns
47
+ st.markdown("<h3 style='color: #e25822;'>Unique Values in Categorical Columns</h3>", unsafe_allow_html=True)
48
+
49
+ if categorical_cols:
50
+ for col in categorical_cols:
51
+ unique_count = df[col].nunique()
52
+ st.write(f"**{col}:** {unique_count} unique values")
53
+ else:
54
+ st.info("No categorical columns detectedℹ️.")
55
+
56
+ # Value Counts in Categorical Columns
57
+ st.markdown("<h3 style='color: #9400d3;'>Value Counts in Categorical Columns</h3>", unsafe_allow_html=True)
58
+
59
+ if categorical_cols:
60
+ for col in categorical_cols:
61
+ st.write(f"🔹 **{col} Value Distribution:**")
62
+ st.write(df[col].value_counts().head(10)) # Show top 10 categories
63
+ else:
64
+ st.info("No categorical columns detectedℹ️.")
65
+
66
+
67
+ # Summary Statistics
68
+ st.markdown("<h3 style='color: #843f5b;'>Summary Statistics for Numerical Columns</h3>", unsafe_allow_html=True)
69
+
70
+ st.write("🔹 **Basic statistical insights into the dataset:**")
71
+ st.write(df.describe())
72
+
73
+ st.markdown("<h3 style='color: #2a52be;'>Summary Statistics for Categorical Columns</h3>", unsafe_allow_html=True)
74
+
75
+ if categorical_cols:
76
+ st.write(df[categorical_cols].describe(include='object'))
77
+ else:
78
+ st.info("No categorical columns detectedℹ️.")
79
 
80
  # Checking for Missing Values
81
+ st.markdown("<h3 style='color: #9400d3;'>Missing Values in the Dataset⚠️</h3>", unsafe_allow_html=True)
82
  missing_values = df.isnull().sum()
83
 
84
  if missing_values.sum() == 0:
85
+ st.success("No missing values found!")
86
  else:
87
  st.write("🔹 **Columns with Missing Values:**")
88
  st.write(missing_values[missing_values > 0])
89
 
90
+ # Checking for Duplicate Records
91
+ st.markdown("<h3 style='color: #2a52be;'>Duplicate Records</h3>", unsafe_allow_html=True)
92
+ duplicate_count = df.duplicated().sum()
93
+
94
+ if duplicate_count == 0:
95
+ st.success("No duplicate records found!")
96
+ else:
97
+ st.warning(f"Found {duplicate_count} duplicate rows in the dataset.")
98
+ st.write("🔹 **Example Duplicate Rows:**")
99
+ st.dataframe(df[df.duplicated()].head())
100
+
101
+ # 📊 Outlier Detection
102
+ st.markdown("<h3 style='color: #e25822;'>📉 Outlier Detection in Numerical Columns</h3>", unsafe_allow_html=True)
103
+
104
+ if numerical_cols:
105
+ outlier_info = {}
106
+
107
+ for col in numerical_cols:
108
+ Q1 = df[col].quantile(0.25)
109
+ Q3 = df[col].quantile(0.75)
110
+ IQR = Q3 - Q1
111
+ lower_bound = Q1 - 1.5 * IQR
112
+ upper_bound = Q3 + 1.5 * IQR
113
+ outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
114
+
115
+ if outliers > 0:
116
+ outlier_info[col] = outliers
117
+
118
+ if outlier_info:
119
+ st.warning("⚠️ Outliers detected in the following numerical columns:")
120
+ for col, count in outlier_info.items():
121
+ st.write(f"🔹 **{col}:** {count} outliers")
122
+ else:
123
+ st.success("✅ No significant outliers detected!")
124
+ else:
125
+ st.info("ℹ️ No numerical columns detected.")
126
+
127
 
128
  else:
129
  st.warning("No dataset found! Please upload a dataset first⚠️.")