trohith89 commited on
Commit
3bf549e
·
verified ·
1 Parent(s): 23e3572

Update pages/3_EDA_and_Feature_Engineering.py

Browse files
Files changed (1) hide show
  1. pages/3_EDA_and_Feature_Engineering.py +92 -170
pages/3_EDA_and_Feature_Engineering.py CHANGED
@@ -1,177 +1,99 @@
1
  import streamlit as st
2
- import pandas as pd
3
  import seaborn as sns
4
  import matplotlib.pyplot as plt
5
- from sklearn.preprocessing import LabelEncoder
6
 
7
- st.title("Exploratory Data Analysis (EDA)")
 
8
 
9
- if 'df' in st.session_state:
10
- df = st.session_state['df'] # Retrieve dataset
11
- st.write("### Dataset Preview")
12
- st.dataframe(df)
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  st.write("### Summary Statistics")
15
- st.write(df.describe())
16
-
17
- # Rename columns
18
- df = df.rename(columns={'ProductCategory': 'Category', 'ProductBrand': 'Brand', 'ProductPrice': 'Price'})
19
-
20
- # Correct Price Column values (round off to 2 decimal places)
21
- df['Price'] = df['Price'].apply(lambda x: round(x, 2))
22
-
23
- st.markdown("Complete EDA + Feature Engineering")
24
- st.markdown("## **Univariate Analysis**")
25
-
26
- st.markdown("### PRODUCT CATEGORY")
27
- st.write(fig, ax = plt.subplots(figsize=(10, 6))
28
- sns.countplot(x='Category', data=df, palette='viridis', ax=ax)
29
- ax.set_title("Product Category Distribution")
30
- ax.set_xlabel("Product Category")
31
- ax.set_ylabel("Count")
32
- ax.tick_params(axis='x', rotation=45))
33
- st.plt(fig)
34
- st.markdown("""
35
- **Insights:**
36
- - 5 product categories observed.
37
- - Highest frequency: Smart Phones and Laptops.
38
- - Lowest frequency: Tablets and Headphones.
39
- """)
40
-
41
- st.markdown("### BRANDS")
42
- fig, ax = plt.subplots(figsize=(10, 6))
43
- sns.countplot(x='Brand', data=df, palette='cubehelix', ax=ax)
44
- ax.set_title("Product Brand Distribution")
45
- ax.set_xlabel("Product Brand")
46
- ax.set_ylabel("Count")
47
- ax.tick_params(axis='x', rotation=45)
48
- st.plt(fig)
49
- st.markdown("""
50
- **Insights:**
51
- - Samsung and HP have the highest frequencies.
52
- - Sony, Apple, and other brands follow behind.
53
- """)
54
-
55
- st.markdown("### PRICE DISTRIBUTION")
56
- fig, ax = plt.subplots(figsize=(10, 6))
57
- sns.histplot(df['Price'], kde=True, color='orange', ax=ax)
58
- ax.set_title("Product Price Distribution")
59
- ax.set_xlabel("Product Price")
60
- ax.set_ylabel("Count")
61
- st.plt(fig)
62
- st.markdown("""
63
- **Insights:**
64
- - Products span a wide price range, from near 0 to 3000.
65
- - Significant concentration between 200 and 2500.
66
- """)
67
-
68
- st.markdown("### PRODUCT PRICE BINNING")
69
- df['ProductPriceBucket'] = pd.cut(df['Price'], bins=[100, 500, 1000, 1500, 2000, 3000], labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
70
- fig, ax = plt.subplots(figsize=(10, 6))
71
- sns.countplot(x='ProductPriceBucket', data=df, palette='icefire', ax=ax)
72
- ax.set_title("Product Price Distribution")
73
- ax.set_xlabel("Product Price Bucket")
74
- ax.set_ylabel("Count")
75
- ax.tick_params(axis='x', rotation=45)
76
- st.plt(fig)
77
- st.markdown("""
78
- **Insights:**
79
- - "Very High" price bucket has the highest concentration.
80
- - Fewer products in the "Very Low" bucket.
81
- - "Low", "Medium", and "High" have moderate distribution.
82
- """)
83
-
84
- st.markdown("### AGE DISTRIBUTION AND BINNING")
85
- df['CustomerAgeGroup'] = pd.qcut(df['CustomerAge'], q=4, labels=['Young', 'Middle-aged', 'Mature', 'Senior'])
86
- fig, axs = plt.subplots(1, 2, figsize=(15, 6))
87
-
88
- sns.countplot(x='CustomerAgeGroup', data=df, ax=axs[0], palette='magma')
89
- axs[0].set_title("Customer Age Group Distribution")
90
- axs[0].set_xlabel("Customer Age Group")
91
-
92
- sns.histplot(df['CustomerAge'], kde=True, ax=axs[1], color='teal')
93
- axs[1].set_title("Customer Age Distribution")
94
- axs[1].set_xlabel("Customer Age")
95
-
96
- plt.tight_layout()
97
- st.plt(fig)
98
- st.markdown("""
99
- **Insights:**
100
- - Age groups are relatively evenly distributed.
101
- - Slightly higher representation for "Young".
102
- """)
103
-
104
- st.markdown("### GENDER DISTRIBUTION")
105
- fig, ax = plt.subplots(figsize=(10, 8))
106
- df['CustomerGender'].value_counts().plot(kind='pie',
107
- colors=['lightblue', 'lightpink'],
108
- autopct='%1.1f%%',
109
- startangle=90,
110
- wedgeprops={'edgecolor': 'black'},
111
- ax=ax)
112
- ax.set_title("Customer Gender Distribution")
113
- st.plt(fig)
114
- st.markdown("""
115
- **Insights:**
116
- - Gender distribution is almost equal.
117
- - Female: 50.9%, Male: 49.1%.
118
- """)
119
-
120
- st.markdown("### PURCHASE FREQUENCY DISTRIBUTION")
121
- fig, ax = plt.subplots(figsize=(10, 6))
122
- sns.histplot(df['PurchaseFrequency'], kde=True, color='purple', bins=30, ax=ax)
123
- ax.set_title("Purchase Frequency Distribution")
124
- ax.set_xlabel("Purchase Frequency")
125
- ax.set_ylabel("Count")
126
- st.plt(fig)
127
- st.markdown("""
128
- **Insights:**
129
- - Purchase frequencies range from 1 to 19.
130
- """)
131
-
132
- st.markdown("### CUSTOMER SATISFACTION DISTRIBUTION")
133
- fig, ax = plt.subplots(figsize=(10, 6))
134
- sns.histplot(df['CustomerSatisfaction'], kde=True, color=sns.color_palette("crest", n_colors=1)[0], ax=ax)
135
- ax.set_title("Customer Satisfaction Distribution")
136
- ax.set_xlabel("Customer Satisfaction")
137
- ax.set_ylabel("Count")
138
- st.plt(fig)
139
- st.markdown("""
140
- **Insights:**
141
- - Distinct peaks around integer values (1-5).
142
- - Indicates customers tend to provide whole-number ratings.
143
- """)
144
-
145
- st.markdown("### PURCHASE INTENT DISTRIBUTION")
146
- fig, ax = plt.subplots(figsize=(8, 6))
147
- purchase_intent_counts = df['PurchaseIntent'].value_counts()
148
- ax.pie(purchase_intent_counts,
149
- labels=purchase_intent_counts.index,
150
- colors=sns.color_palette("coolwarm", n_colors=len(purchase_intent_counts)),
151
- autopct='%1.1f%%',
152
- startangle=90,
153
- wedgeprops={'edgecolor': 'black'})
154
- ax.set_title("Purchase Intent Distribution")
155
- st.plt(fig)
156
- st.markdown("""
157
- **Insights:**
158
- - Binary classification problem (0: Not Purchase, 1: Purchase).
159
- - 56.6% intent to purchase, 43.4% do not.
160
- """)
161
-
162
- st.markdown("### CORRELATION HEATMAP")
163
- label_encoder = LabelEncoder()
164
- df['CustomerGender'] = label_encoder.fit_transform(df['CustomerGender'])
165
- df['Brand'] = label_encoder.fit_transform(df['Brand'])
166
- df['Category'] = label_encoder.fit_transform(df['Category'])
167
-
168
- corr_matrix = df.corr()
169
- fig, ax = plt.subplots(figsize=(12, 8))
170
- sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, ax=ax)
171
- ax.set_title("Correlation Heatmap")
172
- st.plt(fig)
173
- st.markdown("""
174
- **Insights:**
175
- - Strong correlations can be observed between certain variables.
176
- - Customer Satisfaction and Purchase Intent might have meaningful relationships.
177
- """)
 
1
  import streamlit as st
 
2
  import seaborn as sns
3
  import matplotlib.pyplot as plt
 
4
 
5
+ # Page Title
6
+ st.title("Complete EDA Page")
7
 
8
+ st.markdown("""
9
+ ### Perform a Detailed Exploratory Data Analysis (EDA)
10
+ This page uses the previously uploaded dataset and provides comprehensive analysis, including advanced visualizations and insights.
11
+ ---
12
+ """)
13
 
14
+ # Check if dataset exists in session state
15
+ if 'df' in st.session_state:
16
+ data = st.session_state['df']
17
+ st.success("Dataset loaded successfully from previous upload.")
18
+
19
+ # Display Dataset Overview
20
+ st.write("### Dataset Overview")
21
+ st.write(data.head())
22
+
23
+ st.write("### General Information")
24
+ st.write(f"Number of Rows: {data.shape[0]}")
25
+ st.write(f"Number of Columns: {data.shape[1]}")
26
+ st.write("Column Names:", list(data.columns))
27
+
28
+ # Visualize Numeric Columns
29
+ numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
30
+ if len(numeric_columns) > 0:
31
+ st.write("### Numeric Column Analysis")
32
+
33
+ # Pairplot
34
+ st.write("#### Pairplot for Numeric Columns")
35
+ fig = sns.pairplot(data[numeric_columns], diag_kind='kde', palette='husl')
36
+ st.pyplot(fig)
37
+
38
+ # Correlation Heatmap
39
+ st.write("#### Correlation Heatmap")
40
+ corr_matrix = data[numeric_columns].corr()
41
+ fig, ax = plt.subplots(figsize=(10, 8))
42
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
43
+ st.pyplot(fig)
44
+
45
+ # Histograms
46
+ st.write("#### Histograms")
47
+ for col in numeric_columns:
48
+ fig, ax = plt.subplots()
49
+ sns.histplot(data[col], kde=True, palette='crest', ax=ax)
50
+ ax.set_title(f'Histogram of {col}')
51
+ st.pyplot(fig)
52
+
53
+ else:
54
+ st.write("No numeric columns available for analysis.")
55
+
56
+ # Visualize Categorical Columns
57
+ categorical_columns = data.select_dtypes(include=['object', 'category']).columns
58
+ if len(categorical_columns) > 0:
59
+ st.write("### Categorical Column Analysis")
60
+
61
+ # Frequency Count Plots
62
+ for col in categorical_columns:
63
+ st.write(f"#### Bar Plot for {col}")
64
+ fig, ax = plt.subplots()
65
+ sns.countplot(x=col, data=data, palette='viridis', ax=ax)
66
+ ax.set_title(f'Bar Plot of {col}')
67
+ st.pyplot(fig)
68
+
69
+ else:
70
+ st.write("No categorical columns available for analysis.")
71
+
72
+ # Advanced Analysis: Boxplots
73
+ if len(numeric_columns) > 0 and len(categorical_columns) > 0:
74
+ st.write("### Boxplot Analysis")
75
+
76
+ selected_num_col = st.selectbox("Select Numeric Column for Boxplot", numeric_columns)
77
+ selected_cat_col = st.selectbox("Select Categorical Column for Boxplot", categorical_columns)
78
+
79
+ fig, ax = plt.subplots()
80
+ sns.boxplot(x=selected_cat_col, y=selected_num_col, data=data, palette='mako', ax=ax)
81
+ ax.set_title(f'Boxplot of {selected_num_col} by {selected_cat_col}')
82
+ st.pyplot(fig)
83
+
84
+ # Missing Values Heatmap
85
+ st.write("### Missing Values Analysis")
86
+ if data.isnull().values.any():
87
+ st.write("#### Missing Value Heatmap")
88
+ fig, ax = plt.subplots(figsize=(10, 6))
89
+ sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax)
90
+ st.pyplot(fig)
91
+ else:
92
+ st.write("No missing values found in the dataset.")
93
+
94
+ # Summary Statistics
95
  st.write("### Summary Statistics")
96
+ st.write(data.describe(include='all'))
97
+
98
+ else:
99
+ st.error("No dataset found. Please upload a dataset on the previous page first.")