trohith89 commited on
Commit
790a915
·
verified ·
1 Parent(s): 382d490

Update pages/3_EDA_and_Feature_Engineering.py

Browse files
Files changed (1) hide show
  1. pages/3_EDA_and_Feature_Engineering.py +121 -55
pages/3_EDA_and_Feature_Engineering.py CHANGED
@@ -82,80 +82,151 @@ if df is not None:
82
  # Univariate Analysis
83
  st.write("### Univariate Analysis")
84
 
85
-
86
  # Product Category Distribution
87
- st.write("### PRODUCT CATEGORY")
88
- fig, axs = plt.subplots(figsize=(10, 6))
89
- sns.countplot(x='Category', data=df, palette='viridis', ax=axs)
90
- axs.set_title("Product Category Distribution")
91
- axs.set_xlabel("Product Category")
92
- axs.set_ylabel("Count")
93
  plt.xticks(rotation=45)
94
  st.pyplot(fig)
95
- st.markdown('''**Insights :**
96
- - We've 5 Product Categories:
97
- 1. Smart Phones & Laptops are the most highest and similar in frequency,
98
- 2. Followed by Smart Watches,
99
- 3. Tablets and Headphones are little less in frequency overall.''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
-
102
- # Add title and legend
103
  axs.set_title("Customer Gender Distribution")
104
  axs.legend(labels=['Female', 'Male'], loc='upper left', fontsize=12, title="Customer Gender")
105
 
106
- # Adjust layout and render plot in Streamlit
107
  plt.tight_layout()
108
  st.pyplot(fig)
109
- st.markdown('''**Insights :** Almost same proportion for both the genders
110
- - Male (49.1%)
111
- - Female (50.9%)''')
 
 
112
 
113
- st.write("### PURCHASE FREQUENCY DISTRIBUTION")
114
- # Create a figure and axis for the plot
115
  fig, axs = plt.subplots(1, 1, figsize=(10, 6))
116
-
117
- # Plot the histogram with KDE
118
  sns.histplot(df['PurchaseFrequency'], kde=True, color='purple', bins=30, ax=axs)
119
-
120
- # Add title and labels
121
  axs.set_title("Purchase Frequency Distribution")
122
  axs.set_xlabel("Purchase Frequency")
123
  axs.set_ylabel("Count")
124
-
125
- # Adjust layout and render plot in Streamlit
126
  plt.tight_layout()
127
  st.pyplot(fig)
128
  st.write("#### The Range is 1 - 19")
129
 
130
- st.write("### CUSTOMER SATISFACTION DISTRIBUTION")
131
- # Create a figure and axis for the plot
132
  fig, axs = plt.subplots(1, 1, figsize=(10, 6))
133
-
134
- # Plot the histogram with KDE using the specified color
135
  sns.histplot(df['CustomerSatisfaction'], kde=True, color=sns.color_palette("crest", n_colors=1)[0], ax=axs)
136
-
137
- # Add title and labels
138
  axs.set_title("Customer Satisfaction Distribution")
139
  axs.set_xlabel("Customer Satisfaction")
140
  axs.set_ylabel("Count")
141
-
142
- # Adjust layout and render plot in Streamlit
143
  plt.tight_layout()
144
  st.pyplot(fig)
 
 
 
 
145
 
146
- st.markdown('''**Insights :**
147
-
148
- - **Multimodal Distribution:** The most striking aspect is the multimodal nature of the distribution. There are distinct peaks around the integer values (1, 2, 3, 4, 5). This suggests that customers tend to provide whole-number ratings rather than choosing intermediate values.
149
-
150
- - **Relatively Uniform Peaks:** The peaks seem relatively uniform in height, indicating a somewhat even distribution of satisfaction levels across the rating scale. This might imply that there isn't a strong concentration of extremely satisfied or dissatisfied customers.''')
151
-
152
- st.write("### CUSTOMER SATISFACTION DISTRIBUTION")
153
  purchase_intent_counts = df['PurchaseIntent'].value_counts()
154
 
155
- # Create a figure and axis for the plot
156
  fig, axs = plt.subplots(1, 1, figsize=(8, 6))
157
-
158
- # Plot the pie chart
159
  wedges, texts, autotexts = axs.pie(purchase_intent_counts,
160
  labels=purchase_intent_counts.index,
161
  colors=sns.color_palette("coolwarm", n_colors=len(purchase_intent_counts)),
@@ -163,22 +234,17 @@ if df is not None:
163
  startangle=90,
164
  wedgeprops={'edgecolor': 'black'})
165
 
166
- # Add legend and title
167
  axs.legend(wedges, purchase_intent_counts.index, title="Purchase Intent", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
168
  axs.set_title("Purchase Intent Distribution")
169
 
170
- # Adjust layout and render plot in Streamlit
171
  plt.tight_layout()
172
  st.pyplot(fig)
173
-
174
- st.markdown('''**Insights :**
175
- - We've 0 and 1 which means Not Purchase and Purchase.
176
-
177
- - A binary classification problem.
178
-
179
- - 0: Not Purchase --> 43.4%
180
-
181
- - 1: Purchase --> 56.6%''')
182
 
183
  st.write("## **Bivariate and MultivariateAnalysis**")
184
  st.write("### Ploting Each Variable Against Target Variable")
 
82
  # Univariate Analysis
83
  st.write("### Univariate Analysis")
84
 
 
85
  # Product Category Distribution
86
+ st.write("### Product Category Distribution")
87
+ fig, ax = plt.subplots(figsize=(10, 6))
88
+ sns.countplot(x='Category', data=df, palette='viridis', ax=ax)
89
+ ax.set_title("Product Category Distribution")
90
+ ax.set_xlabel("Product Category")
91
+ ax.set_ylabel("Count")
92
  plt.xticks(rotation=45)
93
  st.pyplot(fig)
94
+ st.markdown('''**Insights:**
95
+ - We have 5 Product Categories:
96
+ 1. **Smart Phones & Laptops** have the highest and similar frequency.
97
+ 2. **Smart Watches** follow, with a moderate frequency.
98
+ 3. **Tablets and Headphones** have slightly lower frequency overall.
99
+ ''')
100
+
101
+ # Product Brand Distribution
102
+ st.write("### Product Brand Distribution")
103
+ fig, ax = plt.subplots(figsize=(10, 6))
104
+ sns.countplot(x='Brand', data=df, palette='cubehelix', ax=ax)
105
+ ax.set_title("Product Brand Distribution")
106
+ ax.set_xlabel("Product Brand")
107
+ ax.set_ylabel("Count")
108
+ plt.xticks(rotation=45)
109
+ st.pyplot(fig)
110
+ st.markdown('''**Insights:**
111
+ - We have 5 Brand Categories:
112
+ 1. **Samsung & HP** are the most frequent brands, with similar counts.
113
+ 2. **Sony, Apple, and other brands** follow with lower frequencies.
114
+ ''')
115
+
116
+ # Price Distribution
117
+ st.write("### Price Distribution")
118
+ fig, ax = plt.subplots(figsize=(10, 6))
119
+ sns.histplot(df['Price'], kde=True, color='orange', ax=ax)
120
+ ax.set_title("Product Price Distribution")
121
+ ax.set_xlabel("Product Price")
122
+ ax.set_ylabel("Count")
123
+ st.pyplot(fig)
124
+ st.markdown('''**Insights:**
125
+ - **Wide Range**: The products span a considerable price range (from near 0 to 3000).
126
+ - **Concentration**: There's a noticeable concentration of products priced between 200 and 2500.
127
+ - **Uniformity**: The distribution is somewhat uniform, with some peaks and valleys, suggesting no single dominant price point.
128
+ ''')
129
+
130
+ # Product Price Binning
131
+ st.write("### Product Price Binning")
132
+ df['ProductPriceBucket'] = pd.cut(df['Price'],
133
+ bins=[100, 500, 1000, 1500, 2000, 3000],
134
+ labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
135
+
136
+ fig, ax = plt.subplots(figsize=(10, 6))
137
+ sns.countplot(x='ProductPriceBucket', data=df, palette='icefire', ax=ax)
138
+ ax.set_title("Product Price Bucket Distribution")
139
+ ax.set_xlabel("Price Bucket")
140
+ ax.set_ylabel("Count")
141
+ plt.xticks(rotation=45)
142
+ st.pyplot(fig)
143
+ st.markdown('''**Insights:**
144
+ - **Uneven Distribution**: The distribution is not even across price buckets, indicating certain price ranges are more common.
145
+ - **"Very High" Dominance**: The "Very High" bucket contains the most products, indicating a focus on premium items.
146
+ - **Lower Representation in "Very Low"**: The "Very Low" bucket has the fewest items, suggesting fewer budget-friendly products.
147
+ - **Balanced Mid-Range**: The "Low", "Medium", and "High" buckets have relatively similar counts.
148
+ ''')
149
+
150
+ # Age Distribution and Binning
151
+ st.write("### Age Distribution and Binning")
152
+ df['CustomerAgeGroup'] = pd.qcut(df['CustomerAge'], q=4, labels=['Young', 'Middle-aged', 'Mature', 'Senior'])
153
+
154
+ fig, axs = plt.subplots(1, 2, figsize=(15, 6))
155
+
156
+ # Age Group Distribution
157
+ sns.countplot(x='CustomerAgeGroup', data=df, ax=axs[0], palette='magma')
158
+ axs[0].set_title("Customer Age Group Distribution")
159
+ axs[0].set_xlabel("Customer Age Group")
160
+ axs[0].set_ylabel("Count")
161
+
162
+ # Age Histogram
163
+ sns.histplot(df['CustomerAge'], kde=True, ax=axs[1], color='teal')
164
+ axs[1].set_title("Customer Age Distribution")
165
+ axs[1].set_xlabel("Customer Age")
166
+ axs[1].set_ylabel("Count")
167
+
168
+ plt.tight_layout()
169
+ st.pyplot(fig)
170
+ st.markdown('''**Insights:**
171
+ - **Relatively Even Distribution**: The customer age groups are relatively evenly distributed, indicating broad appeal across age demographics.
172
+ - **Slight Variation**:
173
+ - **Young** has a slightly higher count.
174
+ - **Senior** has a marginally lower count than others.
175
+ - **No Dominant Group**: There's no single dominant age group, reflecting a balanced customer base.
176
+ ''')
177
+
178
+ # Gender Distribution
179
+ st.write("### Gender Distribution")
180
+ fig, axs = plt.subplots(figsize=(8, 8))
181
+
182
+ df['CustomerGender'].value_counts().plot(kind='pie',
183
+ colors=['lightblue', 'lightpink'],
184
+ autopct='%1.1f%%',
185
+ startangle=90,
186
+ wedgeprops={'edgecolor': 'black'},
187
+ ax=axs)
188
 
 
 
189
  axs.set_title("Customer Gender Distribution")
190
  axs.legend(labels=['Female', 'Male'], loc='upper left', fontsize=12, title="Customer Gender")
191
 
 
192
  plt.tight_layout()
193
  st.pyplot(fig)
194
+ st.markdown('''**Insights:**
195
+ - **Gender Distribution** is almost balanced:
196
+ - Male: 49.1%
197
+ - Female: 50.9%
198
+ ''')
199
 
200
+ # Purchase Frequency Distribution
201
+ st.write("### Purchase Frequency Distribution")
202
  fig, axs = plt.subplots(1, 1, figsize=(10, 6))
 
 
203
  sns.histplot(df['PurchaseFrequency'], kde=True, color='purple', bins=30, ax=axs)
 
 
204
  axs.set_title("Purchase Frequency Distribution")
205
  axs.set_xlabel("Purchase Frequency")
206
  axs.set_ylabel("Count")
 
 
207
  plt.tight_layout()
208
  st.pyplot(fig)
209
  st.write("#### The Range is 1 - 19")
210
 
211
+ # Customer Satisfaction Distribution
212
+ st.write("### Customer Satisfaction Distribution")
213
  fig, axs = plt.subplots(1, 1, figsize=(10, 6))
 
 
214
  sns.histplot(df['CustomerSatisfaction'], kde=True, color=sns.color_palette("crest", n_colors=1)[0], ax=axs)
 
 
215
  axs.set_title("Customer Satisfaction Distribution")
216
  axs.set_xlabel("Customer Satisfaction")
217
  axs.set_ylabel("Count")
 
 
218
  plt.tight_layout()
219
  st.pyplot(fig)
220
+ st.markdown('''**Insights:**
221
+ - **Multimodal Distribution**: There are distinct peaks at whole-number ratings (1, 2, 3, 4, 5), suggesting customers prefer integer ratings.
222
+ - **Uniform Peaks**: The peaks are relatively uniform in height, implying a diverse range of satisfaction levels across the rating scale.
223
+ ''')
224
 
225
+ # Purchase Intent Distribution
226
+ st.write("### Purchase Intent Distribution")
 
 
 
 
 
227
  purchase_intent_counts = df['PurchaseIntent'].value_counts()
228
 
 
229
  fig, axs = plt.subplots(1, 1, figsize=(8, 6))
 
 
230
  wedges, texts, autotexts = axs.pie(purchase_intent_counts,
231
  labels=purchase_intent_counts.index,
232
  colors=sns.color_palette("coolwarm", n_colors=len(purchase_intent_counts)),
 
234
  startangle=90,
235
  wedgeprops={'edgecolor': 'black'})
236
 
 
237
  axs.legend(wedges, purchase_intent_counts.index, title="Purchase Intent", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
238
  axs.set_title("Purchase Intent Distribution")
239
 
 
240
  plt.tight_layout()
241
  st.pyplot(fig)
242
+ st.markdown('''**Insights:**
243
+ - **Binary Classification**: The Purchase Intent feature is binary:
244
+ - **Not Purchase (0)**: 43.4%
245
+ - **Purchase (1)**: 56.6%
246
+ ''')
247
+
 
 
 
248
 
249
  st.write("## **Bivariate and MultivariateAnalysis**")
250
  st.write("### Ploting Each Variable Against Target Variable")