Update pages/3_EDA_and_Feature_Engineering.py
Browse files
pages/3_EDA_and_Feature_Engineering.py
CHANGED
|
@@ -82,80 +82,151 @@ if df is not None:
|
|
| 82 |
# Univariate Analysis
|
| 83 |
st.write("### Univariate Analysis")
|
| 84 |
|
| 85 |
-
|
| 86 |
# Product Category Distribution
|
| 87 |
-
st.write("###
|
| 88 |
-
fig,
|
| 89 |
-
sns.countplot(x='Category', data=df, palette='viridis', ax=
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
plt.xticks(rotation=45)
|
| 94 |
st.pyplot(fig)
|
| 95 |
-
st.markdown('''**Insights
|
| 96 |
-
- We
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
-
|
| 102 |
-
# Add title and legend
|
| 103 |
axs.set_title("Customer Gender Distribution")
|
| 104 |
axs.legend(labels=['Female', 'Male'], loc='upper left', fontsize=12, title="Customer Gender")
|
| 105 |
|
| 106 |
-
# Adjust layout and render plot in Streamlit
|
| 107 |
plt.tight_layout()
|
| 108 |
st.pyplot(fig)
|
| 109 |
-
st.markdown('''**Insights
|
| 110 |
-
-
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
| 115 |
fig, axs = plt.subplots(1, 1, figsize=(10, 6))
|
| 116 |
-
|
| 117 |
-
# Plot the histogram with KDE
|
| 118 |
sns.histplot(df['PurchaseFrequency'], kde=True, color='purple', bins=30, ax=axs)
|
| 119 |
-
|
| 120 |
-
# Add title and labels
|
| 121 |
axs.set_title("Purchase Frequency Distribution")
|
| 122 |
axs.set_xlabel("Purchase Frequency")
|
| 123 |
axs.set_ylabel("Count")
|
| 124 |
-
|
| 125 |
-
# Adjust layout and render plot in Streamlit
|
| 126 |
plt.tight_layout()
|
| 127 |
st.pyplot(fig)
|
| 128 |
st.write("#### The Range is 1 - 19")
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
fig, axs = plt.subplots(1, 1, figsize=(10, 6))
|
| 133 |
-
|
| 134 |
-
# Plot the histogram with KDE using the specified color
|
| 135 |
sns.histplot(df['CustomerSatisfaction'], kde=True, color=sns.color_palette("crest", n_colors=1)[0], ax=axs)
|
| 136 |
-
|
| 137 |
-
# Add title and labels
|
| 138 |
axs.set_title("Customer Satisfaction Distribution")
|
| 139 |
axs.set_xlabel("Customer Satisfaction")
|
| 140 |
axs.set_ylabel("Count")
|
| 141 |
-
|
| 142 |
-
# Adjust layout and render plot in Streamlit
|
| 143 |
plt.tight_layout()
|
| 144 |
st.pyplot(fig)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
- **Multimodal Distribution:** The most striking aspect is the multimodal nature of the distribution. There are distinct peaks around the integer values (1, 2, 3, 4, 5). This suggests that customers tend to provide whole-number ratings rather than choosing intermediate values.
|
| 149 |
-
|
| 150 |
-
- **Relatively Uniform Peaks:** The peaks seem relatively uniform in height, indicating a somewhat even distribution of satisfaction levels across the rating scale. This might imply that there isn't a strong concentration of extremely satisfied or dissatisfied customers.''')
|
| 151 |
-
|
| 152 |
-
st.write("### CUSTOMER SATISFACTION DISTRIBUTION")
|
| 153 |
purchase_intent_counts = df['PurchaseIntent'].value_counts()
|
| 154 |
|
| 155 |
-
# Create a figure and axis for the plot
|
| 156 |
fig, axs = plt.subplots(1, 1, figsize=(8, 6))
|
| 157 |
-
|
| 158 |
-
# Plot the pie chart
|
| 159 |
wedges, texts, autotexts = axs.pie(purchase_intent_counts,
|
| 160 |
labels=purchase_intent_counts.index,
|
| 161 |
colors=sns.color_palette("coolwarm", n_colors=len(purchase_intent_counts)),
|
|
@@ -163,22 +234,17 @@ if df is not None:
|
|
| 163 |
startangle=90,
|
| 164 |
wedgeprops={'edgecolor': 'black'})
|
| 165 |
|
| 166 |
-
# Add legend and title
|
| 167 |
axs.legend(wedges, purchase_intent_counts.index, title="Purchase Intent", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
|
| 168 |
axs.set_title("Purchase Intent Distribution")
|
| 169 |
|
| 170 |
-
# Adjust layout and render plot in Streamlit
|
| 171 |
plt.tight_layout()
|
| 172 |
st.pyplot(fig)
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
- 0: Not Purchase --> 43.4%
|
| 180 |
-
|
| 181 |
-
- 1: Purchase --> 56.6%''')
|
| 182 |
|
| 183 |
st.write("## **Bivariate and MultivariateAnalysis**")
|
| 184 |
st.write("### Ploting Each Variable Against Target Variable")
|
|
|
|
| 82 |
# Univariate Analysis
|
| 83 |
st.write("### Univariate Analysis")
|
| 84 |
|
|
|
|
| 85 |
# Product Category Distribution
|
| 86 |
+
st.write("### Product Category Distribution")
|
| 87 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 88 |
+
sns.countplot(x='Category', data=df, palette='viridis', ax=ax)
|
| 89 |
+
ax.set_title("Product Category Distribution")
|
| 90 |
+
ax.set_xlabel("Product Category")
|
| 91 |
+
ax.set_ylabel("Count")
|
| 92 |
plt.xticks(rotation=45)
|
| 93 |
st.pyplot(fig)
|
| 94 |
+
st.markdown('''**Insights:**
|
| 95 |
+
- We have 5 Product Categories:
|
| 96 |
+
1. **Smart Phones & Laptops** have the highest and similar frequency.
|
| 97 |
+
2. **Smart Watches** follow, with a moderate frequency.
|
| 98 |
+
3. **Tablets and Headphones** have slightly lower frequency overall.
|
| 99 |
+
''')
|
| 100 |
+
|
| 101 |
+
# Product Brand Distribution
|
| 102 |
+
st.write("### Product Brand Distribution")
|
| 103 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 104 |
+
sns.countplot(x='Brand', data=df, palette='cubehelix', ax=ax)
|
| 105 |
+
ax.set_title("Product Brand Distribution")
|
| 106 |
+
ax.set_xlabel("Product Brand")
|
| 107 |
+
ax.set_ylabel("Count")
|
| 108 |
+
plt.xticks(rotation=45)
|
| 109 |
+
st.pyplot(fig)
|
| 110 |
+
st.markdown('''**Insights:**
|
| 111 |
+
- We have 5 Brand Categories:
|
| 112 |
+
1. **Samsung & HP** are the most frequent brands, with similar counts.
|
| 113 |
+
2. **Sony, Apple, and other brands** follow with lower frequencies.
|
| 114 |
+
''')
|
| 115 |
+
|
| 116 |
+
# Price Distribution
|
| 117 |
+
st.write("### Price Distribution")
|
| 118 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 119 |
+
sns.histplot(df['Price'], kde=True, color='orange', ax=ax)
|
| 120 |
+
ax.set_title("Product Price Distribution")
|
| 121 |
+
ax.set_xlabel("Product Price")
|
| 122 |
+
ax.set_ylabel("Count")
|
| 123 |
+
st.pyplot(fig)
|
| 124 |
+
st.markdown('''**Insights:**
|
| 125 |
+
- **Wide Range**: The products span a considerable price range (from near 0 to 3000).
|
| 126 |
+
- **Concentration**: There's a noticeable concentration of products priced between 200 and 2500.
|
| 127 |
+
- **Uniformity**: The distribution is somewhat uniform, with some peaks and valleys, suggesting no single dominant price point.
|
| 128 |
+
''')
|
| 129 |
+
|
| 130 |
+
# Product Price Binning
|
| 131 |
+
st.write("### Product Price Binning")
|
| 132 |
+
df['ProductPriceBucket'] = pd.cut(df['Price'],
|
| 133 |
+
bins=[100, 500, 1000, 1500, 2000, 3000],
|
| 134 |
+
labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
|
| 135 |
+
|
| 136 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 137 |
+
sns.countplot(x='ProductPriceBucket', data=df, palette='icefire', ax=ax)
|
| 138 |
+
ax.set_title("Product Price Bucket Distribution")
|
| 139 |
+
ax.set_xlabel("Price Bucket")
|
| 140 |
+
ax.set_ylabel("Count")
|
| 141 |
+
plt.xticks(rotation=45)
|
| 142 |
+
st.pyplot(fig)
|
| 143 |
+
st.markdown('''**Insights:**
|
| 144 |
+
- **Uneven Distribution**: The distribution is not even across price buckets, indicating certain price ranges are more common.
|
| 145 |
+
- **"Very High" Dominance**: The "Very High" bucket contains the most products, indicating a focus on premium items.
|
| 146 |
+
- **Lower Representation in "Very Low"**: The "Very Low" bucket has the fewest items, suggesting fewer budget-friendly products.
|
| 147 |
+
- **Balanced Mid-Range**: The "Low", "Medium", and "High" buckets have relatively similar counts.
|
| 148 |
+
''')
|
| 149 |
+
|
| 150 |
+
# Age Distribution and Binning
|
| 151 |
+
st.write("### Age Distribution and Binning")
|
| 152 |
+
df['CustomerAgeGroup'] = pd.qcut(df['CustomerAge'], q=4, labels=['Young', 'Middle-aged', 'Mature', 'Senior'])
|
| 153 |
+
|
| 154 |
+
fig, axs = plt.subplots(1, 2, figsize=(15, 6))
|
| 155 |
+
|
| 156 |
+
# Age Group Distribution
|
| 157 |
+
sns.countplot(x='CustomerAgeGroup', data=df, ax=axs[0], palette='magma')
|
| 158 |
+
axs[0].set_title("Customer Age Group Distribution")
|
| 159 |
+
axs[0].set_xlabel("Customer Age Group")
|
| 160 |
+
axs[0].set_ylabel("Count")
|
| 161 |
+
|
| 162 |
+
# Age Histogram
|
| 163 |
+
sns.histplot(df['CustomerAge'], kde=True, ax=axs[1], color='teal')
|
| 164 |
+
axs[1].set_title("Customer Age Distribution")
|
| 165 |
+
axs[1].set_xlabel("Customer Age")
|
| 166 |
+
axs[1].set_ylabel("Count")
|
| 167 |
+
|
| 168 |
+
plt.tight_layout()
|
| 169 |
+
st.pyplot(fig)
|
| 170 |
+
st.markdown('''**Insights:**
|
| 171 |
+
- **Relatively Even Distribution**: The customer age groups are relatively evenly distributed, indicating broad appeal across age demographics.
|
| 172 |
+
- **Slight Variation**:
|
| 173 |
+
- **Young** has a slightly higher count.
|
| 174 |
+
- **Senior** has a marginally lower count than others.
|
| 175 |
+
- **No Dominant Group**: There's no single dominant age group, reflecting a balanced customer base.
|
| 176 |
+
''')
|
| 177 |
+
|
| 178 |
+
# Gender Distribution
|
| 179 |
+
st.write("### Gender Distribution")
|
| 180 |
+
fig, axs = plt.subplots(figsize=(8, 8))
|
| 181 |
+
|
| 182 |
+
df['CustomerGender'].value_counts().plot(kind='pie',
|
| 183 |
+
colors=['lightblue', 'lightpink'],
|
| 184 |
+
autopct='%1.1f%%',
|
| 185 |
+
startangle=90,
|
| 186 |
+
wedgeprops={'edgecolor': 'black'},
|
| 187 |
+
ax=axs)
|
| 188 |
|
|
|
|
|
|
|
| 189 |
axs.set_title("Customer Gender Distribution")
|
| 190 |
axs.legend(labels=['Female', 'Male'], loc='upper left', fontsize=12, title="Customer Gender")
|
| 191 |
|
|
|
|
| 192 |
plt.tight_layout()
|
| 193 |
st.pyplot(fig)
|
| 194 |
+
st.markdown('''**Insights:**
|
| 195 |
+
- **Gender Distribution** is almost balanced:
|
| 196 |
+
- Male: 49.1%
|
| 197 |
+
- Female: 50.9%
|
| 198 |
+
''')
|
| 199 |
|
| 200 |
+
# Purchase Frequency Distribution
|
| 201 |
+
st.write("### Purchase Frequency Distribution")
|
| 202 |
fig, axs = plt.subplots(1, 1, figsize=(10, 6))
|
|
|
|
|
|
|
| 203 |
sns.histplot(df['PurchaseFrequency'], kde=True, color='purple', bins=30, ax=axs)
|
|
|
|
|
|
|
| 204 |
axs.set_title("Purchase Frequency Distribution")
|
| 205 |
axs.set_xlabel("Purchase Frequency")
|
| 206 |
axs.set_ylabel("Count")
|
|
|
|
|
|
|
| 207 |
plt.tight_layout()
|
| 208 |
st.pyplot(fig)
|
| 209 |
st.write("#### The Range is 1 - 19")
|
| 210 |
|
| 211 |
+
# Customer Satisfaction Distribution
|
| 212 |
+
st.write("### Customer Satisfaction Distribution")
|
| 213 |
fig, axs = plt.subplots(1, 1, figsize=(10, 6))
|
|
|
|
|
|
|
| 214 |
sns.histplot(df['CustomerSatisfaction'], kde=True, color=sns.color_palette("crest", n_colors=1)[0], ax=axs)
|
|
|
|
|
|
|
| 215 |
axs.set_title("Customer Satisfaction Distribution")
|
| 216 |
axs.set_xlabel("Customer Satisfaction")
|
| 217 |
axs.set_ylabel("Count")
|
|
|
|
|
|
|
| 218 |
plt.tight_layout()
|
| 219 |
st.pyplot(fig)
|
| 220 |
+
st.markdown('''**Insights:**
|
| 221 |
+
- **Multimodal Distribution**: There are distinct peaks at whole-number ratings (1, 2, 3, 4, 5), suggesting customers prefer integer ratings.
|
| 222 |
+
- **Uniform Peaks**: The peaks are relatively uniform in height, implying a diverse range of satisfaction levels across the rating scale.
|
| 223 |
+
''')
|
| 224 |
|
| 225 |
+
# Purchase Intent Distribution
|
| 226 |
+
st.write("### Purchase Intent Distribution")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
purchase_intent_counts = df['PurchaseIntent'].value_counts()
|
| 228 |
|
|
|
|
| 229 |
fig, axs = plt.subplots(1, 1, figsize=(8, 6))
|
|
|
|
|
|
|
| 230 |
wedges, texts, autotexts = axs.pie(purchase_intent_counts,
|
| 231 |
labels=purchase_intent_counts.index,
|
| 232 |
colors=sns.color_palette("coolwarm", n_colors=len(purchase_intent_counts)),
|
|
|
|
| 234 |
startangle=90,
|
| 235 |
wedgeprops={'edgecolor': 'black'})
|
| 236 |
|
|
|
|
| 237 |
axs.legend(wedges, purchase_intent_counts.index, title="Purchase Intent", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
|
| 238 |
axs.set_title("Purchase Intent Distribution")
|
| 239 |
|
|
|
|
| 240 |
plt.tight_layout()
|
| 241 |
st.pyplot(fig)
|
| 242 |
+
st.markdown('''**Insights:**
|
| 243 |
+
- **Binary Classification**: The Purchase Intent feature is binary:
|
| 244 |
+
- **Not Purchase (0)**: 43.4%
|
| 245 |
+
- **Purchase (1)**: 56.6%
|
| 246 |
+
''')
|
| 247 |
+
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
st.write("## **Bivariate and MultivariateAnalysis**")
|
| 250 |
st.write("### Ploting Each Variable Against Target Variable")
|