Spaces:

trohith89
/

Electronics-Sales-Classification

Sleeping

App Files Files Community

trohith89 commited on Jan 13, 2025

Commit

790a915

verified ·

1 Parent(s): 382d490

Update pages/3_EDA_and_Feature_Engineering.py

Browse files

Files changed (1) hide show

pages/3_EDA_and_Feature_Engineering.py +121 -55

pages/3_EDA_and_Feature_Engineering.py CHANGED Viewed

@@ -82,80 +82,151 @@ if df is not None:
     # Univariate Analysis
     st.write("### Univariate Analysis")
     # Product Category Distribution
-    st.write("### PRODUCT CATEGORY")
-    fig, axs = plt.subplots(figsize=(10, 6))
-    sns.countplot(x='Category', data=df, palette='viridis', ax=axs)
-    axs.set_title("Product Category Distribution")
-    axs.set_xlabel("Product Category")
-    axs.set_ylabel("Count")
     plt.xticks(rotation=45)
     st.pyplot(fig)
-    st.markdown('''**Insights :**
-    - We've 5 Product Categories:
-    1. Smart Phones & Laptops are the most highest and similar in frequency,
-    2. Followed by Smart Watches,
-    3. Tablets and Headphones are little less in frequency overall.''')
-    # Add title and legend
     axs.set_title("Customer Gender Distribution")
     axs.legend(labels=['Female', 'Male'], loc='upper left', fontsize=12, title="Customer Gender")
-    # Adjust layout and render plot in Streamlit
     plt.tight_layout()
     st.pyplot(fig)
-    st.markdown('''**Insights :** Almost same proportion for both the genders
-    - Male (49.1%)
-    - Female (50.9%)''')
-    st.write("### PURCHASE FREQUENCY DISTRIBUTION")
-    # Create a figure and axis for the plot
     fig, axs = plt.subplots(1, 1, figsize=(10, 6))
-    # Plot the histogram with KDE
     sns.histplot(df['PurchaseFrequency'], kde=True, color='purple', bins=30, ax=axs)
-    # Add title and labels
     axs.set_title("Purchase Frequency Distribution")
     axs.set_xlabel("Purchase Frequency")
     axs.set_ylabel("Count")
-    # Adjust layout and render plot in Streamlit
     plt.tight_layout()
     st.pyplot(fig)
     st.write("#### The Range is 1 - 19")
-    st.write("### CUSTOMER SATISFACTION DISTRIBUTION")
-    # Create a figure and axis for the plot
     fig, axs = plt.subplots(1, 1, figsize=(10, 6))
-    # Plot the histogram with KDE using the specified color
     sns.histplot(df['CustomerSatisfaction'], kde=True, color=sns.color_palette("crest", n_colors=1)[0], ax=axs)
-    # Add title and labels
     axs.set_title("Customer Satisfaction Distribution")
     axs.set_xlabel("Customer Satisfaction")
     axs.set_ylabel("Count")
-    # Adjust layout and render plot in Streamlit
     plt.tight_layout()
     st.pyplot(fig)
-    st.markdown('''**Insights :**
-    - **Multimodal Distribution:** The most striking aspect is the multimodal nature of the distribution. There are distinct peaks around the integer values (1, 2, 3, 4, 5). This suggests that customers tend to provide whole-number ratings rather than choosing intermediate values.
-    - **Relatively Uniform Peaks:** The peaks seem relatively uniform in height, indicating a somewhat even distribution of satisfaction levels across the rating scale. This might imply that there isn't a strong concentration of extremely satisfied or dissatisfied customers.''')
-    st.write("### CUSTOMER SATISFACTION DISTRIBUTION")
     purchase_intent_counts = df['PurchaseIntent'].value_counts()
-    # Create a figure and axis for the plot
     fig, axs = plt.subplots(1, 1, figsize=(8, 6))
-    # Plot the pie chart
     wedges, texts, autotexts = axs.pie(purchase_intent_counts,
                                        labels=purchase_intent_counts.index,
                                        colors=sns.color_palette("coolwarm", n_colors=len(purchase_intent_counts)),
@@ -163,22 +234,17 @@ if df is not None:
                                        startangle=90,
                                        wedgeprops={'edgecolor': 'black'})
-    # Add legend and title
     axs.legend(wedges, purchase_intent_counts.index, title="Purchase Intent", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
     axs.set_title("Purchase Intent Distribution")
-    # Adjust layout and render plot in Streamlit
     plt.tight_layout()
     st.pyplot(fig)
-    st.markdown('''**Insights :**
-    - We've 0 and 1 which means Not Purchase and Purchase.
-    - A binary classification problem.
-    - 0: Not Purchase --> 43.4%
-    - 1: Purchase --> 56.6%''')
     st.write("## **Bivariate and MultivariateAnalysis**")
     st.write("### Ploting Each Variable Against Target Variable")

     # Univariate Analysis
     st.write("### Univariate Analysis")
     # Product Category Distribution
+    st.write("### Product Category Distribution")
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.countplot(x='Category', data=df, palette='viridis', ax=ax)
+    ax.set_title("Product Category Distribution")
+    ax.set_xlabel("Product Category")
+    ax.set_ylabel("Count")
     plt.xticks(rotation=45)
     st.pyplot(fig)
+    st.markdown('''**Insights:**
+    - We have 5 Product Categories:
+        1. **Smart Phones & Laptops** have the highest and similar frequency.
+        2. **Smart Watches** follow, with a moderate frequency.
+        3. **Tablets and Headphones** have slightly lower frequency overall.
+    ''')
+    # Product Brand Distribution
+    st.write("### Product Brand Distribution")
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.countplot(x='Brand', data=df, palette='cubehelix', ax=ax)
+    ax.set_title("Product Brand Distribution")
+    ax.set_xlabel("Product Brand")
+    ax.set_ylabel("Count")
+    plt.xticks(rotation=45)
+    st.pyplot(fig)
+    st.markdown('''**Insights:**
+    - We have 5 Brand Categories:
+        1. **Samsung & HP** are the most frequent brands, with similar counts.
+        2. **Sony, Apple, and other brands** follow with lower frequencies.
+    ''')
+    # Price Distribution
+    st.write("### Price Distribution")
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.histplot(df['Price'], kde=True, color='orange', ax=ax)
+    ax.set_title("Product Price Distribution")
+    ax.set_xlabel("Product Price")
+    ax.set_ylabel("Count")
+    st.pyplot(fig)
+    st.markdown('''**Insights:**
+    - **Wide Range**: The products span a considerable price range (from near 0 to 3000).
+    - **Concentration**: There's a noticeable concentration of products priced between 200 and 2500.
+    - **Uniformity**: The distribution is somewhat uniform, with some peaks and valleys, suggesting no single dominant price point.
+    ''')
+    # Product Price Binning
+    st.write("### Product Price Binning")
+    df['ProductPriceBucket'] = pd.cut(df['Price'],
+                                      bins=[100, 500, 1000, 1500, 2000, 3000],
+                                      labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.countplot(x='ProductPriceBucket', data=df, palette='icefire', ax=ax)
+    ax.set_title("Product Price Bucket Distribution")
+    ax.set_xlabel("Price Bucket")
+    ax.set_ylabel("Count")
+    plt.xticks(rotation=45)
+    st.pyplot(fig)
+    st.markdown('''**Insights:**
+    - **Uneven Distribution**: The distribution is not even across price buckets, indicating certain price ranges are more common.
+    - **"Very High" Dominance**: The "Very High" bucket contains the most products, indicating a focus on premium items.
+    - **Lower Representation in "Very Low"**: The "Very Low" bucket has the fewest items, suggesting fewer budget-friendly products.
+    - **Balanced Mid-Range**: The "Low", "Medium", and "High" buckets have relatively similar counts.
+    ''')
+    # Age Distribution and Binning
+    st.write("### Age Distribution and Binning")
+    df['CustomerAgeGroup'] = pd.qcut(df['CustomerAge'], q=4, labels=['Young', 'Middle-aged', 'Mature', 'Senior'])
+    fig, axs = plt.subplots(1, 2, figsize=(15, 6))
+    # Age Group Distribution
+    sns.countplot(x='CustomerAgeGroup', data=df, ax=axs[0], palette='magma')
+    axs[0].set_title("Customer Age Group Distribution")
+    axs[0].set_xlabel("Customer Age Group")
+    axs[0].set_ylabel("Count")
+    # Age Histogram
+    sns.histplot(df['CustomerAge'], kde=True, ax=axs[1], color='teal')
+    axs[1].set_title("Customer Age Distribution")
+    axs[1].set_xlabel("Customer Age")
+    axs[1].set_ylabel("Count")
+    plt.tight_layout()
+    st.pyplot(fig)
+    st.markdown('''**Insights:**
+    - **Relatively Even Distribution**: The customer age groups are relatively evenly distributed, indicating broad appeal across age demographics.
+    - **Slight Variation**:
+        - **Young** has a slightly higher count.
+        - **Senior** has a marginally lower count than others.
+    - **No Dominant Group**: There's no single dominant age group, reflecting a balanced customer base.
+    ''')
+    # Gender Distribution
+    st.write("### Gender Distribution")
+    fig, axs = plt.subplots(figsize=(8, 8))
+    df['CustomerGender'].value_counts().plot(kind='pie',
+                                              colors=['lightblue', 'lightpink'],
+                                              autopct='%1.1f%%',
+                                              startangle=90,
+                                              wedgeprops={'edgecolor': 'black'},
+                                              ax=axs)
     axs.set_title("Customer Gender Distribution")
     axs.legend(labels=['Female', 'Male'], loc='upper left', fontsize=12, title="Customer Gender")
     plt.tight_layout()
     st.pyplot(fig)
+    st.markdown('''**Insights:**
+    - **Gender Distribution** is almost balanced:
+        - Male: 49.1%
+        - Female: 50.9%
+    ''')
+    # Purchase Frequency Distribution
+    st.write("### Purchase Frequency Distribution")
     fig, axs = plt.subplots(1, 1, figsize=(10, 6))
     sns.histplot(df['PurchaseFrequency'], kde=True, color='purple', bins=30, ax=axs)
     axs.set_title("Purchase Frequency Distribution")
     axs.set_xlabel("Purchase Frequency")
     axs.set_ylabel("Count")
     plt.tight_layout()
     st.pyplot(fig)
     st.write("#### The Range is 1 - 19")
+    # Customer Satisfaction Distribution
+    st.write("### Customer Satisfaction Distribution")
     fig, axs = plt.subplots(1, 1, figsize=(10, 6))
     sns.histplot(df['CustomerSatisfaction'], kde=True, color=sns.color_palette("crest", n_colors=1)[0], ax=axs)
     axs.set_title("Customer Satisfaction Distribution")
     axs.set_xlabel("Customer Satisfaction")
     axs.set_ylabel("Count")
     plt.tight_layout()
     st.pyplot(fig)
+    st.markdown('''**Insights:**
+    - **Multimodal Distribution**: There are distinct peaks at whole-number ratings (1, 2, 3, 4, 5), suggesting customers prefer integer ratings.
+    - **Uniform Peaks**: The peaks are relatively uniform in height, implying a diverse range of satisfaction levels across the rating scale.
+    ''')
+    # Purchase Intent Distribution
+    st.write("### Purchase Intent Distribution")
     purchase_intent_counts = df['PurchaseIntent'].value_counts()
     fig, axs = plt.subplots(1, 1, figsize=(8, 6))
     wedges, texts, autotexts = axs.pie(purchase_intent_counts,
                                        labels=purchase_intent_counts.index,
                                        colors=sns.color_palette("coolwarm", n_colors=len(purchase_intent_counts)),
                                        startangle=90,
                                        wedgeprops={'edgecolor': 'black'})
     axs.legend(wedges, purchase_intent_counts.index, title="Purchase Intent", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
     axs.set_title("Purchase Intent Distribution")
     plt.tight_layout()
     st.pyplot(fig)
+    st.markdown('''**Insights:**
+    - **Binary Classification**: The Purchase Intent feature is binary:
+        - **Not Purchase (0)**: 43.4%
+        - **Purchase (1)**: 56.6%
+    ''')
     st.write("## **Bivariate and MultivariateAnalysis**")
     st.write("### Ploting Each Variable Against Target Variable")