Spaces:

trohith89
/

Electronics-Sales-Classification

Sleeping

App Files Files Community

trohith89 commited on Jan 9, 2025

Commit

f6845ce

verified ·

1 Parent(s): 3bf549e

Update pages/3_EDA_and_Feature_Engineering.py

Browse files

Files changed (1) hide show

pages/3_EDA_and_Feature_Engineering.py +82 -82

pages/3_EDA_and_Feature_Engineering.py CHANGED Viewed

@@ -1,99 +1,99 @@
 import streamlit as st
 import seaborn as sns
 import matplotlib.pyplot as plt
 # Page Title
-st.title("Complete EDA Page")
 st.markdown("""
-### Perform a Detailed Exploratory Data Analysis (EDA)
-This page uses the previously uploaded dataset and provides comprehensive analysis, including advanced visualizations and insights.
 ---
 """)
 # Check if dataset exists in session state
 if 'df' in st.session_state:
-    data = st.session_state['df']
-    st.success("Dataset loaded successfully from previous upload.")
     # Display Dataset Overview
     st.write("### Dataset Overview")
-    st.write(data.head())
-    st.write("### General Information")
-    st.write(f"Number of Rows: {data.shape[0]}")
-    st.write(f"Number of Columns: {data.shape[1]}")
-    st.write("Column Names:", list(data.columns))
-    # Visualize Numeric Columns
-    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
-    if len(numeric_columns) > 0:
-        st.write("### Numeric Column Analysis")
-        # Pairplot
-        st.write("#### Pairplot for Numeric Columns")
-        fig = sns.pairplot(data[numeric_columns], diag_kind='kde', palette='husl')
-        st.pyplot(fig)
-        # Correlation Heatmap
-        st.write("#### Correlation Heatmap")
-        corr_matrix = data[numeric_columns].corr()
-        fig, ax = plt.subplots(figsize=(10, 8))
-        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
-        st.pyplot(fig)
-        # Histograms
-        st.write("#### Histograms")
-        for col in numeric_columns:
-            fig, ax = plt.subplots()
-            sns.histplot(data[col], kde=True, palette='crest', ax=ax)
-            ax.set_title(f'Histogram of {col}')
-            st.pyplot(fig)
-    else:
-        st.write("No numeric columns available for analysis.")
-    # Visualize Categorical Columns
-    categorical_columns = data.select_dtypes(include=['object', 'category']).columns
-    if len(categorical_columns) > 0:
-        st.write("### Categorical Column Analysis")
-        # Frequency Count Plots
-        for col in categorical_columns:
-            st.write(f"#### Bar Plot for {col}")
-            fig, ax = plt.subplots()
-            sns.countplot(x=col, data=data, palette='viridis', ax=ax)
-            ax.set_title(f'Bar Plot of {col}')
-            st.pyplot(fig)
-    else:
-        st.write("No categorical columns available for analysis.")
-    # Advanced Analysis: Boxplots
-    if len(numeric_columns) > 0 and len(categorical_columns) > 0:
-        st.write("### Boxplot Analysis")
-        selected_num_col = st.selectbox("Select Numeric Column for Boxplot", numeric_columns)
-        selected_cat_col = st.selectbox("Select Categorical Column for Boxplot", categorical_columns)
-        fig, ax = plt.subplots()
-        sns.boxplot(x=selected_cat_col, y=selected_num_col, data=data, palette='mako', ax=ax)
-        ax.set_title(f'Boxplot of {selected_num_col} by {selected_cat_col}')
-        st.pyplot(fig)
-    # Missing Values Heatmap
-    st.write("### Missing Values Analysis")
-    if data.isnull().values.any():
-        st.write("#### Missing Value Heatmap")
-        fig, ax = plt.subplots(figsize=(10, 6))
-        sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax)
-        st.pyplot(fig)
-    else:
-        st.write("No missing values found in the dataset.")
-    # Summary Statistics
-    st.write("### Summary Statistics")
-    st.write(data.describe(include='all'))
 else:
-    st.error("No dataset found. Please upload a dataset on the previous page first.")

 import streamlit as st
+import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
+from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
 # Page Title
+st.title("Complete EDA and Feature Engineering")
 st.markdown("""
+This page provides advanced Exploratory Data Analysis (EDA) and Feature Engineering using the dataset loaded in memory.
 ---
 """)
 # Check if dataset exists in session state
 if 'df' in st.session_state:
+    df = st.session_state['df']
+    st.success("Dataset loaded successfully.")
     # Display Dataset Overview
     st.write("### Dataset Overview")
+    st.dataframe(df.head())
+    # Shape of Dataset
+    st.write(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
+    # Univariate Analysis
+    st.write("### Univariate Analysis")
+    # Product Category Distribution
+    st.write("#### Product Category Distribution")
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.countplot(x='Category', data=df, palette='viridis', ax=ax)
+    ax.set_title("Product Category Distribution")
+    ax.set_xlabel("Product Category")
+    ax.set_ylabel("Count")
+    plt.xticks(rotation=45)
+    st.pyplot(fig)
+    # Product Price Distribution
+    st.write("#### Product Price Distribution")
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.histplot(df['Price'], kde=True, color='orange', ax=ax)
+    ax.set_title("Product Price Distribution")
+    ax.set_xlabel("Product Price")
+    ax.set_ylabel("Count")
+    st.pyplot(fig)
+    # Bivariate Analysis: Price vs Category
+    st.write("### Price vs Category")
+    fig, ax = plt.subplots(figsize=(12, 8))
+    sns.boxplot(x='Category', y='Price', data=df, palette='mako', ax=ax)
+    ax.set_title("Product Price by Category")
+    ax.set_xlabel("Category")
+    ax.set_ylabel("Price")
+    plt.xticks(rotation=45)
+    st.pyplot(fig)
+    # Feature Engineering
+    st.write("### Feature Engineering")
+    # Binning Product Price
+    st.write("#### Product Price Binning")
+    df['PriceBucket'] = pd.cut(df['Price'], bins=[100, 500, 1000, 1500, 2000, 3000],
+                               labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.countplot(x='PriceBucket', data=df, palette='icefire', ax=ax)
+    ax.set_title("Product Price Buckets")
+    ax.set_xlabel("Price Bucket")
+    ax.set_ylabel("Count")
+    st.pyplot(fig)
+    # Encoding Categorical Features
+    st.write("#### Encoding Categorical Features")
+    le = LabelEncoder()
+    df['Category'] = le.fit_transform(df['Category'])
+    df['Brand'] = le.fit_transform(df['Brand'])
+    st.write("Label Encoding Applied on 'Category' and 'Brand'.")
+    # Polynomial Features for Numerical Columns
+    st.write("#### Polynomial Feature Engineering")
+    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
+    poly = PolynomialFeatures(degree=2, include_bias=False)
+    poly_features = poly.fit_transform(df[numeric_columns])
+    poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(numeric_columns))
+    st.write("Polynomial Features Created:")
+    st.dataframe(poly_df.head())
+    # Correlation Heatmap
+    st.write("### Correlation Matrix")
+    corr = df.corr()
+    fig, ax = plt.subplots(figsize=(12, 8))
+    sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
+    ax.set_title("Correlation Matrix")
+    st.pyplot(fig)
 else:
+    st.error("No dataset found. Please upload a dataset on the main page first.")