trohith89 commited on
Commit
f6845ce
·
verified ·
1 Parent(s): 3bf549e

Update pages/3_EDA_and_Feature_Engineering.py

Browse files
pages/3_EDA_and_Feature_Engineering.py CHANGED
@@ -1,99 +1,99 @@
1
  import streamlit as st
 
2
  import seaborn as sns
3
  import matplotlib.pyplot as plt
 
4
 
5
  # Page Title
6
- st.title("Complete EDA Page")
7
 
8
  st.markdown("""
9
- ### Perform a Detailed Exploratory Data Analysis (EDA)
10
- This page uses the previously uploaded dataset and provides comprehensive analysis, including advanced visualizations and insights.
11
  ---
12
  """)
13
 
14
  # Check if dataset exists in session state
15
  if 'df' in st.session_state:
16
- data = st.session_state['df']
17
- st.success("Dataset loaded successfully from previous upload.")
18
 
19
  # Display Dataset Overview
20
  st.write("### Dataset Overview")
21
- st.write(data.head())
22
-
23
- st.write("### General Information")
24
- st.write(f"Number of Rows: {data.shape[0]}")
25
- st.write(f"Number of Columns: {data.shape[1]}")
26
- st.write("Column Names:", list(data.columns))
27
-
28
- # Visualize Numeric Columns
29
- numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
30
- if len(numeric_columns) > 0:
31
- st.write("### Numeric Column Analysis")
32
-
33
- # Pairplot
34
- st.write("#### Pairplot for Numeric Columns")
35
- fig = sns.pairplot(data[numeric_columns], diag_kind='kde', palette='husl')
36
- st.pyplot(fig)
37
-
38
- # Correlation Heatmap
39
- st.write("#### Correlation Heatmap")
40
- corr_matrix = data[numeric_columns].corr()
41
- fig, ax = plt.subplots(figsize=(10, 8))
42
- sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
43
- st.pyplot(fig)
44
-
45
- # Histograms
46
- st.write("#### Histograms")
47
- for col in numeric_columns:
48
- fig, ax = plt.subplots()
49
- sns.histplot(data[col], kde=True, palette='crest', ax=ax)
50
- ax.set_title(f'Histogram of {col}')
51
- st.pyplot(fig)
52
-
53
- else:
54
- st.write("No numeric columns available for analysis.")
55
-
56
- # Visualize Categorical Columns
57
- categorical_columns = data.select_dtypes(include=['object', 'category']).columns
58
- if len(categorical_columns) > 0:
59
- st.write("### Categorical Column Analysis")
60
-
61
- # Frequency Count Plots
62
- for col in categorical_columns:
63
- st.write(f"#### Bar Plot for {col}")
64
- fig, ax = plt.subplots()
65
- sns.countplot(x=col, data=data, palette='viridis', ax=ax)
66
- ax.set_title(f'Bar Plot of {col}')
67
- st.pyplot(fig)
68
-
69
- else:
70
- st.write("No categorical columns available for analysis.")
71
-
72
- # Advanced Analysis: Boxplots
73
- if len(numeric_columns) > 0 and len(categorical_columns) > 0:
74
- st.write("### Boxplot Analysis")
75
-
76
- selected_num_col = st.selectbox("Select Numeric Column for Boxplot", numeric_columns)
77
- selected_cat_col = st.selectbox("Select Categorical Column for Boxplot", categorical_columns)
78
-
79
- fig, ax = plt.subplots()
80
- sns.boxplot(x=selected_cat_col, y=selected_num_col, data=data, palette='mako', ax=ax)
81
- ax.set_title(f'Boxplot of {selected_num_col} by {selected_cat_col}')
82
- st.pyplot(fig)
83
-
84
- # Missing Values Heatmap
85
- st.write("### Missing Values Analysis")
86
- if data.isnull().values.any():
87
- st.write("#### Missing Value Heatmap")
88
- fig, ax = plt.subplots(figsize=(10, 6))
89
- sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax)
90
- st.pyplot(fig)
91
- else:
92
- st.write("No missing values found in the dataset.")
93
-
94
- # Summary Statistics
95
- st.write("### Summary Statistics")
96
- st.write(data.describe(include='all'))
97
 
98
  else:
99
- st.error("No dataset found. Please upload a dataset on the previous page first.")
 
1
  import streamlit as st
2
+ import pandas as pd
3
  import seaborn as sns
4
  import matplotlib.pyplot as plt
5
+ from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
6
 
7
  # Page Title
8
+ st.title("Complete EDA and Feature Engineering")
9
 
10
  st.markdown("""
11
+ This page provides advanced Exploratory Data Analysis (EDA) and Feature Engineering using the dataset loaded in memory.
 
12
  ---
13
  """)
14
 
15
  # Check if dataset exists in session state
16
  if 'df' in st.session_state:
17
+ df = st.session_state['df']
18
+ st.success("Dataset loaded successfully.")
19
 
20
  # Display Dataset Overview
21
  st.write("### Dataset Overview")
22
+ st.dataframe(df.head())
23
+
24
+ # Shape of Dataset
25
+ st.write(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
26
+
27
+ # Univariate Analysis
28
+ st.write("### Univariate Analysis")
29
+
30
+ # Product Category Distribution
31
+ st.write("#### Product Category Distribution")
32
+ fig, ax = plt.subplots(figsize=(10, 6))
33
+ sns.countplot(x='Category', data=df, palette='viridis', ax=ax)
34
+ ax.set_title("Product Category Distribution")
35
+ ax.set_xlabel("Product Category")
36
+ ax.set_ylabel("Count")
37
+ plt.xticks(rotation=45)
38
+ st.pyplot(fig)
39
+
40
+ # Product Price Distribution
41
+ st.write("#### Product Price Distribution")
42
+ fig, ax = plt.subplots(figsize=(10, 6))
43
+ sns.histplot(df['Price'], kde=True, color='orange', ax=ax)
44
+ ax.set_title("Product Price Distribution")
45
+ ax.set_xlabel("Product Price")
46
+ ax.set_ylabel("Count")
47
+ st.pyplot(fig)
48
+
49
+ # Bivariate Analysis: Price vs Category
50
+ st.write("### Price vs Category")
51
+ fig, ax = plt.subplots(figsize=(12, 8))
52
+ sns.boxplot(x='Category', y='Price', data=df, palette='mako', ax=ax)
53
+ ax.set_title("Product Price by Category")
54
+ ax.set_xlabel("Category")
55
+ ax.set_ylabel("Price")
56
+ plt.xticks(rotation=45)
57
+ st.pyplot(fig)
58
+
59
+ # Feature Engineering
60
+ st.write("### Feature Engineering")
61
+
62
+ # Binning Product Price
63
+ st.write("#### Product Price Binning")
64
+ df['PriceBucket'] = pd.cut(df['Price'], bins=[100, 500, 1000, 1500, 2000, 3000],
65
+ labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
66
+ fig, ax = plt.subplots(figsize=(10, 6))
67
+ sns.countplot(x='PriceBucket', data=df, palette='icefire', ax=ax)
68
+ ax.set_title("Product Price Buckets")
69
+ ax.set_xlabel("Price Bucket")
70
+ ax.set_ylabel("Count")
71
+ st.pyplot(fig)
72
+
73
+ # Encoding Categorical Features
74
+ st.write("#### Encoding Categorical Features")
75
+ le = LabelEncoder()
76
+ df['Category'] = le.fit_transform(df['Category'])
77
+ df['Brand'] = le.fit_transform(df['Brand'])
78
+ st.write("Label Encoding Applied on 'Category' and 'Brand'.")
79
+
80
+ # Polynomial Features for Numerical Columns
81
+ st.write("#### Polynomial Feature Engineering")
82
+ numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
83
+ poly = PolynomialFeatures(degree=2, include_bias=False)
84
+ poly_features = poly.fit_transform(df[numeric_columns])
85
+ poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(numeric_columns))
86
+
87
+ st.write("Polynomial Features Created:")
88
+ st.dataframe(poly_df.head())
89
+
90
+ # Correlation Heatmap
91
+ st.write("### Correlation Matrix")
92
+ corr = df.corr()
93
+ fig, ax = plt.subplots(figsize=(12, 8))
94
+ sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
95
+ ax.set_title("Correlation Matrix")
96
+ st.pyplot(fig)
 
97
 
98
  else:
99
+ st.error("No dataset found. Please upload a dataset on the main page first.")