trohith89 commited on
Commit
3b7931f
·
verified ·
1 Parent(s): e14bdbd

Update pages/2_Data_CLeaning_and_Preprocessing.py

Browse files
pages/2_Data_CLeaning_and_Preprocessing.py CHANGED
@@ -1,149 +1,134 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import seaborn as sns
4
- import matplotlib.pyplot as plt
 
5
 
6
  # Page Title
7
  st.title("Exploratory Data Analysis (EDA) App")
8
 
9
  st.markdown("""
10
  ### Perform EDA and Clean Data
11
- Upload a CSV file to begin. This app will provide basic insights into the dataset,
12
- highlight missing values, and visualize numeric and categorical columns.
13
  ---
14
  """)
15
 
16
- # File Upload Section
17
- st.header("Upload Dataset")
18
-
19
- uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
20
-
21
- # Check if file is uploaded
22
- if uploaded_file is not None:
23
- if uploaded_file.size > 0:
24
- try:
25
- # Read the CSV file
26
- data = pd.read_csv(uploaded_file)
27
- st.session_state['df'] = data # Store the data for use in other pages
28
- st.success("Dataset uploaded successfully!")
29
-
30
- # Show Data Preview
31
- st.write("### Preview of Dataset")
32
- st.dataframe(data.head())
33
-
34
- # Overview Section
35
- st.write("### Dataset Overview")
36
- st.write(data.describe())
37
-
38
- # Missing Values
39
- st.write("### Missing Values")
40
- st.write(data.isnull().sum())
41
-
42
- # Duplicate Rows
43
- st.write("### Duplicate Rows")
44
- st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
45
-
46
- # Visualize Numeric Data
47
- numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
48
- if len(numeric_columns) > 0:
49
- st.write("### Histograms for Numeric Columns")
50
- for col in numeric_columns:
51
- fig, ax = plt.subplots()
52
- sns.histplot(data[col], kde=True, palette="crest", ax=ax)
53
- ax.set_title(f'Histogram of {col}')
54
- st.pyplot(fig)
55
-
56
- st.write("### Boxplots for Numeric Columns")
57
- for col in numeric_columns:
58
- fig, ax = plt.subplots()
59
- sns.boxplot(x=data[col], palette="mako", ax=ax)
60
- ax.set_title(f'Boxplot of {col}')
61
- st.pyplot(fig)
62
- else:
63
- st.write("No numeric columns available for visualization.")
64
-
65
- # Visualize Categorical Data
66
- categorical_columns = data.select_dtypes(include=['object', 'category']).columns
67
- if len(categorical_columns) > 0:
68
- st.write("### Bar Plots for Categorical Columns")
69
- selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
70
-
71
- st.write(f"Value Counts for '{selected_cat_col}':")
72
- st.write(data[selected_cat_col].value_counts())
73
-
74
- fig, ax = plt.subplots()
75
- sns.countplot(x=selected_cat_col, data=data, palette="viridis", ax=ax)
76
- ax.set_title(f'Bar Plot of {selected_cat_col}')
77
- st.pyplot(fig)
78
- else:
79
- st.write("No categorical columns available for visualization.")
80
-
81
- # Correlation Matrix
82
- if len(numeric_columns) > 1:
83
- st.write("### Correlation Matrix")
84
- corr_matrix = data[numeric_columns].corr()
85
- st.write(corr_matrix)
86
-
87
- fig, ax = plt.subplots(figsize=(10, 8))
88
- sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
89
- st.pyplot(fig)
90
-
91
- # Check the columns before renaming
92
- st.write("### Dataset Columns:")
93
- st.write(data.columns)
94
-
95
- # Renaming columns if they exist
96
- if 'ProductCategory' in data.columns and 'ProductBrand' in data.columns and 'ProductPrice' in data.columns:
97
- data = data.rename(columns={'ProductCategory': 'Category', 'ProductBrand': 'Brand', 'ProductPrice': 'Price'})
98
- st.success("Columns renamed successfully!")
99
- else:
100
- st.warning("Columns 'ProductCategory', 'ProductBrand', or 'ProductPrice' not found in the dataset.")
101
-
102
- # Now check if 'Category' exists and plot
103
- if 'Category' in data.columns:
104
- st.write("### Bar Plot for Category")
105
- fig, ax = plt.subplots()
106
- sns.countplot(x='Category', data=data, palette='plasma', ax=ax)
107
- st.pyplot(fig)
108
- else:
109
- st.warning("'Category' column not found for plotting.")
110
-
111
- # Binning of age column
112
- bins = [0, 18, 35, 50, 65, 100]
113
- labels = ['Child', 'Young Adult', 'Adult', 'Middle Aged', 'Senior']
114
- data['age_bins'] = pd.cut(data['CustomerAge'], bins=bins, labels=labels, right=False)
115
-
116
- # Data Cleaning Section
117
- st.write("### Cleaned Dataset")
118
- cleaned_data = data.drop_duplicates()
119
- st.dataframe(cleaned_data)
120
-
121
- # Save cleaned data to CSV and provide download option
122
- cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
123
- st.download_button(
124
- label="Download Cleaned Dataset",
125
- data=cleaned_csv,
126
- file_name="cleaned_dataset.csv",
127
- mime="text/csv"
128
  )
129
 
130
- # Store the cleaned dataframe in session state for use in other pages
131
- st.session_state['df'] = cleaned_data
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- except pd.errors.EmptyDataError:
134
- st.error("The uploaded CSV file is empty. Please upload a valid file.")
135
- except pd.errors.ParserError:
136
- st.error("The file is not properly formatted as a CSV. Please check the data.")
137
- except Exception as e:
138
- st.error(f"An unexpected error occurred: {e}")
139
  else:
140
- st.error("The uploaded file is empty.")
141
- else:
142
- st.info("Upload a CSV file to get started.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- # Session State Access on Other Pages
145
- if 'df' in st.session_state:
146
- data = st.session_state['df']
147
- st.write("Dataset available for further analysis.")
148
  else:
149
- st.warning("No dataset found. Please upload data to proceed.")
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import plotly.graph_objects as go
4
+ import plotly.express as px
5
+ from plotly.subplots import make_subplots
6
 
7
  # Page Title
8
  st.title("Exploratory Data Analysis (EDA) App")
9
 
10
  st.markdown("""
11
  ### Perform EDA and Clean Data
12
+ This app provides basic insights into the dataset, highlights missing values,
13
+ and visualizes numeric and categorical columns.
14
  ---
15
  """)
16
 
17
+ # Check if the dataset is already in session state
18
+ if 'df' in st.session_state:
19
+ data = st.session_state['df']
20
+ st.success("Dataset loaded from previous session!")
21
+
22
+ # Show Data Preview
23
+ st.write("### Preview of Dataset")
24
+ st.dataframe(data.head())
25
+
26
+ # Overview Section
27
+ st.write("### Dataset Overview")
28
+ st.write(data.describe())
29
+
30
+ # Missing Values
31
+ st.write("### Missing Values")
32
+ st.write(data.isnull().sum())
33
+
34
+ # Duplicate Rows
35
+ st.write("### Duplicate Rows")
36
+ st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
37
+
38
+ # Visualize Numeric Data
39
+ numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
40
+ if len(numeric_columns) > 0:
41
+ st.write("### Histograms for Numeric Columns")
42
+
43
+ # Create subplots for histograms
44
+ fig = make_subplots(rows=len(numeric_columns), cols=1, subplot_titles=numeric_columns)
45
+ for i, col in enumerate(numeric_columns):
46
+ hist = px.histogram(data, x=col, nbins=30, title=f'Histogram of {col}')
47
+ fig.add_trace(
48
+ hist.data[0],
49
+ row=i+1, col=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  )
51
 
52
+ fig.update_layout(height=500 * len(numeric_columns), title_text="Histograms for Numeric Columns")
53
+ st.plotly_chart(fig)
54
+
55
+ st.write("### Boxplots for Numeric Columns")
56
+
57
+ # Create subplots for boxplots
58
+ fig = make_subplots(rows=len(numeric_columns), cols=1, subplot_titles=numeric_columns)
59
+ for i, col in enumerate(numeric_columns):
60
+ boxplot = px.box(data, y=col, title=f'Boxplot of {col}')
61
+ fig.add_trace(
62
+ boxplot.data[0],
63
+ row=i+1, col=1
64
+ )
65
 
66
+ fig.update_layout(height=500 * len(numeric_columns), title_text="Boxplots for Numeric Columns")
67
+ st.plotly_chart(fig)
 
 
 
 
68
  else:
69
+ st.write("No numeric columns available for visualization.")
70
+
71
+ # Visualize Categorical Data
72
+ categorical_columns = data.select_dtypes(include=['object', 'category']).columns
73
+ if len(categorical_columns) > 0:
74
+ st.write("### Bar Plots for Categorical Columns")
75
+ selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
76
+
77
+ st.write(f"Value Counts for '{selected_cat_col}':")
78
+ st.write(data[selected_cat_col].value_counts())
79
+
80
+ fig = px.bar(data, x=selected_cat_col, title=f'Bar Plot of {selected_cat_col}', color=selected_cat_col)
81
+ st.plotly_chart(fig)
82
+ else:
83
+ st.write("No categorical columns available for visualization.")
84
+
85
+ # Correlation Matrix for Numeric Columns
86
+ if len(numeric_columns) > 1:
87
+ st.write("### Correlation Matrix")
88
+ corr_matrix = data[numeric_columns].corr()
89
+ fig = px.imshow(corr_matrix, title="Correlation Matrix", color_continuous_scale='coolwarm')
90
+ st.plotly_chart(fig)
91
+
92
+ # Check the columns before renaming
93
+ st.write("### Dataset Columns:")
94
+ st.write(data.columns)
95
+
96
+ # Renaming columns if they exist
97
+ if 'ProductCategory' in data.columns and 'ProductBrand' in data.columns and 'ProductPrice' in data.columns:
98
+ data = data.rename(columns={'ProductCategory': 'Category', 'ProductBrand': 'Brand', 'ProductPrice': 'Price'})
99
+ st.success("Columns renamed successfully!")
100
+ else:
101
+ st.warning("Columns 'ProductCategory', 'ProductBrand', or 'ProductPrice' not found in the dataset.")
102
+
103
+ # Now check if 'Category' exists and plot
104
+ if 'Category' in data.columns:
105
+ st.write("### Bar Plot for Category")
106
+ fig = px.bar(data, x='Category', title='Bar Plot of Category', color='Category')
107
+ st.plotly_chart(fig)
108
+ else:
109
+ st.warning("'Category' column not found for plotting.")
110
+
111
+ # Binning of age column
112
+ bins = [0, 18, 35, 50, 65, 100]
113
+ labels = ['Child', 'Young Adult', 'Adult', 'Middle Aged', 'Senior']
114
+ data['age_bins'] = pd.cut(data['CustomerAge'], bins=bins, labels=labels, right=False)
115
+
116
+ # Data Cleaning Section
117
+ st.write("### Cleaned Dataset")
118
+ cleaned_data = data.drop_duplicates()
119
+ st.dataframe(cleaned_data)
120
+
121
+ # Save cleaned data to CSV and provide download option
122
+ cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
123
+ st.download_button(
124
+ label="Download Cleaned Dataset",
125
+ data=cleaned_csv,
126
+ file_name="cleaned_dataset.csv",
127
+ mime="text/csv"
128
+ )
129
+
130
+ # Store the cleaned dataframe in session state for use in other pages
131
+ st.session_state['df'] = cleaned_data
132
 
 
 
 
 
133
  else:
134
+ st.info("No dataset found in session. Please upload a CSV file on the previous page.")