trohith89 commited on
Commit
fcc29a0
·
verified ·
1 Parent(s): 129713d

Update pages/2_Data_CLeaning_and_Preprocessing.py

Browse files
pages/2_Data_CLeaning_and_Preprocessing.py CHANGED
@@ -3,11 +3,15 @@ import pandas as pd
3
  import plotly.graph_objects as go
4
  import plotly.express as px
5
  from plotly.subplots import make_subplots
 
6
 
7
- # Background Image URL
 
 
 
8
  background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/clljdAv7f_LGL8dH5vCZQ.jpeg"
9
 
10
- # Apply background image using CSS
11
  st.markdown(
12
  f"""
13
  <style>
@@ -17,79 +21,66 @@ st.markdown(
17
  background-position: center;
18
  height: 100vh;
19
  }}
 
 
 
 
 
 
 
 
 
 
 
 
20
  </style>
21
- """,
22
  unsafe_allow_html=True
23
  )
24
 
25
- # Page Title
26
- st.title("Exploratory Data Analysis (EDA) App")
27
-
28
- st.markdown("""
29
- ### Perform EDA and Clean Data
30
- This app provides basic insights into the dataset, highlights missing values,
31
- and visualizes numeric and categorical columns.
32
- ---
33
- """)
34
-
35
  # Check if the dataset is already in session state
36
- if 'df' in st.session_state:
37
- data = st.session_state['df']
38
- st.success("Dataset loaded from previous session!")
39
 
40
- # Show Data Preview
41
- st.write("### Preview of Dataset")
42
- st.dataframe(data.head())
43
 
44
- # Overview Section
45
- st.write("### Dataset Overview")
46
  st.write(data.describe())
47
 
48
- # Missing Values
49
- st.write("### Missing Values")
50
  st.write(data.isnull().sum())
51
 
52
- # Duplicate Rows
53
- st.write("### Duplicate Rows")
54
  st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
55
 
56
  # Visualize Numeric Data
57
  numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
58
  if len(numeric_columns) > 0:
59
- st.write("### Histograms for Numeric Columns")
60
-
61
- # Create subplots for histograms
62
  fig = make_subplots(rows=len(numeric_columns), cols=1, subplot_titles=numeric_columns)
63
  for i, col in enumerate(numeric_columns):
64
  hist = px.histogram(data, x=col, nbins=30, title=f'Histogram of {col}')
65
- fig.add_trace(
66
- hist.data[0],
67
- row=i+1, col=1
68
- )
69
 
70
  fig.update_layout(height=500 * len(numeric_columns), title_text="Histograms for Numeric Columns")
71
  st.plotly_chart(fig)
72
 
73
- st.write("### Boxplots for Numeric Columns")
74
-
75
- # Create subplots for boxplots
76
  fig = make_subplots(rows=len(numeric_columns), cols=1, subplot_titles=numeric_columns)
77
  for i, col in enumerate(numeric_columns):
78
  boxplot = px.box(data, y=col, title=f'Boxplot of {col}')
79
- fig.add_trace(
80
- boxplot.data[0],
81
- row=i+1, col=1
82
- )
83
 
84
  fig.update_layout(height=500 * len(numeric_columns), title_text="Boxplots for Numeric Columns")
85
  st.plotly_chart(fig)
86
  else:
87
- st.write("No numeric columns available for visualization.")
88
 
89
  # Visualize Categorical Data
90
  categorical_columns = data.select_dtypes(include=['object', 'category']).columns
91
  if len(categorical_columns) > 0:
92
- st.write("### Bar Plots for Categorical Columns")
93
  selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
94
 
95
  st.write(f"Value Counts for '{selected_cat_col}':")
@@ -98,45 +89,19 @@ if 'df' in st.session_state:
98
  fig = px.bar(data, x=selected_cat_col, title=f'Bar Plot of {selected_cat_col}', color=selected_cat_col)
99
  st.plotly_chart(fig)
100
  else:
101
- st.write("No categorical columns available for visualization.")
102
 
103
  # Correlation Matrix for Numeric Columns
104
  if len(numeric_columns) > 1:
105
- st.write("### Correlation Matrix")
106
  corr_matrix = data[numeric_columns].corr()
107
  fig = px.imshow(corr_matrix, title="Correlation Matrix", color_continuous_scale='coolwarm')
108
  st.plotly_chart(fig)
109
 
110
- # Check the columns before renaming
111
- st.write("### Dataset Columns:")
112
- st.write(data.columns)
113
-
114
- # Renaming columns if they exist
115
- if 'ProductCategory' in data.columns and 'ProductBrand' in data.columns and 'ProductPrice' in data.columns:
116
- data = data.rename(columns={'ProductCategory': 'Category', 'ProductBrand': 'Brand', 'ProductPrice': 'Price'})
117
- st.success("Columns renamed successfully!")
118
- else:
119
- st.warning("Columns 'ProductCategory', 'ProductBrand', or 'ProductPrice' not found in the dataset.")
120
-
121
- # Now check if 'Category' exists and plot
122
- if 'Category' in data.columns:
123
- st.write("### Bar Plot for Category")
124
- fig = px.bar(data, x='Category', title='Bar Plot of Category', color='Category')
125
- st.plotly_chart(fig)
126
- else:
127
- st.warning("'Category' column not found for plotting.")
128
-
129
- # Binning of age column
130
- bins = [0, 18, 35, 50, 65, 100]
131
- labels = ['Child', 'Young Adult', 'Adult', 'Middle Aged', 'Senior']
132
- data['age_bins'] = pd.cut(data['CustomerAge'], bins=bins, labels=labels, right=False)
133
-
134
- # Data Cleaning Section
135
- st.write("### Cleaned Dataset")
136
  cleaned_data = data.drop_duplicates()
137
- st.dataframe(cleaned_data)
138
 
139
- # Save cleaned data to CSV and provide download option
140
  cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
141
  st.download_button(
142
  label="Download Cleaned Dataset",
@@ -145,8 +110,7 @@ if 'df' in st.session_state:
145
  mime="text/csv"
146
  )
147
 
148
- # Store the cleaned dataframe in session state for use in other pages
149
  st.session_state['df'] = cleaned_data
150
 
151
  else:
152
- st.info("No dataset found in session. Please upload a CSV file on the previous page.")
 
3
  import plotly.graph_objects as go
4
  import plotly.express as px
5
  from plotly.subplots import make_subplots
6
+ from io import StringIO
7
 
8
+ # Page Title
9
+ st.markdown("<h1 style='text-align:center; color:#008080;'>Exploratory Data Analysis (EDA) App</h1>", unsafe_allow_html=True)
10
+
11
+ # Define the URL of the background image (use your own image URL)
12
  background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/clljdAv7f_LGL8dH5vCZQ.jpeg"
13
 
14
+ # Apply custom CSS for the background image and overlay
15
  st.markdown(
16
  f"""
17
  <style>
 
21
  background-position: center;
22
  height: 100vh;
23
  }}
24
+
25
+ /* Semi-transparent overlay */
26
+ .stApp::before {{
27
+ content: "";
28
+ position: absolute;
29
+ top: 0;
30
+ left: 0;
31
+ width: 100%;
32
+ height: 100%;
33
+ background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
34
+ z-index: -1;
35
+ }}
36
  </style>
37
+ """,
38
  unsafe_allow_html=True
39
  )
40
 
 
 
 
 
 
 
 
 
 
 
41
  # Check if the dataset is already in session state
42
+ data = st.session_state.get("df")
 
 
43
 
44
+ if data is not None:
45
+ st.subheader("Dataset Preview:")
46
+ st.write(data.head())
47
 
48
+ st.subheader("Dataset Overview:")
 
49
  st.write(data.describe())
50
 
51
+ st.subheader("Missing Values:")
 
52
  st.write(data.isnull().sum())
53
 
54
+ st.subheader("Duplicate Rows:")
 
55
  st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
56
 
57
  # Visualize Numeric Data
58
  numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
59
  if len(numeric_columns) > 0:
60
+ st.subheader("Histograms for Numeric Columns:")
 
 
61
  fig = make_subplots(rows=len(numeric_columns), cols=1, subplot_titles=numeric_columns)
62
  for i, col in enumerate(numeric_columns):
63
  hist = px.histogram(data, x=col, nbins=30, title=f'Histogram of {col}')
64
+ fig.add_trace(hist.data[0], row=i + 1, col=1)
 
 
 
65
 
66
  fig.update_layout(height=500 * len(numeric_columns), title_text="Histograms for Numeric Columns")
67
  st.plotly_chart(fig)
68
 
69
+ st.subheader("Boxplots for Numeric Columns:")
 
 
70
  fig = make_subplots(rows=len(numeric_columns), cols=1, subplot_titles=numeric_columns)
71
  for i, col in enumerate(numeric_columns):
72
  boxplot = px.box(data, y=col, title=f'Boxplot of {col}')
73
+ fig.add_trace(boxplot.data[0], row=i + 1, col=1)
 
 
 
74
 
75
  fig.update_layout(height=500 * len(numeric_columns), title_text="Boxplots for Numeric Columns")
76
  st.plotly_chart(fig)
77
  else:
78
+ st.warning("No numeric columns available for visualization.")
79
 
80
  # Visualize Categorical Data
81
  categorical_columns = data.select_dtypes(include=['object', 'category']).columns
82
  if len(categorical_columns) > 0:
83
+ st.subheader("Bar Plots for Categorical Columns:")
84
  selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
85
 
86
  st.write(f"Value Counts for '{selected_cat_col}':")
 
89
  fig = px.bar(data, x=selected_cat_col, title=f'Bar Plot of {selected_cat_col}', color=selected_cat_col)
90
  st.plotly_chart(fig)
91
  else:
92
+ st.warning("No categorical columns available for visualization.")
93
 
94
  # Correlation Matrix for Numeric Columns
95
  if len(numeric_columns) > 1:
96
+ st.subheader("Correlation Matrix:")
97
  corr_matrix = data[numeric_columns].corr()
98
  fig = px.imshow(corr_matrix, title="Correlation Matrix", color_continuous_scale='coolwarm')
99
  st.plotly_chart(fig)
100
 
101
+ st.subheader("Cleaned Dataset:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  cleaned_data = data.drop_duplicates()
103
+ st.write(cleaned_data)
104
 
 
105
  cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
106
  st.download_button(
107
  label="Download Cleaned Dataset",
 
110
  mime="text/csv"
111
  )
112
 
 
113
  st.session_state['df'] = cleaned_data
114
 
115
  else:
116
+ st.warning("No dataset found. Please upload a dataset on the Home page.")