trohith89 commited on
Commit
419bb0b
·
verified ·
1 Parent(s): 1486007

Update pages/2_Data_CLeaning_and_Preprocessing.py

Browse files
pages/2_Data_CLeaning_and_Preprocessing.py CHANGED
@@ -1,4 +1,147 @@
1
- # Access the dataframe stored in session state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  if 'df' in st.session_state:
3
  data = st.session_state['df']
4
  st.write("Dataset available for further analysis.")
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+
6
+ # Page Title
7
+ st.title("Exploratory Data Analysis (EDA) App")
8
+
9
+ st.markdown("""
10
+ ### Perform EDA and Clean Data
11
+ Upload a CSV file to begin. This app will provide basic insights into the dataset,
12
+ highlight missing values, and visualize numeric and categorical columns.
13
+ ---
14
+ """)
15
+
16
+ # File Upload Section
17
+ st.header("Upload Dataset")
18
+
19
+ uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
20
+
21
+ # Check if file is uploaded
22
+ if uploaded_file is not None:
23
+ if uploaded_file.size > 0:
24
+ try:
25
+ # Read the CSV file
26
+ data = pd.read_csv(uploaded_file)
27
+ st.session_state['df'] = data # Store the data for use in other pages
28
+ st.success("Dataset uploaded successfully!")
29
+
30
+ # Show Data Preview
31
+ st.write("### Preview of Dataset")
32
+ st.dataframe(data.head())
33
+
34
+ # Overview Section
35
+ st.write("### Dataset Overview")
36
+ st.write(data.describe())
37
+
38
+ # Missing Values
39
+ st.write("### Missing Values")
40
+ st.write(data.isnull().sum())
41
+
42
+ # Duplicate Rows
43
+ st.write("### Duplicate Rows")
44
+ st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
45
+
46
+ # Visualize Numeric Data
47
+ numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
48
+ if len(numeric_columns) > 0:
49
+ st.write("### Histograms for Numeric Columns")
50
+ for col in numeric_columns:
51
+ fig, ax = plt.subplots()
52
+ sns.histplot(data[col], kde=True, ax=ax)
53
+ ax.set_title(f'Histogram of {col}')
54
+ st.pyplot(fig)
55
+
56
+ st.write("### Boxplots for Numeric Columns")
57
+ for col in numeric_columns:
58
+ fig, ax = plt.subplots()
59
+ sns.boxplot(x=data[col], ax=ax)
60
+ ax.set_title(f'Boxplot of {col}')
61
+ st.pyplot(fig)
62
+ else:
63
+ st.write("No numeric columns available for visualization.")
64
+
65
+ # Visualize Categorical Data
66
+ categorical_columns = data.select_dtypes(include=['object', 'category']).columns
67
+ if len(categorical_columns) > 0:
68
+ st.write("### Bar Plots for Categorical Columns")
69
+ selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
70
+
71
+ st.write(f"Value Counts for '{selected_cat_col}':")
72
+ st.write(data[selected_cat_col].value_counts())
73
+
74
+ fig, ax = plt.subplots()
75
+ sns.countplot(x=selected_cat_col, data=data, ax=ax)
76
+ ax.set_title(f'Bar Plot of {selected_cat_col}')
77
+ st.pyplot(fig)
78
+ else:
79
+ st.write("No categorical columns available for visualization.")
80
+
81
+ # Correlation Matrix
82
+ if len(numeric_columns) > 1:
83
+ st.write("### Correlation Matrix")
84
+ corr_matrix = data[numeric_columns].corr()
85
+ st.write(corr_matrix)
86
+
87
+ fig, ax = plt.subplots(figsize=(10, 8))
88
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
89
+ st.pyplot(fig)
90
+
91
+ # Check the columns before renaming
92
+ st.write("### Dataset Columns:")
93
+ st.write(data.columns)
94
+
95
+ # Renaming columns if they exist
96
+ if 'ProductCategory' in data.columns and 'ProductBrand' in data.columns and 'ProductPrice' in data.columns:
97
+ data = data.rename(columns={'ProductCategory': 'Category', 'ProductBrand': 'Brand', 'ProductPrice': 'Price'})
98
+ st.success("Columns renamed successfully!")
99
+ else:
100
+ st.warning("Columns 'ProductCategory', 'ProductBrand', or 'ProductPrice' not found in the dataset.")
101
+
102
+ # Now check if 'Category' exists and plot
103
+ if 'Category' in data.columns:
104
+ st.write("### Bar Plot for Category")
105
+ fig, ax = plt.subplots()
106
+ sns.countplot(x='Category', data=data, palette='viridis', ax=ax)
107
+ st.pyplot(fig)
108
+ else:
109
+ st.warning("'Category' column not found for plotting.")
110
+
111
+ # Binning of age column
112
+ bins = [0, 18, 35, 50, 65, 100]
113
+ labels = ['Child', 'Young Adult', 'Adult', 'Middle Aged', 'Senior']
114
+ data['age_bins'] = pd.cut(data['CustomerAge'], bins=bins, labels=labels, right=False)
115
+
116
+ # Data Cleaning Section
117
+ st.write("### Cleaned Dataset")
118
+ cleaned_data = data.drop_duplicates()
119
+ st.dataframe(cleaned_data)
120
+
121
+ # Save cleaned data to CSV and provide download option
122
+ cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
123
+ st.download_button(
124
+ label="Download Cleaned Dataset",
125
+ data=cleaned_csv,
126
+ file_name="cleaned_dataset.csv",
127
+ mime="text/csv"
128
+ )
129
+
130
+ # Store the cleaned dataframe in session state for use in other pages
131
+ st.session_state['df'] = cleaned_data
132
+
133
+ except pd.errors.EmptyDataError:
134
+ st.error("The uploaded CSV file is empty. Please upload a valid file.")
135
+ except pd.errors.ParserError:
136
+ st.error("The file is not properly formatted as a CSV. Please check the data.")
137
+ except Exception as e:
138
+ st.error(f"An unexpected error occurred: {e}")
139
+ else:
140
+ st.error("The uploaded file is empty.")
141
+ else:
142
+ st.info("Upload a CSV file to get started.")
143
+
144
+ # Session State Access on Other Pages
145
  if 'df' in st.session_state:
146
  data = st.session_state['df']
147
  st.write("Dataset available for further analysis.")