trohith89 commited on
Commit
1486007
·
verified ·
1 Parent(s): 05c67fa

Update pages/2_Data_CLeaning_and_Preprocessing.py

Browse files
pages/2_Data_CLeaning_and_Preprocessing.py CHANGED
@@ -1,146 +1,4 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import seaborn as sns
4
- import matplotlib.pyplot as plt
5
-
6
- # Page Title
7
- st.title("Exploratory Data Analysis (EDA) App")
8
-
9
- st.markdown("""
10
- ### Perform EDA and Clean Data
11
- Upload a CSV file to begin. This app will provide basic insights into the dataset,
12
- highlight missing values, and visualize numeric and categorical columns.
13
- ---
14
- """)
15
-
16
- # File Upload Section
17
- st.header("Upload Dataset")
18
-
19
- uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
20
-
21
- # Check if file is uploaded
22
- if uploaded_file is not None:
23
- if uploaded_file.size > 0:
24
- try:
25
- # Read the CSV file
26
- data = pd.read_csv(uploaded_file)
27
- st.session_state['df'] = data # Store the data for use in other pages
28
- st.success("Dataset uploaded successfully!")
29
-
30
- # Show Data Preview
31
- st.write("### Preview of Dataset")
32
- st.dataframe(data.head())
33
-
34
- # Overview Section
35
- st.write("### Dataset Overview")
36
- st.write(data.describe())
37
-
38
- # Missing Values
39
- st.write("### Missing Values")
40
- st.write(data.isnull().sum())
41
-
42
- # Duplicate Rows
43
- st.write("### Duplicate Rows")
44
- st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
45
-
46
- # Visualize Numeric Data
47
- numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
48
- if len(numeric_columns) > 0:
49
- st.write("### Histograms for Numeric Columns")
50
- for col in numeric_columns:
51
- fig, ax = plt.subplots()
52
- sns.histplot(data[col], kde=True, ax=ax)
53
- ax.set_title(f'Histogram of {col}')
54
- st.pyplot(fig)
55
-
56
- st.write("### Boxplots for Numeric Columns")
57
- for col in numeric_columns:
58
- fig, ax = plt.subplots()
59
- sns.boxplot(x=data[col], ax=ax)
60
- ax.set_title(f'Boxplot of {col}')
61
- st.pyplot(fig)
62
- else:
63
- st.write("No numeric columns available for visualization.")
64
-
65
- # Visualize Categorical Data
66
- categorical_columns = data.select_dtypes(include=['object', 'category']).columns
67
- if len(categorical_columns) > 0:
68
- st.write("### Bar Plots for Categorical Columns")
69
- selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
70
-
71
- st.write(f"Value Counts for '{selected_cat_col}':")
72
- st.write(data[selected_cat_col].value_counts())
73
-
74
- fig, ax = plt.subplots()
75
- sns.countplot(x=selected_cat_col, data=data, ax=ax)
76
- ax.set_title(f'Bar Plot of {selected_cat_col}')
77
- st.pyplot(fig)
78
- else:
79
- st.write("No categorical columns available for visualization.")
80
-
81
- # Correlation Matrix
82
- if len(numeric_columns) > 1:
83
- st.write("### Correlation Matrix")
84
- corr_matrix = data[numeric_columns].corr()
85
- st.write(corr_matrix)
86
-
87
- fig, ax = plt.subplots(figsize=(10, 8))
88
- sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
89
- st.pyplot(fig)
90
-
91
- # Check the columns before renaming
92
- st.write("### Dataset Columns:")
93
- st.write(data.columns)
94
-
95
- # Renaming columns if they exist
96
- if 'ProductCategory' in data.columns and 'ProductBrand' in data.columns and 'ProductPrice' in data.columns:
97
- data = data.rename(columns={'ProductCategory': 'Category', 'ProductBrand': 'Brand', 'ProductPrice': 'Price'})
98
- st.success("Columns renamed successfully!")
99
- else:
100
- st.warning("Columns 'ProductCategory', 'ProductBrand', or 'ProductPrice' not found in the dataset.")
101
-
102
- # Now check if 'Category' exists and plot
103
- if 'Category' in data.columns:
104
- st.write("### Bar Plot for Category")
105
- fig, ax = plt.subplots()
106
- sns.countplot(x='Category', data=data, palette='viridis', ax=ax)
107
- st.pyplot(fig)
108
- else:
109
- st.warning("'Category' column not found for plotting.")
110
- # binning of age column
111
-
112
- bins = [0, 18, 35, 50, 65, 100]
113
- labels = ['Child', 'Young Adult', 'Adult', 'Middle Aged', 'Senior']
114
-
115
- data['age_bins'] = pd.cut(data['CustomerAge'], bins=bins, labels=labels, right = False)
116
-
117
- # df.head()
118
- # Data Cleaning Section
119
- st.write("### Cleaned Dataset")
120
- cleaned_data = data.drop_duplicates()
121
- st.dataframe(cleaned_data)
122
-
123
- # Download Cleaned Data
124
- cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
125
- st.download_button(
126
- label="Download Cleaned Dataset",
127
- data=cleaned_csv,
128
- file_name="cleaned_dataset.csv",
129
- mime="text/csv"
130
- )
131
-
132
- except pd.errors.EmptyDataError:
133
- st.error("The uploaded CSV file is empty. Please upload a valid file.")
134
- except pd.errors.ParserError:
135
- st.error("The file is not properly formatted as a CSV. Please check the data.")
136
- except Exception as e:
137
- st.error(f"An unexpected error occurred: {e}")
138
- else:
139
- st.error("The uploaded file is empty.")
140
- else:
141
- st.info("Upload a CSV file to get started.")
142
-
143
- # Session State Access on Other Pages
144
  if 'df' in st.session_state:
145
  data = st.session_state['df']
146
  st.write("Dataset available for further analysis.")
 
1
+ # Access the dataframe stored in session state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  if 'df' in st.session_state:
3
  data = st.session_state['df']
4
  st.write("Dataset available for further analysis.")