Update pages/Data_CLeaning_and_Preprocessing.py
Browse files
pages/Data_CLeaning_and_Preprocessing.py
CHANGED
|
@@ -32,15 +32,23 @@ if uploaded_file is not None:
|
|
| 32 |
|
| 33 |
# Visualizations for numeric columns
|
| 34 |
st.write("### Numeric Column Visualizations:")
|
|
|
|
|
|
|
| 35 |
st.write("Histograms:")
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
# Value counts and bar plot for categorical data
|
| 46 |
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
|
|
@@ -52,25 +60,25 @@ if uploaded_file is not None:
|
|
| 52 |
st.write(f"Bar Plot for '{selected_cat_col}':")
|
| 53 |
fig, ax = plt.subplots()
|
| 54 |
sns.countplot(x=selected_cat_col, data=data, ax=ax)
|
|
|
|
| 55 |
st.pyplot(fig)
|
| 56 |
else:
|
| 57 |
st.write("No categorical columns available for analysis.")
|
| 58 |
|
| 59 |
# Correlation matrix
|
| 60 |
-
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
|
| 61 |
if len(numeric_columns) > 1:
|
| 62 |
st.write("### Correlation Matrix:")
|
| 63 |
-
|
|
|
|
| 64 |
|
| 65 |
st.write("Heatmap of Correlation Matrix:")
|
| 66 |
-
fig, ax = plt.subplots()
|
| 67 |
-
sns.heatmap(
|
| 68 |
st.pyplot(fig)
|
| 69 |
|
| 70 |
# Clean the data: Handle missing values and duplicates
|
| 71 |
st.write("### Cleaned Dataset:")
|
| 72 |
cleaned_data = data.drop_duplicates() # Remove duplicate rows
|
| 73 |
-
#cleaned_data = cleaned_data.fillna(cleaned_data.mean()) # Replace missing values with the mean for numeric columns
|
| 74 |
st.dataframe(cleaned_data)
|
| 75 |
|
| 76 |
# Download button for the cleaned dataset
|
|
@@ -87,4 +95,4 @@ if uploaded_file is not None:
|
|
| 87 |
You can now download the cleaned dataset and proceed with further analysis or modeling.
|
| 88 |
""")
|
| 89 |
else:
|
| 90 |
-
st.warning("Please upload a dataset to proceed with Simple EDA.")
|
|
|
|
| 32 |
|
| 33 |
# Visualizations for numeric columns
|
| 34 |
st.write("### Numeric Column Visualizations:")
|
| 35 |
+
|
| 36 |
+
# Histograms for numeric data
|
| 37 |
st.write("Histograms:")
|
| 38 |
+
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
|
| 39 |
+
for col in numeric_columns:
|
| 40 |
+
fig, ax = plt.subplots()
|
| 41 |
+
sns.histplot(data[col], kde=True, ax=ax)
|
| 42 |
+
ax.set_title(f'Histogram of {col}')
|
| 43 |
+
st.pyplot(fig)
|
| 44 |
|
| 45 |
+
# Boxplots for numeric data
|
| 46 |
+
st.write("Boxplots:")
|
| 47 |
+
for col in numeric_columns:
|
| 48 |
+
fig, ax = plt.subplots()
|
| 49 |
+
sns.boxplot(x=data[col], ax=ax)
|
| 50 |
+
ax.set_title(f'Boxplot of {col}')
|
| 51 |
+
st.pyplot(fig)
|
| 52 |
|
| 53 |
# Value counts and bar plot for categorical data
|
| 54 |
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
|
|
|
|
| 60 |
st.write(f"Bar Plot for '{selected_cat_col}':")
|
| 61 |
fig, ax = plt.subplots()
|
| 62 |
sns.countplot(x=selected_cat_col, data=data, ax=ax)
|
| 63 |
+
ax.set_title(f'Bar Plot of {selected_cat_col}')
|
| 64 |
st.pyplot(fig)
|
| 65 |
else:
|
| 66 |
st.write("No categorical columns available for analysis.")
|
| 67 |
|
| 68 |
# Correlation matrix
|
|
|
|
| 69 |
if len(numeric_columns) > 1:
|
| 70 |
st.write("### Correlation Matrix:")
|
| 71 |
+
corr_matrix = data[numeric_columns].corr()
|
| 72 |
+
st.write(corr_matrix)
|
| 73 |
|
| 74 |
st.write("Heatmap of Correlation Matrix:")
|
| 75 |
+
fig, ax = plt.subplots(figsize=(10, 8))
|
| 76 |
+
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
|
| 77 |
st.pyplot(fig)
|
| 78 |
|
| 79 |
# Clean the data: Handle missing values and duplicates
|
| 80 |
st.write("### Cleaned Dataset:")
|
| 81 |
cleaned_data = data.drop_duplicates() # Remove duplicate rows
|
|
|
|
| 82 |
st.dataframe(cleaned_data)
|
| 83 |
|
| 84 |
# Download button for the cleaned dataset
|
|
|
|
| 95 |
You can now download the cleaned dataset and proceed with further analysis or modeling.
|
| 96 |
""")
|
| 97 |
else:
|
| 98 |
+
st.warning("Please upload a dataset to proceed with Simple EDA.")
|